sysdeps/x86_64/multiarch/strncpy-evex.S

   1 /* {wcs|wcp|str|stp}ncpy with 256/512-bit EVEX instructions.
   2    Copyright (C) 2022-2024 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, see
  17    <https://www.gnu.org/licenses/>.  */
  18
  19 #include <isa-level.h>
  20
  21 #if ISA_SHOULD_BUILD (4)
  22
  23         /* Use evex-masked stores for small sizes. Turned off at the
  24            moment.  */
  25 # define USE_EVEX_MASKED_STORE  0
  26
  27
  28 # include <sysdep.h>
  29 # ifndef VEC_SIZE
  30 #  include "x86-evex256-vecs.h"
  31 # endif
  32
  33
  34 # ifndef STRNCPY
  35 #  define STRNCPY       __strncpy_evex
  36 # endif
  37
  38 # ifdef USE_AS_WCSCPY
  39 #  define VMOVU_MASK    vmovdqu32
  40 #  define VPCMPEQ       vpcmpeqd
  41 #  define VPMIN vpminud
  42 #  define VPTESTN       vptestnmd
  43 #  define VPTEST        vptestmd
  44 #  define CHAR_SIZE     4
  45
  46 #  define REP_MOVS      rep movsd
  47 #  define REP_STOS      rep stosl
  48
  49 #  define USE_WIDE_CHAR
  50
  51 # else
  52 #  define VMOVU_MASK    vmovdqu8
  53 #  define VPCMPEQ       vpcmpeqb
  54 #  define VPMIN vpminub
  55 #  define VPTESTN       vptestnmb
  56 #  define VPTEST        vptestmb
  57 #  define CHAR_SIZE     1
  58
  59 #  define REP_MOVS      rep movsb
  60 #  define REP_STOS      rep stosb
  61 # endif
  62
  63 # include "strncpy-or-cat-overflow-def.h"
  64
  65 # define PAGE_SIZE      4096
  66 # define CHAR_PER_VEC   (VEC_SIZE / CHAR_SIZE)
  67
  68 # include "reg-macros.h"
  69
  70
  71 # define VZERO  VMM(7)
  72 # define VZERO_256      VMM_256(7)
  73 # define VZERO_128      VMM_128(7)
  74
  75 # if VEC_SIZE == 64
  76 #  define VZERO_HALF    VZERO_256
  77 # else
  78 #  define VZERO_HALF    VZERO_128
  79 # endif
  80
  81         .section SECTION(.text), "ax", @progbits
  82 ENTRY(STRNCPY)
  83 # ifdef __ILP32__
  84         /* Clear the upper 32 bits.  */
  85         movl    %edx, %edx
  86 # endif
  87         /* Filter zero length strings and very long strings.  Zero
  88            length strings just return, very long strings are handled by
  89            just running rep stos{b|l} to zero set (which will almost
  90            certainly segfault), if that succeeds then just calling
  91            OVERFLOW_STRCPY (strcpy, stpcpy, wcscpy, wcpcpy).  */
  92 # ifdef USE_AS_WCSCPY
  93         decq    %rdx
  94         movq    %rdx, %rax
  95         /* 56 is end of max supported address space.  */
  96         shr     $56, %rax
  97         jnz     L(zero_len)
  98 # else
  99         decq    %rdx
 100         /* If the flag needs to become `jb` replace `dec` with `sub`.
 101          */
 102         jl      L(zero_len)
 103 # endif
 104
 105         vpxorq  %VZERO_128, %VZERO_128, %VZERO_128
 106         movl    %esi, %eax
 107         andl    $(PAGE_SIZE - 1), %eax
 108         cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
 109         ja      L(page_cross)
 110
 111 L(page_cross_continue):
 112         VMOVU   (%rsi), %VMM(0)
 113         VPTESTN %VMM(0), %VMM(0), %k0
 114         KMOV    %k0, %VRCX
 115
 116         /* If no STPCPY just save end ahead of time.  */
 117 # ifndef USE_AS_STPCPY
 118         movq    %rdi, %rax
 119 # endif
 120
 121
 122         cmpq    $(CHAR_PER_VEC), %rdx
 123
 124         /* If USE_EVEX_MASK_STORE is enabled then we just handle length
 125            <= CHAR_PER_VEC with masked instructions (which have
 126            potential for dramatically bad perf if dst splits a page and
 127            is not in the TLB).  */
 128 # if USE_EVEX_MASKED_STORE
 129         /* `jae` because length rdx is now length - 1.  */
 130         jae     L(more_1x_vec)
 131
 132         /* If there where multiple zero-CHAR matches in the first VEC,
 133            VRCX will be overset but that's fine since any oversets where
 134            at zero-positions anyways.  */
 135
 136 #  ifdef USE_AS_STPCPY
 137         tzcnt   %VRCX, %VRAX
 138         cmpl    %eax, %edx
 139         cmovb   %edx, %eax
 140 #   ifdef USE_AS_WCSCPY
 141         adcl    $0, %eax
 142         leaq    (%rdi, %rax, CHAR_SIZE), %rax
 143 #   else
 144         adcq    %rdi, %rax
 145 #   endif
 146 #  endif
 147         dec     %VRCX
 148
 149         /* Zero out all non-zero CHAR's after the first zero match.  */
 150         KMOV    %VRCX, %k1
 151
 152         /* Use VZERO as destination so this can be reused for
 153            L(zfill_less_vec) (which if jumped to by subsequent logic
 154            will have zerod out VZERO.  */
 155         VMOVU_MASK %VMM(0), %VZERO{%k1}{z}
 156 L(zfill_less_vec):
 157         /* Get mask for what we need to set.  */
 158         incl    %edx
 159         mov     $-1, %VRCX
 160         bzhi    %VRDX, %VRCX, %VRCX
 161         KMOV    %VRCX, %k1
 162         VMOVU_MASK %VZERO, (%rdi){%k1}
 163         ret
 164
 165         .p2align 4,, 4
 166 L(zero_len):
 167         cmpq    $-1, %rdx
 168         jne     L(best_effort_strncpy)
 169         movq    %rdi, %rax
 170         ret
 171
 172         .p2align 4,, 8
 173 L(more_1x_vec):
 174 # else
 175         /* `jb` because length rdx is now length - 1.  */
 176         jb      L(less_1x_vec)
 177 # endif
 178
 179
 180         /* This may overset but that's fine because we still need to zero
 181            fill.  */
 182         VMOVU   %VMM(0), (%rdi)
 183
 184
 185         /* Length must be >= CHAR_PER_VEC so match here means we must
 186            zero-fill.  */
 187         test    %VRCX, %VRCX
 188         jnz     L(zfill)
 189
 190
 191         /* We are going to align rsi here so will need to be able to re-
 192            adjust rdi/rdx afterwards. NB: We filtered out huge lengths
 193            so rsi + rdx * CHAR_SIZE cannot overflow.  */
 194         leaq    (VEC_SIZE * -1)(%rsi, %rdx, CHAR_SIZE), %rdx
 195         subq    %rsi, %rdi
 196         andq    $-(VEC_SIZE), %rsi
 197
 198 L(loop_last_4x_vec):
 199         addq    %rsi, %rdi
 200         subq    %rsi, %rdx
 201 # ifdef USE_AS_WCSCPY
 202         shrq    $2, %rdx
 203 # endif
 204
 205         VMOVA   (VEC_SIZE * 1)(%rsi), %VMM(1)
 206         VPTESTN %VMM(1), %VMM(1), %k0
 207         KMOV    %k0, %VRCX
 208
 209         /* -1 because of the `dec %rdx` earlier.  */
 210         cmpq    $(CHAR_PER_VEC * 2 - 1), %rdx
 211         ja      L(more_2x_vec)
 212
 213 L(last_2x_vec):
 214         /* This will be need to be computed no matter what. We do it
 215            ahead of time for CHAR_PER_VEC == 64 because we can't adjust
 216            the value of `tzcnt` with a shift.  */
 217 # if CHAR_PER_VEC == 64
 218         tzcntq  %rcx, %rcx
 219 # endif
 220
 221         cmpl    $(CHAR_PER_VEC), %edx
 222         jb      L(ret_vec_x1_len)
 223
 224         /* Separate logic for CHAR_PER_VEC == 64 because we already did
 225            `tzcnt` on VRCX.  */
 226 # if CHAR_PER_VEC == 64
 227         /* cl == CHAR_PER_VEC iff it was zero before the `tzcnt`.  */
 228         cmpb    $CHAR_PER_VEC, %cl
 229         jnz     L(ret_vec_x1_no_bsf)
 230 # else
 231         test    %VRCX, %VRCX
 232         jnz     L(ret_vec_x1)
 233 # endif
 234
 235
 236
 237         VPCMPEQ (VEC_SIZE * 2)(%rsi), %VZERO, %k0
 238         VMOVU   %VMM(1), (VEC_SIZE * 1)(%rdi)
 239         KMOV    %k0, %VRCX
 240
 241 # if CHAR_PER_VEC < 64
 242         /* This essentiallys adds CHAR_PER_VEC to computed result.  */
 243         shlq    $CHAR_PER_VEC, %rcx
 244 # else
 245         tzcntq  %rcx, %rcx
 246         addl    $CHAR_PER_VEC, %ecx
 247 # endif
 248
 249         .p2align 4,, 4
 250 L(ret_vec_x1_len):
 251         /* If CHAR_PER_VEC < 64 we still need to tzcnt, otherwise it has
 252            already been done.  */
 253 # if CHAR_PER_VEC < 64
 254         tzcntq  %rcx, %rcx
 255 # endif
 256         cmpl    %ecx, %edx
 257         jbe     L(ret_vec_x1_len_no_zfill)
 258         /* Fall through (expectation) is copy len < buffer len.  */
 259         VMOVU   %VZERO, ((VEC_SIZE)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
 260 L(ret_vec_x1_len_no_zfill_mov):
 261         movl    %ecx, %edx
 262 # ifdef USE_AS_STPCPY
 263         /* clear flags.  */
 264         xorl    %ecx, %ecx
 265 # endif
 266 L(ret_vec_x1_len_no_zfill):
 267         VMOVU   ((VEC_SIZE)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
 268         VMOVU   %VMM(0), ((VEC_SIZE)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
 269 # ifdef USE_AS_STPCPY
 270 #  ifdef USE_AS_WCSCPY
 271         adcq    $0, %rdx
 272         leaq    (VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %rax
 273 #  else
 274         leal    (VEC_SIZE)(%rdx), %eax
 275         adcq    %rdi, %rax
 276 #  endif
 277 # endif
 278         ret
 279
 280
 281         .p2align 4,, 10
 282 L(ret_vec_x1):
 283         bsf     %VRCX, %VRCX
 284 L(ret_vec_x1_no_bsf):
 285         VMOVU   %VZERO, ((VEC_SIZE)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
 286         subl    %ecx, %edx
 287         cmpl    $CHAR_PER_VEC, %edx
 288         jb      L(ret_vec_x1_len_no_zfill_mov)
 289         /* Fall through (expectation) is copy len < buffer len.  */
 290         VMOVU   %VMM(1), (VEC_SIZE * 1)(%rdi)
 291         VMOVU   %VZERO, (VEC_SIZE * 1)(%rdi, %rcx, CHAR_SIZE)
 292 # ifdef USE_AS_STPCPY
 293         leaq    (VEC_SIZE * 1)(%rdi, %rcx, CHAR_SIZE), %rax
 294 # endif
 295         ret
 296
 297         .p2align 4,, 8
 298 L(last_4x_vec):
 299         /* Separate logic for CHAR_PER_VEC == 64 because we can do `andl
 300            $(CHAR_PER_VEC * 4 - 1), %edx` with less code size just
 301            using `movzbl`.  */
 302 # if CHAR_PER_VEC == 64
 303         movzbl  %dl, %edx
 304 # else
 305         andl    $(CHAR_PER_VEC * 4 - 1), %edx
 306 # endif
 307         VMOVA   (VEC_SIZE * 5)(%rsi), %VMM(1)
 308         VPTESTN %VMM(1), %VMM(1), %k0
 309         KMOV    %k0, %VRCX
 310         subq    $-(VEC_SIZE * 4), %rsi
 311         subq    $-(VEC_SIZE * 4), %rdi
 312         cmpl    $(CHAR_PER_VEC * 2 - 1), %edx
 313         jbe     L(last_2x_vec)
 314         .p2align 4,, 8
 315 L(more_2x_vec):
 316         VMOVU   %VMM(1), (VEC_SIZE * 1)(%rdi)
 317         test    %VRCX, %VRCX
 318         /* Must fill at least 2x VEC.  */
 319         jnz     L(zfill_vec1)
 320
 321         VMOVA   (VEC_SIZE * 2)(%rsi), %VMM(2)
 322         VMOVU   %VMM(2), (VEC_SIZE * 2)(%rdi)
 323         VPTESTN %VMM(2), %VMM(2), %k0
 324         KMOV    %k0, %VRCX
 325         test    %VRCX, %VRCX
 326         /* Must fill at least 1x VEC.  */
 327         jnz     L(zfill_vec2)
 328
 329         VMOVA   (VEC_SIZE * 3)(%rsi), %VMM(3)
 330         VPTESTN %VMM(3), %VMM(3), %k0
 331         KMOV    %k0, %VRCX
 332
 333         /* Check if len is more 4x VEC. -1 because rdx is len - 1.  */
 334         cmpq    $(CHAR_PER_VEC * 4 - 1), %rdx
 335         ja      L(more_4x_vec)
 336
 337         subl    $(CHAR_PER_VEC * 3), %edx
 338         jb      L(ret_vec_x3_len)
 339
 340         test    %VRCX, %VRCX
 341         jnz     L(ret_vec_x3)
 342
 343         VPCMPEQ (VEC_SIZE * 4)(%rsi), %VZERO, %k0
 344         VMOVU   %VMM(3), (VEC_SIZE * 3)(%rdi)
 345         KMOV    %k0, %VRCX
 346         tzcnt   %VRCX, %VRCX
 347         cmpl    %ecx, %edx
 348         jbe     L(ret_vec_x4_len_no_zfill)
 349         /* Fall through (expectation) is copy len < buffer len.  */
 350         VMOVU   %VZERO, ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
 351         movl    %ecx, %edx
 352 L(ret_vec_x4_len_no_zfill):
 353         VMOVU   ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
 354         VMOVU   %VMM(0), ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
 355 # ifdef USE_AS_STPCPY
 356 #  ifdef USE_AS_WCSCPY
 357         adcq    $0, %rdx
 358         leaq    (VEC_SIZE * 4)(%rdi, %rdx, CHAR_SIZE), %rax
 359 #  else
 360         leal    (VEC_SIZE * 4 + 0)(%rdx), %eax
 361         adcq    %rdi, %rax
 362 #  endif
 363 # endif
 364         ret
 365
 366
 367 L(ret_vec_x3_len):
 368         addl    $(CHAR_PER_VEC * 1), %edx
 369         tzcnt   %VRCX, %VRCX
 370         cmpl    %ecx, %edx
 371         jbe     L(ret_vec_x3_len_no_zfill)
 372         /* Fall through (expectation) is copy len < buffer len.  */
 373         VMOVU   %VZERO, ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
 374 L(ret_vec_x3_len_no_zfill_mov):
 375         movl    %ecx, %edx
 376 # ifdef USE_AS_STPCPY
 377         /* clear flags.  */
 378         xorl    %ecx, %ecx
 379 # endif
 380         .p2align 4,, 4
 381 L(ret_vec_x3_len_no_zfill):
 382         VMOVU   ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
 383         VMOVU   %VMM(0), ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
 384 # ifdef USE_AS_STPCPY
 385 #  ifdef USE_AS_WCSCPY
 386         adcq    $0, %rdx
 387         leaq    (VEC_SIZE * 3)(%rdi, %rdx, CHAR_SIZE), %rax
 388 #  else
 389         leal    (VEC_SIZE * 3 + 0)(%rdx), %eax
 390         adcq    %rdi, %rax
 391 #  endif
 392 # endif
 393         ret
 394
 395
 396         .p2align 4,, 8
 397 L(ret_vec_x3):
 398         bsf     %VRCX, %VRCX
 399         VMOVU   %VZERO, (VEC_SIZE * 4 +(-(VEC_SIZE - CHAR_SIZE)))(%rdi, %rdx, CHAR_SIZE)
 400         subl    %ecx, %edx
 401         jl      L(ret_vec_x3_len_no_zfill_mov)
 402         VMOVU   %VMM(3), (VEC_SIZE * 3)(%rdi)
 403         VMOVU   %VZERO, (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE)
 404 # ifdef USE_AS_STPCPY
 405         leaq    (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %rax
 406 # endif
 407         ret
 408
 409         .p2align 4,, 8
 410 L(more_4x_vec):
 411         VMOVU   %VMM(3), (VEC_SIZE * 3)(%rdi)
 412         test    %VRCX, %VRCX
 413         jnz     L(zfill_vec3)
 414
 415         VMOVA   (VEC_SIZE * 4)(%rsi), %VMM(4)
 416         VMOVU   %VMM(4), (VEC_SIZE * 4)(%rdi)
 417         VPTESTN %VMM(4), %VMM(4), %k0
 418         KMOV    %k0, %VRCX
 419         test    %VRCX, %VRCX
 420         jnz     L(zfill_vec4)
 421
 422         /* Recheck length before aligning.  */
 423         cmpq    $(CHAR_PER_VEC * 8 - 1), %rdx
 424         jbe     L(last_4x_vec)
 425
 426         /* Align rsi to VEC_SIZE * 4, need to readjust rdx / rdi.  */
 427 # ifdef USE_AS_WCSCPY
 428         leaq    (%rsi, %rdx, CHAR_SIZE), %rdx
 429 # else
 430         addq    %rsi, %rdx
 431 # endif
 432         subq    %rsi, %rdi
 433         subq    $-(VEC_SIZE * 5), %rsi
 434         andq    $(VEC_SIZE * -4), %rsi
 435
 436
 437         /* Load first half of the loop before entry.  */
 438         VMOVA   (VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
 439         VMOVA   (VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
 440         VMOVA   (VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
 441         VMOVA   (VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
 442
 443         VPMIN   %VMM(0), %VMM(1), %VMM(4)
 444         VPMIN   %VMM(2), %VMM(3), %VMM(6)
 445         VPTESTN %VMM(4), %VMM(4), %k2
 446         VPTESTN %VMM(6), %VMM(6), %k4
 447
 448
 449         /* Offset rsi by VEC_SIZE so that we can jump to
 450            L(loop_last_4x_vec).  */
 451         addq    $-(VEC_SIZE), %rsi
 452         KORTEST %k2, %k4
 453         jnz     L(loop_4x_done)
 454
 455         /* Store loop end in r9.  */
 456         leaq    -(VEC_SIZE * 5 - CHAR_SIZE)(%rdx), %r9
 457
 458         .p2align 4,, 11
 459 L(loop_4x_vec):
 460         VMOVU   %VMM(0), (VEC_SIZE * 1 + 0)(%rdi, %rsi)
 461         VMOVU   %VMM(1), (VEC_SIZE * 2 + 0)(%rdi, %rsi)
 462         VMOVU   %VMM(2), (VEC_SIZE * 3 + 0)(%rdi, %rsi)
 463         VMOVU   %VMM(3), (VEC_SIZE * 4 + 0)(%rdi, %rsi)
 464
 465         subq    $(VEC_SIZE * -4), %rsi
 466         cmpq    %rsi, %r9
 467         jbe     L(loop_last_4x_vec)
 468
 469         VMOVA   (VEC_SIZE * 1 + 0)(%rsi), %VMM(0)
 470         VMOVA   (VEC_SIZE * 2 + 0)(%rsi), %VMM(1)
 471         VMOVA   (VEC_SIZE * 3 + 0)(%rsi), %VMM(2)
 472         VMOVA   (VEC_SIZE * 4 + 0)(%rsi), %VMM(3)
 473
 474         VPMIN   %VMM(0), %VMM(1), %VMM(4)
 475         VPMIN   %VMM(2), %VMM(3), %VMM(6)
 476         VPTESTN %VMM(4), %VMM(4), %k2
 477         VPTESTN %VMM(6), %VMM(6), %k4
 478         KORTEST %k2, %k4
 479         jz      L(loop_4x_vec)
 480
 481 L(loop_4x_done):
 482         /* Restore rdx (length).  */
 483         subq    %rsi, %rdx
 484 # ifdef USE_AS_WCSCPY
 485         shrq    $2, %rdx
 486 # endif
 487         VMOVU   %VMM(0), (VEC_SIZE * 1 + 0)(%rdi, %rsi)
 488         /* Restore rdi (dst).  */
 489         addq    %rsi, %rdi
 490         VPTESTN %VMM(0), %VMM(0), %k0
 491         KMOV    %k0, %VRCX
 492         test    %VRCX, %VRCX
 493         jnz     L(zfill_vec1)
 494
 495         VMOVU   %VMM(1), (VEC_SIZE * 2 + 0)(%rdi)
 496         KMOV    %k2, %VRCX
 497         test    %VRCX, %VRCX
 498         jnz     L(zfill_vec2)
 499
 500         VMOVU   %VMM(2), (VEC_SIZE * 3 + 0)(%rdi)
 501         VPTESTN %VMM(2), %VMM(2), %k0
 502         KMOV    %k0, %VRCX
 503         test    %VRCX, %VRCX
 504         jnz     L(zfill_vec3)
 505
 506         VMOVU   %VMM(3), (VEC_SIZE * 4 + 0)(%rdi)
 507         KMOV    %k4, %VRCX
 508         // Zfill more....
 509
 510         .p2align 4,, 4
 511 L(zfill_vec4):
 512         subq    $(VEC_SIZE * -2), %rdi
 513         addq    $(CHAR_PER_VEC * -2), %rdx
 514 L(zfill_vec2):
 515         subq    $(VEC_SIZE * -2), %rdi
 516         addq    $(CHAR_PER_VEC * -1), %rdx
 517 L(zfill):
 518         /* VRCX must be non-zero.  */
 519         bsf     %VRCX, %VRCX
 520
 521         /* Adjust length / dst for zfill.  */
 522         subq    %rcx, %rdx
 523 # ifdef USE_AS_WCSCPY
 524         leaq    (%rdi, %rcx, CHAR_SIZE), %rdi
 525 # else
 526         addq    %rcx, %rdi
 527 # endif
 528 # ifdef USE_AS_STPCPY
 529         movq    %rdi, %rax
 530 # endif
 531 L(zfill_from_page_cross):
 532
 533         /* From here on out its just memset(rdi, 0, rdx).  */
 534         cmpq    $CHAR_PER_VEC, %rdx
 535         jb      L(zfill_less_vec)
 536
 537 L(zfill_more_1x_vec):
 538         VMOVU   %VZERO, (%rdi)
 539         VMOVU   %VZERO, (CHAR_SIZE - VEC_SIZE)(%rdi, %rdx, CHAR_SIZE)
 540         cmpq    $(CHAR_PER_VEC * 2 - 1), %rdx
 541         ja      L(zfill_more_2x_vec)
 542 L(zfill_done0):
 543         ret
 544
 545         /* Coming from vec1/vec2 we must be able to zfill at least 2x
 546            VEC.  */
 547         .p2align 4,, 8
 548 L(zfill_vec3):
 549         subq    $(VEC_SIZE * -2), %rdi
 550         addq    $(CHAR_PER_VEC * -2), %rdx
 551         .p2align 4,, 2
 552 L(zfill_vec1):
 553         bsfq    %rcx, %rcx
 554         /* rdi is currently dst - VEC_SIZE so add back VEC_SIZE here.
 555          */
 556         leaq    VEC_SIZE(%rdi, %rcx, CHAR_SIZE), %rdi
 557         subq    %rcx, %rdx
 558 # ifdef USE_AS_STPCPY
 559         movq    %rdi, %rax
 560 # endif
 561
 562
 563         VMOVU   %VZERO, (%rdi)
 564         VMOVU   %VZERO, (CHAR_SIZE - VEC_SIZE)(%rdi, %rdx, CHAR_SIZE)
 565         cmpq    $(CHAR_PER_VEC * 2), %rdx
 566         jb      L(zfill_done0)
 567 L(zfill_more_2x_vec):
 568         VMOVU   %VZERO, (CHAR_SIZE - VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE)
 569         VMOVU   %VZERO, (VEC_SIZE)(%rdi)
 570         subq    $(CHAR_PER_VEC * 4 - 1), %rdx
 571         jbe     L(zfill_done)
 572
 573 # ifdef USE_AS_WCSCPY
 574         leaq    (%rdi, %rdx, CHAR_SIZE), %rdx
 575 # else
 576         addq    %rdi, %rdx
 577 # endif
 578
 579         VMOVU   %VZERO, (VEC_SIZE * 2)(%rdi)
 580         VMOVU   %VZERO, (VEC_SIZE * 3)(%rdi)
 581
 582
 583         VMOVU   %VZERO, (VEC_SIZE * 0 + 0)(%rdx)
 584         VMOVU   %VZERO, (VEC_SIZE * 1 + 0)(%rdx)
 585
 586         subq    $-(VEC_SIZE * 4), %rdi
 587         cmpq    %rdi, %rdx
 588         jbe     L(zfill_done)
 589
 590         /* Align rdi and zfill loop.  */
 591         andq    $-(VEC_SIZE), %rdi
 592         .p2align 4,, 12
 593 L(zfill_loop_4x_vec):
 594         VMOVA   %VZERO, (VEC_SIZE * 0)(%rdi)
 595         VMOVA   %VZERO, (VEC_SIZE * 1)(%rdi)
 596         VMOVA   %VZERO, (VEC_SIZE * 2)(%rdi)
 597         VMOVA   %VZERO, (VEC_SIZE * 3)(%rdi)
 598         subq    $-(VEC_SIZE * 4), %rdi
 599         cmpq    %rdi, %rdx
 600         ja      L(zfill_loop_4x_vec)
 601 L(zfill_done):
 602         ret
 603
 604
 605         /* Less 1x VEC case if we are not using evex masked store.  */
 606 # if !USE_EVEX_MASKED_STORE
 607         .p2align 4,, 8
 608 L(copy_1x):
 609         /* Special case for copy 1x. It can be handled quickly and many
 610            buffer sizes have convenient alignment.  */
 611         VMOVU   %VMM(0), (%rdi)
 612         /* If no zeros then we are done.  */
 613         testl   %ecx, %ecx
 614         jz      L(ret_1x_1x)
 615
 616         /* Need to zfill, not we know that length <= CHAR_PER_VEC so we
 617            only handle the small case here.  */
 618         bsf     %VRCX, %VRCX
 619 L(zfill_less_vec_no_bsf):
 620         /* Adjust length / dst then just zfill less_vec.  */
 621         subq    %rcx, %rdx
 622 #  ifdef USE_AS_WCSCPY
 623         leaq    (%rdi, %rcx, CHAR_SIZE), %rdi
 624 #  else
 625         addq    %rcx, %rdi
 626 #  endif
 627 #  ifdef USE_AS_STPCPY
 628         movq    %rdi, %rax
 629 #  endif
 630
 631 L(zfill_less_vec):
 632         cmpl    $((VEC_SIZE / 2) / CHAR_SIZE), %edx
 633         jb      L(zfill_less_half)
 634
 635         VMOVU   %VZERO_HALF, (%rdi)
 636         VMOVU   %VZERO_HALF, -((VEC_SIZE / 2)- CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
 637         ret
 638 #  ifdef USE_AS_STPCPY
 639 L(ret_1x_1x):
 640         leaq    CHAR_SIZE(%rdi, %rdx, CHAR_SIZE), %rax
 641         ret
 642 #  endif
 643
 644
 645 #  if VEC_SIZE == 64
 646         .p2align 4,, 4
 647 L(copy_32_63):
 648         /* Overfill to avoid branches.  */
 649         VMOVU   -(32 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %VMM_256(1)
 650         VMOVU   %VMM_256(0), (%rdi)
 651         VMOVU   %VMM_256(1), -(32 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
 652
 653         /* We are taking advantage of the fact that to be here we must
 654            be writing null-term as (%rdi, %rcx) we have a byte of lee-
 655            way for overwriting.  */
 656         cmpl    %ecx, %edx
 657         ja      L(zfill_less_vec_no_bsf)
 658 #   ifndef USE_AS_STPCPY
 659 L(ret_1x_1x):
 660 #   else
 661 #    ifdef USE_AS_WCSCPY
 662         adcq    $0, %rdx
 663         leaq    (%rdi, %rdx, CHAR_SIZE), %rax
 664 #    else
 665         movl    %edx, %eax
 666         adcq    %rdi, %rax
 667 #    endif
 668 #   endif
 669         ret
 670 #  endif
 671
 672         .p2align 4,, 4
 673 L(copy_16_31):
 674         /* Overfill to avoid branches.  */
 675         vmovdqu -(16 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %xmm1
 676         VMOVU   %VMM_128(0), (%rdi)
 677         vmovdqu %xmm1, -(16 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
 678         cmpl    %ecx, %edx
 679
 680         /* Separate logic depending on VEC_SIZE. If VEC_SIZE == 64 then
 681            we have a larger copy block for 32-63 so this is just falls
 682            through to zfill 16-31. If VEC_SIZE == 32 then we check for
 683            full zfill of less 1x VEC.  */
 684 #  if VEC_SIZE == 64
 685         jbe     L(ret_16_31)
 686         subl    %ecx, %edx
 687 #   ifdef USE_AS_WCSCPY
 688         leaq    (%rdi, %rcx, CHAR_SIZE), %rdi
 689 #   else
 690         addq    %rcx, %rdi
 691 #   endif
 692 #   ifdef USE_AS_STPCPY
 693         movq    %rdi, %rax
 694 #   endif
 695 L(zfill_less_half):
 696 L(zfill_less_32):
 697         cmpl    $(16 / CHAR_SIZE), %edx
 698         jb      L(zfill_less_16)
 699         VMOVU   %VZERO_128, (%rdi)
 700         VMOVU   %VZERO_128, -(16 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
 701 #   ifdef USE_AS_STPCPY
 702         ret
 703 #   endif
 704 L(ret_16_31):
 705 #   ifdef USE_AS_STPCPY
 706 #    ifdef USE_AS_WCSCPY
 707         adcq    $0, %rdx
 708         leaq    (%rdi, %rdx, CHAR_SIZE), %rax
 709 #    else
 710         movl    %edx, %eax
 711         adcq    %rdi, %rax
 712 #    endif
 713 #   endif
 714         ret
 715 #  else
 716         /* VEC_SIZE == 32 begins.  */
 717         ja      L(zfill_less_vec_no_bsf)
 718 #   ifndef USE_AS_STPCPY
 719 L(ret_1x_1x):
 720 #   else
 721 #    ifdef USE_AS_WCSCPY
 722         adcq    $0, %rdx
 723         leaq    (%rdi, %rdx, CHAR_SIZE), %rax
 724 #    else
 725         movl    %edx, %eax
 726         adcq    %rdi, %rax
 727 #    endif
 728 #   endif
 729         ret
 730 #  endif
 731
 732
 733         .p2align 4,, 4
 734 L(copy_8_15):
 735         /* Overfill to avoid branches.  */
 736         movq    -(8 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %rsi
 737         vmovq   %VMM_128(0), (%rdi)
 738         movq    %rsi, -(8 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
 739         cmpl    %ecx, %edx
 740         jbe     L(ret_8_15)
 741         subl    %ecx, %edx
 742 #  ifdef USE_AS_WCSCPY
 743         leaq    (%rdi, %rcx, CHAR_SIZE), %rdi
 744 #  else
 745         addq    %rcx, %rdi
 746 #  endif
 747 #  ifdef USE_AS_STPCPY
 748         movq    %rdi, %rax
 749 #  endif
 750         .p2align 4,, 8
 751 #  if VEC_SIZE == 32
 752 L(zfill_less_half):
 753 #  endif
 754 L(zfill_less_16):
 755         xorl    %ecx, %ecx
 756         cmpl    $(8 / CHAR_SIZE), %edx
 757         jb      L(zfill_less_8)
 758         movq    %rcx, (%rdi)
 759         movq    %rcx, -(8 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
 760 #  ifndef USE_AS_STPCPY
 761 L(ret_8_15):
 762 #  endif
 763         ret
 764
 765         .p2align 4,, 8
 766 L(less_1x_vec):
 767         je      L(copy_1x)
 768
 769         /* We will need `tzcnt` result for all other copy sizes.  */
 770         tzcnt   %VRCX, %VRCX
 771 #  if VEC_SIZE == 64
 772         cmpl    $(32 / CHAR_SIZE), %edx
 773         jae     L(copy_32_63)
 774 #  endif
 775
 776         cmpl    $(16 / CHAR_SIZE), %edx
 777         jae     L(copy_16_31)
 778
 779         cmpl    $(8 / CHAR_SIZE), %edx
 780         jae     L(copy_8_15)
 781 #  ifdef USE_AS_WCSCPY
 782         testl   %ecx, %ecx
 783         jz      L(zfill_less_8_set_ret)
 784
 785         movl    (%rsi, %rdx, CHAR_SIZE), %esi
 786         vmovd   %VMM_128(0), (%rdi)
 787         movl    %esi, (%rdi, %rdx, CHAR_SIZE)
 788 #   ifdef USE_AS_STPCPY
 789         cmpl    %ecx, %edx
 790 L(ret_8_15):
 791         adcq    $0, %rdx
 792         leaq    (%rdi, %rdx, CHAR_SIZE), %rax
 793 #   endif
 794         ret
 795 L(zfill_less_8_set_ret):
 796         xorl    %ecx, %ecx
 797 #   ifdef USE_AS_STPCPY
 798         movq    %rdi, %rax
 799 #   endif
 800 L(zfill_less_8):
 801         movl    %ecx, (%rdi)
 802         movl    %ecx, (%rdi, %rdx, CHAR_SIZE)
 803         ret
 804 #  else
 805         cmpl    $3, %edx
 806         jb      L(copy_0_3)
 807         /* Overfill to avoid branches.  */
 808         movl    -3(%rsi, %rdx), %esi
 809         vmovd   %VMM_128(0), (%rdi)
 810         movl    %esi, -3(%rdi, %rdx)
 811         cmpl    %ecx, %edx
 812         jbe     L(ret_4_7)
 813         subq    %rcx, %rdx
 814         addq    %rcx, %rdi
 815 #   ifdef USE_AS_STPCPY
 816         movq    %rdi, %rax
 817 #   endif
 818         xorl    %ecx, %ecx
 819         .p2align 4,, 8
 820 L(zfill_less_8):
 821         cmpl    $3, %edx
 822         jb      L(zfill_less_3)
 823         movl    %ecx, (%rdi)
 824         movl    %ecx, -3(%rdi, %rdx)
 825 #   ifdef USE_AS_STPCPY
 826         ret
 827 #   endif
 828
 829 L(ret_4_7):
 830 #   ifdef USE_AS_STPCPY
 831 L(ret_8_15):
 832         movl    %edx, %eax
 833         adcq    %rdi, %rax
 834 #   endif
 835         ret
 836
 837         .p2align 4,, 4
 838 L(zfill_less_3):
 839         testl   %edx, %edx
 840         jz      L(zfill_1)
 841         movw    %cx, (%rdi)
 842 L(zfill_1):
 843         movb    %cl, (%rdi, %rdx)
 844         ret
 845
 846         .p2align 4,, 8
 847 L(copy_0_3):
 848         vmovd   %VMM_128(0), %r8d
 849         testl   %edx, %edx
 850         jz      L(copy_1)
 851         movw    %r8w, (%rdi)
 852         cmpl    %ecx, %edx
 853         ja      L(zfill_from_1)
 854         movzbl  (%rsi, %rdx), %r8d
 855 #   ifdef USE_AS_STPCPY
 856         movl    %edx, %eax
 857         adcq    %rdi, %rax
 858         movb    %r8b, (%rdi, %rdx)
 859         ret
 860 #   endif
 861
 862 L(copy_1):
 863 #   ifdef USE_AS_STPCPY
 864         movl    %edx, %eax
 865         cmpl    %ecx, %edx
 866         adcq    %rdi, %rax
 867 #   endif
 868 #   ifdef USE_AS_WCSCPY
 869         vmovd   %VMM_128(0), (%rdi)
 870 #   else
 871         movb    %r8b, (%rdi, %rdx)
 872 #   endif
 873         ret
 874 #  endif
 875
 876
 877 #  ifndef USE_AS_WCSCPY
 878         .p2align 4,, 8
 879 L(zfill_from_1):
 880 #   ifdef USE_AS_STPCPY
 881         leaq    (%rdi, %rcx), %rax
 882 #   endif
 883         movw    $0, -1(%rdi, %rdx)
 884         ret
 885 #  endif
 886
 887         .p2align 4,, 4
 888 L(zero_len):
 889         incq    %rdx
 890         jne     L(best_effort_strncpy)
 891         movq    %rdi, %rax
 892         ret
 893 # endif
 894
 895
 896         .p2align 4,, 4
 897         .p2align 6,, 8
 898 L(page_cross):
 899         movq    %rsi, %rax
 900         andq    $(VEC_SIZE * -1), %rax
 901         VPCMPEQ (%rax), %VZERO, %k0
 902         KMOV    %k0, %VRCX
 903 # ifdef USE_AS_WCSCPY
 904         movl    %esi, %r8d
 905         shrl    $2, %r8d
 906         andl    $(CHAR_PER_VEC - 1), %r8d
 907         shrx    %VR8, %VRCX, %VRCX
 908 # else
 909         shrx    %VRSI, %VRCX, %VRCX
 910 # endif
 911
 912         /* Compute amount of bytes we checked.  */
 913         subl    %esi, %eax
 914         andl    $(VEC_SIZE - 1), %eax
 915 # ifdef USE_AS_WCSCPY
 916         shrl    $2, %eax
 917 # endif
 918
 919         /* If rax > rdx then we are finishing the copy at the end of the
 920            page.  */
 921         cmpq    %rax, %rdx
 922         jb      L(page_cross_small)
 923
 924
 925         /* If rcx is non-zero then continue.  */
 926         test    %VRCX, %VRCX
 927         jz      L(page_cross_continue)
 928
 929         /* We found zero-CHAR so need to copy then zfill (we know we
 930            didn't cover all of length here).  */
 931         bsf     %VRCX, %VRCX
 932 L(movsb_and_zfill):
 933         incl    %ecx
 934         subq    %rcx, %rdx
 935 # ifdef USE_AS_STPCPY
 936         leaq    -CHAR_SIZE(%rdi, %rcx, CHAR_SIZE), %rax
 937 # else
 938         movq    %rdi, %rax
 939 # endif
 940
 941         REP_MOVS
 942 # ifdef USE_AS_WCSCPY
 943         movl    $0, (%rdi)
 944 # else
 945         movb    $0, (%rdi)
 946 # endif
 947         jmp     L(zfill_from_page_cross)
 948
 949 L(page_cross_small):
 950         tzcnt   %VRCX, %VRCX
 951         cmpl    %ecx, %edx
 952         jbe     L(page_cross_copy_only)
 953
 954         /* Do a zfill of the tail before copying.  */
 955         movq    %rdi, %r9
 956         xorl    %eax, %eax
 957
 958         movl    %ecx, %r8d
 959
 960         subl    %ecx, %edx
 961         leaq    CHAR_SIZE(%rdi, %rcx, CHAR_SIZE), %rdi
 962         movl    %edx, %ecx
 963         REP_STOS
 964         movq    %r9, %rdi
 965         movl    %r8d, %edx
 966 L(page_cross_copy_only):
 967         leal    1(%rdx), %ecx
 968 # ifdef USE_AS_STPCPY
 969 #  ifdef USE_AS_WCSCPY
 970         adcl    $0, %edx
 971         leaq    (%rdi, %rdx, CHAR_SIZE), %rax
 972 #  else
 973         movl    %edx, %eax
 974         adcq    %rdi, %rax
 975 #  endif
 976 # else
 977         movq    %rdi, %rax
 978 # endif
 979         REP_MOVS
 980         ret
 981
 982
 983 L(best_effort_strncpy):
 984         movq    %rdx, %rcx
 985         xorl    %eax, %eax
 986         movq    %rdi, %r8
 987         /* The length is >= 2^63. We very much so expect to segfault at
 988            rep stos. If that doesn't happen then just strcpy to finish.
 989          */
 990         REP_STOS
 991         movq    %r8, %rdi
 992         jmp     OVERFLOW_STRCPY
 993 END(STRNCPY)
 994 #endif