sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S

   1 /* memcpy with AVX
   2    Copyright (C) 2014-2016 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, see
  17    <http://www.gnu.org/licenses/>.  */
  18
  19 #include <sysdep.h>
  20
  21 #if IS_IN (libc) \
  22     && (defined SHARED \
  23         || defined USE_AS_MEMMOVE \
  24         || !defined USE_MULTIARCH)
  25
  26 #include "asm-syntax.h"
  27 #ifndef MEMCPY
  28 # define MEMCPY __memcpy_avx_unaligned
  29 # define MEMCPY_CHK     __memcpy_chk_avx_unaligned
  30 #endif
  31
  32         .section .text.avx,"ax",@progbits
  33 #if !defined USE_AS_BCOPY
  34 ENTRY (MEMCPY_CHK)
  35         cmpq    %rdx, %rcx
  36         jb      HIDDEN_JUMPTARGET (__chk_fail)
  37 END (MEMCPY_CHK)
  38 #endif
  39
  40 ENTRY (MEMCPY)
  41         mov     %rdi, %rax
  42 #ifdef USE_AS_MEMPCPY
  43         add     %rdx, %rax
  44 #endif
  45         cmp     $256, %rdx
  46         jae     L(256bytesormore)
  47         cmp     $16, %dl
  48         jb      L(less_16bytes)
  49         cmp     $128, %dl
  50         jb      L(less_128bytes)
  51         vmovdqu (%rsi), %xmm0
  52         lea     (%rsi, %rdx), %rcx
  53         vmovdqu 0x10(%rsi), %xmm1
  54         vmovdqu 0x20(%rsi), %xmm2
  55         vmovdqu 0x30(%rsi), %xmm3
  56         vmovdqu 0x40(%rsi), %xmm4
  57         vmovdqu 0x50(%rsi), %xmm5
  58         vmovdqu 0x60(%rsi), %xmm6
  59         vmovdqu 0x70(%rsi), %xmm7
  60         vmovdqu -0x80(%rcx), %xmm8
  61         vmovdqu -0x70(%rcx), %xmm9
  62         vmovdqu -0x60(%rcx), %xmm10
  63         vmovdqu -0x50(%rcx), %xmm11
  64         vmovdqu -0x40(%rcx), %xmm12
  65         vmovdqu -0x30(%rcx), %xmm13
  66         vmovdqu -0x20(%rcx), %xmm14
  67         vmovdqu -0x10(%rcx), %xmm15
  68         lea     (%rdi, %rdx), %rdx
  69         vmovdqu %xmm0, (%rdi)
  70         vmovdqu %xmm1, 0x10(%rdi)
  71         vmovdqu %xmm2, 0x20(%rdi)
  72         vmovdqu %xmm3, 0x30(%rdi)
  73         vmovdqu %xmm4, 0x40(%rdi)
  74         vmovdqu %xmm5, 0x50(%rdi)
  75         vmovdqu %xmm6, 0x60(%rdi)
  76         vmovdqu %xmm7, 0x70(%rdi)
  77         vmovdqu %xmm8, -0x80(%rdx)
  78         vmovdqu %xmm9, -0x70(%rdx)
  79         vmovdqu %xmm10, -0x60(%rdx)
  80         vmovdqu %xmm11, -0x50(%rdx)
  81         vmovdqu %xmm12, -0x40(%rdx)
  82         vmovdqu %xmm13, -0x30(%rdx)
  83         vmovdqu %xmm14, -0x20(%rdx)
  84         vmovdqu %xmm15, -0x10(%rdx)
  85         ret
  86         .p2align 4
  87 L(less_128bytes):
  88         cmp     $64, %dl
  89         jb      L(less_64bytes)
  90         vmovdqu (%rsi), %xmm0
  91         lea     (%rsi, %rdx), %rcx
  92         vmovdqu 0x10(%rsi), %xmm1
  93         vmovdqu 0x20(%rsi), %xmm2
  94         lea     (%rdi, %rdx), %rdx
  95         vmovdqu 0x30(%rsi), %xmm3
  96         vmovdqu -0x40(%rcx), %xmm4
  97         vmovdqu -0x30(%rcx), %xmm5
  98         vmovdqu -0x20(%rcx), %xmm6
  99         vmovdqu -0x10(%rcx), %xmm7
 100         vmovdqu %xmm0, (%rdi)
 101         vmovdqu %xmm1, 0x10(%rdi)
 102         vmovdqu %xmm2, 0x20(%rdi)
 103         vmovdqu %xmm3, 0x30(%rdi)
 104         vmovdqu %xmm4, -0x40(%rdx)
 105         vmovdqu %xmm5, -0x30(%rdx)
 106         vmovdqu %xmm6, -0x20(%rdx)
 107         vmovdqu %xmm7, -0x10(%rdx)
 108         ret
 109
 110         .p2align 4
 111 L(less_64bytes):
 112         cmp     $32, %dl
 113         jb      L(less_32bytes)
 114         vmovdqu (%rsi), %xmm0
 115         vmovdqu 0x10(%rsi), %xmm1
 116         vmovdqu -0x20(%rsi, %rdx), %xmm6
 117         vmovdqu -0x10(%rsi, %rdx), %xmm7
 118         vmovdqu %xmm0, (%rdi)
 119         vmovdqu %xmm1, 0x10(%rdi)
 120         vmovdqu %xmm6, -0x20(%rdi, %rdx)
 121         vmovdqu %xmm7, -0x10(%rdi, %rdx)
 122         ret
 123
 124         .p2align 4
 125 L(less_32bytes):
 126         vmovdqu (%rsi), %xmm0
 127         vmovdqu -0x10(%rsi, %rdx), %xmm7
 128         vmovdqu %xmm0, (%rdi)
 129         vmovdqu %xmm7, -0x10(%rdi, %rdx)
 130         ret
 131
 132         .p2align 4
 133 L(less_16bytes):
 134         cmp     $8, %dl
 135         jb      L(less_8bytes)
 136         movq -0x08(%rsi, %rdx), %rcx
 137         movq (%rsi),    %rsi
 138         movq %rsi, (%rdi)
 139         movq %rcx, -0x08(%rdi, %rdx)
 140         ret
 141
 142         .p2align 4
 143 L(less_8bytes):
 144         cmp     $4, %dl
 145         jb      L(less_4bytes)
 146         mov -0x04(%rsi, %rdx), %ecx
 147         mov (%rsi),     %esi
 148         mov %esi, (%rdi)
 149         mov %ecx, -0x04(%rdi, %rdx)
 150         ret
 151
 152 L(less_4bytes):
 153         cmp     $1, %dl
 154         jbe     L(less_2bytes)
 155         mov -0x02(%rsi, %rdx),  %cx
 156         mov (%rsi),     %si
 157         mov %si, (%rdi)
 158         mov %cx, -0x02(%rdi, %rdx)
 159         ret
 160
 161 L(less_2bytes):
 162         jb      L(less_0bytes)
 163         mov     (%rsi), %cl
 164         mov     %cl,    (%rdi)
 165 L(less_0bytes):
 166         ret
 167
 168         .p2align 4
 169 L(256bytesormore):
 170 #ifdef USE_AS_MEMMOVE
 171         mov     %rdi, %rcx
 172         sub     %rsi, %rcx
 173         cmp     %rdx, %rcx
 174         jc      L(copy_backward)
 175 #endif
 176         cmp     $2048, %rdx
 177         jae     L(gobble_data_movsb)
 178         mov     %rax, %r8
 179         lea     (%rsi, %rdx), %rcx
 180         mov     %rdi, %r10
 181         vmovdqu -0x80(%rcx), %xmm5
 182         vmovdqu -0x70(%rcx), %xmm6
 183         mov     $0x80, %rax
 184         and     $-32, %rdi
 185         add     $32, %rdi
 186         vmovdqu -0x60(%rcx), %xmm7
 187         vmovdqu -0x50(%rcx), %xmm8
 188         mov     %rdi, %r11
 189         sub     %r10, %r11
 190         vmovdqu -0x40(%rcx), %xmm9
 191         vmovdqu -0x30(%rcx), %xmm10
 192         sub     %r11, %rdx
 193         vmovdqu -0x20(%rcx), %xmm11
 194         vmovdqu -0x10(%rcx), %xmm12
 195         vmovdqu (%rsi), %ymm4
 196         add     %r11, %rsi
 197         sub     %eax, %edx
 198 L(goble_128_loop):
 199         vmovdqu (%rsi), %ymm0
 200         vmovdqu 0x20(%rsi), %ymm1
 201         vmovdqu 0x40(%rsi), %ymm2
 202         vmovdqu 0x60(%rsi), %ymm3
 203         add     %rax, %rsi
 204         vmovdqa %ymm0, (%rdi)
 205         vmovdqa %ymm1, 0x20(%rdi)
 206         vmovdqa %ymm2, 0x40(%rdi)
 207         vmovdqa %ymm3, 0x60(%rdi)
 208         add     %rax, %rdi
 209         sub     %eax, %edx
 210         jae     L(goble_128_loop)
 211         add     %eax, %edx
 212         add     %rdi, %rdx
 213         vmovdqu %ymm4, (%r10)
 214         vzeroupper
 215         vmovdqu %xmm5, -0x80(%rdx)
 216         vmovdqu %xmm6, -0x70(%rdx)
 217         vmovdqu %xmm7, -0x60(%rdx)
 218         vmovdqu %xmm8, -0x50(%rdx)
 219         vmovdqu %xmm9, -0x40(%rdx)
 220         vmovdqu %xmm10, -0x30(%rdx)
 221         vmovdqu %xmm11, -0x20(%rdx)
 222         vmovdqu %xmm12, -0x10(%rdx)
 223         mov     %r8, %rax
 224         ret
 225
 226         .p2align 4
 227 L(gobble_data_movsb):
 228 #ifdef SHARED_CACHE_SIZE_HALF
 229         mov     $SHARED_CACHE_SIZE_HALF, %rcx
 230 #else
 231         mov     __x86_shared_cache_size_half(%rip), %rcx
 232 #endif
 233         shl     $3, %rcx
 234         cmp     %rcx, %rdx
 235         jae     L(gobble_big_data_fwd)
 236         mov     %rdx, %rcx
 237         mov     %rdx, %rcx
 238         rep     movsb
 239         ret
 240
 241         .p2align 4
 242 L(gobble_big_data_fwd):
 243         lea     (%rsi, %rdx), %rcx
 244         vmovdqu (%rsi), %ymm4
 245         vmovdqu -0x80(%rsi,%rdx), %xmm5
 246         vmovdqu -0x70(%rcx), %xmm6
 247         vmovdqu -0x60(%rcx), %xmm7
 248         vmovdqu -0x50(%rcx), %xmm8
 249         vmovdqu -0x40(%rcx), %xmm9
 250         vmovdqu -0x30(%rcx), %xmm10
 251         vmovdqu -0x20(%rcx), %xmm11
 252         vmovdqu -0x10(%rcx), %xmm12
 253         mov     %rdi, %r8
 254         and     $-32, %rdi
 255         add     $32, %rdi
 256         mov     %rdi, %r10
 257         sub     %r8, %r10
 258         sub     %r10, %rdx
 259         add     %r10, %rsi
 260         lea     (%rdi, %rdx), %rcx
 261         add     $-0x80, %rdx
 262 L(gobble_mem_fwd_loop):
 263         prefetchnta 0x1c0(%rsi)
 264         prefetchnta 0x280(%rsi)
 265         vmovdqu (%rsi), %ymm0
 266         vmovdqu 0x20(%rsi), %ymm1
 267         vmovdqu 0x40(%rsi), %ymm2
 268         vmovdqu 0x60(%rsi), %ymm3
 269         sub     $-0x80, %rsi
 270         vmovntdq        %ymm0, (%rdi)
 271         vmovntdq        %ymm1, 0x20(%rdi)
 272         vmovntdq        %ymm2, 0x40(%rdi)
 273         vmovntdq        %ymm3, 0x60(%rdi)
 274         sub     $-0x80, %rdi
 275         add     $-0x80, %rdx
 276         jb      L(gobble_mem_fwd_loop)
 277         sfence
 278         vmovdqu %ymm4, (%r8)
 279         vzeroupper
 280         vmovdqu %xmm5, -0x80(%rcx)
 281         vmovdqu %xmm6, -0x70(%rcx)
 282         vmovdqu %xmm7, -0x60(%rcx)
 283         vmovdqu %xmm8, -0x50(%rcx)
 284         vmovdqu %xmm9, -0x40(%rcx)
 285         vmovdqu %xmm10, -0x30(%rcx)
 286         vmovdqu %xmm11, -0x20(%rcx)
 287         vmovdqu %xmm12, -0x10(%rcx)
 288         ret
 289
 290 #ifdef USE_AS_MEMMOVE
 291         .p2align 4
 292 L(copy_backward):
 293 #ifdef SHARED_CACHE_SIZE_HALF
 294         mov     $SHARED_CACHE_SIZE_HALF, %rcx
 295 #else
 296         mov     __x86_shared_cache_size_half(%rip), %rcx
 297 #endif
 298         shl     $3, %rcx
 299         vmovdqu (%rsi), %xmm5
 300         vmovdqu 0x10(%rsi), %xmm6
 301         add     %rdx, %rdi
 302         vmovdqu 0x20(%rsi), %xmm7
 303         vmovdqu 0x30(%rsi), %xmm8
 304         lea     -0x20(%rdi), %r10
 305         mov %rdi, %r11
 306         vmovdqu 0x40(%rsi), %xmm9
 307         vmovdqu 0x50(%rsi), %xmm10
 308         and     $0x1f, %r11
 309         vmovdqu 0x60(%rsi), %xmm11
 310         vmovdqu 0x70(%rsi), %xmm12
 311         xor     %r11, %rdi
 312         add     %rdx, %rsi
 313         vmovdqu -0x20(%rsi), %ymm4
 314         sub     %r11, %rsi
 315         sub     %r11, %rdx
 316         cmp     %rcx, %rdx
 317         ja      L(gobble_big_data_bwd)
 318         add     $-0x80, %rdx
 319 L(gobble_mem_bwd_llc):
 320         vmovdqu -0x20(%rsi), %ymm0
 321         vmovdqu -0x40(%rsi), %ymm1
 322         vmovdqu -0x60(%rsi), %ymm2
 323         vmovdqu -0x80(%rsi), %ymm3
 324         lea     -0x80(%rsi), %rsi
 325         vmovdqa %ymm0, -0x20(%rdi)
 326         vmovdqa %ymm1, -0x40(%rdi)
 327         vmovdqa %ymm2, -0x60(%rdi)
 328         vmovdqa %ymm3, -0x80(%rdi)
 329         lea     -0x80(%rdi), %rdi
 330         add     $-0x80, %rdx
 331         jb      L(gobble_mem_bwd_llc)
 332         vmovdqu %ymm4, (%r10)
 333         vzeroupper
 334         vmovdqu %xmm5, (%rax)
 335         vmovdqu %xmm6, 0x10(%rax)
 336         vmovdqu %xmm7, 0x20(%rax)
 337         vmovdqu %xmm8, 0x30(%rax)
 338         vmovdqu %xmm9, 0x40(%rax)
 339         vmovdqu %xmm10, 0x50(%rax)
 340         vmovdqu %xmm11, 0x60(%rax)
 341         vmovdqu %xmm12, 0x70(%rax)
 342         ret
 343
 344         .p2align 4
 345 L(gobble_big_data_bwd):
 346         add     $-0x80, %rdx
 347 L(gobble_mem_bwd_loop):
 348         prefetchnta -0x1c0(%rsi)
 349         prefetchnta -0x280(%rsi)
 350         vmovdqu -0x20(%rsi), %ymm0
 351         vmovdqu -0x40(%rsi), %ymm1
 352         vmovdqu -0x60(%rsi), %ymm2
 353         vmovdqu -0x80(%rsi), %ymm3
 354         lea     -0x80(%rsi), %rsi
 355         vmovntdq        %ymm0, -0x20(%rdi)
 356         vmovntdq        %ymm1, -0x40(%rdi)
 357         vmovntdq        %ymm2, -0x60(%rdi)
 358         vmovntdq        %ymm3, -0x80(%rdi)
 359         lea     -0x80(%rdi), %rdi
 360         add     $-0x80, %rdx
 361         jb      L(gobble_mem_bwd_loop)
 362         sfence
 363         vmovdqu %ymm4, (%r10)
 364         vzeroupper
 365         vmovdqu %xmm5, (%rax)
 366         vmovdqu %xmm6, 0x10(%rax)
 367         vmovdqu %xmm7, 0x20(%rax)
 368         vmovdqu %xmm8, 0x30(%rax)
 369         vmovdqu %xmm9, 0x40(%rax)
 370         vmovdqu %xmm10, 0x50(%rax)
 371         vmovdqu %xmm11, 0x60(%rax)
 372         vmovdqu %xmm12, 0x70(%rax)
 373         ret
 374 #endif
 375 END (MEMCPY)
 376 #endif