crypto: x86/crc32c - eliminate jump table and excessive unrolling

author Eric Biggers <ebiggers@google.com>

Mon, 14 Oct 2024 04:24:47 +0000 (21:24 -0700)

committer Herbert Xu <herbert@gondor.apana.org.au>

Sat, 26 Oct 2024 06:41:59 +0000 (14:41 +0800)
author Eric Biggers <ebiggers@google.com>
Mon, 14 Oct 2024 04:24:47 +0000 (21:24 -0700)
committer Herbert Xu <herbert@gondor.apana.org.au>
Sat, 26 Oct 2024 06:41:59 +0000 (14:41 +0800)
diff --git a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S

index bbf860e90951d75b371b29b9c3cbeb4cc460fdbe..752812bc4991df2778c1b9a4639148c0f231e981 100644 (file)
--- a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
+++ b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
@@ -7,6 +7,7 @@
   * http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-paper.pdf
   *
   * Copyright (C) 2012 Intel Corporation.
+ * Copyright 2024 Google LLC
   *
   * Authors:
   *     Wajdi Feghali <wajdi.k.feghali@intel.com>
@@ -44,18 +45,9 @@
   */
  
  #include <linux/linkage.h>
-#include <asm/nospec-branch.h>
  
  ## ISCSI CRC 32 Implementation with crc32 and pclmulqdq Instruction
  
-.macro LABEL prefix n
-.L\prefix\n\():
-.endm
-
-.macro JMPTBL_ENTRY i
-.quad .Lcrc_\i
-.endm
-
  # Define threshold below which buffers are considered "small" and routed to
  # regular CRC code that does not interleave the CRC instructions.
  #define SMALL_SIZE 200
@@ -64,139 +56,116 @@
  
  .text
  SYM_FUNC_START(crc_pcl)
-#define    bufp                rdi
-#define    bufp_dw     %edi
-#define    bufp_w      %di
-#define    bufp_b      %dil
-#define    bufptmp     %rcx
-#define    block_0     %rcx
-#define    block_1     %rdx
-#define    block_2     %r11
-#define    len         %esi
-#define    crc_init_arg %edx
-#define    tmp         %rbx
-#define    crc_init    %r8d
-#define    crc_init_q  %r8
-#define    crc1                %r9
-#define    crc2                %r10
-
-       pushq   %rbx
-       pushq   %rdi
-       pushq   %rsi
+#define    bufp                  %rdi
+#define    bufp_d        %edi
+#define    len           %esi
+#define    crc_init      %edx
+#define    crc_init_q    %rdx
+#define    n_misaligned          %ecx /* overlaps chunk_bytes! */
+#define    n_misaligned_q %rcx
+#define    chunk_bytes   %ecx /* overlaps n_misaligned! */
+#define    chunk_bytes_q  %rcx
+#define    crc1                  %r8
+#define    crc2                  %r9
  
-       ## Move crc_init for Linux to a different
-       mov     crc_init_arg, crc_init
-
-       mov     %bufp, bufptmp          # rdi = *buf
         cmp     $SMALL_SIZE, len
         jb      .Lsmall
  
         ################################################################
         ## 1) ALIGN:
         ################################################################
-       neg     %bufp
-       and     $7, %bufp               # calculate the unalignment amount of
+       mov     bufp_d, n_misaligned
+       neg     n_misaligned
+       and     $7, n_misaligned        # calculate the misalignment amount of
                                         # the address
-       je      .Lproc_block            # Skip if aligned
+       je      .Laligned               # Skip if aligned
  
+       # Process 1 <= n_misaligned <= 7 bytes individually in order to align
+       # the remaining data to an 8-byte boundary.
  .Ldo_align:
-       #### Calculate CRC of unaligned bytes of the buffer (if any)
-       movq    (bufptmp), tmp          # load a quadward from the buffer
-       add     %bufp, bufptmp          # align buffer pointer for quadword
-                                       # processing
-       sub     bufp_dw, len            # update buffer length
+       movq    (bufp), %rax
+       add     n_misaligned_q, bufp
+       sub     n_misaligned, len
  .Lalign_loop:
-       crc32b  %bl, crc_init           # compute crc32 of 1-byte
-       shr     $8, tmp                 # get next byte
-       dec     %bufp
+       crc32b  %al, crc_init           # compute crc32 of 1-byte
+       shr     $8, %rax                # get next byte
+       dec     n_misaligned
         jne     .Lalign_loop
-
-.Lproc_block:
+.Laligned:
  
         ################################################################
-       ## 2) PROCESS  BLOCKS:
+       ## 2) PROCESS BLOCK:
         ################################################################
  
-       ## compute num of bytes to be processed
-
         cmp     $128*24, len
         jae     .Lfull_block
  
-.Lcontinue_block:
-       ## len < 128*24
-       movq    $2731, %rax             # 2731 = ceil(2^16 / 24)
-       mul     len
-       shrq    $16, %rax
-
-       ## eax contains floor(bytes / 24) = num 24-byte chunks to do
-
-       ## process rax 24-byte chunks (128 >= rax >= 0)
-
-       ## compute end address of each block
-       ## block 0 (base addr + RAX * 8)
-       ## block 1 (base addr + RAX * 16)
-       ## block 2 (base addr + RAX * 24)
-       lea     (bufptmp, %rax, 8), block_0
-       lea     (block_0, %rax, 8), block_1
-       lea     (block_1, %rax, 8), block_2
-
-       xor     crc1, crc1
-       xor     crc2, crc2
-
-       ## branch into array
-       leaq    jump_table(%rip), %bufp
-       mov     (%bufp,%rax,8), %bufp
-       JMP_NOSPEC bufp
+.Lpartial_block:
+       # Compute floor(len / 24) to get num qwords to process from each lane.
+       imul    $2731, len, %eax        # 2731 = ceil(2^16 / 24)
+       shr     $16, %eax
+       jmp     .Lcrc_3lanes
  
-       ################################################################
-       ## 2a) PROCESS FULL BLOCKS:
-       ################################################################
  .Lfull_block:
-       movl    $128,%eax
-       lea     128*8*2(block_0), block_1
-       lea     128*8*3(block_0), block_2
-       add     $128*8*1, block_0
-
-       xor     crc1,crc1
-       xor     crc2,crc2
-
-       # Fall through into top of crc array (crc_128)
+       # Processing 128 qwords from each lane.
+       mov     $128, %eax
  
         ################################################################
-       ## 3) CRC Array:
+       ## 3) CRC each of three lanes:
         ################################################################
  
-       i=128
-.rept 128-1
-.altmacro
-LABEL crc_ %i
-.noaltmacro
-       ENDBR
-       crc32q   -i*8(block_0), crc_init_q
-       crc32q   -i*8(block_1), crc1
-       crc32q   -i*8(block_2), crc2
-       i=(i-1)
-.endr
-
-.altmacro
-LABEL crc_ %i
-.noaltmacro
-       ENDBR
-       crc32q   -i*8(block_0), crc_init_q
-       crc32q   -i*8(block_1), crc1
-# SKIP  crc32  -i*8(block_2), crc2 ; Don't do this one yet
+.Lcrc_3lanes:
+       xor     crc1,crc1
+       xor     crc2,crc2
+       mov     %eax, chunk_bytes
+       shl     $3, chunk_bytes         # num bytes to process from each lane
+       sub     $5, %eax                # 4 for 4x_loop, 1 for special last iter
+       jl      .Lcrc_3lanes_4x_done
+
+       # Unroll the loop by a factor of 4 to reduce the overhead of the loop
+       # bookkeeping instructions, which can compete with crc32q for the ALUs.
+.Lcrc_3lanes_4x_loop:
+       crc32q  (bufp), crc_init_q
+       crc32q  (bufp,chunk_bytes_q), crc1
+       crc32q  (bufp,chunk_bytes_q,2), crc2
+       crc32q  8(bufp), crc_init_q
+       crc32q  8(bufp,chunk_bytes_q), crc1
+       crc32q  8(bufp,chunk_bytes_q,2), crc2
+       crc32q  16(bufp), crc_init_q
+       crc32q  16(bufp,chunk_bytes_q), crc1
+       crc32q  16(bufp,chunk_bytes_q,2), crc2
+       crc32q  24(bufp), crc_init_q
+       crc32q  24(bufp,chunk_bytes_q), crc1
+       crc32q  24(bufp,chunk_bytes_q,2), crc2
+       add     $32, bufp
+       sub     $4, %eax
+       jge     .Lcrc_3lanes_4x_loop
+
+.Lcrc_3lanes_4x_done:
+       add     $4, %eax
+       jz      .Lcrc_3lanes_last_qword
+
+.Lcrc_3lanes_1x_loop:
+       crc32q  (bufp), crc_init_q
+       crc32q  (bufp,chunk_bytes_q), crc1
+       crc32q  (bufp,chunk_bytes_q,2), crc2
+       add     $8, bufp
+       dec     %eax
+       jnz     .Lcrc_3lanes_1x_loop
  
-       mov     block_2, block_0
+.Lcrc_3lanes_last_qword:
+       crc32q  (bufp), crc_init_q
+       crc32q  (bufp,chunk_bytes_q), crc1
+# SKIP  crc32q (bufp,chunk_bytes_q,2), crc2    ; Don't do this one yet
  
         ################################################################
         ## 4) Combine three results:
         ################################################################
  
-       lea     (K_table-8)(%rip), %bufp                # first entry is for idx 1
-       shlq    $3, %rax                        # rax *= 8
-       pmovzxdq (%bufp,%rax), %xmm0            # 2 consts: K1:K2
-       leal    (%eax,%eax,2), %eax             # rax *= 3 (total *24)
-       sub     %eax, len                       # len -= rax*24
+       lea     (K_table-8)(%rip), %rax         # first entry is for idx 1
+       pmovzxdq (%rax,chunk_bytes_q), %xmm0    # 2 consts: K1:K2
+       lea     (chunk_bytes,chunk_bytes,2), %eax # chunk_bytes * 3
+       sub     %eax, len                       # len -= chunk_bytes * 3
  
         movq    crc_init_q, %xmm1               # CRC for block 1
         pclmulqdq $0x00, %xmm0, %xmm1           # Multiply by K2
@@ -206,20 +175,19 @@ LABEL crc_ %i
  
         pxor    %xmm2,%xmm1
         movq    %xmm1, %rax
-       xor     -i*8(block_2), %rax
+       xor     (bufp,chunk_bytes_q,2), %rax
         mov     crc2, crc_init_q
         crc32   %rax, crc_init_q
+       lea     8(bufp,chunk_bytes_q,2), bufp
  
         ################################################################
-       ## 5) Check for end:
+       ## 5) If more blocks remain, goto (2):
         ################################################################
  
-LABEL crc_ 0
-       ENDBR
         cmp     $128*24, len
-       jae     .Lfull_block
+       jae     .Lfull_block
         cmp     $SMALL_SIZE, len
-       jae     .Lcontinue_block
+       jae     .Lpartial_block
  
         #######################################################################
         ## 6) Process any remainder without interleaving:
@@ -231,47 +199,30 @@ LABEL crc_ 0
         shr     $3, %eax
         jz      .Ldo_dword
  .Ldo_qwords:
-       crc32q  (bufptmp), crc_init_q
-       add     $8, bufptmp
+       crc32q  (bufp), crc_init_q
+       add     $8, bufp
         dec     %eax
         jnz     .Ldo_qwords
  .Ldo_dword:
         test    $4, len
         jz      .Ldo_word
-       crc32l  (bufptmp), crc_init
-       add     $4, bufptmp
+       crc32l  (bufp), crc_init
+       add     $4, bufp
  .Ldo_word:
         test    $2, len
         jz      .Ldo_byte
-       crc32w  (bufptmp), crc_init
-       add     $2, bufptmp
+       crc32w  (bufp), crc_init
+       add     $2, bufp
  .Ldo_byte:
         test    $1, len
         jz      .Ldone
-       crc32b  (bufptmp), crc_init
+       crc32b  (bufp), crc_init
  .Ldone:
         mov     crc_init, %eax
-       popq    %rsi
-       popq    %rdi
-       popq    %rbx
          RET
  SYM_FUNC_END(crc_pcl)
  
  .section       .rodata, "a", @progbits
-        ################################################################
-        ## jump table        Table is 129 entries x 2 bytes each
-        ################################################################
-.align 4
-jump_table:
-       i=0
-.rept 129
-.altmacro
-JMPTBL_ENTRY %i
-.noaltmacro
-       i=i+1
-.endr
-
-
         ################################################################
         ## PCLMULQDQ tables
         ## Table is 128 entries x 2 words (8 bytes) each
author	Eric Biggers <ebiggers@google.com>
	Mon, 14 Oct 2024 04:24:47 +0000 (21:24 -0700)
committer	Herbert Xu <herbert@gondor.apana.org.au>
	Sat, 26 Oct 2024 06:41:59 +0000 (14:41 +0800)