-# Calling convention:
-#
-# %rdi contains the first argument: HUF_DecompressAsmArgs*.
-# %rbp is'nt maintained (no frame pointer).
-# %rsp contains the stack pointer that grows down.
-# No red-zone is assumed, only addresses >= %rsp are used.
-# All register contents are preserved.
-#
-# TODO: Support Windows calling convention.
-
#if !defined(HUF_DISABLE_ASM) && defined(__x86_64__)
+
+/* Calling convention:
+ *
+ * %rdi contains the first argument: HUF_DecompressAsmArgs*.
+ * %rbp is'nt maintained (no frame pointer).
+ * %rsp contains the stack pointer that grows down.
+ * No red-zone is assumed, only addresses >= %rsp are used.
+ * All register contents are preserved.
+ *
+ * TODO: Support Windows calling convention.
+ */
+
.global HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop
.global HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop
.global _HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop
.global _HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop
.text
-# Sets up register mappings for clarity.
-# op[], bits[], dtable & ip[0] each get their own register.
-# ip[1,2,3] & olimit alias var[].
-# %rax is a scratch register.
-
-#define op0 rsi
-#define op1 rbx
-#define op2 rcx
-#define op3 rdi
-
-#define ip0 r8
-#define ip1 r9
-#define ip2 r10
-#define ip3 r11
-
-#define bits0 rbp
-#define bits1 rdx
-#define bits2 r12
-#define bits3 r13
-#define dtable r14
-#define olimit r15
-
-# var[] aliases ip[1,2,3] & olimit
-# ip[1,2,3] are saved every iteration.
-# olimit is only used in compute_olimit.
-#define var0 r15
-#define var1 r9
-#define var2 r10
-#define var3 r11
-
-# 32-bit var registers
-#define vard0 r15d
-#define vard1 r9d
-#define vard2 r10d
-#define vard3 r11d
-
-# Helper macro: args if idx != 4.
+/* Sets up register mappings for clarity.
+ * op[], bits[], dtable & ip[0] each get their own register.
+ * ip[1,2,3] & olimit alias var[].
+ * %rax is a scratch register.
+ */
+
+#define op0 rsi
+#define op1 rbx
+#define op2 rcx
+#define op3 rdi
+
+#define ip0 r8
+#define ip1 r9
+#define ip2 r10
+#define ip3 r11
+
+#define bits0 rbp
+#define bits1 rdx
+#define bits2 r12
+#define bits3 r13
+#define dtable r14
+#define olimit r15
+
+/* var[] aliases ip[1,2,3] & olimit
+ * ip[1,2,3] are saved every iteration.
+ * olimit is only used in compute_olimit.
+ */
+#define var0 r15
+#define var1 r9
+#define var2 r10
+#define var3 r11
+
+/* 32-bit var registers */
+#define vard0 r15d
+#define vard1 r9d
+#define vard2 r10d
+#define vard3 r11d
+
+/* Helper macro: args if idx != 4. */
#define IF_NOT_4_0(...) __VA_ARGS__
#define IF_NOT_4_1(...) __VA_ARGS__
#define IF_NOT_4_2(...) __VA_ARGS__
#define IF_NOT_4_(idx, ...) IF_NOT_4_##idx(__VA_ARGS__)
#define IF_NOT_4(idx, ...) IF_NOT_4_(idx, __VA_ARGS__)
-# Calls X(N) for each stream 0, 1, 2, 3.
+/* Calls X(N) for each stream 0, 1, 2, 3. */
#define FOR_EACH_STREAM(X) \
- X(0); \
- X(1); \
- X(2); \
- X(3)
+ X(0); \
+ X(1); \
+ X(2); \
+ X(3)
-# Calls X(N, idx) for each stream 0, 1, 2, 3.
+/* Calls X(N, idx) for each stream 0, 1, 2, 3. */
#define FOR_EACH_STREAM_WITH_INDEX(X, idx) \
- X(0, idx); \
- X(1, idx); \
- X(2, idx); \
- X(3, idx)
-
-# Define both _HUF_* & HUF_* symbols because MacOS
-# C symbols are prefixed with '_' & Linux symbols aren't.
+ X(0, idx); \
+ X(1, idx); \
+ X(2, idx); \
+ X(3, idx)
+
+/* Define both _HUF_* & HUF_* symbols because MacOS
+ * C symbols are prefixed with '_' & Linux symbols aren't.
+ */
_HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop:
HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop:
- # Save all registers - even if they are callee saved for simplicity.
- push %rax
- push %rbx
- push %rcx
- push %rdx
- push %rbp
- push %rsi
- push %rdi
- push %r8
- push %r9
- push %r10
- push %r11
- push %r12
- push %r13
- push %r14
- push %r15
-
- # Read HUF_DecompressAsmArgs* args from %rax
- movq %rdi, %rax
- movq 0(%rax), %ip0
- movq 8(%rax), %ip1
- movq 16(%rax), %ip2
- movq 24(%rax), %ip3
- movq 32(%rax), %op0
- movq 40(%rax), %op1
- movq 48(%rax), %op2
- movq 56(%rax), %op3
- movq 64(%rax), %bits0
- movq 72(%rax), %bits1
- movq 80(%rax), %bits2
- movq 88(%rax), %bits3
- movq 96(%rax), %dtable
- push %rax # argument
- push 104(%rax) # ilimit
- push 112(%rax) # oend
- push %olimit # olimit space
-
- subq $24, %rsp
+ /* Save all registers - even if they are callee saved for simplicity. */
+ push %rax
+ push %rbx
+ push %rcx
+ push %rdx
+ push %rbp
+ push %rsi
+ push %rdi
+ push %r8
+ push %r9
+ push %r10
+ push %r11
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+
+ /* Read HUF_DecompressAsmArgs* args from %rax */
+ movq %rdi, %rax
+ movq 0(%rax), %ip0
+ movq 8(%rax), %ip1
+ movq 16(%rax), %ip2
+ movq 24(%rax), %ip3
+ movq 32(%rax), %op0
+ movq 40(%rax), %op1
+ movq 48(%rax), %op2
+ movq 56(%rax), %op3
+ movq 64(%rax), %bits0
+ movq 72(%rax), %bits1
+ movq 80(%rax), %bits2
+ movq 88(%rax), %bits3
+ movq 96(%rax), %dtable
+ push %rax /* argument */
+ push 104(%rax) /* ilimit */
+ push 112(%rax) /* oend */
+ push %olimit /* olimit space */
+
+ subq $24, %rsp
.L_4X1_compute_olimit:
- # Computes how many iterations we can do savely
- # %r15, %rax may be clobbered
- # rbx, rdx must be saved
- # op3 & ip0 mustn't be clobbered
- movq %rbx, 0(%rsp)
- movq %rdx, 8(%rsp)
-
- movq 32(%rsp), %rax # rax = oend
- subq %op3, %rax # rax = oend - op3
-
- # r15 = (oend - op3) / 5
- movabsq $-3689348814741910323, %rdx
- mulq %rdx
- movq %rdx, %r15
- shrq $2, %r15
-
- movq %ip0, %rax # rax = ip0
- movq 40(%rsp), %rdx # rdx = ilimit
- subq %rdx, %rax # rax = ip0 - ilimit
- movq %rax, %rbx # rbx = ip0 - ilimit
-
- # rdx = (ip0 - ilimit) / 7
- movabsq $2635249153387078803, %rdx
- mulq %rdx
- subq %rdx, %rbx
- shrq %rbx
- addq %rbx, %rdx
- shrq $2, %rdx
-
- # r15 = min(%rdx, %r15)
- cmpq %rdx, %r15
- cmova %rdx, %r15
-
- # r15 = r15 * 5
- leaq (%r15, %r15, 4), %r15
-
- # olimit = op3 + r15
- addq %op3, %olimit
-
- movq 8(%rsp), %rdx
- movq 0(%rsp), %rbx
-
- # If (op3 + 20 > olimit)
- movq %op3, %rax # rax = op3
- addq $20, %rax # rax = op3 + 20
- cmpq %rax, %olimit # op3 + 20 > olimit
- jb .L_4X1_exit
-
- # If (ip1 < ip0) go to exit
- cmpq %ip0, %ip1
- jb .L_4X1_exit
-
- # If (ip2 < ip1) go to exit
- cmpq %ip1, %ip2
- jb .L_4X1_exit
-
- # If (ip3 < ip2) go to exit
- cmpq %ip2, %ip3
- jb .L_4X1_exit
-
-# Reads top 11 bits from bits[n]
-# Loads dt[bits[n]] into var[n]
-#define GET_NEXT_DELT(n) \
- movq $53, %var##n; \
- shrxq %var##n, %bits##n, %var##n; \
- movzwl (%dtable,%var##n,2),%vard##n
-
-# var[n] must contain the DTable entry computed with GET_NEXT_DELT
-# Moves var[n] to %rax
-# bits[n] <<= var[n] & 63
-# op[n][idx] = %rax >> 8
-# %ah is a way to access bits [8, 16) of %rax
-#define DECODE_FROM_DELT(n, idx) \
- movq %var##n, %rax; \
- shlxq %var##n, %bits##n, %bits##n; \
- movb %ah, idx(%op##n)
-
-# Assumes GET_NEXT_DELT has been called.
-# Calls DECODE_FROM_DELT then GET_NEXT_DELT if n < 4
-#define DECODE(n, idx) \
- DECODE_FROM_DELT(n, idx); \
- IF_NOT_4(idx, GET_NEXT_DELT(n))
-
-# // ctz & nbBytes is stored in bits[n]
-# // nbBits is stored in %rax
-# ctz = CTZ[bits[n]]
-# nbBits = ctz & 7
-# nbBytes = ctz >> 3
-# op[n] += 5
-# ip[n] -= nbBytes
-# // Note: x86-64 is little-endian ==> no bswap
-# bits[n] = MEM_readST(ip[n]) | 1
-# bits[n] <<= nbBits
-#define RELOAD_BITS(n) \
- bsfq %bits##n, %bits##n; \
- movq %bits##n, %rax; \
- andq $7, %rax; \
- shrq $3, %bits##n; \
- leaq 5(%op##n), %op##n; \
- subq %bits##n, %ip##n; \
- movq (%ip##n), %bits##n; \
- orq $1, %bits##n; \
- shlx %rax, %bits##n, %bits##n;
-
- # Store clobbered variables on the stack
- movq %olimit, 24(%rsp)
- movq %ip1, 0(%rsp)
- movq %ip2, 8(%rsp)
- movq %ip3, 16(%rsp)
-
- # Call GET_NEXT_DELT for each stream
- FOR_EACH_STREAM(GET_NEXT_DELT)
-
- .p2align 6
+ /* Computes how many iterations we can do savely
+ * %r15, %rax may be clobbered
+ * rbx, rdx must be saved
+ * op3 & ip0 mustn't be clobbered
+ */
+ movq %rbx, 0(%rsp)
+ movq %rdx, 8(%rsp)
+
+ movq 32(%rsp), %rax /* rax = oend */
+ subq %op3, %rax /* rax = oend - op3 */
+
+ /* r15 = (oend - op3) / 5 */
+ movabsq $-3689348814741910323, %rdx
+ mulq %rdx
+ movq %rdx, %r15
+ shrq $2, %r15
+
+ movq %ip0, %rax /* rax = ip0 */
+ movq 40(%rsp), %rdx /* rdx = ilimit */
+ subq %rdx, %rax /* rax = ip0 - ilimit */
+ movq %rax, %rbx /* rbx = ip0 - ilimit */
+
+ /* rdx = (ip0 - ilimit) / 7 */
+ movabsq $2635249153387078803, %rdx
+ mulq %rdx
+ subq %rdx, %rbx
+ shrq %rbx
+ addq %rbx, %rdx
+ shrq $2, %rdx
+
+ /* r15 = min(%rdx, %r15) */
+ cmpq %rdx, %r15
+ cmova %rdx, %r15
+
+ /* r15 = r15 * 5 */
+ leaq (%r15, %r15, 4), %r15
+
+ /* olimit = op3 + r15 */
+ addq %op3, %olimit
+
+ movq 8(%rsp), %rdx
+ movq 0(%rsp), %rbx
+
+ /* If (op3 + 20 > olimit) */
+ movq %op3, %rax /* rax = op3 */
+ addq $20, %rax /* rax = op3 + 20 */
+ cmpq %rax, %olimit /* op3 + 20 > olimit */
+ jb .L_4X1_exit
+
+ /* If (ip1 < ip0) go to exit */
+ cmpq %ip0, %ip1
+ jb .L_4X1_exit
+
+ /* If (ip2 < ip1) go to exit */
+ cmpq %ip1, %ip2
+ jb .L_4X1_exit
+
+ /* If (ip3 < ip2) go to exit */
+ cmpq %ip2, %ip3
+ jb .L_4X1_exit
+
+/* Reads top 11 bits from bits[n]
+ * Loads dt[bits[n]] into var[n]
+ */
+#define GET_NEXT_DELT(n) \
+ movq $53, %var##n; \
+ shrxq %var##n, %bits##n, %var##n; \
+ movzwl (%dtable,%var##n,2),%vard##n
+
+/* var[n] must contain the DTable entry computed with GET_NEXT_DELT
+ * Moves var[n] to %rax
+ * bits[n] <<= var[n] & 63
+ * op[n][idx] = %rax >> 8
+ * %ah is a way to access bits [8, 16) of %rax
+ */
+#define DECODE_FROM_DELT(n, idx) \
+ movq %var##n, %rax; \
+ shlxq %var##n, %bits##n, %bits##n; \
+ movb %ah, idx(%op##n)
+
+/* Assumes GET_NEXT_DELT has been called.
+ * Calls DECODE_FROM_DELT then GET_NEXT_DELT if n < 4
+ */
+#define DECODE(n, idx) \
+ DECODE_FROM_DELT(n, idx); \
+ IF_NOT_4(idx, GET_NEXT_DELT(n))
+
+/* // ctz & nbBytes is stored in bits[n]
+ * // nbBits is stored in %rax
+ * ctz = CTZ[bits[n]]
+ * nbBits = ctz & 7
+ * nbBytes = ctz >> 3
+ * op[n] += 5
+ * ip[n] -= nbBytes
+ * // Note: x86-64 is little-endian ==> no bswap
+ * bits[n] = MEM_readST(ip[n]) | 1
+ * bits[n] <<= nbBits
+ */
+#define RELOAD_BITS(n) \
+ bsfq %bits##n, %bits##n; \
+ movq %bits##n, %rax; \
+ andq $7, %rax; \
+ shrq $3, %bits##n; \
+ leaq 5(%op##n), %op##n; \
+ subq %bits##n, %ip##n; \
+ movq (%ip##n), %bits##n; \
+ orq $1, %bits##n; \
+ shlx %rax, %bits##n, %bits##n
+
+ /* Store clobbered variables on the stack */
+ movq %olimit, 24(%rsp)
+ movq %ip1, 0(%rsp)
+ movq %ip2, 8(%rsp)
+ movq %ip3, 16(%rsp)
+
+ /* Call GET_NEXT_DELT for each stream */
+ FOR_EACH_STREAM(GET_NEXT_DELT)
+
+ .p2align 6
.L_4X1_loop_body:
-# LLVM-MCA-BEGIN decode-4X1
- # Decode 5 symbols in each of the 4 streams (20 total)
- # Must have called GET_NEXT_DELT for each stream
- FOR_EACH_STREAM_WITH_INDEX(DECODE, 0)
- FOR_EACH_STREAM_WITH_INDEX(DECODE, 1)
- FOR_EACH_STREAM_WITH_INDEX(DECODE, 2)
- FOR_EACH_STREAM_WITH_INDEX(DECODE, 3)
- FOR_EACH_STREAM_WITH_INDEX(DECODE, 4)
-
- # Load ip[1,2,3] from stack (var[] aliases them)
- # ip[] is needed for RELOAD_BITS
- # Each will be stored back to the stack after RELOAD
- movq 0(%rsp), %ip1
- movq 8(%rsp), %ip2
- movq 16(%rsp), %ip3
-
- # Reload each stream & fetch the next table entry
- # to prepare for the next iteration
- RELOAD_BITS(0)
- GET_NEXT_DELT(0)
-
- RELOAD_BITS(1)
- movq %ip1, 0(%rsp)
- GET_NEXT_DELT(1)
-
- RELOAD_BITS(2)
- movq %ip2, 8(%rsp)
- GET_NEXT_DELT(2)
-
- RELOAD_BITS(3)
- movq %ip3, 16(%rsp)
- GET_NEXT_DELT(3)
-
- # If op3 < olimit: continue the loop
- cmp %op3, 24(%rsp)
- ja .L_4X1_loop_body
-
- # Reload ip[1,2,3] from stack
- movq 0(%rsp), %ip1
- movq 8(%rsp), %ip2
- movq 16(%rsp), %ip3
-
- # Re-compute olimit
- jmp .L_4X1_compute_olimit
+ /* Decode 5 symbols in each of the 4 streams (20 total)
+ * Must have called GET_NEXT_DELT for each stream
+ */
+ FOR_EACH_STREAM_WITH_INDEX(DECODE, 0)
+ FOR_EACH_STREAM_WITH_INDEX(DECODE, 1)
+ FOR_EACH_STREAM_WITH_INDEX(DECODE, 2)
+ FOR_EACH_STREAM_WITH_INDEX(DECODE, 3)
+ FOR_EACH_STREAM_WITH_INDEX(DECODE, 4)
+
+ /* Load ip[1,2,3] from stack (var[] aliases them)
+ * ip[] is needed for RELOAD_BITS
+ * Each will be stored back to the stack after RELOAD
+ */
+ movq 0(%rsp), %ip1
+ movq 8(%rsp), %ip2
+ movq 16(%rsp), %ip3
+
+ /* Reload each stream & fetch the next table entry
+ * to prepare for the next iteration
+ */
+ RELOAD_BITS(0)
+ GET_NEXT_DELT(0)
+
+ RELOAD_BITS(1)
+ movq %ip1, 0(%rsp)
+ GET_NEXT_DELT(1)
+
+ RELOAD_BITS(2)
+ movq %ip2, 8(%rsp)
+ GET_NEXT_DELT(2)
+
+ RELOAD_BITS(3)
+ movq %ip3, 16(%rsp)
+ GET_NEXT_DELT(3)
+
+ /* If op3 < olimit: continue the loop */
+ cmp %op3, 24(%rsp)
+ ja .L_4X1_loop_body
+
+ /* Reload ip[1,2,3] from stack */
+ movq 0(%rsp), %ip1
+ movq 8(%rsp), %ip2
+ movq 16(%rsp), %ip3
+
+ /* Re-compute olimit */
+ jmp .L_4X1_compute_olimit
#undef GET_NEXT_DELT
#undef DECODE_FROM_DELT
#undef DECODE
#undef RELOAD_BITS
-# LLVM-MCA-END
.L_4X1_exit:
- addq $24, %rsp
-
- # Restore stack (oend & olimit)
- pop %rax # olimit
- pop %rax # oend
- pop %rax # ilimit
- pop %rax # arg
-
- # Save ip / op / bits
- movq %ip0, 0(%rax)
- movq %ip1, 8(%rax)
- movq %ip2, 16(%rax)
- movq %ip3, 24(%rax)
- movq %op0, 32(%rax)
- movq %op1, 40(%rax)
- movq %op2, 48(%rax)
- movq %op3, 56(%rax)
- movq %bits0, 64(%rax)
- movq %bits1, 72(%rax)
- movq %bits2, 80(%rax)
- movq %bits3, 88(%rax)
-
- # Restore registers
- pop %r15
- pop %r14
- pop %r13
- pop %r12
- pop %r11
- pop %r10
- pop %r9
- pop %r8
- pop %rdi
- pop %rsi
- pop %rbp
- pop %rdx
- pop %rcx
- pop %rbx
- pop %rax
- ret
+ addq $24, %rsp
+
+ /* Restore stack (oend & olimit) */
+ pop %rax /* olimit */
+ pop %rax /* oend */
+ pop %rax /* ilimit */
+ pop %rax /* arg */
+
+ /* Save ip / op / bits */
+ movq %ip0, 0(%rax)
+ movq %ip1, 8(%rax)
+ movq %ip2, 16(%rax)
+ movq %ip3, 24(%rax)
+ movq %op0, 32(%rax)
+ movq %op1, 40(%rax)
+ movq %op2, 48(%rax)
+ movq %op3, 56(%rax)
+ movq %bits0, 64(%rax)
+ movq %bits1, 72(%rax)
+ movq %bits2, 80(%rax)
+ movq %bits3, 88(%rax)
+
+ /* Restore registers */
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %r11
+ pop %r10
+ pop %r9
+ pop %r8
+ pop %rdi
+ pop %rsi
+ pop %rbp
+ pop %rdx
+ pop %rcx
+ pop %rbx
+ pop %rax
+ ret
_HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop:
HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop:
- # Save all registers - even if they are callee saved for simplicity.
- push %rax
- push %rbx
- push %rcx
- push %rdx
- push %rbp
- push %rsi
- push %rdi
- push %r8
- push %r9
- push %r10
- push %r11
- push %r12
- push %r13
- push %r14
- push %r15
-
- movq %rdi, %rax
- movq 0(%rax), %ip0
- movq 8(%rax), %ip1
- movq 16(%rax), %ip2
- movq 24(%rax), %ip3
- movq 32(%rax), %op0
- movq 40(%rax), %op1
- movq 48(%rax), %op2
- movq 56(%rax), %op3
- movq 64(%rax), %bits0
- movq 72(%rax), %bits1
- movq 80(%rax), %bits2
- movq 88(%rax), %bits3
- movq 96(%rax), %dtable
- push %rax # argument
- push %rax # olimit
- push 104(%rax) # ilimit
-
- movq 112(%rax), %rax
- push %rax # oend3
-
- movq %op3, %rax
- push %rax # oend2
-
- movq %op2, %rax
- push %rax # oend1
-
- movq %op1, %rax
- push %rax # oend0
-
- # Scratch space
- subq $8, %rsp
+ /* Save all registers - even if they are callee saved for simplicity. */
+ push %rax
+ push %rbx
+ push %rcx
+ push %rdx
+ push %rbp
+ push %rsi
+ push %rdi
+ push %r8
+ push %r9
+ push %r10
+ push %r11
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+
+ movq %rdi, %rax
+ movq 0(%rax), %ip0
+ movq 8(%rax), %ip1
+ movq 16(%rax), %ip2
+ movq 24(%rax), %ip3
+ movq 32(%rax), %op0
+ movq 40(%rax), %op1
+ movq 48(%rax), %op2
+ movq 56(%rax), %op3
+ movq 64(%rax), %bits0
+ movq 72(%rax), %bits1
+ movq 80(%rax), %bits2
+ movq 88(%rax), %bits3
+ movq 96(%rax), %dtable
+ push %rax /* argument */
+ push %rax /* olimit */
+ push 104(%rax) /* ilimit */
+
+ movq 112(%rax), %rax
+ push %rax /* oend3 */
+
+ movq %op3, %rax
+ push %rax /* oend2 */
+
+ movq %op2, %rax
+ push %rax /* oend1 */
+
+ movq %op1, %rax
+ push %rax /* oend0 */
+
+ /* Scratch space */
+ subq $8, %rsp
.L_4X2_compute_olimit:
- # Computes how many iterations we can do savely
- # %r15, %rax may be clobbered
- # rdx must be saved
- # op[1,2,3,4] & ip0 mustn't be clobbered
- movq %rdx, 0(%rsp)
-
- # We can consume up to 7 input bytes each iteration.
- movq %ip0, %rax # rax = ip0
- movq 40(%rsp), %rdx # rdx = ilimit
- subq %rdx, %rax # rax = ip0 - ilimit
- movq %rax, %r15 # r15 = ip0 - ilimit
-
- # rdx = rax / 7
- movabsq $2635249153387078803, %rdx
- mulq %rdx
- subq %rdx, %r15
- shrq %r15
- addq %r15, %rdx
- shrq $2, %rdx
-
- # r15 = (ip0 - ilimit) / 7
- movq %rdx, %r15
-
- movabsq $-3689348814741910323, %rdx
- movq 8(%rsp), %rax # rax = oend0
- subq %op0, %rax # rax = oend0 - op0
- mulq %rdx
- shrq $3, %rdx # rdx = rax / 10
-
- # r15 = min(%rdx, %r15)
- cmpq %rdx, %r15
- cmova %rdx, %r15
-
- movabsq $-3689348814741910323, %rdx
- movq 16(%rsp), %rax # rax = oend1
- subq %op1, %rax # rax = oend1 - op1
- mulq %rdx
- shrq $3, %rdx # rdx = rax / 10
-
- # r15 = min(%rdx, %r15)
- cmpq %rdx, %r15
- cmova %rdx, %r15
-
- movabsq $-3689348814741910323, %rdx
- movq 24(%rsp), %rax # rax = oend2
- subq %op2, %rax # rax = oend2 - op2
- mulq %rdx
- shrq $3, %rdx # rdx = rax / 10
-
- # r15 = min(%rdx, %r15)
- cmpq %rdx, %r15
- cmova %rdx, %r15
-
- movabsq $-3689348814741910323, %rdx
- movq 32(%rsp), %rax # rax = oend3
- subq %op3, %rax # rax = oend3 - op3
- mulq %rdx
- shrq $3, %rdx # rdx = rax / 10
-
- # r15 = min(%rdx, %r15)
- cmpq %rdx, %r15
- cmova %rdx, %r15
-
- # olimit = op3 + 5 * r15
- movq %r15, %rax
- leaq (%op3, %rax, 4), %olimit
- addq %rax, %olimit
-
- movq 0(%rsp), %rdx
-
- # If (op3 + 10 > olimit)
- movq %op3, %rax # rax = op3
- addq $10, %rax # rax = op3 + 10
- cmpq %rax, %olimit # op3 + 10 > olimit
- jb .L_4X2_exit
-
- # If (ip1 < ip0) go to exit
- cmpq %ip0, %ip1
- jb .L_4X2_exit
-
- # If (ip2 < ip1) go to exit
- cmpq %ip1, %ip2
- jb .L_4X2_exit
-
- # If (ip3 < ip2) go to exit
- cmpq %ip2, %ip3
- jb .L_4X2_exit
-
-#define DECODE(n, idx) \
- movq %bits##n, %rax; \
- shrq $53, %rax; \
- movzwl 0(%dtable,%rax,4),%r8d; \
- movzbl 2(%dtable,%rax,4),%r15d; \
- movzbl 3(%dtable,%rax,4),%eax; \
- movw %r8w, (%op##n); \
- shlxq %r15, %bits##n, %bits##n; \
- addq %rax, %op##n
-
-#define RELOAD_BITS(n) \
- bsfq %bits##n, %bits##n; \
- movq %bits##n, %rax; \
- shrq $3, %bits##n; \
- andq $7, %rax; \
- subq %bits##n, %ip##n; \
- movq (%ip##n), %bits##n; \
- orq $1, %bits##n; \
- shlxq %rax, %bits##n, %bits##n;
-
-
- movq %olimit, 48(%rsp)
-
- .p2align 6
+ /* Computes how many iterations we can do savely
+ * %r15, %rax may be clobbered
+ * rdx must be saved
+ * op[1,2,3,4] & ip0 mustn't be clobbered
+ */
+ movq %rdx, 0(%rsp)
+
+ /* We can consume up to 7 input bytes each iteration. */
+ movq %ip0, %rax /* rax = ip0 */
+ movq 40(%rsp), %rdx /* rdx = ilimit */
+ subq %rdx, %rax /* rax = ip0 - ilimit */
+ movq %rax, %r15 /* r15 = ip0 - ilimit */
+
+ /* rdx = rax / 7 */
+ movabsq $2635249153387078803, %rdx
+ mulq %rdx
+ subq %rdx, %r15
+ shrq %r15
+ addq %r15, %rdx
+ shrq $2, %rdx
+
+ /* r15 = (ip0 - ilimit) / 7 */
+ movq %rdx, %r15
+
+ movabsq $-3689348814741910323, %rdx
+ movq 8(%rsp), %rax /* rax = oend0 */
+ subq %op0, %rax /* rax = oend0 - op0 */
+ mulq %rdx
+ shrq $3, %rdx /* rdx = rax / 10 */
+
+ /* r15 = min(%rdx, %r15) */
+ cmpq %rdx, %r15
+ cmova %rdx, %r15
+
+ movabsq $-3689348814741910323, %rdx
+ movq 16(%rsp), %rax /* rax = oend1 */
+ subq %op1, %rax /* rax = oend1 - op1 */
+ mulq %rdx
+ shrq $3, %rdx /* rdx = rax / 10 */
+
+ /* r15 = min(%rdx, %r15) */
+ cmpq %rdx, %r15
+ cmova %rdx, %r15
+
+ movabsq $-3689348814741910323, %rdx
+ movq 24(%rsp), %rax /* rax = oend2 */
+ subq %op2, %rax /* rax = oend2 - op2 */
+ mulq %rdx
+ shrq $3, %rdx /* rdx = rax / 10 */
+
+ /* r15 = min(%rdx, %r15) */
+ cmpq %rdx, %r15
+ cmova %rdx, %r15
+
+ movabsq $-3689348814741910323, %rdx
+ movq 32(%rsp), %rax /* rax = oend3 */
+ subq %op3, %rax /* rax = oend3 - op3 */
+ mulq %rdx
+ shrq $3, %rdx /* rdx = rax / 10 */
+
+ /* r15 = min(%rdx, %r15) */
+ cmpq %rdx, %r15
+ cmova %rdx, %r15
+
+ /* olimit = op3 + 5 * r15 */
+ movq %r15, %rax
+ leaq (%op3, %rax, 4), %olimit
+ addq %rax, %olimit
+
+ movq 0(%rsp), %rdx
+
+ /* If (op3 + 10 > olimit) */
+ movq %op3, %rax /* rax = op3 */
+ addq $10, %rax /* rax = op3 + 10 */
+ cmpq %rax, %olimit /* op3 + 10 > olimit */
+ jb .L_4X2_exit
+
+ /* If (ip1 < ip0) go to exit */
+ cmpq %ip0, %ip1
+ jb .L_4X2_exit
+
+ /* If (ip2 < ip1) go to exit */
+ cmpq %ip1, %ip2
+ jb .L_4X2_exit
+
+ /* If (ip3 < ip2) go to exit */
+ cmpq %ip2, %ip3
+ jb .L_4X2_exit
+
+#define DECODE(n, idx) \
+ movq %bits##n, %rax; \
+ shrq $53, %rax; \
+ movzwl 0(%dtable,%rax,4),%r8d; \
+ movzbl 2(%dtable,%rax,4),%r15d; \
+ movzbl 3(%dtable,%rax,4),%eax; \
+ movw %r8w, (%op##n); \
+ shlxq %r15, %bits##n, %bits##n; \
+ addq %rax, %op##n
+
+#define RELOAD_BITS(n) \
+ bsfq %bits##n, %bits##n; \
+ movq %bits##n, %rax; \
+ shrq $3, %bits##n; \
+ andq $7, %rax; \
+ subq %bits##n, %ip##n; \
+ movq (%ip##n), %bits##n; \
+ orq $1, %bits##n; \
+ shlxq %rax, %bits##n, %bits##n
+
+
+ movq %olimit, 48(%rsp)
+
+ .p2align 6
.L_4X2_loop_body:
-# LLVM-MCA-BEGIN decode-4X2
-
- # We clobber r8, so store it on the stack
- movq %r8, 0(%rsp)
+ /* We clobber r8, so store it on the stack */
+ movq %r8, 0(%rsp)
- # Decode 5 symbols from each of the 4 streams (20 symbols total).
- FOR_EACH_STREAM_WITH_INDEX(DECODE, 0)
- FOR_EACH_STREAM_WITH_INDEX(DECODE, 1)
- FOR_EACH_STREAM_WITH_INDEX(DECODE, 2)
- FOR_EACH_STREAM_WITH_INDEX(DECODE, 3)
- FOR_EACH_STREAM_WITH_INDEX(DECODE, 4)
+ /* Decode 5 symbols from each of the 4 streams (20 symbols total). */
+ FOR_EACH_STREAM_WITH_INDEX(DECODE, 0)
+ FOR_EACH_STREAM_WITH_INDEX(DECODE, 1)
+ FOR_EACH_STREAM_WITH_INDEX(DECODE, 2)
+ FOR_EACH_STREAM_WITH_INDEX(DECODE, 3)
+ FOR_EACH_STREAM_WITH_INDEX(DECODE, 4)
- # Reload r8
- movq 0(%rsp), %r8
+ /* Reload r8 */
+ movq 0(%rsp), %r8
- FOR_EACH_STREAM(RELOAD_BITS)
+ FOR_EACH_STREAM(RELOAD_BITS)
- cmp %op3, 48(%rsp)
- ja .L_4X2_loop_body
- jmp .L_4X2_compute_olimit
+ cmp %op3, 48(%rsp)
+ ja .L_4X2_loop_body
+ jmp .L_4X2_compute_olimit
#undef DECODE
#undef RELOAD_BITS
-# LLVM-MCA-END
.L_4X2_exit:
- addq $8, %rsp
- # Restore stack (oend & olimit)
- pop %rax # oend0
- pop %rax # oend1
- pop %rax # oend2
- pop %rax # oend3
- pop %rax # ilimit
- pop %rax # olimit
- pop %rax # arg
-
- # Save ip / op / bits
- movq %ip0, 0(%rax)
- movq %ip1, 8(%rax)
- movq %ip2, 16(%rax)
- movq %ip3, 24(%rax)
- movq %op0, 32(%rax)
- movq %op1, 40(%rax)
- movq %op2, 48(%rax)
- movq %op3, 56(%rax)
- movq %bits0, 64(%rax)
- movq %bits1, 72(%rax)
- movq %bits2, 80(%rax)
- movq %bits3, 88(%rax)
-
- # Restore registers
- pop %r15
- pop %r14
- pop %r13
- pop %r12
- pop %r11
- pop %r10
- pop %r9
- pop %r8
- pop %rdi
- pop %rsi
- pop %rbp
- pop %rdx
- pop %rcx
- pop %rbx
- pop %rax
- ret
+ addq $8, %rsp
+ /* Restore stack (oend & olimit) */
+ pop %rax /* oend0 */
+ pop %rax /* oend1 */
+ pop %rax /* oend2 */
+ pop %rax /* oend3 */
+ pop %rax /* ilimit */
+ pop %rax /* olimit */
+ pop %rax /* arg */
+
+ /* Save ip / op / bits */
+ movq %ip0, 0(%rax)
+ movq %ip1, 8(%rax)
+ movq %ip2, 16(%rax)
+ movq %ip3, 24(%rax)
+ movq %op0, 32(%rax)
+ movq %op1, 40(%rax)
+ movq %op2, 48(%rax)
+ movq %op3, 56(%rax)
+ movq %bits0, 64(%rax)
+ movq %bits1, 72(%rax)
+ movq %bits2, 80(%rax)
+ movq %bits3, 88(%rax)
+
+ /* Restore registers */
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %r11
+ pop %r10
+ pop %r9
+ pop %r8
+ pop %rdi
+ pop %rsi
+ pop %rbp
+ pop %rdx
+ pop %rcx
+ pop %rbx
+ pop %rax
+ ret
+
#endif