From 439e58d0607786feeaa90f120fdb023fad8880a2 Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Fri, 7 May 2021 23:58:13 -0700 Subject: [PATCH] improved gcc-9 and gcc-10 decoding speed the new alignment setting is better for gcc-9 and gcc-10 by about ~+5%. Unfortunately, it's worse for essentially all other compilers. Make the new alignment setting conditional to gcc-9+. --- lib/decompress/zstd_decompress_block.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/lib/decompress/zstd_decompress_block.c b/lib/decompress/zstd_decompress_block.c index a8ba8d8bb..349dcdc33 100644 --- a/lib/decompress/zstd_decompress_block.c +++ b/lib/decompress/zstd_decompress_block.c @@ -1142,13 +1142,14 @@ ZSTD_decompressSequences_body( ZSTD_DCtx* dctx, * If you see most cycles served out of the DSB you've hit the good case. * If it is pretty even then you may be in an okay case. * - * I've been able to reproduce this issue on the following CPUs: + * This issue has been reproduced on the following CPUs: * - Kabylake: Macbook Pro (15-inch, 2019) 2.4 GHz Intel Core i9 * Use Instruments->Counters to get DSB/MITE cycles. * I never got performance swings, but I was able to * go from the good case of mostly DSB to half of the * cycles served from MITE. * - Coffeelake: Intel i9-9900k + * - Coffeelake: Intel i7-9700k * * I haven't been able to reproduce the instability or DSB misses on any * of the following CPUS: @@ -1165,7 +1166,12 @@ ZSTD_decompressSequences_body( ZSTD_DCtx* dctx, __asm__("nop"); __asm__(".p2align 5"); __asm__("nop"); +# if __GNUC__ >= 9 + /* better for gcc-9 and gcc-10, worse for clang and gcc-8 */ + __asm__(".p2align 3"); +# else __asm__(".p2align 4"); +# endif #endif for ( ; ; ) { seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset); -- 2.47.2