/* Threshold to use Enhanced REP STOSB. */
long int __x86_rep_stosb_threshold attribute_hidden = 2048;
+/* Threshold to stop using Enhanced REP MOVSB. */
+long int __x86_rep_movsb_stop_threshold attribute_hidden;
+
static void
init_cacheinfo (void)
{
__x86_rep_movsb_threshold = cpu_features->rep_movsb_threshold;
__x86_rep_stosb_threshold = cpu_features->rep_stosb_threshold;
+ __x86_rep_movsb_stop_threshold = cpu_features->rep_movsb_stop_threshold;
}
#endif
int max_cpuid_ex;
long int data = -1;
long int shared = -1;
- long int core;
+ long int core = -1;
unsigned int threads = 0;
unsigned long int level1_icache_size = -1;
unsigned long int level1_dcache_size = -1;
#endif
}
+ unsigned long int rep_movsb_stop_threshold;
+ /* ERMS feature is implemented from AMD Zen3 architecture and it is
+ performing poorly for data above L2 cache size. Henceforth, adding
+ an upper bound threshold parameter to limit the usage of Enhanced
+ REP MOVSB operations and setting its value to L2 cache size. */
+ if (cpu_features->basic.kind == arch_kind_amd)
+ rep_movsb_stop_threshold = core;
+ /* Setting the upper bound of ERMS to the computed value of
+ non-temporal threshold for architectures other than AMD. */
+ else
+ rep_movsb_stop_threshold = non_temporal_threshold;
+
/* The default threshold to use Enhanced REP STOSB. */
unsigned long int rep_stosb_threshold = 2048;
cpu_features->non_temporal_threshold = non_temporal_threshold;
cpu_features->rep_movsb_threshold = rep_movsb_threshold;
cpu_features->rep_stosb_threshold = rep_stosb_threshold;
+ cpu_features->rep_movsb_stop_threshold = rep_movsb_stop_threshold;
}
unsigned long int non_temporal_threshold;
/* Threshold to use "rep movsb". */
unsigned long int rep_movsb_threshold;
+ /* Threshold to stop using "rep movsb". */
+ unsigned long int rep_movsb_stop_threshold;
/* Threshold to use "rep stosb". */
unsigned long int rep_stosb_threshold;
/* _SC_LEVEL1_ICACHE_SIZE. */
load and aligned store. Load the last 4 * VEC and first VEC
before the loop and store them after the loop to support
overlapping addresses.
- 6. If size >= __x86_shared_non_temporal_threshold and there is no
+ 6. On machines with ERMS feature, if size greater than equal or to
+ __x86_rep_movsb_threshold and less than
+ __x86_rep_movsb_stop_threshold, then REP MOVSB will be used.
+ 7. If size >= __x86_shared_non_temporal_threshold and there is no
overlap between destination and source, use non-temporal store
instead of aligned store. */
ret
L(movsb):
- cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
+ cmp __x86_rep_movsb_stop_threshold(%rip), %RDX_LP
jae L(more_8x_vec)
cmpq %rsi, %rdi
jb 1f