]> git.ipfire.org Git - thirdparty/kernel/stable.git/commitdiff
sched/numa: Complete scanning of partial VMAs regardless of PID activity
authorMel Gorman <mgorman@techsingularity.net>
Tue, 10 Oct 2023 08:31:42 +0000 (09:31 +0100)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Fri, 4 Oct 2024 14:29:22 +0000 (16:29 +0200)
[ Upstream commit b7a5b537c55c088d891ae554103d1b281abef781 ]

NUMA Balancing skips VMAs when the current task has not trapped a NUMA
fault within the VMA. If the VMA is skipped then mm->numa_scan_offset
advances and a task that is trapping faults within the VMA may never
fully update PTEs within the VMA.

Force tasks to update PTEs for partially scanned PTEs. The VMA will
be tagged for NUMA hints by some task but this removes some of the
benefit of tracking PID activity within a VMA. A follow-on patch
will mitigate this problem.

The test cases and machines evaluated did not trigger the corner case so
the performance results are neutral with only small changes within the
noise from normal test-to-test variance. However, the next patch makes
the corner case easier to trigger.

Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Tested-by: Raghavendra K T <raghavendra.kt@amd.com>
Link: https://lore.kernel.org/r/20231010083143.19593-6-mgorman@techsingularity.net
Stable-dep-of: f22cde4371f3 ("sched/numa: Fix the vma scan starving issue")
Signed-off-by: Sasha Levin <sashal@kernel.org>
include/linux/sched/numa_balancing.h
include/trace/events/sched.h
kernel/sched/fair.c

index c127a1509e2faad8ca856e9866791500dacc2e4b..7dcc0bdfddbbf1dfea8bf2c1f35e7745c107230f 100644 (file)
@@ -21,6 +21,7 @@ enum numa_vmaskip_reason {
        NUMAB_SKIP_INACCESSIBLE,
        NUMAB_SKIP_SCAN_DELAY,
        NUMAB_SKIP_PID_INACTIVE,
+       NUMAB_SKIP_IGNORE_PID,
 };
 
 #ifdef CONFIG_NUMA_BALANCING
index b0d0dbf491ea61a8cd35c78c97cd97f4da030790..27b51c81b106735ac0472d3c0e2eb32427dc21eb 100644 (file)
@@ -670,7 +670,8 @@ DEFINE_EVENT(sched_numa_pair_template, sched_swap_numa,
        EM( NUMAB_SKIP_SHARED_RO,               "shared_ro" )   \
        EM( NUMAB_SKIP_INACCESSIBLE,            "inaccessible" )        \
        EM( NUMAB_SKIP_SCAN_DELAY,              "scan_delay" )  \
-       EMe(NUMAB_SKIP_PID_INACTIVE,            "pid_inactive" )
+       EM( NUMAB_SKIP_PID_INACTIVE,            "pid_inactive" )        \
+       EMe(NUMAB_SKIP_IGNORE_PID,              "ignore_pid_inactive" )
 
 /* Redefine for export. */
 #undef EM
index 07363b73ccdcc5a1600094dc71992ed331e31fe0..03eb1cab320d8fe8b64169bc21d6ba68ef0abb90 100644 (file)
@@ -3188,7 +3188,7 @@ static void reset_ptenuma_scan(struct task_struct *p)
        p->mm->numa_scan_offset = 0;
 }
 
-static bool vma_is_accessed(struct vm_area_struct *vma)
+static bool vma_is_accessed(struct mm_struct *mm, struct vm_area_struct *vma)
 {
        unsigned long pids;
        /*
@@ -3201,7 +3201,19 @@ static bool vma_is_accessed(struct vm_area_struct *vma)
                return true;
 
        pids = vma->numab_state->pids_active[0] | vma->numab_state->pids_active[1];
-       return test_bit(hash_32(current->pid, ilog2(BITS_PER_LONG)), &pids);
+       if (test_bit(hash_32(current->pid, ilog2(BITS_PER_LONG)), &pids))
+               return true;
+
+       /*
+        * Complete a scan that has already started regardless of PID access, or
+        * some VMAs may never be scanned in multi-threaded applications:
+        */
+       if (mm->numa_scan_offset > vma->vm_start) {
+               trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_IGNORE_PID);
+               return true;
+       }
+
+       return false;
 }
 
 #define VMA_PID_RESET_PERIOD (4 * sysctl_numa_balancing_scan_delay)
@@ -3345,7 +3357,7 @@ static void task_numa_work(struct callback_head *work)
                }
 
                /* Do not scan the VMA if task has not accessed */
-               if (!vma_is_accessed(vma)) {
+               if (!vma_is_accessed(mm, vma)) {
                        trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_PID_INACTIVE);
                        continue;
                }