cgroup/cpuset: move PF_EXITING check before __GFP_HARDWALL in cpuset_current_node_all...

author Chen Wandun <chenwandun@lixiang.com>

Thu, 7 May 2026 10:54:34 +0000 (18:54 +0800)

committer Tejun Heo <tj@kernel.org>

Thu, 7 May 2026 21:57:31 +0000 (11:57 -1000)
author Chen Wandun <chenwandun@lixiang.com>
Thu, 7 May 2026 10:54:34 +0000 (18:54 +0800)
committer Tejun Heo <tj@kernel.org>
Thu, 7 May 2026 21:57:31 +0000 (11:57 -1000)
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c

index e3a081a07c6d5166e475ad66f72ae0fc6920f027..a48901a0416a28d064d75aeea6a703f3172da55a 100644 (file)
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -4176,11 +4176,11 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
   * current's mems_allowed, yes.  If it's not a __GFP_HARDWALL request and this
   * node is set in the nearest hardwalled cpuset ancestor to current's cpuset,
   * yes.  If current has access to memory reserves as an oom victim, yes.
- * Otherwise, no.
+ * If the current task is PF_EXITING, yes. Otherwise, no.
   *
   * GFP_USER allocations are marked with the __GFP_HARDWALL bit,
   * and do not allow allocations outside the current tasks cpuset
- * unless the task has been OOM killed.
+ * unless the task has been OOM killed or is exiting.
   * GFP_KERNEL allocations are not so marked, so can escape to the
   * nearest enclosing hardwalled ancestor cpuset.
   *
@@ -4194,7 +4194,9 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
   * The first call here from mm/page_alloc:get_page_from_freelist()
   * has __GFP_HARDWALL set in gfp_mask, enforcing hardwall cpusets,
   * so no allocation on a node outside the cpuset is allowed (unless
- * in interrupt, of course).
+ * in interrupt, of course).  The PF_EXITING check must therefore
+ * come before the __GFP_HARDWALL check, otherwise a dying task
+ * would be blocked on the fast path.
   *
   * The second pass through get_page_from_freelist() doesn't even call
   * here for GFP_ATOMIC calls.  For those calls, the __alloc_pages()
@@ -4204,6 +4206,7 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
   *     in_interrupt - any node ok (current task context irrelevant)
   *     GFP_ATOMIC   - any node ok
   *     tsk_is_oom_victim   - any node ok
+ *     PF_EXITING   - any node ok (let dying task exit quickly)
   *     GFP_KERNEL   - any node in enclosing hardwalled cpuset ok
   *     GFP_USER     - only nodes in current tasks mems allowed ok.
   */
@@ -4223,11 +4226,10 @@ bool cpuset_current_node_allowed(int node, gfp_t gfp_mask)
          */
         if (unlikely(tsk_is_oom_victim(current)))
                 return true;
-       if (gfp_mask & __GFP_HARDWALL)  /* If hardwall request, stop here */
-               return false;
-
         if (current->flags & PF_EXITING) /* Let dying task have memory */
                 return true;
+       if (gfp_mask & __GFP_HARDWALL)  /* If hardwall request, stop here */
+               return false;
  
         /* Not hardwall and node outside mems_allowed: scan up cpusets */
         spin_lock_irqsave(&callback_lock, flags);
author	Chen Wandun <chenwandun@lixiang.com>
	Thu, 7 May 2026 10:54:34 +0000 (18:54 +0800)
committer	Tejun Heo <tj@kernel.org>
	Thu, 7 May 2026 21:57:31 +0000 (11:57 -1000)