5.4-stable patches

author Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Sun, 27 Aug 2023 09:07:17 +0000 (11:07 +0200)

committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Sun, 27 Aug 2023 09:07:17 +0000 (11:07 +0200)
author Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Sun, 27 Aug 2023 09:07:17 +0000 (11:07 +0200)
committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Sun, 27 Aug 2023 09:07:17 +0000 (11:07 +0200)
diff --git a/queue-5.4/mm-allow-a-controlled-amount-of-unfairness-in-the-page-lock.patch b/queue-5.4/mm-allow-a-controlled-amount-of-unfairness-in-the-page-lock.patch

new file mode 100644 (file)

index 0000000..09c7c14
--- /dev/null
+++ b/queue-5.4/mm-allow-a-controlled-amount-of-unfairness-in-the-page-lock.patch
@@ -0,0 +1,361 @@
+From 5ef64cc8987a9211d3f3667331ba3411a94ddc79 Mon Sep 17 00:00:00 2001
+From: Linus Torvalds <torvalds@linux-foundation.org>
+Date: Sun, 13 Sep 2020 14:05:35 -0700
+Subject: mm: allow a controlled amount of unfairness in the page lock
+
+From: Linus Torvalds <torvalds@linux-foundation.org>
+
+commit 5ef64cc8987a9211d3f3667331ba3411a94ddc79 upstream.
+
+Commit 2a9127fcf229 ("mm: rewrite wait_on_page_bit_common() logic") made
+the page locking entirely fair, in that if a waiter came in while the
+lock was held, the lock would be transferred to the lockers strictly in
+order.
+
+That was intended to finally get rid of the long-reported watchdog
+failures that involved the page lock under extreme load, where a process
+could end up waiting essentially forever, as other page lockers stole
+the lock from under it.
+
+It also improved some benchmarks, but it ended up causing huge
+performance regressions on others, simply because fair lock behavior
+doesn't end up giving out the lock as aggressively, causing better
+worst-case latency, but potentially much worse average latencies and
+throughput.
+
+Instead of reverting that change entirely, this introduces a controlled
+amount of unfairness, with a sysctl knob to tune it if somebody needs
+to.  But the default value should hopefully be good for any normal load,
+allowing a few rounds of lock stealing, but enforcing the strict
+ordering before the lock has been stolen too many times.
+
+There is also a hint from Matthieu Baerts that the fair page coloring
+may end up exposing an ABBA deadlock that is hidden by the usual
+optimistic lock stealing, and while the unfairness doesn't fix the
+fundamental issue (and I'm still looking at that), it avoids it in
+practice.
+
+The amount of unfairness can be modified by writing a new value to the
+'sysctl_page_lock_unfairness' variable (default value of 5, exposed
+through /proc/sys/vm/page_lock_unfairness), but that is hopefully
+something we'd use mainly for debugging rather than being necessary for
+any deep system tuning.
+
+This whole issue has exposed just how critical the page lock can be, and
+how contended it gets under certain locks.  And the main contention
+doesn't really seem to be anything related to IO (which was the origin
+of this lock), but for things like just verifying that the page file
+mapping is stable while faulting in the page into a page table.
+
+Link: https://lore.kernel.org/linux-fsdevel/ed8442fd-6f54-dd84-cd4a-941e8b7ee603@MichaelLarabel.com/
+Link: https://www.phoronix.com/scan.php?page=article&item=linux-50-59&num=1
+Link: https://lore.kernel.org/linux-fsdevel/c560a38d-8313-51fb-b1ec-e904bd8836bc@tessares.net/
+Reported-and-tested-by: Michael Larabel <Michael@michaellarabel.com>
+Tested-by: Matthieu Baerts <matthieu.baerts@tessares.net>
+Cc: Dave Chinner <david@fromorbit.com>
+Cc: Matthew Wilcox <willy@infradead.org>
+Cc: Chris Mason <clm@fb.com>
+Cc: Jan Kara <jack@suse.cz>
+Cc: Amir Goldstein <amir73il@gmail.com>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Saeed Mirzamohammadi <saeed.mirzamohammadi@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/linux/mm.h   |    2 
+ include/linux/wait.h |    2 
+ kernel/sysctl.c      |    8 ++
+ mm/filemap.c         |  160 +++++++++++++++++++++++++++++++++++++++++----------
+ 4 files changed, 141 insertions(+), 31 deletions(-)
+
+--- a/include/linux/mm.h
++++ b/include/linux/mm.h
+@@ -37,6 +37,8 @@ struct user_struct;
+ struct writeback_control;
+ struct bdi_writeback;
+ 
++extern int sysctl_page_lock_unfairness;
++
+ void init_mm_internals(void);
+ 
+ #ifndef CONFIG_NEED_MULTIPLE_NODES    /* Don't use mapnrs, do it properly */
+--- a/include/linux/wait.h
++++ b/include/linux/wait.h
+@@ -20,6 +20,8 @@ int default_wake_function(struct wait_qu
+ #define WQ_FLAG_EXCLUSIVE     0x01
+ #define WQ_FLAG_WOKEN         0x02
+ #define WQ_FLAG_BOOKMARK      0x04
++#define WQ_FLAG_CUSTOM                0x08
++#define WQ_FLAG_DONE          0x10
+ 
+ /*
+  * A single wait-queue entry structure:
+--- a/kernel/sysctl.c
++++ b/kernel/sysctl.c
+@@ -1563,6 +1563,14 @@ static struct ctl_table vm_table[] = {
+               .proc_handler   = percpu_pagelist_fraction_sysctl_handler,
+               .extra1         = SYSCTL_ZERO,
+       },
++      {
++              .procname       = "page_lock_unfairness",
++              .data           = &sysctl_page_lock_unfairness,
++              .maxlen         = sizeof(sysctl_page_lock_unfairness),
++              .mode           = 0644,
++              .proc_handler   = proc_dointvec_minmax,
++              .extra1         = SYSCTL_ZERO,
++      },
+ #ifdef CONFIG_MMU
+       {
+               .procname       = "max_map_count",
+--- a/mm/filemap.c
++++ b/mm/filemap.c
+@@ -1044,9 +1044,43 @@ struct wait_page_queue {
+       wait_queue_entry_t wait;
+ };
+ 
++/*
++ * The page wait code treats the "wait->flags" somewhat unusually, because
++ * we have multiple different kinds of waits, not just he usual "exclusive"
++ * one.
++ *
++ * We have:
++ *
++ *  (a) no special bits set:
++ *
++ *    We're just waiting for the bit to be released, and when a waker
++ *    calls the wakeup function, we set WQ_FLAG_WOKEN and wake it up,
++ *    and remove it from the wait queue.
++ *
++ *    Simple and straightforward.
++ *
++ *  (b) WQ_FLAG_EXCLUSIVE:
++ *
++ *    The waiter is waiting to get the lock, and only one waiter should
++ *    be woken up to avoid any thundering herd behavior. We'll set the
++ *    WQ_FLAG_WOKEN bit, wake it up, and remove it from the wait queue.
++ *
++ *    This is the traditional exclusive wait.
++ *
++ *  (b) WQ_FLAG_EXCLUSIVE | WQ_FLAG_CUSTOM:
++ *
++ *    The waiter is waiting to get the bit, and additionally wants the
++ *    lock to be transferred to it for fair lock behavior. If the lock
++ *    cannot be taken, we stop walking the wait queue without waking
++ *    the waiter.
++ *
++ *    This is the "fair lock handoff" case, and in addition to setting
++ *    WQ_FLAG_WOKEN, we set WQ_FLAG_DONE to let the waiter easily see
++ *    that it now has the lock.
++ */
+ static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *arg)
+ {
+-      int ret;
++      unsigned int flags;
+       struct wait_page_key *key = arg;
+       struct wait_page_queue *wait_page
+               = container_of(wait, struct wait_page_queue, wait);
+@@ -1059,35 +1093,44 @@ static int wake_page_function(wait_queue
+               return 0;
+ 
+       /*
+-       * If it's an exclusive wait, we get the bit for it, and
+-       * stop walking if we can't.
+-       *
+-       * If it's a non-exclusive wait, then the fact that this
+-       * wake function was called means that the bit already
+-       * was cleared, and we don't care if somebody then
+-       * re-took it.
++       * If it's a lock handoff wait, we get the bit for it, and
++       * stop walking (and do not wake it up) if we can't.
+        */
+-      ret = 0;
+-      if (wait->flags & WQ_FLAG_EXCLUSIVE) {
+-              if (test_and_set_bit(key->bit_nr, &key->page->flags))
++      flags = wait->flags;
++      if (flags & WQ_FLAG_EXCLUSIVE) {
++              if (test_bit(key->bit_nr, &key->page->flags))
+                       return -1;
+-              ret = 1;
++              if (flags & WQ_FLAG_CUSTOM) {
++                      if (test_and_set_bit(key->bit_nr, &key->page->flags))
++                              return -1;
++                      flags |= WQ_FLAG_DONE;
++              }
+       }
+-      wait->flags |= WQ_FLAG_WOKEN;
+ 
++      /*
++       * We are holding the wait-queue lock, but the waiter that
++       * is waiting for this will be checking the flags without
++       * any locking.
++       *
++       * So update the flags atomically, and wake up the waiter
++       * afterwards to avoid any races. This store-release pairs
++       * with the load-acquire in wait_on_page_bit_common().
++       */
++      smp_store_release(&wait->flags, flags | WQ_FLAG_WOKEN);
+       wake_up_state(wait->private, mode);
+ 
+       /*
+        * Ok, we have successfully done what we're waiting for,
+        * and we can unconditionally remove the wait entry.
+        *
+-       * Note that this has to be the absolute last thing we do,
+-       * since after list_del_init(&wait->entry) the wait entry
++       * Note that this pairs with the "finish_wait()" in the
++       * waiter, and has to be the absolute last thing we do.
++       * After this list_del_init(&wait->entry) the wait entry
+        * might be de-allocated and the process might even have
+        * exited.
+        */
+       list_del_init_careful(&wait->entry);
+-      return ret;
++      return (flags & WQ_FLAG_EXCLUSIVE) != 0;
+ }
+ 
+ static void wake_up_page_bit(struct page *page, int bit_nr)
+@@ -1167,8 +1210,8 @@ enum behavior {
+ };
+ 
+ /*
+- * Attempt to check (or get) the page bit, and mark the
+- * waiter woken if successful.
++ * Attempt to check (or get) the page bit, and mark us done
++ * if successful.
+  */
+ static inline bool trylock_page_bit_common(struct page *page, int bit_nr,
+                                       struct wait_queue_entry *wait)
+@@ -1179,13 +1222,17 @@ static inline bool trylock_page_bit_comm
+       } else if (test_bit(bit_nr, &page->flags))
+               return false;
+ 
+-      wait->flags |= WQ_FLAG_WOKEN;
++      wait->flags |= WQ_FLAG_WOKEN | WQ_FLAG_DONE;
+       return true;
+ }
+ 
++/* How many times do we accept lock stealing from under a waiter? */
++int sysctl_page_lock_unfairness = 5;
++
+ static inline int wait_on_page_bit_common(wait_queue_head_t *q,
+       struct page *page, int bit_nr, int state, enum behavior behavior)
+ {
++      int unfairness = sysctl_page_lock_unfairness;
+       struct wait_page_queue wait_page;
+       wait_queue_entry_t *wait = &wait_page.wait;
+       bool thrashing = false;
+@@ -1203,11 +1250,18 @@ static inline int wait_on_page_bit_commo
+       }
+ 
+       init_wait(wait);
+-      wait->flags = behavior == EXCLUSIVE ? WQ_FLAG_EXCLUSIVE : 0;
+       wait->func = wake_page_function;
+       wait_page.page = page;
+       wait_page.bit_nr = bit_nr;
+ 
++repeat:
++      wait->flags = 0;
++      if (behavior == EXCLUSIVE) {
++              wait->flags = WQ_FLAG_EXCLUSIVE;
++              if (--unfairness < 0)
++                      wait->flags |= WQ_FLAG_CUSTOM;
++      }
++
+       /*
+        * Do one last check whether we can get the
+        * page bit synchronously.
+@@ -1230,27 +1284,63 @@ static inline int wait_on_page_bit_commo
+ 
+       /*
+        * From now on, all the logic will be based on
+-       * the WQ_FLAG_WOKEN flag, and the and the page
+-       * bit testing (and setting) will be - or has
+-       * already been - done by the wake function.
++       * the WQ_FLAG_WOKEN and WQ_FLAG_DONE flag, to
++       * see whether the page bit testing has already
++       * been done by the wake function.
+        *
+        * We can drop our reference to the page.
+        */
+       if (behavior == DROP)
+               put_page(page);
+ 
++      /*
++       * Note that until the "finish_wait()", or until
++       * we see the WQ_FLAG_WOKEN flag, we need to
++       * be very careful with the 'wait->flags', because
++       * we may race with a waker that sets them.
++       */
+       for (;;) {
++              unsigned int flags;
++
+               set_current_state(state);
+ 
+-              if (signal_pending_state(state, current))
++              /* Loop until we've been woken or interrupted */
++              flags = smp_load_acquire(&wait->flags);
++              if (!(flags & WQ_FLAG_WOKEN)) {
++                      if (signal_pending_state(state, current))
++                              break;
++
++                      io_schedule();
++                      continue;
++              }
++
++              /* If we were non-exclusive, we're done */
++              if (behavior != EXCLUSIVE)
+                       break;
+ 
+-              if (wait->flags & WQ_FLAG_WOKEN)
++              /* If the waker got the lock for us, we're done */
++              if (flags & WQ_FLAG_DONE)
+                       break;
+ 
+-              io_schedule();
++              /*
++               * Otherwise, if we're getting the lock, we need to
++               * try to get it ourselves.
++               *
++               * And if that fails, we'll have to retry this all.
++               */
++              if (unlikely(test_and_set_bit(bit_nr, &page->flags)))
++                      goto repeat;
++
++              wait->flags |= WQ_FLAG_DONE;
++              break;
+       }
+ 
++      /*
++       * If a signal happened, this 'finish_wait()' may remove the last
++       * waiter from the wait-queues, but the PageWaiters bit will remain
++       * set. That's ok. The next wakeup will take care of it, and trying
++       * to do it here would be difficult and prone to races.
++       */
+       finish_wait(q, wait);
+ 
+       if (thrashing) {
+@@ -1260,12 +1350,20 @@ static inline int wait_on_page_bit_commo
+       }
+ 
+       /*
+-       * A signal could leave PageWaiters set. Clearing it here if
+-       * !waitqueue_active would be possible (by open-coding finish_wait),
+-       * but still fail to catch it in the case of wait hash collision. We
+-       * already can fail to clear wait hash collision cases, so don't
+-       * bother with signals either.
++       * NOTE! The wait->flags weren't stable until we've done the
++       * 'finish_wait()', and we could have exited the loop above due
++       * to a signal, and had a wakeup event happen after the signal
++       * test but before the 'finish_wait()'.
++       *
++       * So only after the finish_wait() can we reliably determine
++       * if we got woken up or not, so we can now figure out the final
++       * return value based on that state without races.
++       *
++       * Also note that WQ_FLAG_WOKEN is sufficient for a non-exclusive
++       * waiter, but an exclusive one requires WQ_FLAG_DONE.
+        */
++      if (behavior == EXCLUSIVE)
++              return wait->flags & WQ_FLAG_DONE ? 0 : -EINTR;
+ 
+       return wait->flags & WQ_FLAG_WOKEN ? 0 : -EINTR;
+ }
diff --git a/queue-5.4/series b/queue-5.4/series

index 49ca61ddf6fd56d1dccbab129545fea2dfcbeb42..b89c7c79c70d757e2cd4646b77421ea96e173a82 100644 (file)
--- a/queue-5.4/series
+++ b/queue-5.4/series
@@ -145,3 +145,4 @@ media-vcodec-fix-potential-array-out-of-bounds-in-encoder-queue_setup.patch
  pci-acpiphp-use-pci_assign_unassigned_bridge_resources-only-for-non-root-bus.patch
  drm-display-dp-fix-the-dp-dsc-receiver-cap-size.patch
  x86-fpu-set-x86_feature_osxsave-feature-after-enabling-osxsave-in-cr4.patch
+mm-allow-a-controlled-amount-of-unfairness-in-the-page-lock.patch
author	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Sun, 27 Aug 2023 09:07:17 +0000 (11:07 +0200)
committer	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Sun, 27 Aug 2023 09:07:17 +0000 (11:07 +0200)
queue-5.4/mm-allow-a-controlled-amount-of-unfairness-in-the-page-lock.patch	[new file with mode: 0644]	patch \| blob
queue-5.4/series		patch \| blob \| blame \| history