fix up a net 6.1 patch to not need the world of dependencies

author Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Mon, 23 Oct 2023 08:40:57 +0000 (10:40 +0200)

committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Mon, 23 Oct 2023 08:40:57 +0000 (10:40 +0200)
author Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Mon, 23 Oct 2023 08:40:57 +0000 (10:40 +0200)
committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Mon, 23 Oct 2023 08:40:57 +0000 (10:40 +0200)
diff --git a/queue-6.1/atomics-provide-atomic_add_negative-variants.patch b/queue-6.1/atomics-provide-atomic_add_negative-variants.patch

deleted file mode 100644 (file)

index b8689ca..0000000
--- a/queue-6.1/atomics-provide-atomic_add_negative-variants.patch
+++ /dev/null
@@ -1,481 +0,0 @@
-From 2194a9643e933a16a92f83d3859f3916f95a5e42 Mon Sep 17 00:00:00 2001
-From: Sasha Levin <sashal@kernel.org>
-Date: Thu, 23 Mar 2023 21:55:30 +0100
-Subject: atomics: Provide atomic_add_negative() variants
-
-From: Thomas Gleixner <tglx@linutronix.de>
-
-[ Upstream commit e5ab9eff46b04c5a04778e40d7092fed3fda52ca ]
-
-atomic_add_negative() does not provide the relaxed/acquire/release
-variants.
-
-Provide them in preparation for a new scalable reference count algorithm.
-
-Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
-Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
-Acked-by: Mark Rutland <mark.rutland@arm.com>
-Link: https://lore.kernel.org/r/20230323102800.101763813@linutronix.de
-Stable-dep-of: cc9b364bb1d5 ("xfrm6: fix inet6_dev refcount underflow problem")
-Signed-off-by: Sasha Levin <sashal@kernel.org>
----
- include/linux/atomic/atomic-arch-fallback.h | 208 +++++++++++++++++++-
- include/linux/atomic/atomic-instrumented.h  |  68 ++++++-
- include/linux/atomic/atomic-long.h          |  38 +++-
- scripts/atomic/atomics.tbl                  |   2 +-
- scripts/atomic/fallbacks/add_negative       |  11 +-
- 5 files changed, 309 insertions(+), 18 deletions(-)
-
-diff --git a/include/linux/atomic/atomic-arch-fallback.h b/include/linux/atomic/atomic-arch-fallback.h
-index 77bc5522e61c6..4226379a232d5 100644
---- a/include/linux/atomic/atomic-arch-fallback.h
-+++ b/include/linux/atomic/atomic-arch-fallback.h
-@@ -1208,15 +1208,21 @@ arch_atomic_inc_and_test(atomic_t *v)
- #define arch_atomic_inc_and_test arch_atomic_inc_and_test
- #endif
- 
-+#ifndef arch_atomic_add_negative_relaxed
-+#ifdef arch_atomic_add_negative
-+#define arch_atomic_add_negative_acquire arch_atomic_add_negative
-+#define arch_atomic_add_negative_release arch_atomic_add_negative
-+#define arch_atomic_add_negative_relaxed arch_atomic_add_negative
-+#endif /* arch_atomic_add_negative */
-+
- #ifndef arch_atomic_add_negative
- /**
-- * arch_atomic_add_negative - add and test if negative
-+ * arch_atomic_add_negative - Add and test if negative
-  * @i: integer value to add
-  * @v: pointer of type atomic_t
-  *
-- * Atomically adds @i to @v and returns true
-- * if the result is negative, or false when
-- * result is greater than or equal to zero.
-+ * Atomically adds @i to @v and returns true if the result is negative,
-+ * or false when the result is greater than or equal to zero.
-  */
- static __always_inline bool
- arch_atomic_add_negative(int i, atomic_t *v)
-@@ -1226,6 +1232,95 @@ arch_atomic_add_negative(int i, atomic_t *v)
- #define arch_atomic_add_negative arch_atomic_add_negative
- #endif
- 
-+#ifndef arch_atomic_add_negative_acquire
-+/**
-+ * arch_atomic_add_negative_acquire - Add and test if negative
-+ * @i: integer value to add
-+ * @v: pointer of type atomic_t
-+ *
-+ * Atomically adds @i to @v and returns true if the result is negative,
-+ * or false when the result is greater than or equal to zero.
-+ */
-+static __always_inline bool
-+arch_atomic_add_negative_acquire(int i, atomic_t *v)
-+{
-+      return arch_atomic_add_return_acquire(i, v) < 0;
-+}
-+#define arch_atomic_add_negative_acquire arch_atomic_add_negative_acquire
-+#endif
-+
-+#ifndef arch_atomic_add_negative_release
-+/**
-+ * arch_atomic_add_negative_release - Add and test if negative
-+ * @i: integer value to add
-+ * @v: pointer of type atomic_t
-+ *
-+ * Atomically adds @i to @v and returns true if the result is negative,
-+ * or false when the result is greater than or equal to zero.
-+ */
-+static __always_inline bool
-+arch_atomic_add_negative_release(int i, atomic_t *v)
-+{
-+      return arch_atomic_add_return_release(i, v) < 0;
-+}
-+#define arch_atomic_add_negative_release arch_atomic_add_negative_release
-+#endif
-+
-+#ifndef arch_atomic_add_negative_relaxed
-+/**
-+ * arch_atomic_add_negative_relaxed - Add and test if negative
-+ * @i: integer value to add
-+ * @v: pointer of type atomic_t
-+ *
-+ * Atomically adds @i to @v and returns true if the result is negative,
-+ * or false when the result is greater than or equal to zero.
-+ */
-+static __always_inline bool
-+arch_atomic_add_negative_relaxed(int i, atomic_t *v)
-+{
-+      return arch_atomic_add_return_relaxed(i, v) < 0;
-+}
-+#define arch_atomic_add_negative_relaxed arch_atomic_add_negative_relaxed
-+#endif
-+
-+#else /* arch_atomic_add_negative_relaxed */
-+
-+#ifndef arch_atomic_add_negative_acquire
-+static __always_inline bool
-+arch_atomic_add_negative_acquire(int i, atomic_t *v)
-+{
-+      bool ret = arch_atomic_add_negative_relaxed(i, v);
-+      __atomic_acquire_fence();
-+      return ret;
-+}
-+#define arch_atomic_add_negative_acquire arch_atomic_add_negative_acquire
-+#endif
-+
-+#ifndef arch_atomic_add_negative_release
-+static __always_inline bool
-+arch_atomic_add_negative_release(int i, atomic_t *v)
-+{
-+      __atomic_release_fence();
-+      return arch_atomic_add_negative_relaxed(i, v);
-+}
-+#define arch_atomic_add_negative_release arch_atomic_add_negative_release
-+#endif
-+
-+#ifndef arch_atomic_add_negative
-+static __always_inline bool
-+arch_atomic_add_negative(int i, atomic_t *v)
-+{
-+      bool ret;
-+      __atomic_pre_full_fence();
-+      ret = arch_atomic_add_negative_relaxed(i, v);
-+      __atomic_post_full_fence();
-+      return ret;
-+}
-+#define arch_atomic_add_negative arch_atomic_add_negative
-+#endif
-+
-+#endif /* arch_atomic_add_negative_relaxed */
-+
- #ifndef arch_atomic_fetch_add_unless
- /**
-  * arch_atomic_fetch_add_unless - add unless the number is already a given value
-@@ -2329,15 +2424,21 @@ arch_atomic64_inc_and_test(atomic64_t *v)
- #define arch_atomic64_inc_and_test arch_atomic64_inc_and_test
- #endif
- 
-+#ifndef arch_atomic64_add_negative_relaxed
-+#ifdef arch_atomic64_add_negative
-+#define arch_atomic64_add_negative_acquire arch_atomic64_add_negative
-+#define arch_atomic64_add_negative_release arch_atomic64_add_negative
-+#define arch_atomic64_add_negative_relaxed arch_atomic64_add_negative
-+#endif /* arch_atomic64_add_negative */
-+
- #ifndef arch_atomic64_add_negative
- /**
-- * arch_atomic64_add_negative - add and test if negative
-+ * arch_atomic64_add_negative - Add and test if negative
-  * @i: integer value to add
-  * @v: pointer of type atomic64_t
-  *
-- * Atomically adds @i to @v and returns true
-- * if the result is negative, or false when
-- * result is greater than or equal to zero.
-+ * Atomically adds @i to @v and returns true if the result is negative,
-+ * or false when the result is greater than or equal to zero.
-  */
- static __always_inline bool
- arch_atomic64_add_negative(s64 i, atomic64_t *v)
-@@ -2347,6 +2448,95 @@ arch_atomic64_add_negative(s64 i, atomic64_t *v)
- #define arch_atomic64_add_negative arch_atomic64_add_negative
- #endif
- 
-+#ifndef arch_atomic64_add_negative_acquire
-+/**
-+ * arch_atomic64_add_negative_acquire - Add and test if negative
-+ * @i: integer value to add
-+ * @v: pointer of type atomic64_t
-+ *
-+ * Atomically adds @i to @v and returns true if the result is negative,
-+ * or false when the result is greater than or equal to zero.
-+ */
-+static __always_inline bool
-+arch_atomic64_add_negative_acquire(s64 i, atomic64_t *v)
-+{
-+      return arch_atomic64_add_return_acquire(i, v) < 0;
-+}
-+#define arch_atomic64_add_negative_acquire arch_atomic64_add_negative_acquire
-+#endif
-+
-+#ifndef arch_atomic64_add_negative_release
-+/**
-+ * arch_atomic64_add_negative_release - Add and test if negative
-+ * @i: integer value to add
-+ * @v: pointer of type atomic64_t
-+ *
-+ * Atomically adds @i to @v and returns true if the result is negative,
-+ * or false when the result is greater than or equal to zero.
-+ */
-+static __always_inline bool
-+arch_atomic64_add_negative_release(s64 i, atomic64_t *v)
-+{
-+      return arch_atomic64_add_return_release(i, v) < 0;
-+}
-+#define arch_atomic64_add_negative_release arch_atomic64_add_negative_release
-+#endif
-+
-+#ifndef arch_atomic64_add_negative_relaxed
-+/**
-+ * arch_atomic64_add_negative_relaxed - Add and test if negative
-+ * @i: integer value to add
-+ * @v: pointer of type atomic64_t
-+ *
-+ * Atomically adds @i to @v and returns true if the result is negative,
-+ * or false when the result is greater than or equal to zero.
-+ */
-+static __always_inline bool
-+arch_atomic64_add_negative_relaxed(s64 i, atomic64_t *v)
-+{
-+      return arch_atomic64_add_return_relaxed(i, v) < 0;
-+}
-+#define arch_atomic64_add_negative_relaxed arch_atomic64_add_negative_relaxed
-+#endif
-+
-+#else /* arch_atomic64_add_negative_relaxed */
-+
-+#ifndef arch_atomic64_add_negative_acquire
-+static __always_inline bool
-+arch_atomic64_add_negative_acquire(s64 i, atomic64_t *v)
-+{
-+      bool ret = arch_atomic64_add_negative_relaxed(i, v);
-+      __atomic_acquire_fence();
-+      return ret;
-+}
-+#define arch_atomic64_add_negative_acquire arch_atomic64_add_negative_acquire
-+#endif
-+
-+#ifndef arch_atomic64_add_negative_release
-+static __always_inline bool
-+arch_atomic64_add_negative_release(s64 i, atomic64_t *v)
-+{
-+      __atomic_release_fence();
-+      return arch_atomic64_add_negative_relaxed(i, v);
-+}
-+#define arch_atomic64_add_negative_release arch_atomic64_add_negative_release
-+#endif
-+
-+#ifndef arch_atomic64_add_negative
-+static __always_inline bool
-+arch_atomic64_add_negative(s64 i, atomic64_t *v)
-+{
-+      bool ret;
-+      __atomic_pre_full_fence();
-+      ret = arch_atomic64_add_negative_relaxed(i, v);
-+      __atomic_post_full_fence();
-+      return ret;
-+}
-+#define arch_atomic64_add_negative arch_atomic64_add_negative
-+#endif
-+
-+#endif /* arch_atomic64_add_negative_relaxed */
-+
- #ifndef arch_atomic64_fetch_add_unless
- /**
-  * arch_atomic64_fetch_add_unless - add unless the number is already a given value
-@@ -2456,4 +2646,4 @@ arch_atomic64_dec_if_positive(atomic64_t *v)
- #endif
- 
- #endif /* _LINUX_ATOMIC_FALLBACK_H */
--// b5e87bdd5ede61470c29f7a7e4de781af3770f09
-+// 00071fffa021cec66f6290d706d69c91df87bade
-diff --git a/include/linux/atomic/atomic-instrumented.h b/include/linux/atomic/atomic-instrumented.h
-index 7a139ec030b0c..0496816738ca9 100644
---- a/include/linux/atomic/atomic-instrumented.h
-+++ b/include/linux/atomic/atomic-instrumented.h
-@@ -592,6 +592,28 @@ atomic_add_negative(int i, atomic_t *v)
-       return arch_atomic_add_negative(i, v);
- }
- 
-+static __always_inline bool
-+atomic_add_negative_acquire(int i, atomic_t *v)
-+{
-+      instrument_atomic_read_write(v, sizeof(*v));
-+      return arch_atomic_add_negative_acquire(i, v);
-+}
-+
-+static __always_inline bool
-+atomic_add_negative_release(int i, atomic_t *v)
-+{
-+      kcsan_release();
-+      instrument_atomic_read_write(v, sizeof(*v));
-+      return arch_atomic_add_negative_release(i, v);
-+}
-+
-+static __always_inline bool
-+atomic_add_negative_relaxed(int i, atomic_t *v)
-+{
-+      instrument_atomic_read_write(v, sizeof(*v));
-+      return arch_atomic_add_negative_relaxed(i, v);
-+}
-+
- static __always_inline int
- atomic_fetch_add_unless(atomic_t *v, int a, int u)
- {
-@@ -1211,6 +1233,28 @@ atomic64_add_negative(s64 i, atomic64_t *v)
-       return arch_atomic64_add_negative(i, v);
- }
- 
-+static __always_inline bool
-+atomic64_add_negative_acquire(s64 i, atomic64_t *v)
-+{
-+      instrument_atomic_read_write(v, sizeof(*v));
-+      return arch_atomic64_add_negative_acquire(i, v);
-+}
-+
-+static __always_inline bool
-+atomic64_add_negative_release(s64 i, atomic64_t *v)
-+{
-+      kcsan_release();
-+      instrument_atomic_read_write(v, sizeof(*v));
-+      return arch_atomic64_add_negative_release(i, v);
-+}
-+
-+static __always_inline bool
-+atomic64_add_negative_relaxed(s64 i, atomic64_t *v)
-+{
-+      instrument_atomic_read_write(v, sizeof(*v));
-+      return arch_atomic64_add_negative_relaxed(i, v);
-+}
-+
- static __always_inline s64
- atomic64_fetch_add_unless(atomic64_t *v, s64 a, s64 u)
- {
-@@ -1830,6 +1874,28 @@ atomic_long_add_negative(long i, atomic_long_t *v)
-       return arch_atomic_long_add_negative(i, v);
- }
- 
-+static __always_inline bool
-+atomic_long_add_negative_acquire(long i, atomic_long_t *v)
-+{
-+      instrument_atomic_read_write(v, sizeof(*v));
-+      return arch_atomic_long_add_negative_acquire(i, v);
-+}
-+
-+static __always_inline bool
-+atomic_long_add_negative_release(long i, atomic_long_t *v)
-+{
-+      kcsan_release();
-+      instrument_atomic_read_write(v, sizeof(*v));
-+      return arch_atomic_long_add_negative_release(i, v);
-+}
-+
-+static __always_inline bool
-+atomic_long_add_negative_relaxed(long i, atomic_long_t *v)
-+{
-+      instrument_atomic_read_write(v, sizeof(*v));
-+      return arch_atomic_long_add_negative_relaxed(i, v);
-+}
-+
- static __always_inline long
- atomic_long_fetch_add_unless(atomic_long_t *v, long a, long u)
- {
-@@ -2083,4 +2149,4 @@ atomic_long_dec_if_positive(atomic_long_t *v)
- })
- 
- #endif /* _LINUX_ATOMIC_INSTRUMENTED_H */
--// 764f741eb77a7ad565dc8d99ce2837d5542e8aee
-+// 1b485de9cbaa4900de59e14ee2084357eaeb1c3a
-diff --git a/include/linux/atomic/atomic-long.h b/include/linux/atomic/atomic-long.h
-index 800b8c35992d1..2fc51ba66bebd 100644
---- a/include/linux/atomic/atomic-long.h
-+++ b/include/linux/atomic/atomic-long.h
-@@ -479,6 +479,24 @@ arch_atomic_long_add_negative(long i, atomic_long_t *v)
-       return arch_atomic64_add_negative(i, v);
- }
- 
-+static __always_inline bool
-+arch_atomic_long_add_negative_acquire(long i, atomic_long_t *v)
-+{
-+      return arch_atomic64_add_negative_acquire(i, v);
-+}
-+
-+static __always_inline bool
-+arch_atomic_long_add_negative_release(long i, atomic_long_t *v)
-+{
-+      return arch_atomic64_add_negative_release(i, v);
-+}
-+
-+static __always_inline bool
-+arch_atomic_long_add_negative_relaxed(long i, atomic_long_t *v)
-+{
-+      return arch_atomic64_add_negative_relaxed(i, v);
-+}
-+
- static __always_inline long
- arch_atomic_long_fetch_add_unless(atomic_long_t *v, long a, long u)
- {
-@@ -973,6 +991,24 @@ arch_atomic_long_add_negative(long i, atomic_long_t *v)
-       return arch_atomic_add_negative(i, v);
- }
- 
-+static __always_inline bool
-+arch_atomic_long_add_negative_acquire(long i, atomic_long_t *v)
-+{
-+      return arch_atomic_add_negative_acquire(i, v);
-+}
-+
-+static __always_inline bool
-+arch_atomic_long_add_negative_release(long i, atomic_long_t *v)
-+{
-+      return arch_atomic_add_negative_release(i, v);
-+}
-+
-+static __always_inline bool
-+arch_atomic_long_add_negative_relaxed(long i, atomic_long_t *v)
-+{
-+      return arch_atomic_add_negative_relaxed(i, v);
-+}
-+
- static __always_inline long
- arch_atomic_long_fetch_add_unless(atomic_long_t *v, long a, long u)
- {
-@@ -1011,4 +1047,4 @@ arch_atomic_long_dec_if_positive(atomic_long_t *v)
- 
- #endif /* CONFIG_64BIT */
- #endif /* _LINUX_ATOMIC_LONG_H */
--// e8f0e08ff072b74d180eabe2ad001282b38c2c88
-+// a194c07d7d2f4b0e178d3c118c919775d5d65f50
-diff --git a/scripts/atomic/atomics.tbl b/scripts/atomic/atomics.tbl
-index fbee2f6190d9e..85ca8d9b5c279 100644
---- a/scripts/atomic/atomics.tbl
-+++ b/scripts/atomic/atomics.tbl
-@@ -33,7 +33,7 @@ try_cmpxchg          B       v       p:old   i:new
- sub_and_test          b       i       v
- dec_and_test          b       v
- inc_and_test          b       v
--add_negative          b       i       v
-+add_negative          B       i       v
- add_unless            fb      v       i:a     i:u
- inc_not_zero          b       v
- inc_unless_negative   b       v
-diff --git a/scripts/atomic/fallbacks/add_negative b/scripts/atomic/fallbacks/add_negative
-index 15caa2eb23712..e5980abf5904e 100755
---- a/scripts/atomic/fallbacks/add_negative
-+++ b/scripts/atomic/fallbacks/add_negative
-@@ -1,16 +1,15 @@
- cat <<EOF
- /**
-- * arch_${atomic}_add_negative - add and test if negative
-+ * arch_${atomic}_add_negative${order} - Add and test if negative
-  * @i: integer value to add
-  * @v: pointer of type ${atomic}_t
-  *
-- * Atomically adds @i to @v and returns true
-- * if the result is negative, or false when
-- * result is greater than or equal to zero.
-+ * Atomically adds @i to @v and returns true if the result is negative,
-+ * or false when the result is greater than or equal to zero.
-  */
- static __always_inline bool
--arch_${atomic}_add_negative(${int} i, ${atomic}_t *v)
-+arch_${atomic}_add_negative${order}(${int} i, ${atomic}_t *v)
- {
--      return arch_${atomic}_add_return(i, v) < 0;
-+      return arch_${atomic}_add_return${order}(i, v) < 0;
- }
- EOF
--- 
-2.40.1
-
diff --git a/queue-6.1/atomics-provide-rcuref-scalable-reference-counting.patch b/queue-6.1/atomics-provide-rcuref-scalable-reference-counting.patch

deleted file mode 100644 (file)

index 537146a..0000000
--- a/queue-6.1/atomics-provide-rcuref-scalable-reference-counting.patch
+++ /dev/null
@@ -1,582 +0,0 @@
-From 47ac5394259bfc9dd07646a58feea1be4e624eef Mon Sep 17 00:00:00 2001
-From: Sasha Levin <sashal@kernel.org>
-Date: Thu, 23 Mar 2023 21:55:31 +0100
-Subject: atomics: Provide rcuref - scalable reference counting
-
-From: Thomas Gleixner <tglx@linutronix.de>
-
-[ Upstream commit ee1ee6db07795d9637bc5e8993a8ddcf886541ef ]
-
-atomic_t based reference counting, including refcount_t, uses
-atomic_inc_not_zero() for acquiring a reference. atomic_inc_not_zero() is
-implemented with a atomic_try_cmpxchg() loop. High contention of the
-reference count leads to retry loops and scales badly. There is nothing to
-improve on this implementation as the semantics have to be preserved.
-
-Provide rcuref as a scalable alternative solution which is suitable for RCU
-managed objects. Similar to refcount_t it comes with overflow and underflow
-detection and mitigation.
-
-rcuref treats the underlying atomic_t as an unsigned integer and partitions
-this space into zones:
-
-  0x00000000 - 0x7FFFFFFF      valid zone (1 .. (INT_MAX + 1) references)
-  0x80000000 - 0xBFFFFFFF      saturation zone
-  0xC0000000 - 0xFFFFFFFE      dead zone
-  0xFFFFFFFF                           no reference
-
-rcuref_get() unconditionally increments the reference count with
-atomic_add_negative_relaxed(). rcuref_put() unconditionally decrements the
-reference count with atomic_add_negative_release().
-
-This unconditional increment avoids the inc_not_zero() problem, but
-requires a more complex implementation on the put() side when the count
-drops from 0 to -1.
-
-When this transition is detected then it is attempted to mark the reference
-count dead, by setting it to the midpoint of the dead zone with a single
-atomic_cmpxchg_release() operation. This operation can fail due to a
-concurrent rcuref_get() elevating the reference count from -1 to 0 again.
-
-If the unconditional increment in rcuref_get() hits a reference count which
-is marked dead (or saturated) it will detect it after the fact and bring
-back the reference count to the midpoint of the respective zone. The zones
-provide enough tolerance which makes it practically impossible to escape
-from a zone.
-
-The racy implementation of rcuref_put() requires to protect rcuref_put()
-against a grace period ending in order to prevent a subtle use after
-free. As RCU is the only mechanism which allows to protect against that, it
-is not possible to fully replace the atomic_inc_not_zero() based
-implementation of refcount_t with this scheme.
-
-The final drop is slightly more expensive than the atomic_dec_return()
-counterpart, but that's not the case which this is optimized for. The
-optimization is on the high frequeunt get()/put() pairs and their
-scalability.
-
-The performance of an uncontended rcuref_get()/put() pair where the put()
-is not dropping the last reference is still on par with the plain atomic
-operations, while at the same time providing overflow and underflow
-detection and mitigation.
-
-The performance of rcuref compared to plain atomic_inc_not_zero() and
-atomic_dec_return() based reference counting under contention:
-
- -  Micro benchmark: All CPUs running a increment/decrement loop on an
-    elevated reference count, which means the 0 to -1 transition never
-    happens.
-
-    The performance gain depends on microarchitecture and the number of
-    CPUs and has been observed in the range of 1.3X to 4.7X
-
- - Conversion of dst_entry::__refcnt to rcuref and testing with the
-    localhost memtier/memcached benchmark. That benchmark shows the
-    reference count contention prominently.
-
-    The performance gain depends on microarchitecture and the number of
-    CPUs and has been observed in the range of 1.1X to 2.6X over the
-    previous fix for the false sharing issue vs. struct
-    dst_entry::__refcnt.
-
-    When memtier is run over a real 1Gb network connection, there is a
-    small gain on top of the false sharing fix. The two changes combined
-    result in a 2%-5% total gain for that networked test.
-
-Reported-by: Wangyang Guo <wangyang.guo@intel.com>
-Reported-by: Arjan Van De Ven <arjan.van.de.ven@intel.com>
-Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
-Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
-Link: https://lore.kernel.org/r/20230323102800.158429195@linutronix.de
-Stable-dep-of: cc9b364bb1d5 ("xfrm6: fix inet6_dev refcount underflow problem")
-Signed-off-by: Sasha Levin <sashal@kernel.org>
----
- include/linux/rcuref.h | 155 +++++++++++++++++++++++
- include/linux/types.h  |   6 +
- lib/Makefile           |   2 +-
- lib/rcuref.c           | 281 +++++++++++++++++++++++++++++++++++++++++
- 4 files changed, 443 insertions(+), 1 deletion(-)
- create mode 100644 include/linux/rcuref.h
- create mode 100644 lib/rcuref.c
-
-diff --git a/include/linux/rcuref.h b/include/linux/rcuref.h
-new file mode 100644
-index 0000000000000..2c8bfd0f1b6b3
---- /dev/null
-+++ b/include/linux/rcuref.h
-@@ -0,0 +1,155 @@
-+/* SPDX-License-Identifier: GPL-2.0-only */
-+#ifndef _LINUX_RCUREF_H
-+#define _LINUX_RCUREF_H
-+
-+#include <linux/atomic.h>
-+#include <linux/bug.h>
-+#include <linux/limits.h>
-+#include <linux/lockdep.h>
-+#include <linux/preempt.h>
-+#include <linux/rcupdate.h>
-+
-+#define RCUREF_ONEREF         0x00000000U
-+#define RCUREF_MAXREF         0x7FFFFFFFU
-+#define RCUREF_SATURATED      0xA0000000U
-+#define RCUREF_RELEASED               0xC0000000U
-+#define RCUREF_DEAD           0xE0000000U
-+#define RCUREF_NOREF          0xFFFFFFFFU
-+
-+/**
-+ * rcuref_init - Initialize a rcuref reference count with the given reference count
-+ * @ref:      Pointer to the reference count
-+ * @cnt:      The initial reference count typically '1'
-+ */
-+static inline void rcuref_init(rcuref_t *ref, unsigned int cnt)
-+{
-+      atomic_set(&ref->refcnt, cnt - 1);
-+}
-+
-+/**
-+ * rcuref_read - Read the number of held reference counts of a rcuref
-+ * @ref:      Pointer to the reference count
-+ *
-+ * Return: The number of held references (0 ... N)
-+ */
-+static inline unsigned int rcuref_read(rcuref_t *ref)
-+{
-+      unsigned int c = atomic_read(&ref->refcnt);
-+
-+      /* Return 0 if within the DEAD zone. */
-+      return c >= RCUREF_RELEASED ? 0 : c + 1;
-+}
-+
-+extern __must_check bool rcuref_get_slowpath(rcuref_t *ref);
-+
-+/**
-+ * rcuref_get - Acquire one reference on a rcuref reference count
-+ * @ref:      Pointer to the reference count
-+ *
-+ * Similar to atomic_inc_not_zero() but saturates at RCUREF_MAXREF.
-+ *
-+ * Provides no memory ordering, it is assumed the caller has guaranteed the
-+ * object memory to be stable (RCU, etc.). It does provide a control dependency
-+ * and thereby orders future stores. See documentation in lib/rcuref.c
-+ *
-+ * Return:
-+ *    False if the attempt to acquire a reference failed. This happens
-+ *    when the last reference has been put already
-+ *
-+ *    True if a reference was successfully acquired
-+ */
-+static inline __must_check bool rcuref_get(rcuref_t *ref)
-+{
-+      /*
-+       * Unconditionally increase the reference count. The saturation and
-+       * dead zones provide enough tolerance for this.
-+       */
-+      if (likely(!atomic_add_negative_relaxed(1, &ref->refcnt)))
-+              return true;
-+
-+      /* Handle the cases inside the saturation and dead zones */
-+      return rcuref_get_slowpath(ref);
-+}
-+
-+extern __must_check bool rcuref_put_slowpath(rcuref_t *ref);
-+
-+/*
-+ * Internal helper. Do not invoke directly.
-+ */
-+static __always_inline __must_check bool __rcuref_put(rcuref_t *ref)
-+{
-+      RCU_LOCKDEP_WARN(!rcu_read_lock_held() && preemptible(),
-+                       "suspicious rcuref_put_rcusafe() usage");
-+      /*
-+       * Unconditionally decrease the reference count. The saturation and
-+       * dead zones provide enough tolerance for this.
-+       */
-+      if (likely(!atomic_add_negative_release(-1, &ref->refcnt)))
-+              return false;
-+
-+      /*
-+       * Handle the last reference drop and cases inside the saturation
-+       * and dead zones.
-+       */
-+      return rcuref_put_slowpath(ref);
-+}
-+
-+/**
-+ * rcuref_put_rcusafe -- Release one reference for a rcuref reference count RCU safe
-+ * @ref:      Pointer to the reference count
-+ *
-+ * Provides release memory ordering, such that prior loads and stores are done
-+ * before, and provides an acquire ordering on success such that free()
-+ * must come after.
-+ *
-+ * Can be invoked from contexts, which guarantee that no grace period can
-+ * happen which would free the object concurrently if the decrement drops
-+ * the last reference and the slowpath races against a concurrent get() and
-+ * put() pair. rcu_read_lock()'ed and atomic contexts qualify.
-+ *
-+ * Return:
-+ *    True if this was the last reference with no future references
-+ *    possible. This signals the caller that it can safely release the
-+ *    object which is protected by the reference counter.
-+ *
-+ *    False if there are still active references or the put() raced
-+ *    with a concurrent get()/put() pair. Caller is not allowed to
-+ *    release the protected object.
-+ */
-+static inline __must_check bool rcuref_put_rcusafe(rcuref_t *ref)
-+{
-+      return __rcuref_put(ref);
-+}
-+
-+/**
-+ * rcuref_put -- Release one reference for a rcuref reference count
-+ * @ref:      Pointer to the reference count
-+ *
-+ * Can be invoked from any context.
-+ *
-+ * Provides release memory ordering, such that prior loads and stores are done
-+ * before, and provides an acquire ordering on success such that free()
-+ * must come after.
-+ *
-+ * Return:
-+ *
-+ *    True if this was the last reference with no future references
-+ *    possible. This signals the caller that it can safely schedule the
-+ *    object, which is protected by the reference counter, for
-+ *    deconstruction.
-+ *
-+ *    False if there are still active references or the put() raced
-+ *    with a concurrent get()/put() pair. Caller is not allowed to
-+ *    deconstruct the protected object.
-+ */
-+static inline __must_check bool rcuref_put(rcuref_t *ref)
-+{
-+      bool released;
-+
-+      preempt_disable();
-+      released = __rcuref_put(ref);
-+      preempt_enable();
-+      return released;
-+}
-+
-+#endif
-diff --git a/include/linux/types.h b/include/linux/types.h
-index ea8cf60a8a795..688fb943556a1 100644
---- a/include/linux/types.h
-+++ b/include/linux/types.h
-@@ -175,6 +175,12 @@ typedef struct {
- } atomic64_t;
- #endif
- 
-+typedef struct {
-+      atomic_t refcnt;
-+} rcuref_t;
-+
-+#define RCUREF_INIT(i)        { .refcnt = ATOMIC_INIT(i - 1) }
-+
- struct list_head {
-       struct list_head *next, *prev;
- };
-diff --git a/lib/Makefile b/lib/Makefile
-index 5ffe72ec99797..afd78c497ec76 100644
---- a/lib/Makefile
-+++ b/lib/Makefile
-@@ -47,7 +47,7 @@ obj-y += bcd.o sort.o parser.o debug_locks.o random32.o \
-        list_sort.o uuid.o iov_iter.o clz_ctz.o \
-        bsearch.o find_bit.o llist.o memweight.o kfifo.o \
-        percpu-refcount.o rhashtable.o base64.o \
--       once.o refcount.o usercopy.o errseq.o bucket_locks.o \
-+       once.o refcount.o rcuref.o usercopy.o errseq.o bucket_locks.o \
-        generic-radix-tree.o
- obj-$(CONFIG_STRING_SELFTEST) += test_string.o
- obj-y += string_helpers.o
-diff --git a/lib/rcuref.c b/lib/rcuref.c
-new file mode 100644
-index 0000000000000..5ec00a4a64d11
---- /dev/null
-+++ b/lib/rcuref.c
-@@ -0,0 +1,281 @@
-+// SPDX-License-Identifier: GPL-2.0-only
-+
-+/*
-+ * rcuref - A scalable reference count implementation for RCU managed objects
-+ *
-+ * rcuref is provided to replace open coded reference count implementations
-+ * based on atomic_t. It protects explicitely RCU managed objects which can
-+ * be visible even after the last reference has been dropped and the object
-+ * is heading towards destruction.
-+ *
-+ * A common usage pattern is:
-+ *
-+ * get()
-+ *    rcu_read_lock();
-+ *    p = get_ptr();
-+ *    if (p && !atomic_inc_not_zero(&p->refcnt))
-+ *            p = NULL;
-+ *    rcu_read_unlock();
-+ *    return p;
-+ *
-+ * put()
-+ *    if (!atomic_dec_return(&->refcnt)) {
-+ *            remove_ptr(p);
-+ *            kfree_rcu((p, rcu);
-+ *    }
-+ *
-+ * atomic_inc_not_zero() is implemented with a try_cmpxchg() loop which has
-+ * O(N^2) behaviour under contention with N concurrent operations.
-+ *
-+ * rcuref uses atomic_add_negative_relaxed() for the fast path, which scales
-+ * better under contention.
-+ *
-+ * Why not refcount?
-+ * =================
-+ *
-+ * In principle it should be possible to make refcount use the rcuref
-+ * scheme, but the destruction race described below cannot be prevented
-+ * unless the protected object is RCU managed.
-+ *
-+ * Theory of operation
-+ * ===================
-+ *
-+ * rcuref uses an unsigned integer reference counter. As long as the
-+ * counter value is greater than or equal to RCUREF_ONEREF and not larger
-+ * than RCUREF_MAXREF the reference is alive:
-+ *
-+ * ONEREF   MAXREF               SATURATED             RELEASED      DEAD    NOREF
-+ * 0        0x7FFFFFFF 0x8000000 0xA0000000 0xBFFFFFFF 0xC0000000 0xE0000000 0xFFFFFFFF
-+ * <---valid --------> <-------saturation zone-------> <-----dead zone----->
-+ *
-+ * The get() and put() operations do unconditional increments and
-+ * decrements. The result is checked after the operation. This optimizes
-+ * for the fast path.
-+ *
-+ * If the reference count is saturated or dead, then the increments and
-+ * decrements are not harmful as the reference count still stays in the
-+ * respective zones and is always set back to STATURATED resp. DEAD. The
-+ * zones have room for 2^28 racing operations in each direction, which
-+ * makes it practically impossible to escape the zones.
-+ *
-+ * Once the last reference is dropped the reference count becomes
-+ * RCUREF_NOREF which forces rcuref_put() into the slowpath operation. The
-+ * slowpath then tries to set the reference count from RCUREF_NOREF to
-+ * RCUREF_DEAD via a cmpxchg(). This opens a small window where a
-+ * concurrent rcuref_get() can acquire the reference count and bring it
-+ * back to RCUREF_ONEREF or even drop the reference again and mark it DEAD.
-+ *
-+ * If the cmpxchg() succeeds then a concurrent rcuref_get() will result in
-+ * DEAD + 1, which is inside the dead zone. If that happens the reference
-+ * count is put back to DEAD.
-+ *
-+ * The actual race is possible due to the unconditional increment and
-+ * decrements in rcuref_get() and rcuref_put():
-+ *
-+ *    T1                              T2
-+ *    get()                           put()
-+ *                                    if (atomic_add_negative(-1, &ref->refcnt))
-+ *            succeeds->                      atomic_cmpxchg(&ref->refcnt, NOREF, DEAD);
-+ *
-+ *    atomic_add_negative(1, &ref->refcnt);   <- Elevates refcount to DEAD + 1
-+ *
-+ * As the result of T1's add is negative, the get() goes into the slow path
-+ * and observes refcnt being in the dead zone which makes the operation fail.
-+ *
-+ * Possible critical states:
-+ *
-+ *    Context Counter References      Operation
-+ *    T1      0       1               init()
-+ *    T2      1       2               get()
-+ *    T1      0       1               put()
-+ *    T2     -1       0               put() tries to mark dead
-+ *    T1      0       1               get()
-+ *    T2      0       1               put() mark dead fails
-+ *    T1     -1       0               put() tries to mark dead
-+ *    T1    DEAD      0               put() mark dead succeeds
-+ *    T2    DEAD+1    0               get() fails and puts it back to DEAD
-+ *
-+ * Of course there are more complex scenarios, but the above illustrates
-+ * the working principle. The rest is left to the imagination of the
-+ * reader.
-+ *
-+ * Deconstruction race
-+ * ===================
-+ *
-+ * The release operation must be protected by prohibiting a grace period in
-+ * order to prevent a possible use after free:
-+ *
-+ *    T1                              T2
-+ *    put()                           get()
-+ *    // ref->refcnt = ONEREF
-+ *    if (!atomic_add_negative(-1, &ref->refcnt))
-+ *            return false;                           <- Not taken
-+ *
-+ *    // ref->refcnt == NOREF
-+ *    --> preemption
-+ *                                    // Elevates ref->refcnt to ONEREF
-+ *                                    if (!atomic_add_negative(1, &ref->refcnt))
-+ *                                            return true;                    <- taken
-+ *
-+ *                                    if (put(&p->ref)) { <-- Succeeds
-+ *                                            remove_pointer(p);
-+ *                                            kfree_rcu(p, rcu);
-+ *                                    }
-+ *
-+ *            RCU grace period ends, object is freed
-+ *
-+ *    atomic_cmpxchg(&ref->refcnt, NOREF, DEAD);      <- UAF
-+ *
-+ * This is prevented by disabling preemption around the put() operation as
-+ * that's in most kernel configurations cheaper than a rcu_read_lock() /
-+ * rcu_read_unlock() pair and in many cases even a NOOP. In any case it
-+ * prevents the grace period which keeps the object alive until all put()
-+ * operations complete.
-+ *
-+ * Saturation protection
-+ * =====================
-+ *
-+ * The reference count has a saturation limit RCUREF_MAXREF (INT_MAX).
-+ * Once this is exceedded the reference count becomes stale by setting it
-+ * to RCUREF_SATURATED, which will cause a memory leak, but it prevents
-+ * wrap arounds which obviously cause worse problems than a memory
-+ * leak. When saturation is reached a warning is emitted.
-+ *
-+ * Race conditions
-+ * ===============
-+ *
-+ * All reference count increment/decrement operations are unconditional and
-+ * only verified after the fact. This optimizes for the good case and takes
-+ * the occasional race vs. a dead or already saturated refcount into
-+ * account. The saturation and dead zones are large enough to accomodate
-+ * for that.
-+ *
-+ * Memory ordering
-+ * ===============
-+ *
-+ * Memory ordering rules are slightly relaxed wrt regular atomic_t functions
-+ * and provide only what is strictly required for refcounts.
-+ *
-+ * The increments are fully relaxed; these will not provide ordering. The
-+ * rationale is that whatever is used to obtain the object to increase the
-+ * reference count on will provide the ordering. For locked data
-+ * structures, its the lock acquire, for RCU/lockless data structures its
-+ * the dependent load.
-+ *
-+ * rcuref_get() provides a control dependency ordering future stores which
-+ * ensures that the object is not modified when acquiring a reference
-+ * fails.
-+ *
-+ * rcuref_put() provides release order, i.e. all prior loads and stores
-+ * will be issued before. It also provides a control dependency ordering
-+ * against the subsequent destruction of the object.
-+ *
-+ * If rcuref_put() successfully dropped the last reference and marked the
-+ * object DEAD it also provides acquire ordering.
-+ */
-+
-+#include <linux/export.h>
-+#include <linux/rcuref.h>
-+
-+/**
-+ * rcuref_get_slowpath - Slowpath of rcuref_get()
-+ * @ref:      Pointer to the reference count
-+ *
-+ * Invoked when the reference count is outside of the valid zone.
-+ *
-+ * Return:
-+ *    False if the reference count was already marked dead
-+ *
-+ *    True if the reference count is saturated, which prevents the
-+ *    object from being deconstructed ever.
-+ */
-+bool rcuref_get_slowpath(rcuref_t *ref)
-+{
-+      unsigned int cnt = atomic_read(&ref->refcnt);
-+
-+      /*
-+       * If the reference count was already marked dead, undo the
-+       * increment so it stays in the middle of the dead zone and return
-+       * fail.
-+       */
-+      if (cnt >= RCUREF_RELEASED) {
-+              atomic_set(&ref->refcnt, RCUREF_DEAD);
-+              return false;
-+      }
-+
-+      /*
-+       * If it was saturated, warn and mark it so. In case the increment
-+       * was already on a saturated value restore the saturation
-+       * marker. This keeps it in the middle of the saturation zone and
-+       * prevents the reference count from overflowing. This leaks the
-+       * object memory, but prevents the obvious reference count overflow
-+       * damage.
-+       */
-+      if (WARN_ONCE(cnt > RCUREF_MAXREF, "rcuref saturated - leaking memory"))
-+              atomic_set(&ref->refcnt, RCUREF_SATURATED);
-+      return true;
-+}
-+EXPORT_SYMBOL_GPL(rcuref_get_slowpath);
-+
-+/**
-+ * rcuref_put_slowpath - Slowpath of __rcuref_put()
-+ * @ref:      Pointer to the reference count
-+ *
-+ * Invoked when the reference count is outside of the valid zone.
-+ *
-+ * Return:
-+ *    True if this was the last reference with no future references
-+ *    possible. This signals the caller that it can safely schedule the
-+ *    object, which is protected by the reference counter, for
-+ *    deconstruction.
-+ *
-+ *    False if there are still active references or the put() raced
-+ *    with a concurrent get()/put() pair. Caller is not allowed to
-+ *    deconstruct the protected object.
-+ */
-+bool rcuref_put_slowpath(rcuref_t *ref)
-+{
-+      unsigned int cnt = atomic_read(&ref->refcnt);
-+
-+      /* Did this drop the last reference? */
-+      if (likely(cnt == RCUREF_NOREF)) {
-+              /*
-+               * Carefully try to set the reference count to RCUREF_DEAD.
-+               *
-+               * This can fail if a concurrent get() operation has
-+               * elevated it again or the corresponding put() even marked
-+               * it dead already. Both are valid situations and do not
-+               * require a retry. If this fails the caller is not
-+               * allowed to deconstruct the object.
-+               */
-+              if (atomic_cmpxchg_release(&ref->refcnt, RCUREF_NOREF, RCUREF_DEAD) != RCUREF_NOREF)
-+                      return false;
-+
-+              /*
-+               * The caller can safely schedule the object for
-+               * deconstruction. Provide acquire ordering.
-+               */
-+              smp_acquire__after_ctrl_dep();
-+              return true;
-+      }
-+
-+      /*
-+       * If the reference count was already in the dead zone, then this
-+       * put() operation is imbalanced. Warn, put the reference count back to
-+       * DEAD and tell the caller to not deconstruct the object.
-+       */
-+      if (WARN_ONCE(cnt >= RCUREF_RELEASED, "rcuref - imbalanced put()")) {
-+              atomic_set(&ref->refcnt, RCUREF_DEAD);
-+              return false;
-+      }
-+
-+      /*
-+       * This is a put() operation on a saturated refcount. Restore the
-+       * mean saturation value and tell the caller to not deconstruct the
-+       * object.
-+       */
-+      if (cnt > RCUREF_MAXREF)
-+              atomic_set(&ref->refcnt, RCUREF_SATURATED);
-+      return false;
-+}
-+EXPORT_SYMBOL_GPL(rcuref_put_slowpath);
--- 
-2.40.1
-
diff --git a/queue-6.1/net-dst-fix-missing-initialization-of-rt_uncached.patch b/queue-6.1/net-dst-fix-missing-initialization-of-rt_uncached.patch

deleted file mode 100644 (file)

index 3244b26..0000000
--- a/queue-6.1/net-dst-fix-missing-initialization-of-rt_uncached.patch
+++ /dev/null
@@ -1,177 +0,0 @@
-From f4fdfd10202488104e6e484bd76fd1b5cd7c10c6 Mon Sep 17 00:00:00 2001
-From: Sasha Levin <sashal@kernel.org>
-Date: Thu, 20 Apr 2023 20:25:08 +0200
-Subject: net: dst: fix missing initialization of rt_uncached
-
-From: Maxime Bizon <mbizon@freebox.fr>
-
-[ Upstream commit 418a73074da9182f571e467eaded03ea501f3281 ]
-
-xfrm_alloc_dst() followed by xfrm4_dst_destroy(), without a
-xfrm4_fill_dst() call in between, causes the following BUG:
-
- BUG: spinlock bad magic on CPU#0, fbxhostapd/732
-  lock: 0x890b7668, .magic: 890b7668, .owner: <none>/-1, .owner_cpu: 0
- CPU: 0 PID: 732 Comm: fbxhostapd Not tainted 6.3.0-rc6-next-20230414-00613-ge8de66369925-dirty #9
- Hardware name: Marvell Kirkwood (Flattened Device Tree)
-  unwind_backtrace from show_stack+0x10/0x14
-  show_stack from dump_stack_lvl+0x28/0x30
-  dump_stack_lvl from do_raw_spin_lock+0x20/0x80
-  do_raw_spin_lock from rt_del_uncached_list+0x30/0x64
-  rt_del_uncached_list from xfrm4_dst_destroy+0x3c/0xbc
-  xfrm4_dst_destroy from dst_destroy+0x5c/0xb0
-  dst_destroy from rcu_process_callbacks+0xc4/0xec
-  rcu_process_callbacks from __do_softirq+0xb4/0x22c
-  __do_softirq from call_with_stack+0x1c/0x24
-  call_with_stack from do_softirq+0x60/0x6c
-  do_softirq from __local_bh_enable_ip+0xa0/0xcc
-
-Patch "net: dst: Prevent false sharing vs. dst_entry:: __refcnt" moved
-rt_uncached and rt_uncached_list fields from rtable struct to dst
-struct, so they are more zeroed by memset_after(xdst, 0, u.dst) in
-xfrm_alloc_dst().
-
-Note that rt_uncached (list_head) was never properly initialized at
-alloc time, but xfrm[46]_dst_destroy() is written in such a way that
-it was not an issue thanks to the memset:
-
-       if (xdst->u.rt.dst.rt_uncached_list)
-               rt_del_uncached_list(&xdst->u.rt);
-
-The route code does it the other way around: rt_uncached_list is
-assumed to be valid IIF rt_uncached list_head is not empty:
-
-void rt_del_uncached_list(struct rtable *rt)
-{
-        if (!list_empty(&rt->dst.rt_uncached)) {
-                struct uncached_list *ul = rt->dst.rt_uncached_list;
-
-                spin_lock_bh(&ul->lock);
-                list_del_init(&rt->dst.rt_uncached);
-                spin_unlock_bh(&ul->lock);
-        }
-}
-
-This patch adds mandatory rt_uncached list_head initialization in
-generic dst_init(), and adapt xfrm[46]_dst_destroy logic to match the
-rest of the code.
-
-Fixes: d288a162dd1c ("net: dst: Prevent false sharing vs. dst_entry:: __refcnt")
-Reported-by: kernel test robot <oliver.sang@intel.com>
-Link: https://lore.kernel.org/oe-lkp/202304162125.18b7bcdd-oliver.sang@intel.com
-Reviewed-by: David Ahern <dsahern@kernel.org>
-Reviewed-by: Eric Dumazet <edumazet@google.com>
-CC: Leon Romanovsky <leon@kernel.org>
-Signed-off-by: Maxime Bizon <mbizon@freebox.fr>
-Link: https://lore.kernel.org/r/20230420182508.2417582-1-mbizon@freebox.fr
-Signed-off-by: Jakub Kicinski <kuba@kernel.org>
-Stable-dep-of: cc9b364bb1d5 ("xfrm6: fix inet6_dev refcount underflow problem")
-Signed-off-by: Sasha Levin <sashal@kernel.org>
----
- net/core/dst.c          | 1 +
- net/ipv4/route.c        | 4 ----
- net/ipv4/xfrm4_policy.c | 4 +---
- net/ipv6/route.c        | 1 -
- net/ipv6/xfrm6_policy.c | 4 +---
- 5 files changed, 3 insertions(+), 11 deletions(-)
-
-diff --git a/net/core/dst.c b/net/core/dst.c
-index 2b7b1619b5e29..1666a6f5e858e 100644
---- a/net/core/dst.c
-+++ b/net/core/dst.c
-@@ -67,6 +67,7 @@ void dst_init(struct dst_entry *dst, struct dst_ops *ops,
- #endif
-       dst->lwtstate = NULL;
-       rcuref_init(&dst->__rcuref, initial_ref);
-+      INIT_LIST_HEAD(&dst->rt_uncached);
-       dst->__use = 0;
-       dst->lastuse = jiffies;
-       dst->flags = flags;
-diff --git a/net/ipv4/route.c b/net/ipv4/route.c
-index 7ccf6503d67aa..a44d20644fbc2 100644
---- a/net/ipv4/route.c
-+++ b/net/ipv4/route.c
-@@ -1646,7 +1646,6 @@ struct rtable *rt_dst_alloc(struct net_device *dev,
-               rt->rt_uses_gateway = 0;
-               rt->rt_gw_family = 0;
-               rt->rt_gw4 = 0;
--              INIT_LIST_HEAD(&rt->dst.rt_uncached);
- 
-               rt->dst.output = ip_output;
-               if (flags & RTCF_LOCAL)
-@@ -1677,7 +1676,6 @@ struct rtable *rt_dst_clone(struct net_device *dev, struct rtable *rt)
-                       new_rt->rt_gw4 = rt->rt_gw4;
-               else if (rt->rt_gw_family == AF_INET6)
-                       new_rt->rt_gw6 = rt->rt_gw6;
--              INIT_LIST_HEAD(&new_rt->dst.rt_uncached);
- 
-               new_rt->dst.input = rt->dst.input;
-               new_rt->dst.output = rt->dst.output;
-@@ -2861,8 +2859,6 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or
-                       rt->rt_gw4 = ort->rt_gw4;
-               else if (rt->rt_gw_family == AF_INET6)
-                       rt->rt_gw6 = ort->rt_gw6;
--
--              INIT_LIST_HEAD(&rt->dst.rt_uncached);
-       }
- 
-       dst_release(dst_orig);
-diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
-index 47861c8b7340e..9403bbaf1b616 100644
---- a/net/ipv4/xfrm4_policy.c
-+++ b/net/ipv4/xfrm4_policy.c
-@@ -91,7 +91,6 @@ static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
-               xdst->u.rt.rt_gw6 = rt->rt_gw6;
-       xdst->u.rt.rt_pmtu = rt->rt_pmtu;
-       xdst->u.rt.rt_mtu_locked = rt->rt_mtu_locked;
--      INIT_LIST_HEAD(&xdst->u.rt.dst.rt_uncached);
-       rt_add_uncached_list(&xdst->u.rt);
- 
-       return 0;
-@@ -121,8 +120,7 @@ static void xfrm4_dst_destroy(struct dst_entry *dst)
-       struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
- 
-       dst_destroy_metrics_generic(dst);
--      if (xdst->u.rt.dst.rt_uncached_list)
--              rt_del_uncached_list(&xdst->u.rt);
-+      rt_del_uncached_list(&xdst->u.rt);
-       xfrm_dst_destroy(xdst);
- }
- 
-diff --git a/net/ipv6/route.c b/net/ipv6/route.c
-index 9db0b2318e918..d4d06a9d985e8 100644
---- a/net/ipv6/route.c
-+++ b/net/ipv6/route.c
-@@ -334,7 +334,6 @@ static const struct rt6_info ip6_blk_hole_entry_template = {
- static void rt6_info_init(struct rt6_info *rt)
- {
-       memset_after(rt, 0, dst);
--      INIT_LIST_HEAD(&rt->dst.rt_uncached);
- }
- 
- /* allocate dst with ip6_dst_ops */
-diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c
-index 2b493f8d00918..eecc5e59da17c 100644
---- a/net/ipv6/xfrm6_policy.c
-+++ b/net/ipv6/xfrm6_policy.c
-@@ -89,7 +89,6 @@ static int xfrm6_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
-       xdst->u.rt6.rt6i_gateway = rt->rt6i_gateway;
-       xdst->u.rt6.rt6i_dst = rt->rt6i_dst;
-       xdst->u.rt6.rt6i_src = rt->rt6i_src;
--      INIT_LIST_HEAD(&xdst->u.rt6.dst.rt_uncached);
-       rt6_uncached_list_add(&xdst->u.rt6);
- 
-       return 0;
-@@ -121,8 +120,7 @@ static void xfrm6_dst_destroy(struct dst_entry *dst)
-       if (likely(xdst->u.rt6.rt6i_idev))
-               in6_dev_put(xdst->u.rt6.rt6i_idev);
-       dst_destroy_metrics_generic(dst);
--      if (xdst->u.rt6.dst.rt_uncached_list)
--              rt6_uncached_list_del(&xdst->u.rt6);
-+      rt6_uncached_list_del(&xdst->u.rt6);
-       xfrm_dst_destroy(xdst);
- }
- 
--- 
-2.40.1
-
diff --git a/queue-6.1/net-dst-prevent-false-sharing-vs.-dst_entry-__refcnt.patch b/queue-6.1/net-dst-prevent-false-sharing-vs.-dst_entry-__refcnt.patch

deleted file mode 100644 (file)

index ae7f971..0000000
--- a/queue-6.1/net-dst-prevent-false-sharing-vs.-dst_entry-__refcnt.patch
+++ /dev/null
@@ -1,372 +0,0 @@
-From e7f0083dd5326ec3a897b9d9c144fdaf4f630c4a Mon Sep 17 00:00:00 2001
-From: Sasha Levin <sashal@kernel.org>
-Date: Thu, 23 Mar 2023 21:55:29 +0100
-Subject: net: dst: Prevent false sharing vs. dst_entry:: __refcnt
-
-From: Wangyang Guo <wangyang.guo@intel.com>
-
-[ Upstream commit d288a162dd1c73507da582966f17dd226e34a0c0 ]
-
-dst_entry::__refcnt is highly contended in scenarios where many connections
-happen from and to the same IP. The reference count is an atomic_t, so the
-reference count operations have to take the cache-line exclusive.
-
-Aside of the unavoidable reference count contention there is another
-significant problem which is caused by that: False sharing.
-
-perf top identified two affected read accesses. dst_entry::lwtstate and
-rtable::rt_genid.
-
-dst_entry:__refcnt is located at offset 64 of dst_entry, which puts it into
-a seperate cacheline vs. the read mostly members located at the beginning
-of the struct.
-
-That prevents false sharing vs. the struct members in the first 64
-bytes of the structure, but there is also
-
-  dst_entry::lwtstate
-
-which is located after the reference count and in the same cache line. This
-member is read after a reference count has been acquired.
-
-struct rtable embeds a struct dst_entry at offset 0. struct dst_entry has a
-size of 112 bytes, which means that the struct members of rtable which
-follow the dst member share the same cache line as dst_entry::__refcnt.
-Especially
-
-  rtable::rt_genid
-
-is also read by the contexts which have a reference count acquired
-already.
-
-When dst_entry:__refcnt is incremented or decremented via an atomic
-operation these read accesses stall. This was found when analysing the
-memtier benchmark in 1:100 mode, which amplifies the problem extremly.
-
-Move the rt[6i]_uncached[_list] members out of struct rtable and struct
-rt6_info into struct dst_entry to provide padding and move the lwtstate
-member after that so it ends up in the same cache line.
-
-The resulting improvement depends on the micro-architecture and the number
-of CPUs. It ranges from +20% to +120% with a localhost memtier/memcached
-benchmark.
-
-[ tglx: Rearrange struct ]
-
-Signed-off-by: Wangyang Guo <wangyang.guo@intel.com>
-Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
-Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
-Reviewed-by: Eric Dumazet <edumazet@google.com>
-Reviewed-by: David Ahern <dsahern@kernel.org>
-Link: https://lore.kernel.org/r/20230323102800.042297517@linutronix.de
-Signed-off-by: Jakub Kicinski <kuba@kernel.org>
-Stable-dep-of: cc9b364bb1d5 ("xfrm6: fix inet6_dev refcount underflow problem")
-Signed-off-by: Sasha Levin <sashal@kernel.org>
----
- include/net/dst.h       | 15 ++++++++++++++-
- include/net/ip6_fib.h   |  3 ---
- include/net/ip6_route.h |  2 +-
- include/net/route.h     |  3 ---
- net/ipv4/route.c        | 20 ++++++++++----------
- net/ipv4/xfrm4_policy.c |  4 ++--
- net/ipv6/route.c        | 26 +++++++++++++-------------
- net/ipv6/xfrm6_policy.c |  4 ++--
- 8 files changed, 42 insertions(+), 35 deletions(-)
-
-diff --git a/include/net/dst.h b/include/net/dst.h
-index d67fda89cd0fa..81f2279ea911a 100644
---- a/include/net/dst.h
-+++ b/include/net/dst.h
-@@ -69,15 +69,28 @@ struct dst_entry {
- #endif
-       int                     __use;
-       unsigned long           lastuse;
--      struct lwtunnel_state   *lwtstate;
-       struct rcu_head         rcu_head;
-       short                   error;
-       short                   __pad;
-       __u32                   tclassid;
- #ifndef CONFIG_64BIT
-+      struct lwtunnel_state   *lwtstate;
-       atomic_t                __refcnt;       /* 32-bit offset 64 */
- #endif
-       netdevice_tracker       dev_tracker;
-+
-+      /*
-+       * Used by rtable and rt6_info. Moves lwtstate into the next cache
-+       * line on 64bit so that lwtstate does not cause false sharing with
-+       * __refcnt under contention of __refcnt. This also puts the
-+       * frequently accessed members of rtable and rt6_info out of the
-+       * __refcnt cache line.
-+       */
-+      struct list_head        rt_uncached;
-+      struct uncached_list    *rt_uncached_list;
-+#ifdef CONFIG_64BIT
-+      struct lwtunnel_state   *lwtstate;
-+#endif
- };
- 
- struct dst_metrics {
-diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
-index fa4e6af382e2a..9ba6413fd2e3e 100644
---- a/include/net/ip6_fib.h
-+++ b/include/net/ip6_fib.h
-@@ -217,9 +217,6 @@ struct rt6_info {
-       struct inet6_dev                *rt6i_idev;
-       u32                             rt6i_flags;
- 
--      struct list_head                rt6i_uncached;
--      struct uncached_list            *rt6i_uncached_list;
--
-       /* more non-fragment space at head required */
-       unsigned short                  rt6i_nfheader_len;
- };
-diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
-index 035d61d50a989..6c6b673d92554 100644
---- a/include/net/ip6_route.h
-+++ b/include/net/ip6_route.h
-@@ -104,7 +104,7 @@ static inline struct dst_entry *ip6_route_output(struct net *net,
- static inline void ip6_rt_put_flags(struct rt6_info *rt, int flags)
- {
-       if (!(flags & RT6_LOOKUP_F_DST_NOREF) ||
--          !list_empty(&rt->rt6i_uncached))
-+          !list_empty(&rt->dst.rt_uncached))
-               ip6_rt_put(rt);
- }
- 
-diff --git a/include/net/route.h b/include/net/route.h
-index af8431b25f800..9ca0f72868b76 100644
---- a/include/net/route.h
-+++ b/include/net/route.h
-@@ -78,9 +78,6 @@ struct rtable {
-       /* Miscellaneous cached information */
-       u32                     rt_mtu_locked:1,
-                               rt_pmtu:31;
--
--      struct list_head        rt_uncached;
--      struct uncached_list    *rt_uncached_list;
- };
- 
- static inline bool rt_is_input_route(const struct rtable *rt)
-diff --git a/net/ipv4/route.c b/net/ipv4/route.c
-index 9cbaae4f5ee71..7ccf6503d67aa 100644
---- a/net/ipv4/route.c
-+++ b/net/ipv4/route.c
-@@ -1510,20 +1510,20 @@ void rt_add_uncached_list(struct rtable *rt)
- {
-       struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
- 
--      rt->rt_uncached_list = ul;
-+      rt->dst.rt_uncached_list = ul;
- 
-       spin_lock_bh(&ul->lock);
--      list_add_tail(&rt->rt_uncached, &ul->head);
-+      list_add_tail(&rt->dst.rt_uncached, &ul->head);
-       spin_unlock_bh(&ul->lock);
- }
- 
- void rt_del_uncached_list(struct rtable *rt)
- {
--      if (!list_empty(&rt->rt_uncached)) {
--              struct uncached_list *ul = rt->rt_uncached_list;
-+      if (!list_empty(&rt->dst.rt_uncached)) {
-+              struct uncached_list *ul = rt->dst.rt_uncached_list;
- 
-               spin_lock_bh(&ul->lock);
--              list_del_init(&rt->rt_uncached);
-+              list_del_init(&rt->dst.rt_uncached);
-               spin_unlock_bh(&ul->lock);
-       }
- }
-@@ -1548,13 +1548,13 @@ void rt_flush_dev(struct net_device *dev)
-                       continue;
- 
-               spin_lock_bh(&ul->lock);
--              list_for_each_entry_safe(rt, safe, &ul->head, rt_uncached) {
-+              list_for_each_entry_safe(rt, safe, &ul->head, dst.rt_uncached) {
-                       if (rt->dst.dev != dev)
-                               continue;
-                       rt->dst.dev = blackhole_netdev;
-                       netdev_ref_replace(dev, blackhole_netdev,
-                                          &rt->dst.dev_tracker, GFP_ATOMIC);
--                      list_move(&rt->rt_uncached, &ul->quarantine);
-+                      list_move(&rt->dst.rt_uncached, &ul->quarantine);
-               }
-               spin_unlock_bh(&ul->lock);
-       }
-@@ -1646,7 +1646,7 @@ struct rtable *rt_dst_alloc(struct net_device *dev,
-               rt->rt_uses_gateway = 0;
-               rt->rt_gw_family = 0;
-               rt->rt_gw4 = 0;
--              INIT_LIST_HEAD(&rt->rt_uncached);
-+              INIT_LIST_HEAD(&rt->dst.rt_uncached);
- 
-               rt->dst.output = ip_output;
-               if (flags & RTCF_LOCAL)
-@@ -1677,7 +1677,7 @@ struct rtable *rt_dst_clone(struct net_device *dev, struct rtable *rt)
-                       new_rt->rt_gw4 = rt->rt_gw4;
-               else if (rt->rt_gw_family == AF_INET6)
-                       new_rt->rt_gw6 = rt->rt_gw6;
--              INIT_LIST_HEAD(&new_rt->rt_uncached);
-+              INIT_LIST_HEAD(&new_rt->dst.rt_uncached);
- 
-               new_rt->dst.input = rt->dst.input;
-               new_rt->dst.output = rt->dst.output;
-@@ -2862,7 +2862,7 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or
-               else if (rt->rt_gw_family == AF_INET6)
-                       rt->rt_gw6 = ort->rt_gw6;
- 
--              INIT_LIST_HEAD(&rt->rt_uncached);
-+              INIT_LIST_HEAD(&rt->dst.rt_uncached);
-       }
- 
-       dst_release(dst_orig);
-diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
-index 3d0dfa6cf9f96..47861c8b7340e 100644
---- a/net/ipv4/xfrm4_policy.c
-+++ b/net/ipv4/xfrm4_policy.c
-@@ -91,7 +91,7 @@ static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
-               xdst->u.rt.rt_gw6 = rt->rt_gw6;
-       xdst->u.rt.rt_pmtu = rt->rt_pmtu;
-       xdst->u.rt.rt_mtu_locked = rt->rt_mtu_locked;
--      INIT_LIST_HEAD(&xdst->u.rt.rt_uncached);
-+      INIT_LIST_HEAD(&xdst->u.rt.dst.rt_uncached);
-       rt_add_uncached_list(&xdst->u.rt);
- 
-       return 0;
-@@ -121,7 +121,7 @@ static void xfrm4_dst_destroy(struct dst_entry *dst)
-       struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
- 
-       dst_destroy_metrics_generic(dst);
--      if (xdst->u.rt.rt_uncached_list)
-+      if (xdst->u.rt.dst.rt_uncached_list)
-               rt_del_uncached_list(&xdst->u.rt);
-       xfrm_dst_destroy(xdst);
- }
-diff --git a/net/ipv6/route.c b/net/ipv6/route.c
-index 0bcdb675ba2c1..7205adee46c21 100644
---- a/net/ipv6/route.c
-+++ b/net/ipv6/route.c
-@@ -139,20 +139,20 @@ void rt6_uncached_list_add(struct rt6_info *rt)
- {
-       struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
- 
--      rt->rt6i_uncached_list = ul;
-+      rt->dst.rt_uncached_list = ul;
- 
-       spin_lock_bh(&ul->lock);
--      list_add_tail(&rt->rt6i_uncached, &ul->head);
-+      list_add_tail(&rt->dst.rt_uncached, &ul->head);
-       spin_unlock_bh(&ul->lock);
- }
- 
- void rt6_uncached_list_del(struct rt6_info *rt)
- {
--      if (!list_empty(&rt->rt6i_uncached)) {
--              struct uncached_list *ul = rt->rt6i_uncached_list;
-+      if (!list_empty(&rt->dst.rt_uncached)) {
-+              struct uncached_list *ul = rt->dst.rt_uncached_list;
- 
-               spin_lock_bh(&ul->lock);
--              list_del_init(&rt->rt6i_uncached);
-+              list_del_init(&rt->dst.rt_uncached);
-               spin_unlock_bh(&ul->lock);
-       }
- }
-@@ -169,7 +169,7 @@ static void rt6_uncached_list_flush_dev(struct net_device *dev)
-                       continue;
- 
-               spin_lock_bh(&ul->lock);
--              list_for_each_entry_safe(rt, safe, &ul->head, rt6i_uncached) {
-+              list_for_each_entry_safe(rt, safe, &ul->head, dst.rt_uncached) {
-                       struct inet6_dev *rt_idev = rt->rt6i_idev;
-                       struct net_device *rt_dev = rt->dst.dev;
-                       bool handled = false;
-@@ -188,7 +188,7 @@ static void rt6_uncached_list_flush_dev(struct net_device *dev)
-                               handled = true;
-                       }
-                       if (handled)
--                              list_move(&rt->rt6i_uncached,
-+                              list_move(&rt->dst.rt_uncached,
-                                         &ul->quarantine);
-               }
-               spin_unlock_bh(&ul->lock);
-@@ -334,7 +334,7 @@ static const struct rt6_info ip6_blk_hole_entry_template = {
- static void rt6_info_init(struct rt6_info *rt)
- {
-       memset_after(rt, 0, dst);
--      INIT_LIST_HEAD(&rt->rt6i_uncached);
-+      INIT_LIST_HEAD(&rt->dst.rt_uncached);
- }
- 
- /* allocate dst with ip6_dst_ops */
-@@ -2641,7 +2641,7 @@ struct dst_entry *ip6_route_output_flags(struct net *net,
-       dst = ip6_route_output_flags_noref(net, sk, fl6, flags);
-       rt6 = (struct rt6_info *)dst;
-       /* For dst cached in uncached_list, refcnt is already taken. */
--      if (list_empty(&rt6->rt6i_uncached) && !dst_hold_safe(dst)) {
-+      if (list_empty(&rt6->dst.rt_uncached) && !dst_hold_safe(dst)) {
-               dst = &net->ipv6.ip6_null_entry->dst;
-               dst_hold(dst);
-       }
-@@ -2751,7 +2751,7 @@ INDIRECT_CALLABLE_SCOPE struct dst_entry *ip6_dst_check(struct dst_entry *dst,
-       from = rcu_dereference(rt->from);
- 
-       if (from && (rt->rt6i_flags & RTF_PCPU ||
--          unlikely(!list_empty(&rt->rt6i_uncached))))
-+          unlikely(!list_empty(&rt->dst.rt_uncached))))
-               dst_ret = rt6_dst_from_check(rt, from, cookie);
-       else
-               dst_ret = rt6_check(rt, from, cookie);
-@@ -6488,7 +6488,7 @@ static int __net_init ip6_route_net_init(struct net *net)
-       net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
-       dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
-                        ip6_template_metrics, true);
--      INIT_LIST_HEAD(&net->ipv6.ip6_null_entry->rt6i_uncached);
-+      INIT_LIST_HEAD(&net->ipv6.ip6_null_entry->dst.rt_uncached);
- 
- #ifdef CONFIG_IPV6_MULTIPLE_TABLES
-       net->ipv6.fib6_has_custom_rules = false;
-@@ -6500,7 +6500,7 @@ static int __net_init ip6_route_net_init(struct net *net)
-       net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
-       dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
-                        ip6_template_metrics, true);
--      INIT_LIST_HEAD(&net->ipv6.ip6_prohibit_entry->rt6i_uncached);
-+      INIT_LIST_HEAD(&net->ipv6.ip6_prohibit_entry->dst.rt_uncached);
- 
-       net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
-                                              sizeof(*net->ipv6.ip6_blk_hole_entry),
-@@ -6510,7 +6510,7 @@ static int __net_init ip6_route_net_init(struct net *net)
-       net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
-       dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
-                        ip6_template_metrics, true);
--      INIT_LIST_HEAD(&net->ipv6.ip6_blk_hole_entry->rt6i_uncached);
-+      INIT_LIST_HEAD(&net->ipv6.ip6_blk_hole_entry->dst.rt_uncached);
- #ifdef CONFIG_IPV6_SUBTREES
-       net->ipv6.fib6_routes_require_src = 0;
- #endif
-diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c
-index ea435eba30534..2b493f8d00918 100644
---- a/net/ipv6/xfrm6_policy.c
-+++ b/net/ipv6/xfrm6_policy.c
-@@ -89,7 +89,7 @@ static int xfrm6_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
-       xdst->u.rt6.rt6i_gateway = rt->rt6i_gateway;
-       xdst->u.rt6.rt6i_dst = rt->rt6i_dst;
-       xdst->u.rt6.rt6i_src = rt->rt6i_src;
--      INIT_LIST_HEAD(&xdst->u.rt6.rt6i_uncached);
-+      INIT_LIST_HEAD(&xdst->u.rt6.dst.rt_uncached);
-       rt6_uncached_list_add(&xdst->u.rt6);
- 
-       return 0;
-@@ -121,7 +121,7 @@ static void xfrm6_dst_destroy(struct dst_entry *dst)
-       if (likely(xdst->u.rt6.rt6i_idev))
-               in6_dev_put(xdst->u.rt6.rt6i_idev);
-       dst_destroy_metrics_generic(dst);
--      if (xdst->u.rt6.rt6i_uncached_list)
-+      if (xdst->u.rt6.dst.rt_uncached_list)
-               rt6_uncached_list_del(&xdst->u.rt6);
-       xfrm_dst_destroy(xdst);
- }
--- 
-2.40.1
-
diff --git a/queue-6.1/net-dst-switch-to-rcuref_t-reference-counting.patch b/queue-6.1/net-dst-switch-to-rcuref_t-reference-counting.patch

deleted file mode 100644 (file)

index 8ed6867..0000000
--- a/queue-6.1/net-dst-switch-to-rcuref_t-reference-counting.patch
+++ /dev/null
@@ -1,259 +0,0 @@
-From 180ab46081f3404a77e4cef550c4f0b28701a1b3 Mon Sep 17 00:00:00 2001
-From: Sasha Levin <sashal@kernel.org>
-Date: Thu, 23 Mar 2023 21:55:32 +0100
-Subject: net: dst: Switch to rcuref_t reference counting
-
-From: Thomas Gleixner <tglx@linutronix.de>
-
-[ Upstream commit bc9d3a9f2afca189a6ae40225b6985e3c775375e ]
-
-Under high contention dst_entry::__refcnt becomes a significant bottleneck.
-
-atomic_inc_not_zero() is implemented with a cmpxchg() loop, which goes into
-high retry rates on contention.
-
-Switch the reference count to rcuref_t which results in a significant
-performance gain. Rename the reference count member to __rcuref to reflect
-the change.
-
-The gain depends on the micro-architecture and the number of concurrent
-operations and has been measured in the range of +25% to +130% with a
-localhost memtier/memcached benchmark which amplifies the problem
-massively.
-
-Running the memtier/memcached benchmark over a real (1Gb) network
-connection the conversion on top of the false sharing fix for struct
-dst_entry::__refcnt results in a total gain in the 2%-5% range over the
-upstream baseline.
-
-Reported-by: Wangyang Guo <wangyang.guo@intel.com>
-Reported-by: Arjan Van De Ven <arjan.van.de.ven@intel.com>
-Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
-Link: https://lore.kernel.org/r/20230307125538.989175656@linutronix.de
-Link: https://lore.kernel.org/r/20230323102800.215027837@linutronix.de
-Signed-off-by: Jakub Kicinski <kuba@kernel.org>
-Stable-dep-of: cc9b364bb1d5 ("xfrm6: fix inet6_dev refcount underflow problem")
-Signed-off-by: Sasha Levin <sashal@kernel.org>
----
- include/net/dst.h               | 19 ++++++++++---------
- include/net/sock.h              |  2 +-
- net/bridge/br_nf_core.c         |  2 +-
- net/core/dst.c                  | 26 +++++---------------------
- net/core/rtnetlink.c            |  2 +-
- net/ipv6/route.c                |  6 +++---
- net/netfilter/ipvs/ip_vs_xmit.c |  4 ++--
- 7 files changed, 23 insertions(+), 38 deletions(-)
-
-diff --git a/include/net/dst.h b/include/net/dst.h
-index 81f2279ea911a..78884429deed8 100644
---- a/include/net/dst.h
-+++ b/include/net/dst.h
-@@ -16,6 +16,7 @@
- #include <linux/bug.h>
- #include <linux/jiffies.h>
- #include <linux/refcount.h>
-+#include <linux/rcuref.h>
- #include <net/neighbour.h>
- #include <asm/processor.h>
- #include <linux/indirect_call_wrapper.h>
-@@ -61,11 +62,11 @@ struct dst_entry {
-       unsigned short          trailer_len;    /* space to reserve at tail */
- 
-       /*
--       * __refcnt wants to be on a different cache line from
-+       * __rcuref wants to be on a different cache line from
-        * input/output/ops or performance tanks badly
-        */
- #ifdef CONFIG_64BIT
--      atomic_t                __refcnt;       /* 64-bit offset 64 */
-+      rcuref_t                __rcuref;       /* 64-bit offset 64 */
- #endif
-       int                     __use;
-       unsigned long           lastuse;
-@@ -75,16 +76,16 @@ struct dst_entry {
-       __u32                   tclassid;
- #ifndef CONFIG_64BIT
-       struct lwtunnel_state   *lwtstate;
--      atomic_t                __refcnt;       /* 32-bit offset 64 */
-+      rcuref_t                __rcuref;       /* 32-bit offset 64 */
- #endif
-       netdevice_tracker       dev_tracker;
- 
-       /*
-        * Used by rtable and rt6_info. Moves lwtstate into the next cache
-        * line on 64bit so that lwtstate does not cause false sharing with
--       * __refcnt under contention of __refcnt. This also puts the
-+       * __rcuref under contention of __rcuref. This also puts the
-        * frequently accessed members of rtable and rt6_info out of the
--       * __refcnt cache line.
-+       * __rcuref cache line.
-        */
-       struct list_head        rt_uncached;
-       struct uncached_list    *rt_uncached_list;
-@@ -238,10 +239,10 @@ static inline void dst_hold(struct dst_entry *dst)
- {
-       /*
-        * If your kernel compilation stops here, please check
--       * the placement of __refcnt in struct dst_entry
-+       * the placement of __rcuref in struct dst_entry
-        */
--      BUILD_BUG_ON(offsetof(struct dst_entry, __refcnt) & 63);
--      WARN_ON(atomic_inc_not_zero(&dst->__refcnt) == 0);
-+      BUILD_BUG_ON(offsetof(struct dst_entry, __rcuref) & 63);
-+      WARN_ON(!rcuref_get(&dst->__rcuref));
- }
- 
- static inline void dst_use_noref(struct dst_entry *dst, unsigned long time)
-@@ -305,7 +306,7 @@ static inline void skb_dst_copy(struct sk_buff *nskb, const struct sk_buff *oskb
-  */
- static inline bool dst_hold_safe(struct dst_entry *dst)
- {
--      return atomic_inc_not_zero(&dst->__refcnt);
-+      return rcuref_get(&dst->__rcuref);
- }
- 
- /**
-diff --git a/include/net/sock.h b/include/net/sock.h
-index fe695e8bfe289..4c988b981d6e1 100644
---- a/include/net/sock.h
-+++ b/include/net/sock.h
-@@ -2181,7 +2181,7 @@ sk_dst_get(struct sock *sk)
- 
-       rcu_read_lock();
-       dst = rcu_dereference(sk->sk_dst_cache);
--      if (dst && !atomic_inc_not_zero(&dst->__refcnt))
-+      if (dst && !rcuref_get(&dst->__rcuref))
-               dst = NULL;
-       rcu_read_unlock();
-       return dst;
-diff --git a/net/bridge/br_nf_core.c b/net/bridge/br_nf_core.c
-index 8c69f0c95a8ed..98aea5485aaef 100644
---- a/net/bridge/br_nf_core.c
-+++ b/net/bridge/br_nf_core.c
-@@ -73,7 +73,7 @@ void br_netfilter_rtable_init(struct net_bridge *br)
- {
-       struct rtable *rt = &br->fake_rtable;
- 
--      atomic_set(&rt->dst.__refcnt, 1);
-+      rcuref_init(&rt->dst.__rcuref, 1);
-       rt->dst.dev = br->dev;
-       dst_init_metrics(&rt->dst, br_dst_default_metrics, true);
-       rt->dst.flags   = DST_NOXFRM | DST_FAKE_RTABLE;
-diff --git a/net/core/dst.c b/net/core/dst.c
-index a4e738d321ba2..2b7b1619b5e29 100644
---- a/net/core/dst.c
-+++ b/net/core/dst.c
-@@ -66,7 +66,7 @@ void dst_init(struct dst_entry *dst, struct dst_ops *ops,
-       dst->tclassid = 0;
- #endif
-       dst->lwtstate = NULL;
--      atomic_set(&dst->__refcnt, initial_ref);
-+      rcuref_init(&dst->__rcuref, initial_ref);
-       dst->__use = 0;
-       dst->lastuse = jiffies;
-       dst->flags = flags;
-@@ -166,31 +166,15 @@ EXPORT_SYMBOL(dst_dev_put);
- 
- void dst_release(struct dst_entry *dst)
- {
--      if (dst) {
--              int newrefcnt;
--
--              newrefcnt = atomic_dec_return(&dst->__refcnt);
--              if (WARN_ONCE(newrefcnt < 0, "dst_release underflow"))
--                      net_warn_ratelimited("%s: dst:%p refcnt:%d\n",
--                                           __func__, dst, newrefcnt);
--              if (!newrefcnt)
--                      call_rcu_hurry(&dst->rcu_head, dst_destroy_rcu);
--      }
-+      if (dst && rcuref_put(&dst->__rcuref))
-+              call_rcu_hurry(&dst->rcu_head, dst_destroy_rcu);
- }
- EXPORT_SYMBOL(dst_release);
- 
- void dst_release_immediate(struct dst_entry *dst)
- {
--      if (dst) {
--              int newrefcnt;
--
--              newrefcnt = atomic_dec_return(&dst->__refcnt);
--              if (WARN_ONCE(newrefcnt < 0, "dst_release_immediate underflow"))
--                      net_warn_ratelimited("%s: dst:%p refcnt:%d\n",
--                                           __func__, dst, newrefcnt);
--              if (!newrefcnt)
--                      dst_destroy(dst);
--      }
-+      if (dst && rcuref_put(&dst->__rcuref))
-+              dst_destroy(dst);
- }
- EXPORT_SYMBOL(dst_release_immediate);
- 
-diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
-index 854b3fd66b1be..90810408cc5df 100644
---- a/net/core/rtnetlink.c
-+++ b/net/core/rtnetlink.c
-@@ -839,7 +839,7 @@ int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst, u32 id,
-       if (dst) {
-               ci.rta_lastuse = jiffies_delta_to_clock_t(jiffies - dst->lastuse);
-               ci.rta_used = dst->__use;
--              ci.rta_clntref = atomic_read(&dst->__refcnt);
-+              ci.rta_clntref = rcuref_read(&dst->__rcuref);
-       }
-       if (expires) {
-               unsigned long clock;
-diff --git a/net/ipv6/route.c b/net/ipv6/route.c
-index 7205adee46c21..9db0b2318e918 100644
---- a/net/ipv6/route.c
-+++ b/net/ipv6/route.c
-@@ -293,7 +293,7 @@ static const struct fib6_info fib6_null_entry_template = {
- 
- static const struct rt6_info ip6_null_entry_template = {
-       .dst = {
--              .__refcnt       = ATOMIC_INIT(1),
-+              .__rcuref       = RCUREF_INIT(1),
-               .__use          = 1,
-               .obsolete       = DST_OBSOLETE_FORCE_CHK,
-               .error          = -ENETUNREACH,
-@@ -307,7 +307,7 @@ static const struct rt6_info ip6_null_entry_template = {
- 
- static const struct rt6_info ip6_prohibit_entry_template = {
-       .dst = {
--              .__refcnt       = ATOMIC_INIT(1),
-+              .__rcuref       = RCUREF_INIT(1),
-               .__use          = 1,
-               .obsolete       = DST_OBSOLETE_FORCE_CHK,
-               .error          = -EACCES,
-@@ -319,7 +319,7 @@ static const struct rt6_info ip6_prohibit_entry_template = {
- 
- static const struct rt6_info ip6_blk_hole_entry_template = {
-       .dst = {
--              .__refcnt       = ATOMIC_INIT(1),
-+              .__rcuref       = RCUREF_INIT(1),
-               .__use          = 1,
-               .obsolete       = DST_OBSOLETE_FORCE_CHK,
-               .error          = -EINVAL,
-diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
-index 7243079ef3546..70ef036909fb0 100644
---- a/net/netfilter/ipvs/ip_vs_xmit.c
-+++ b/net/netfilter/ipvs/ip_vs_xmit.c
-@@ -339,7 +339,7 @@ __ip_vs_get_out_rt(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb,
-                       spin_unlock_bh(&dest->dst_lock);
-                       IP_VS_DBG(10, "new dst %pI4, src %pI4, refcnt=%d\n",
-                                 &dest->addr.ip, &dest_dst->dst_saddr.ip,
--                                atomic_read(&rt->dst.__refcnt));
-+                                rcuref_read(&rt->dst.__rcuref));
-               }
-               if (ret_saddr)
-                       *ret_saddr = dest_dst->dst_saddr.ip;
-@@ -507,7 +507,7 @@ __ip_vs_get_out_rt_v6(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb,
-                       spin_unlock_bh(&dest->dst_lock);
-                       IP_VS_DBG(10, "new dst %pI6, src %pI6, refcnt=%d\n",
-                                 &dest->addr.in6, &dest_dst->dst_saddr.in6,
--                                atomic_read(&rt->dst.__refcnt));
-+                                rcuref_read(&rt->dst.__rcuref));
-               }
-               if (ret_saddr)
-                       *ret_saddr = dest_dst->dst_saddr.in6;
--- 
-2.40.1
-
diff --git a/queue-6.1/net-use-call_rcu_hurry-for-dst_release.patch b/queue-6.1/net-use-call_rcu_hurry-for-dst_release.patch

deleted file mode 100644 (file)

index 2677027..0000000
--- a/queue-6.1/net-use-call_rcu_hurry-for-dst_release.patch
+++ /dev/null
@@ -1,92 +0,0 @@
-From 51290b74abe5ae7c0313a41f7e182e0d23a0ad56 Mon Sep 17 00:00:00 2001
-From: Sasha Levin <sashal@kernel.org>
-Date: Fri, 18 Nov 2022 19:19:08 +0000
-Subject: net: Use call_rcu_hurry() for dst_release()
-
-From: Joel Fernandes (Google) <joel@joelfernandes.org>
-
-[ Upstream commit 483c26ff63f42e8898ed43aca0b9953bc91f0cd4 ]
-
-In a networking test on ChromeOS, kernels built with the new
-CONFIG_RCU_LAZY=y Kconfig option fail a networking test in the teardown
-phase.
-
-This failure may be reproduced as follows: ip netns del <name>
-
-The CONFIG_RCU_LAZY=y Kconfig option was introduced by earlier commits
-in this series for the benefit of certain battery-powered systems.
-This Kconfig option causes call_rcu() to delay its callbacks in order
-to batch them.  This means that a given RCU grace period covers more
-callbacks, thus reducing the number of grace periods, in turn reducing
-the amount of energy consumed, which increases battery lifetime which
-can be a very good thing.  This is not a subtle effect: In some important
-use cases, the battery lifetime is increased by more than 10%.
-
-This CONFIG_RCU_LAZY=y option is available only for CPUs that offload
-callbacks, for example, CPUs mentioned in the rcu_nocbs kernel boot
-parameter passed to kernels built with CONFIG_RCU_NOCB_CPU=y.
-
-Delaying callbacks is normally not a problem because most callbacks do
-nothing but free memory.  If the system is short on memory, a shrinker
-will kick all currently queued lazy callbacks out of their laziness,
-thus freeing their memory in short order.  Similarly, the rcu_barrier()
-function, which blocks until all currently queued callbacks are invoked,
-will also kick lazy callbacks, thus enabling rcu_barrier() to complete
-in a timely manner.
-
-However, there are some cases where laziness is not a good option.
-For example, synchronize_rcu() invokes call_rcu(), and blocks until
-the newly queued callback is invoked.  It would not be a good for
-synchronize_rcu() to block for ten seconds, even on an idle system.
-Therefore, synchronize_rcu() invokes call_rcu_hurry() instead of
-call_rcu().  The arrival of a non-lazy call_rcu_hurry() callback on a
-given CPU kicks any lazy callbacks that might be already queued on that
-CPU.  After all, if there is going to be a grace period, all callbacks
-might as well get full benefit from it.
-
-Yes, this could be done the other way around by creating a
-call_rcu_lazy(), but earlier experience with this approach and
-feedback at the 2022 Linux Plumbers Conference shifted the approach
-to call_rcu() being lazy with call_rcu_hurry() for the few places
-where laziness is inappropriate.
-
-Returning to the test failure, use of ftrace showed that this failure
-cause caused by the aadded delays due to this new lazy behavior of
-call_rcu() in kernels built with CONFIG_RCU_LAZY=y.
-
-Therefore, make dst_release() use call_rcu_hurry() in order to revert
-to the old test-failure-free behavior.
-
-[ paulmck: Apply s/call_rcu_flush/call_rcu_hurry/ feedback from Tejun Heo. ]
-
-Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
-Cc: David Ahern <dsahern@kernel.org>
-Cc: "David S. Miller" <davem@davemloft.net>
-Cc: Hideaki YOSHIFUJI <yoshfuji@linux-ipv6.org>
-Cc: Jakub Kicinski <kuba@kernel.org>
-Cc: Paolo Abeni <pabeni@redhat.com>
-Cc: <netdev@vger.kernel.org>
-Reviewed-by: Eric Dumazet <edumazet@google.com>
-Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
-Stable-dep-of: cc9b364bb1d5 ("xfrm6: fix inet6_dev refcount underflow problem")
-Signed-off-by: Sasha Levin <sashal@kernel.org>
----
- net/core/dst.c | 2 +-
- 1 file changed, 1 insertion(+), 1 deletion(-)
-
-diff --git a/net/core/dst.c b/net/core/dst.c
-index bc9c9be4e0801..a4e738d321ba2 100644
---- a/net/core/dst.c
-+++ b/net/core/dst.c
-@@ -174,7 +174,7 @@ void dst_release(struct dst_entry *dst)
-                       net_warn_ratelimited("%s: dst:%p refcnt:%d\n",
-                                            __func__, dst, newrefcnt);
-               if (!newrefcnt)
--                      call_rcu(&dst->rcu_head, dst_destroy_rcu);
-+                      call_rcu_hurry(&dst->rcu_head, dst_destroy_rcu);
-       }
- }
- EXPORT_SYMBOL(dst_release);
--- 
-2.40.1
-
diff --git a/queue-6.1/rcu-fix-late-wakeup-when-flush-of-bypass-cblist-happ.patch b/queue-6.1/rcu-fix-late-wakeup-when-flush-of-bypass-cblist-happ.patch

deleted file mode 100644 (file)

index e38cc5f..0000000
--- a/queue-6.1/rcu-fix-late-wakeup-when-flush-of-bypass-cblist-happ.patch
+++ /dev/null
@@ -1,68 +0,0 @@
-From 3f132e8e674299042d9e5313dfbfcb3de55af912 Mon Sep 17 00:00:00 2001
-From: Sasha Levin <sashal@kernel.org>
-Date: Sat, 17 Sep 2022 16:41:59 +0000
-Subject: rcu: Fix late wakeup when flush of bypass cblist happens
-
-From: Joel Fernandes (Google) <joel@joelfernandes.org>
-
-[ Upstream commit b50606f35f4b73c8e4c6b9c64fe7ba72ea919134 ]
-
-When the bypass cblist gets too big or its timeout has occurred, it is
-flushed into the main cblist. However, the bypass timer is still running
-and the behavior is that it would eventually expire and wake the GP
-thread.
-
-Since we are going to use the bypass cblist for lazy CBs, do the wakeup
-soon as the flush for "too big or too long" bypass list happens.
-Otherwise, long delays can happen for callbacks which get promoted from
-lazy to non-lazy.
-
-This is a good thing to do anyway (regardless of future lazy patches),
-since it makes the behavior consistent with behavior of other code paths
-where flushing into the ->cblist makes the GP kthread into a
-non-sleeping state quickly.
-
-[ Frederic Weisbecker: Changes to avoid unnecessary GP-thread wakeups plus
-                   comment changes. ]
-
-Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
-Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
-Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
-Stable-dep-of: cc9b364bb1d5 ("xfrm6: fix inet6_dev refcount underflow problem")
-Signed-off-by: Sasha Levin <sashal@kernel.org>
----
- kernel/rcu/tree_nocb.h | 10 ++++++++--
- 1 file changed, 8 insertions(+), 2 deletions(-)
-
-diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h
-index 0a5f0ef414845..04c87f250e01a 100644
---- a/kernel/rcu/tree_nocb.h
-+++ b/kernel/rcu/tree_nocb.h
-@@ -433,8 +433,9 @@ static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
-       if ((ncbs && j != READ_ONCE(rdp->nocb_bypass_first)) ||
-           ncbs >= qhimark) {
-               rcu_nocb_lock(rdp);
-+              *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
-+
-               if (!rcu_nocb_flush_bypass(rdp, rhp, j)) {
--                      *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
-                       if (*was_alldone)
-                               trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
-                                                   TPS("FirstQ"));
-@@ -447,7 +448,12 @@ static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
-                       rcu_advance_cbs_nowake(rdp->mynode, rdp);
-                       rdp->nocb_gp_adv_time = j;
-               }
--              rcu_nocb_unlock_irqrestore(rdp, flags);
-+
-+              // The flush succeeded and we moved CBs into the regular list.
-+              // Don't wait for the wake up timer as it may be too far ahead.
-+              // Wake up the GP thread now instead, if the cblist was empty.
-+              __call_rcu_nocb_wake(rdp, *was_alldone, flags);
-+
-               return true; // Callback already enqueued.
-       }
- 
--- 
-2.40.1
-
diff --git a/queue-6.1/rcu-fix-missing-nocb-gp-wake-on-rcu_barrier.patch b/queue-6.1/rcu-fix-missing-nocb-gp-wake-on-rcu_barrier.patch

deleted file mode 100644 (file)

index d1ccd6c..0000000
--- a/queue-6.1/rcu-fix-missing-nocb-gp-wake-on-rcu_barrier.patch
+++ /dev/null
@@ -1,94 +0,0 @@
-From 6e201fbbe533ee08318f49c360c83145a1231ac2 Mon Sep 17 00:00:00 2001
-From: Sasha Levin <sashal@kernel.org>
-Date: Sun, 16 Oct 2022 16:22:53 +0000
-Subject: rcu: Fix missing nocb gp wake on rcu_barrier()
-
-From: Frederic Weisbecker <frederic@kernel.org>
-
-[ Upstream commit b8f7aca3f0e0e6223094ba2662bac90353674b04 ]
-
-In preparation for RCU lazy changes, wake up the RCU nocb gp thread if
-needed after an entrain.  This change prevents the RCU barrier callback
-from waiting in the queue for several seconds before the lazy callbacks
-in front of it are serviced.
-
-Reported-by: Joel Fernandes (Google) <joel@joelfernandes.org>
-Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
-Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
-Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
-Stable-dep-of: cc9b364bb1d5 ("xfrm6: fix inet6_dev refcount underflow problem")
-Signed-off-by: Sasha Levin <sashal@kernel.org>
----
- kernel/rcu/tree.c      | 11 +++++++++++
- kernel/rcu/tree.h      |  1 +
- kernel/rcu/tree_nocb.h |  5 +++++
- 3 files changed, 17 insertions(+)
-
-diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
-index 917a1e43f7839..6ea59aa53db78 100644
---- a/kernel/rcu/tree.c
-+++ b/kernel/rcu/tree.c
-@@ -3908,6 +3908,8 @@ static void rcu_barrier_entrain(struct rcu_data *rdp)
- {
-       unsigned long gseq = READ_ONCE(rcu_state.barrier_sequence);
-       unsigned long lseq = READ_ONCE(rdp->barrier_seq_snap);
-+      bool wake_nocb = false;
-+      bool was_alldone = false;
- 
-       lockdep_assert_held(&rcu_state.barrier_lock);
-       if (rcu_seq_state(lseq) || !rcu_seq_state(gseq) || rcu_seq_ctr(lseq) != rcu_seq_ctr(gseq))
-@@ -3916,7 +3918,14 @@ static void rcu_barrier_entrain(struct rcu_data *rdp)
-       rdp->barrier_head.func = rcu_barrier_callback;
-       debug_rcu_head_queue(&rdp->barrier_head);
-       rcu_nocb_lock(rdp);
-+      /*
-+       * Flush bypass and wakeup rcuog if we add callbacks to an empty regular
-+       * queue. This way we don't wait for bypass timer that can reach seconds
-+       * if it's fully lazy.
-+       */
-+      was_alldone = rcu_rdp_is_offloaded(rdp) && !rcu_segcblist_pend_cbs(&rdp->cblist);
-       WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies));
-+      wake_nocb = was_alldone && rcu_segcblist_pend_cbs(&rdp->cblist);
-       if (rcu_segcblist_entrain(&rdp->cblist, &rdp->barrier_head)) {
-               atomic_inc(&rcu_state.barrier_cpu_count);
-       } else {
-@@ -3924,6 +3933,8 @@ static void rcu_barrier_entrain(struct rcu_data *rdp)
-               rcu_barrier_trace(TPS("IRQNQ"), -1, rcu_state.barrier_sequence);
-       }
-       rcu_nocb_unlock(rdp);
-+      if (wake_nocb)
-+              wake_nocb_gp(rdp, false);
-       smp_store_release(&rdp->barrier_seq_snap, gseq);
- }
- 
-diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
-index d4a97e40ea9c3..925dd98f8b23b 100644
---- a/kernel/rcu/tree.h
-+++ b/kernel/rcu/tree.h
-@@ -439,6 +439,7 @@ static void zero_cpu_stall_ticks(struct rcu_data *rdp);
- static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp);
- static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq);
- static void rcu_init_one_nocb(struct rcu_node *rnp);
-+static bool wake_nocb_gp(struct rcu_data *rdp, bool force);
- static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
-                                 unsigned long j);
- static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
-diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h
-index 04c87f250e01a..74d4983d68f82 100644
---- a/kernel/rcu/tree_nocb.h
-+++ b/kernel/rcu/tree_nocb.h
-@@ -1570,6 +1570,11 @@ static void rcu_init_one_nocb(struct rcu_node *rnp)
- {
- }
- 
-+static bool wake_nocb_gp(struct rcu_data *rdp, bool force)
-+{
-+      return false;
-+}
-+
- static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
-                                 unsigned long j)
- {
--- 
-2.40.1
-
diff --git a/queue-6.1/rcu-make-call_rcu-lazy-to-save-power.patch b/queue-6.1/rcu-make-call_rcu-lazy-to-save-power.patch

deleted file mode 100644 (file)

index f81ebaa..0000000
--- a/queue-6.1/rcu-make-call_rcu-lazy-to-save-power.patch
+++ /dev/null
@@ -1,689 +0,0 @@
-From 7b253194c188b40a04df52ea0aeacae23989ef0d Mon Sep 17 00:00:00 2001
-From: Sasha Levin <sashal@kernel.org>
-Date: Sun, 16 Oct 2022 16:22:54 +0000
-Subject: rcu: Make call_rcu() lazy to save power
-
-From: Joel Fernandes (Google) <joel@joelfernandes.org>
-
-[ Upstream commit 3cb278e73be58bfb780ecd55129296d2f74c1fb7 ]
-
-Implement timer-based RCU callback batching (also known as lazy
-callbacks). With this we save about 5-10% of power consumed due
-to RCU requests that happen when system is lightly loaded or idle.
-
-By default, all async callbacks (queued via call_rcu) are marked
-lazy. An alternate API call_rcu_hurry() is provided for the few users,
-for example synchronize_rcu(), that need the old behavior.
-
-The batch is flushed whenever a certain amount of time has passed, or
-the batch on a particular CPU grows too big. Also memory pressure will
-flush it in a future patch.
-
-To handle several corner cases automagically (such as rcu_barrier() and
-hotplug), we re-use bypass lists which were originally introduced to
-address lock contention, to handle lazy CBs as well. The bypass list
-length has the lazy CB length included in it. A separate lazy CB length
-counter is also introduced to keep track of the number of lazy CBs.
-
-[ paulmck: Fix formatting of inline call_rcu_lazy() definition. ]
-[ paulmck: Apply Zqiang feedback. ]
-[ paulmck: Apply s/call_rcu_flush/call_rcu_hurry/ feedback from Tejun Heo. ]
-
-Suggested-by: Paul McKenney <paulmck@kernel.org>
-Acked-by: Frederic Weisbecker <frederic@kernel.org>
-Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
-Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
-Stable-dep-of: cc9b364bb1d5 ("xfrm6: fix inet6_dev refcount underflow problem")
-Signed-off-by: Sasha Levin <sashal@kernel.org>
----
- include/linux/rcupdate.h |   9 +++
- kernel/rcu/Kconfig       |   8 ++
- kernel/rcu/rcu.h         |   8 ++
- kernel/rcu/tiny.c        |   2 +-
- kernel/rcu/tree.c        | 129 ++++++++++++++++++++-----------
- kernel/rcu/tree.h        |  11 ++-
- kernel/rcu/tree_exp.h    |   2 +-
- kernel/rcu/tree_nocb.h   | 159 +++++++++++++++++++++++++++++++--------
- 8 files changed, 246 insertions(+), 82 deletions(-)
-
-diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
-index e9e61cd27ef63..46f05dc8b31aa 100644
---- a/include/linux/rcupdate.h
-+++ b/include/linux/rcupdate.h
-@@ -108,6 +108,15 @@ static inline int rcu_preempt_depth(void)
- 
- #endif /* #else #ifdef CONFIG_PREEMPT_RCU */
- 
-+#ifdef CONFIG_RCU_LAZY
-+void call_rcu_hurry(struct rcu_head *head, rcu_callback_t func);
-+#else
-+static inline void call_rcu_hurry(struct rcu_head *head, rcu_callback_t func)
-+{
-+      call_rcu(head, func);
-+}
-+#endif
-+
- /* Internal to kernel */
- void rcu_init(void);
- extern int rcu_scheduler_active;
-diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig
-index d471d22a5e21b..d78f6181c8aad 100644
---- a/kernel/rcu/Kconfig
-+++ b/kernel/rcu/Kconfig
-@@ -311,4 +311,12 @@ config TASKS_TRACE_RCU_READ_MB
-         Say N here if you hate read-side memory barriers.
-         Take the default if you are unsure.
- 
-+config RCU_LAZY
-+      bool "RCU callback lazy invocation functionality"
-+      depends on RCU_NOCB_CPU
-+      default n
-+      help
-+        To save power, batch RCU callbacks and flush after delay, memory
-+        pressure, or callback list growing too big.
-+
- endmenu # "RCU Subsystem"
-diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
-index 48d8f754b730e..6b86c5912beaf 100644
---- a/kernel/rcu/rcu.h
-+++ b/kernel/rcu/rcu.h
-@@ -474,6 +474,14 @@ enum rcutorture_type {
-       INVALID_RCU_FLAVOR
- };
- 
-+#if defined(CONFIG_RCU_LAZY)
-+unsigned long rcu_lazy_get_jiffies_till_flush(void);
-+void rcu_lazy_set_jiffies_till_flush(unsigned long j);
-+#else
-+static inline unsigned long rcu_lazy_get_jiffies_till_flush(void) { return 0; }
-+static inline void rcu_lazy_set_jiffies_till_flush(unsigned long j) { }
-+#endif
-+
- #if defined(CONFIG_TREE_RCU)
- void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags,
-                           unsigned long *gp_seq);
-diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
-index a33a8d4942c37..72913ce21258b 100644
---- a/kernel/rcu/tiny.c
-+++ b/kernel/rcu/tiny.c
-@@ -44,7 +44,7 @@ static struct rcu_ctrlblk rcu_ctrlblk = {
- 
- void rcu_barrier(void)
- {
--      wait_rcu_gp(call_rcu);
-+      wait_rcu_gp(call_rcu_hurry);
- }
- EXPORT_SYMBOL(rcu_barrier);
- 
-diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
-index 6ea59aa53db78..855c035ec9630 100644
---- a/kernel/rcu/tree.c
-+++ b/kernel/rcu/tree.c
-@@ -2731,47 +2731,8 @@ static void check_cb_ovld(struct rcu_data *rdp)
-       raw_spin_unlock_rcu_node(rnp);
- }
- 
--/**
-- * call_rcu() - Queue an RCU callback for invocation after a grace period.
-- * @head: structure to be used for queueing the RCU updates.
-- * @func: actual callback function to be invoked after the grace period
-- *
-- * The callback function will be invoked some time after a full grace
-- * period elapses, in other words after all pre-existing RCU read-side
-- * critical sections have completed.  However, the callback function
-- * might well execute concurrently with RCU read-side critical sections
-- * that started after call_rcu() was invoked.
-- *
-- * RCU read-side critical sections are delimited by rcu_read_lock()
-- * and rcu_read_unlock(), and may be nested.  In addition, but only in
-- * v5.0 and later, regions of code across which interrupts, preemption,
-- * or softirqs have been disabled also serve as RCU read-side critical
-- * sections.  This includes hardware interrupt handlers, softirq handlers,
-- * and NMI handlers.
-- *
-- * Note that all CPUs must agree that the grace period extended beyond
-- * all pre-existing RCU read-side critical section.  On systems with more
-- * than one CPU, this means that when "func()" is invoked, each CPU is
-- * guaranteed to have executed a full memory barrier since the end of its
-- * last RCU read-side critical section whose beginning preceded the call
-- * to call_rcu().  It also means that each CPU executing an RCU read-side
-- * critical section that continues beyond the start of "func()" must have
-- * executed a memory barrier after the call_rcu() but before the beginning
-- * of that RCU read-side critical section.  Note that these guarantees
-- * include CPUs that are offline, idle, or executing in user mode, as
-- * well as CPUs that are executing in the kernel.
-- *
-- * Furthermore, if CPU A invoked call_rcu() and CPU B invoked the
-- * resulting RCU callback function "func()", then both CPU A and CPU B are
-- * guaranteed to execute a full memory barrier during the time interval
-- * between the call to call_rcu() and the invocation of "func()" -- even
-- * if CPU A and CPU B are the same CPU (but again only if the system has
-- * more than one CPU).
-- *
-- * Implementation of these memory-ordering guarantees is described here:
-- * Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst.
-- */
--void call_rcu(struct rcu_head *head, rcu_callback_t func)
-+static void
-+__call_rcu_common(struct rcu_head *head, rcu_callback_t func, bool lazy)
- {
-       static atomic_t doublefrees;
-       unsigned long flags;
-@@ -2812,7 +2773,7 @@ void call_rcu(struct rcu_head *head, rcu_callback_t func)
-       }
- 
-       check_cb_ovld(rdp);
--      if (rcu_nocb_try_bypass(rdp, head, &was_alldone, flags))
-+      if (rcu_nocb_try_bypass(rdp, head, &was_alldone, flags, lazy))
-               return; // Enqueued onto ->nocb_bypass, so just leave.
-       // If no-CBs CPU gets here, rcu_nocb_try_bypass() acquired ->nocb_lock.
-       rcu_segcblist_enqueue(&rdp->cblist, head);
-@@ -2834,8 +2795,84 @@ void call_rcu(struct rcu_head *head, rcu_callback_t func)
-               local_irq_restore(flags);
-       }
- }
--EXPORT_SYMBOL_GPL(call_rcu);
- 
-+#ifdef CONFIG_RCU_LAZY
-+/**
-+ * call_rcu_hurry() - Queue RCU callback for invocation after grace period, and
-+ * flush all lazy callbacks (including the new one) to the main ->cblist while
-+ * doing so.
-+ *
-+ * @head: structure to be used for queueing the RCU updates.
-+ * @func: actual callback function to be invoked after the grace period
-+ *
-+ * The callback function will be invoked some time after a full grace
-+ * period elapses, in other words after all pre-existing RCU read-side
-+ * critical sections have completed.
-+ *
-+ * Use this API instead of call_rcu() if you don't want the callback to be
-+ * invoked after very long periods of time, which can happen on systems without
-+ * memory pressure and on systems which are lightly loaded or mostly idle.
-+ * This function will cause callbacks to be invoked sooner than later at the
-+ * expense of extra power. Other than that, this function is identical to, and
-+ * reuses call_rcu()'s logic. Refer to call_rcu() for more details about memory
-+ * ordering and other functionality.
-+ */
-+void call_rcu_hurry(struct rcu_head *head, rcu_callback_t func)
-+{
-+      return __call_rcu_common(head, func, false);
-+}
-+EXPORT_SYMBOL_GPL(call_rcu_hurry);
-+#endif
-+
-+/**
-+ * call_rcu() - Queue an RCU callback for invocation after a grace period.
-+ * By default the callbacks are 'lazy' and are kept hidden from the main
-+ * ->cblist to prevent starting of grace periods too soon.
-+ * If you desire grace periods to start very soon, use call_rcu_hurry().
-+ *
-+ * @head: structure to be used for queueing the RCU updates.
-+ * @func: actual callback function to be invoked after the grace period
-+ *
-+ * The callback function will be invoked some time after a full grace
-+ * period elapses, in other words after all pre-existing RCU read-side
-+ * critical sections have completed.  However, the callback function
-+ * might well execute concurrently with RCU read-side critical sections
-+ * that started after call_rcu() was invoked.
-+ *
-+ * RCU read-side critical sections are delimited by rcu_read_lock()
-+ * and rcu_read_unlock(), and may be nested.  In addition, but only in
-+ * v5.0 and later, regions of code across which interrupts, preemption,
-+ * or softirqs have been disabled also serve as RCU read-side critical
-+ * sections.  This includes hardware interrupt handlers, softirq handlers,
-+ * and NMI handlers.
-+ *
-+ * Note that all CPUs must agree that the grace period extended beyond
-+ * all pre-existing RCU read-side critical section.  On systems with more
-+ * than one CPU, this means that when "func()" is invoked, each CPU is
-+ * guaranteed to have executed a full memory barrier since the end of its
-+ * last RCU read-side critical section whose beginning preceded the call
-+ * to call_rcu().  It also means that each CPU executing an RCU read-side
-+ * critical section that continues beyond the start of "func()" must have
-+ * executed a memory barrier after the call_rcu() but before the beginning
-+ * of that RCU read-side critical section.  Note that these guarantees
-+ * include CPUs that are offline, idle, or executing in user mode, as
-+ * well as CPUs that are executing in the kernel.
-+ *
-+ * Furthermore, if CPU A invoked call_rcu() and CPU B invoked the
-+ * resulting RCU callback function "func()", then both CPU A and CPU B are
-+ * guaranteed to execute a full memory barrier during the time interval
-+ * between the call to call_rcu() and the invocation of "func()" -- even
-+ * if CPU A and CPU B are the same CPU (but again only if the system has
-+ * more than one CPU).
-+ *
-+ * Implementation of these memory-ordering guarantees is described here:
-+ * Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst.
-+ */
-+void call_rcu(struct rcu_head *head, rcu_callback_t func)
-+{
-+      return __call_rcu_common(head, func, IS_ENABLED(CONFIG_RCU_LAZY));
-+}
-+EXPORT_SYMBOL_GPL(call_rcu);
- 
- /* Maximum number of jiffies to wait before draining a batch. */
- #define KFREE_DRAIN_JIFFIES (5 * HZ)
-@@ -3521,7 +3558,7 @@ void synchronize_rcu(void)
-               if (rcu_gp_is_expedited())
-                       synchronize_rcu_expedited();
-               else
--                      wait_rcu_gp(call_rcu);
-+                      wait_rcu_gp(call_rcu_hurry);
-               return;
-       }
- 
-@@ -3924,7 +3961,7 @@ static void rcu_barrier_entrain(struct rcu_data *rdp)
-        * if it's fully lazy.
-        */
-       was_alldone = rcu_rdp_is_offloaded(rdp) && !rcu_segcblist_pend_cbs(&rdp->cblist);
--      WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies));
-+      WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies, false));
-       wake_nocb = was_alldone && rcu_segcblist_pend_cbs(&rdp->cblist);
-       if (rcu_segcblist_entrain(&rdp->cblist, &rdp->barrier_head)) {
-               atomic_inc(&rcu_state.barrier_cpu_count);
-@@ -4359,7 +4396,7 @@ void rcutree_migrate_callbacks(int cpu)
-       my_rdp = this_cpu_ptr(&rcu_data);
-       my_rnp = my_rdp->mynode;
-       rcu_nocb_lock(my_rdp); /* irqs already disabled. */
--      WARN_ON_ONCE(!rcu_nocb_flush_bypass(my_rdp, NULL, jiffies));
-+      WARN_ON_ONCE(!rcu_nocb_flush_bypass(my_rdp, NULL, jiffies, false));
-       raw_spin_lock_rcu_node(my_rnp); /* irqs already disabled. */
-       /* Leverage recent GPs and set GP for new callbacks. */
-       needwake = rcu_advance_cbs(my_rnp, rdp) ||
-diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
-index 925dd98f8b23b..fcb5d696eb170 100644
---- a/kernel/rcu/tree.h
-+++ b/kernel/rcu/tree.h
-@@ -263,14 +263,16 @@ struct rcu_data {
-       unsigned long last_fqs_resched; /* Time of last rcu_resched(). */
-       unsigned long last_sched_clock; /* Jiffies of last rcu_sched_clock_irq(). */
- 
-+      long lazy_len;                  /* Length of buffered lazy callbacks. */
-       int cpu;
- };
- 
- /* Values for nocb_defer_wakeup field in struct rcu_data. */
- #define RCU_NOCB_WAKE_NOT     0
- #define RCU_NOCB_WAKE_BYPASS  1
--#define RCU_NOCB_WAKE         2
--#define RCU_NOCB_WAKE_FORCE   3
-+#define RCU_NOCB_WAKE_LAZY    2
-+#define RCU_NOCB_WAKE         3
-+#define RCU_NOCB_WAKE_FORCE   4
- 
- #define RCU_JIFFIES_TILL_FORCE_QS (1 + (HZ > 250) + (HZ > 500))
-                                       /* For jiffies_till_first_fqs and */
-@@ -441,9 +443,10 @@ static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq);
- static void rcu_init_one_nocb(struct rcu_node *rnp);
- static bool wake_nocb_gp(struct rcu_data *rdp, bool force);
- static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
--                                unsigned long j);
-+                                unsigned long j, bool lazy);
- static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
--                              bool *was_alldone, unsigned long flags);
-+                              bool *was_alldone, unsigned long flags,
-+                              bool lazy);
- static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_empty,
-                                unsigned long flags);
- static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp, int level);
-diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
-index aa3ec3c3b9f75..b9637df7cda70 100644
---- a/kernel/rcu/tree_exp.h
-+++ b/kernel/rcu/tree_exp.h
-@@ -941,7 +941,7 @@ void synchronize_rcu_expedited(void)
- 
-       /* If expedited grace periods are prohibited, fall back to normal. */
-       if (rcu_gp_is_normal()) {
--              wait_rcu_gp(call_rcu);
-+              wait_rcu_gp(call_rcu_hurry);
-               return;
-       }
- 
-diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h
-index 74d4983d68f82..c3ec5f389d27f 100644
---- a/kernel/rcu/tree_nocb.h
-+++ b/kernel/rcu/tree_nocb.h
-@@ -256,6 +256,31 @@ static bool wake_nocb_gp(struct rcu_data *rdp, bool force)
-       return __wake_nocb_gp(rdp_gp, rdp, force, flags);
- }
- 
-+/*
-+ * LAZY_FLUSH_JIFFIES decides the maximum amount of time that
-+ * can elapse before lazy callbacks are flushed. Lazy callbacks
-+ * could be flushed much earlier for a number of other reasons
-+ * however, LAZY_FLUSH_JIFFIES will ensure no lazy callbacks are
-+ * left unsubmitted to RCU after those many jiffies.
-+ */
-+#define LAZY_FLUSH_JIFFIES (10 * HZ)
-+static unsigned long jiffies_till_flush = LAZY_FLUSH_JIFFIES;
-+
-+#ifdef CONFIG_RCU_LAZY
-+// To be called only from test code.
-+void rcu_lazy_set_jiffies_till_flush(unsigned long jif)
-+{
-+      jiffies_till_flush = jif;
-+}
-+EXPORT_SYMBOL(rcu_lazy_set_jiffies_till_flush);
-+
-+unsigned long rcu_lazy_get_jiffies_till_flush(void)
-+{
-+      return jiffies_till_flush;
-+}
-+EXPORT_SYMBOL(rcu_lazy_get_jiffies_till_flush);
-+#endif
-+
- /*
-  * Arrange to wake the GP kthread for this NOCB group at some future
-  * time when it is safe to do so.
-@@ -269,10 +294,14 @@ static void wake_nocb_gp_defer(struct rcu_data *rdp, int waketype,
-       raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags);
- 
-       /*
--       * Bypass wakeup overrides previous deferments. In case
--       * of callback storm, no need to wake up too early.
-+       * Bypass wakeup overrides previous deferments. In case of
-+       * callback storms, no need to wake up too early.
-        */
--      if (waketype == RCU_NOCB_WAKE_BYPASS) {
-+      if (waketype == RCU_NOCB_WAKE_LAZY &&
-+          rdp->nocb_defer_wakeup == RCU_NOCB_WAKE_NOT) {
-+              mod_timer(&rdp_gp->nocb_timer, jiffies + jiffies_till_flush);
-+              WRITE_ONCE(rdp_gp->nocb_defer_wakeup, waketype);
-+      } else if (waketype == RCU_NOCB_WAKE_BYPASS) {
-               mod_timer(&rdp_gp->nocb_timer, jiffies + 2);
-               WRITE_ONCE(rdp_gp->nocb_defer_wakeup, waketype);
-       } else {
-@@ -293,10 +322,13 @@ static void wake_nocb_gp_defer(struct rcu_data *rdp, int waketype,
-  * proves to be initially empty, just return false because the no-CB GP
-  * kthread may need to be awakened in this case.
-  *
-+ * Return true if there was something to be flushed and it succeeded, otherwise
-+ * false.
-+ *
-  * Note that this function always returns true if rhp is NULL.
-  */
- static bool rcu_nocb_do_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
--                                   unsigned long j)
-+                                   unsigned long j, bool lazy)
- {
-       struct rcu_cblist rcl;
- 
-@@ -310,7 +342,20 @@ static bool rcu_nocb_do_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
-       /* Note: ->cblist.len already accounts for ->nocb_bypass contents. */
-       if (rhp)
-               rcu_segcblist_inc_len(&rdp->cblist); /* Must precede enqueue. */
--      rcu_cblist_flush_enqueue(&rcl, &rdp->nocb_bypass, rhp);
-+
-+      /*
-+       * If the new CB requested was a lazy one, queue it onto the main
-+       * ->cblist so we can take advantage of a sooner grade period.
-+       */
-+      if (lazy && rhp) {
-+              rcu_cblist_flush_enqueue(&rcl, &rdp->nocb_bypass, NULL);
-+              rcu_cblist_enqueue(&rcl, rhp);
-+              WRITE_ONCE(rdp->lazy_len, 0);
-+      } else {
-+              rcu_cblist_flush_enqueue(&rcl, &rdp->nocb_bypass, rhp);
-+              WRITE_ONCE(rdp->lazy_len, 0);
-+      }
-+
-       rcu_segcblist_insert_pend_cbs(&rdp->cblist, &rcl);
-       WRITE_ONCE(rdp->nocb_bypass_first, j);
-       rcu_nocb_bypass_unlock(rdp);
-@@ -326,13 +371,13 @@ static bool rcu_nocb_do_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
-  * Note that this function always returns true if rhp is NULL.
-  */
- static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
--                                unsigned long j)
-+                                unsigned long j, bool lazy)
- {
-       if (!rcu_rdp_is_offloaded(rdp))
-               return true;
-       rcu_lockdep_assert_cblist_protected(rdp);
-       rcu_nocb_bypass_lock(rdp);
--      return rcu_nocb_do_flush_bypass(rdp, rhp, j);
-+      return rcu_nocb_do_flush_bypass(rdp, rhp, j, lazy);
- }
- 
- /*
-@@ -345,7 +390,7 @@ static void rcu_nocb_try_flush_bypass(struct rcu_data *rdp, unsigned long j)
-       if (!rcu_rdp_is_offloaded(rdp) ||
-           !rcu_nocb_bypass_trylock(rdp))
-               return;
--      WARN_ON_ONCE(!rcu_nocb_do_flush_bypass(rdp, NULL, j));
-+      WARN_ON_ONCE(!rcu_nocb_do_flush_bypass(rdp, NULL, j, false));
- }
- 
- /*
-@@ -367,12 +412,14 @@ static void rcu_nocb_try_flush_bypass(struct rcu_data *rdp, unsigned long j)
-  * there is only one CPU in operation.
-  */
- static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
--                              bool *was_alldone, unsigned long flags)
-+                              bool *was_alldone, unsigned long flags,
-+                              bool lazy)
- {
-       unsigned long c;
-       unsigned long cur_gp_seq;
-       unsigned long j = jiffies;
-       long ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
-+      bool bypass_is_lazy = (ncbs == READ_ONCE(rdp->lazy_len));
- 
-       lockdep_assert_irqs_disabled();
- 
-@@ -417,25 +464,29 @@ static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
-       // If there hasn't yet been all that many ->cblist enqueues
-       // this jiffy, tell the caller to enqueue onto ->cblist.  But flush
-       // ->nocb_bypass first.
--      if (rdp->nocb_nobypass_count < nocb_nobypass_lim_per_jiffy) {
-+      // Lazy CBs throttle this back and do immediate bypass queuing.
-+      if (rdp->nocb_nobypass_count < nocb_nobypass_lim_per_jiffy && !lazy) {
-               rcu_nocb_lock(rdp);
-               *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
-               if (*was_alldone)
-                       trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
-                                           TPS("FirstQ"));
--              WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, j));
-+
-+              WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, j, false));
-               WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass));
-               return false; // Caller must enqueue the callback.
-       }
- 
-       // If ->nocb_bypass has been used too long or is too full,
-       // flush ->nocb_bypass to ->cblist.
--      if ((ncbs && j != READ_ONCE(rdp->nocb_bypass_first)) ||
-+      if ((ncbs && !bypass_is_lazy && j != READ_ONCE(rdp->nocb_bypass_first)) ||
-+          (ncbs &&  bypass_is_lazy &&
-+           (time_after(j, READ_ONCE(rdp->nocb_bypass_first) + jiffies_till_flush))) ||
-           ncbs >= qhimark) {
-               rcu_nocb_lock(rdp);
-               *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
- 
--              if (!rcu_nocb_flush_bypass(rdp, rhp, j)) {
-+              if (!rcu_nocb_flush_bypass(rdp, rhp, j, lazy)) {
-                       if (*was_alldone)
-                               trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
-                                                   TPS("FirstQ"));
-@@ -463,13 +514,24 @@ static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
-       ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
-       rcu_segcblist_inc_len(&rdp->cblist); /* Must precede enqueue. */
-       rcu_cblist_enqueue(&rdp->nocb_bypass, rhp);
-+
-+      if (lazy)
-+              WRITE_ONCE(rdp->lazy_len, rdp->lazy_len + 1);
-+
-       if (!ncbs) {
-               WRITE_ONCE(rdp->nocb_bypass_first, j);
-               trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("FirstBQ"));
-       }
-       rcu_nocb_bypass_unlock(rdp);
-       smp_mb(); /* Order enqueue before wake. */
--      if (ncbs) {
-+      // A wake up of the grace period kthread or timer adjustment
-+      // needs to be done only if:
-+      // 1. Bypass list was fully empty before (this is the first
-+      //    bypass list entry), or:
-+      // 2. Both of these conditions are met:
-+      //    a. The bypass list previously had only lazy CBs, and:
-+      //    b. The new CB is non-lazy.
-+      if (ncbs && (!bypass_is_lazy || lazy)) {
-               local_irq_restore(flags);
-       } else {
-               // No-CBs GP kthread might be indefinitely asleep, if so, wake.
-@@ -497,8 +559,10 @@ static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone,
-                                unsigned long flags)
-                                __releases(rdp->nocb_lock)
- {
-+      long bypass_len;
-       unsigned long cur_gp_seq;
-       unsigned long j;
-+      long lazy_len;
-       long len;
-       struct task_struct *t;
- 
-@@ -512,9 +576,16 @@ static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone,
-       }
-       // Need to actually to a wakeup.
-       len = rcu_segcblist_n_cbs(&rdp->cblist);
-+      bypass_len = rcu_cblist_n_cbs(&rdp->nocb_bypass);
-+      lazy_len = READ_ONCE(rdp->lazy_len);
-       if (was_alldone) {
-               rdp->qlen_last_fqs_check = len;
--              if (!irqs_disabled_flags(flags)) {
-+              // Only lazy CBs in bypass list
-+              if (lazy_len && bypass_len == lazy_len) {
-+                      rcu_nocb_unlock_irqrestore(rdp, flags);
-+                      wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE_LAZY,
-+                                         TPS("WakeLazy"));
-+              } else if (!irqs_disabled_flags(flags)) {
-                       /* ... if queue was empty ... */
-                       rcu_nocb_unlock_irqrestore(rdp, flags);
-                       wake_nocb_gp(rdp, false);
-@@ -605,12 +676,12 @@ static void nocb_gp_sleep(struct rcu_data *my_rdp, int cpu)
- static void nocb_gp_wait(struct rcu_data *my_rdp)
- {
-       bool bypass = false;
--      long bypass_ncbs;
-       int __maybe_unused cpu = my_rdp->cpu;
-       unsigned long cur_gp_seq;
-       unsigned long flags;
-       bool gotcbs = false;
-       unsigned long j = jiffies;
-+      bool lazy = false;
-       bool needwait_gp = false; // This prevents actual uninitialized use.
-       bool needwake;
-       bool needwake_gp;
-@@ -640,24 +711,43 @@ static void nocb_gp_wait(struct rcu_data *my_rdp)
-        * won't be ignored for long.
-        */
-       list_for_each_entry(rdp, &my_rdp->nocb_head_rdp, nocb_entry_rdp) {
-+              long bypass_ncbs;
-+              bool flush_bypass = false;
-+              long lazy_ncbs;
-+
-               trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("Check"));
-               rcu_nocb_lock_irqsave(rdp, flags);
-               lockdep_assert_held(&rdp->nocb_lock);
-               bypass_ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
--              if (bypass_ncbs &&
-+              lazy_ncbs = READ_ONCE(rdp->lazy_len);
-+
-+              if (bypass_ncbs && (lazy_ncbs == bypass_ncbs) &&
-+                  (time_after(j, READ_ONCE(rdp->nocb_bypass_first) + jiffies_till_flush) ||
-+                   bypass_ncbs > 2 * qhimark)) {
-+                      flush_bypass = true;
-+              } else if (bypass_ncbs && (lazy_ncbs != bypass_ncbs) &&
-                   (time_after(j, READ_ONCE(rdp->nocb_bypass_first) + 1) ||
-                    bypass_ncbs > 2 * qhimark)) {
--                      // Bypass full or old, so flush it.
--                      (void)rcu_nocb_try_flush_bypass(rdp, j);
--                      bypass_ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
-+                      flush_bypass = true;
-               } else if (!bypass_ncbs && rcu_segcblist_empty(&rdp->cblist)) {
-                       rcu_nocb_unlock_irqrestore(rdp, flags);
-                       continue; /* No callbacks here, try next. */
-               }
-+
-+              if (flush_bypass) {
-+                      // Bypass full or old, so flush it.
-+                      (void)rcu_nocb_try_flush_bypass(rdp, j);
-+                      bypass_ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
-+                      lazy_ncbs = READ_ONCE(rdp->lazy_len);
-+              }
-+
-               if (bypass_ncbs) {
-                       trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
--                                          TPS("Bypass"));
--                      bypass = true;
-+                                          bypass_ncbs == lazy_ncbs ? TPS("Lazy") : TPS("Bypass"));
-+                      if (bypass_ncbs == lazy_ncbs)
-+                              lazy = true;
-+                      else
-+                              bypass = true;
-               }
-               rnp = rdp->mynode;
- 
-@@ -705,12 +795,20 @@ static void nocb_gp_wait(struct rcu_data *my_rdp)
-       my_rdp->nocb_gp_gp = needwait_gp;
-       my_rdp->nocb_gp_seq = needwait_gp ? wait_gp_seq : 0;
- 
--      if (bypass && !rcu_nocb_poll) {
--              // At least one child with non-empty ->nocb_bypass, so set
--              // timer in order to avoid stranding its callbacks.
--              wake_nocb_gp_defer(my_rdp, RCU_NOCB_WAKE_BYPASS,
--                                 TPS("WakeBypassIsDeferred"));
-+      // At least one child with non-empty ->nocb_bypass, so set
-+      // timer in order to avoid stranding its callbacks.
-+      if (!rcu_nocb_poll) {
-+              // If bypass list only has lazy CBs. Add a deferred lazy wake up.
-+              if (lazy && !bypass) {
-+                      wake_nocb_gp_defer(my_rdp, RCU_NOCB_WAKE_LAZY,
-+                                      TPS("WakeLazyIsDeferred"));
-+              // Otherwise add a deferred bypass wake up.
-+              } else if (bypass) {
-+                      wake_nocb_gp_defer(my_rdp, RCU_NOCB_WAKE_BYPASS,
-+                                      TPS("WakeBypassIsDeferred"));
-+              }
-       }
-+
-       if (rcu_nocb_poll) {
-               /* Polling, so trace if first poll in the series. */
-               if (gotcbs)
-@@ -1036,7 +1134,7 @@ static long rcu_nocb_rdp_deoffload(void *arg)
-        * return false, which means that future calls to rcu_nocb_try_bypass()
-        * will refuse to put anything into the bypass.
-        */
--      WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies));
-+      WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies, false));
-       /*
-        * Start with invoking rcu_core() early. This way if the current thread
-        * happens to preempt an ongoing call to rcu_core() in the middle,
-@@ -1290,6 +1388,7 @@ static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
-       raw_spin_lock_init(&rdp->nocb_gp_lock);
-       timer_setup(&rdp->nocb_timer, do_nocb_deferred_wakeup_timer, 0);
-       rcu_cblist_init(&rdp->nocb_bypass);
-+      WRITE_ONCE(rdp->lazy_len, 0);
-       mutex_init(&rdp->nocb_gp_kthread_mutex);
- }
- 
-@@ -1576,13 +1675,13 @@ static bool wake_nocb_gp(struct rcu_data *rdp, bool force)
- }
- 
- static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
--                                unsigned long j)
-+                                unsigned long j, bool lazy)
- {
-       return true;
- }
- 
- static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
--                              bool *was_alldone, unsigned long flags)
-+                              bool *was_alldone, unsigned long flags, bool lazy)
- {
-       return false;
- }
--- 
-2.40.1
-
diff --git a/queue-6.1/series b/queue-6.1/series

index 18429a3ceb9ea8011b5900db992998990c66d4d6..02b77a9cf572e348383c4ab239837ae14e1173bc 100644 (file)
--- a/queue-6.1/series
+++ b/queue-6.1/series
@@ -128,15 +128,6 @@ fprobe-pass-entry_data-to-handlers.patch
  fprobe-add-nr_maxactive-to-specify-rethook_node-pool.patch
  fprobe-fix-to-ensure-the-number-of-active-retprobes-.patch
  net-xfrm-skip-policies-marked-as-dead-while-reinsert.patch
-rcu-fix-late-wakeup-when-flush-of-bypass-cblist-happ.patch
-rcu-fix-missing-nocb-gp-wake-on-rcu_barrier.patch
-rcu-make-call_rcu-lazy-to-save-power.patch
-net-use-call_rcu_hurry-for-dst_release.patch
-atomics-provide-atomic_add_negative-variants.patch
-atomics-provide-rcuref-scalable-reference-counting.patch
-net-dst-prevent-false-sharing-vs.-dst_entry-__refcnt.patch
-net-dst-switch-to-rcuref_t-reference-counting.patch
-net-dst-fix-missing-initialization-of-rt_uncached.patch
  xfrm6-fix-inet6_dev-refcount-underflow-problem.patch
  net-mlx5-e-switch-register-event-handler-before-armi.patch
  net-mlx5-handle-fw-tracer-change-ownership-event-bas.patch
@@ -198,3 +189,5 @@ phy-mapphone-mdm6600-fix-runtime-disable-on-probe.patch
  phy-mapphone-mdm6600-fix-runtime-pm-for-remove.patch
  phy-mapphone-mdm6600-fix-pinctrl_pm-handling-for-sle.patch
  net-move-altnames-together-with-the-netdevice.patch
+bluetooth-hci_sock-fix-slab-oob-read-in-create_monitor_event.patch
+bluetooth-hci_sock-correctly-bounds-check-and-pad-hci_mon_new_index-name.patch
diff --git a/queue-6.1/xfrm6-fix-inet6_dev-refcount-underflow-problem.patch b/queue-6.1/xfrm6-fix-inet6_dev-refcount-underflow-problem.patch

index c0b5edf66cf51d674efd2aaa027f3b6c87ff78d5..49297986bf62fc5c8b9124c993f68be2f455e62a 100644 (file)
--- a/queue-6.1/xfrm6-fix-inet6_dev-refcount-underflow-problem.patch
+++ b/queue-6.1/xfrm6-fix-inet6_dev-refcount-underflow-problem.patch
@@ -33,27 +33,24 @@ Signed-off-by: Zhang Changzhong <zhangchangzhong@huawei.com>
  Reviewed-by: Xin Long <lucien.xin@gmail.com>
  Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
  Signed-off-by: Sasha Levin <sashal@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
  ---
- net/ipv6/xfrm6_policy.c | 4 ++--
+ net/ipv6/xfrm6_policy.c |    4 ++--
   1 file changed, 2 insertions(+), 2 deletions(-)
  
-diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c
-index eecc5e59da17c..50c278f1c1063 100644
  --- a/net/ipv6/xfrm6_policy.c
  +++ b/net/ipv6/xfrm6_policy.c
-@@ -117,10 +117,10 @@ static void xfrm6_dst_destroy(struct dst_entry *dst)
+@@ -118,11 +118,11 @@ static void xfrm6_dst_destroy(struct dst
   {
         struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
   
  -      if (likely(xdst->u.rt6.rt6i_idev))
  -              in6_dev_put(xdst->u.rt6.rt6i_idev);
         dst_destroy_metrics_generic(dst);
-       rt6_uncached_list_del(&xdst->u.rt6);
+       if (xdst->u.rt6.rt6i_uncached_list)
+               rt6_uncached_list_del(&xdst->u.rt6);
  +      if (likely(xdst->u.rt6.rt6i_idev))
  +              in6_dev_put(xdst->u.rt6.rt6i_idev);
         xfrm_dst_destroy(xdst);
   }
   
--- 
-2.40.1
-
author	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Mon, 23 Oct 2023 08:40:57 +0000 (10:40 +0200)
committer	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Mon, 23 Oct 2023 08:40:57 +0000 (10:40 +0200)
queue-6.1/atomics-provide-atomic_add_negative-variants.patch	[deleted file]	patch \| blob \| blame \| history
queue-6.1/atomics-provide-rcuref-scalable-reference-counting.patch	[deleted file]	patch \| blob \| blame \| history
queue-6.1/net-dst-fix-missing-initialization-of-rt_uncached.patch	[deleted file]	patch \| blob \| blame \| history
queue-6.1/net-dst-prevent-false-sharing-vs.-dst_entry-__refcnt.patch	[deleted file]	patch \| blob \| blame \| history
queue-6.1/net-dst-switch-to-rcuref_t-reference-counting.patch	[deleted file]	patch \| blob \| blame \| history
queue-6.1/net-use-call_rcu_hurry-for-dst_release.patch	[deleted file]	patch \| blob \| blame \| history
queue-6.1/rcu-fix-late-wakeup-when-flush-of-bypass-cblist-happ.patch	[deleted file]	patch \| blob \| blame \| history
queue-6.1/rcu-fix-missing-nocb-gp-wake-on-rcu_barrier.patch	[deleted file]	patch \| blob \| blame \| history
queue-6.1/rcu-make-call_rcu-lazy-to-save-power.patch	[deleted file]	patch \| blob \| blame \| history
queue-6.1/series		patch \| blob \| blame \| history
queue-6.1/xfrm6-fix-inet6_dev-refcount-underflow-problem.patch		patch \| blob \| blame \| history