From: Greg Kroah-Hartman Date: Mon, 23 Oct 2023 08:40:57 +0000 (+0200) Subject: fix up a net 6.1 patch to not need the world of dependencies X-Git-Tag: v4.14.328~19 X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=978b9497895ff7e4d1d8d237206a4b4cc58fae6f;p=thirdparty%2Fkernel%2Fstable-queue.git fix up a net 6.1 patch to not need the world of dependencies --- diff --git a/queue-6.1/atomics-provide-atomic_add_negative-variants.patch b/queue-6.1/atomics-provide-atomic_add_negative-variants.patch deleted file mode 100644 index b8689ca2f17..00000000000 --- a/queue-6.1/atomics-provide-atomic_add_negative-variants.patch +++ /dev/null @@ -1,481 +0,0 @@ -From 2194a9643e933a16a92f83d3859f3916f95a5e42 Mon Sep 17 00:00:00 2001 -From: Sasha Levin -Date: Thu, 23 Mar 2023 21:55:30 +0100 -Subject: atomics: Provide atomic_add_negative() variants - -From: Thomas Gleixner - -[ Upstream commit e5ab9eff46b04c5a04778e40d7092fed3fda52ca ] - -atomic_add_negative() does not provide the relaxed/acquire/release -variants. - -Provide them in preparation for a new scalable reference count algorithm. - -Signed-off-by: Thomas Gleixner -Signed-off-by: Peter Zijlstra (Intel) -Acked-by: Mark Rutland -Link: https://lore.kernel.org/r/20230323102800.101763813@linutronix.de -Stable-dep-of: cc9b364bb1d5 ("xfrm6: fix inet6_dev refcount underflow problem") -Signed-off-by: Sasha Levin ---- - include/linux/atomic/atomic-arch-fallback.h | 208 +++++++++++++++++++- - include/linux/atomic/atomic-instrumented.h | 68 ++++++- - include/linux/atomic/atomic-long.h | 38 +++- - scripts/atomic/atomics.tbl | 2 +- - scripts/atomic/fallbacks/add_negative | 11 +- - 5 files changed, 309 insertions(+), 18 deletions(-) - -diff --git a/include/linux/atomic/atomic-arch-fallback.h b/include/linux/atomic/atomic-arch-fallback.h -index 77bc5522e61c6..4226379a232d5 100644 ---- a/include/linux/atomic/atomic-arch-fallback.h -+++ b/include/linux/atomic/atomic-arch-fallback.h -@@ -1208,15 +1208,21 @@ arch_atomic_inc_and_test(atomic_t *v) - #define arch_atomic_inc_and_test arch_atomic_inc_and_test - #endif - -+#ifndef arch_atomic_add_negative_relaxed -+#ifdef arch_atomic_add_negative -+#define arch_atomic_add_negative_acquire arch_atomic_add_negative -+#define arch_atomic_add_negative_release arch_atomic_add_negative -+#define arch_atomic_add_negative_relaxed arch_atomic_add_negative -+#endif /* arch_atomic_add_negative */ -+ - #ifndef arch_atomic_add_negative - /** -- * arch_atomic_add_negative - add and test if negative -+ * arch_atomic_add_negative - Add and test if negative - * @i: integer value to add - * @v: pointer of type atomic_t - * -- * Atomically adds @i to @v and returns true -- * if the result is negative, or false when -- * result is greater than or equal to zero. -+ * Atomically adds @i to @v and returns true if the result is negative, -+ * or false when the result is greater than or equal to zero. - */ - static __always_inline bool - arch_atomic_add_negative(int i, atomic_t *v) -@@ -1226,6 +1232,95 @@ arch_atomic_add_negative(int i, atomic_t *v) - #define arch_atomic_add_negative arch_atomic_add_negative - #endif - -+#ifndef arch_atomic_add_negative_acquire -+/** -+ * arch_atomic_add_negative_acquire - Add and test if negative -+ * @i: integer value to add -+ * @v: pointer of type atomic_t -+ * -+ * Atomically adds @i to @v and returns true if the result is negative, -+ * or false when the result is greater than or equal to zero. -+ */ -+static __always_inline bool -+arch_atomic_add_negative_acquire(int i, atomic_t *v) -+{ -+ return arch_atomic_add_return_acquire(i, v) < 0; -+} -+#define arch_atomic_add_negative_acquire arch_atomic_add_negative_acquire -+#endif -+ -+#ifndef arch_atomic_add_negative_release -+/** -+ * arch_atomic_add_negative_release - Add and test if negative -+ * @i: integer value to add -+ * @v: pointer of type atomic_t -+ * -+ * Atomically adds @i to @v and returns true if the result is negative, -+ * or false when the result is greater than or equal to zero. -+ */ -+static __always_inline bool -+arch_atomic_add_negative_release(int i, atomic_t *v) -+{ -+ return arch_atomic_add_return_release(i, v) < 0; -+} -+#define arch_atomic_add_negative_release arch_atomic_add_negative_release -+#endif -+ -+#ifndef arch_atomic_add_negative_relaxed -+/** -+ * arch_atomic_add_negative_relaxed - Add and test if negative -+ * @i: integer value to add -+ * @v: pointer of type atomic_t -+ * -+ * Atomically adds @i to @v and returns true if the result is negative, -+ * or false when the result is greater than or equal to zero. -+ */ -+static __always_inline bool -+arch_atomic_add_negative_relaxed(int i, atomic_t *v) -+{ -+ return arch_atomic_add_return_relaxed(i, v) < 0; -+} -+#define arch_atomic_add_negative_relaxed arch_atomic_add_negative_relaxed -+#endif -+ -+#else /* arch_atomic_add_negative_relaxed */ -+ -+#ifndef arch_atomic_add_negative_acquire -+static __always_inline bool -+arch_atomic_add_negative_acquire(int i, atomic_t *v) -+{ -+ bool ret = arch_atomic_add_negative_relaxed(i, v); -+ __atomic_acquire_fence(); -+ return ret; -+} -+#define arch_atomic_add_negative_acquire arch_atomic_add_negative_acquire -+#endif -+ -+#ifndef arch_atomic_add_negative_release -+static __always_inline bool -+arch_atomic_add_negative_release(int i, atomic_t *v) -+{ -+ __atomic_release_fence(); -+ return arch_atomic_add_negative_relaxed(i, v); -+} -+#define arch_atomic_add_negative_release arch_atomic_add_negative_release -+#endif -+ -+#ifndef arch_atomic_add_negative -+static __always_inline bool -+arch_atomic_add_negative(int i, atomic_t *v) -+{ -+ bool ret; -+ __atomic_pre_full_fence(); -+ ret = arch_atomic_add_negative_relaxed(i, v); -+ __atomic_post_full_fence(); -+ return ret; -+} -+#define arch_atomic_add_negative arch_atomic_add_negative -+#endif -+ -+#endif /* arch_atomic_add_negative_relaxed */ -+ - #ifndef arch_atomic_fetch_add_unless - /** - * arch_atomic_fetch_add_unless - add unless the number is already a given value -@@ -2329,15 +2424,21 @@ arch_atomic64_inc_and_test(atomic64_t *v) - #define arch_atomic64_inc_and_test arch_atomic64_inc_and_test - #endif - -+#ifndef arch_atomic64_add_negative_relaxed -+#ifdef arch_atomic64_add_negative -+#define arch_atomic64_add_negative_acquire arch_atomic64_add_negative -+#define arch_atomic64_add_negative_release arch_atomic64_add_negative -+#define arch_atomic64_add_negative_relaxed arch_atomic64_add_negative -+#endif /* arch_atomic64_add_negative */ -+ - #ifndef arch_atomic64_add_negative - /** -- * arch_atomic64_add_negative - add and test if negative -+ * arch_atomic64_add_negative - Add and test if negative - * @i: integer value to add - * @v: pointer of type atomic64_t - * -- * Atomically adds @i to @v and returns true -- * if the result is negative, or false when -- * result is greater than or equal to zero. -+ * Atomically adds @i to @v and returns true if the result is negative, -+ * or false when the result is greater than or equal to zero. - */ - static __always_inline bool - arch_atomic64_add_negative(s64 i, atomic64_t *v) -@@ -2347,6 +2448,95 @@ arch_atomic64_add_negative(s64 i, atomic64_t *v) - #define arch_atomic64_add_negative arch_atomic64_add_negative - #endif - -+#ifndef arch_atomic64_add_negative_acquire -+/** -+ * arch_atomic64_add_negative_acquire - Add and test if negative -+ * @i: integer value to add -+ * @v: pointer of type atomic64_t -+ * -+ * Atomically adds @i to @v and returns true if the result is negative, -+ * or false when the result is greater than or equal to zero. -+ */ -+static __always_inline bool -+arch_atomic64_add_negative_acquire(s64 i, atomic64_t *v) -+{ -+ return arch_atomic64_add_return_acquire(i, v) < 0; -+} -+#define arch_atomic64_add_negative_acquire arch_atomic64_add_negative_acquire -+#endif -+ -+#ifndef arch_atomic64_add_negative_release -+/** -+ * arch_atomic64_add_negative_release - Add and test if negative -+ * @i: integer value to add -+ * @v: pointer of type atomic64_t -+ * -+ * Atomically adds @i to @v and returns true if the result is negative, -+ * or false when the result is greater than or equal to zero. -+ */ -+static __always_inline bool -+arch_atomic64_add_negative_release(s64 i, atomic64_t *v) -+{ -+ return arch_atomic64_add_return_release(i, v) < 0; -+} -+#define arch_atomic64_add_negative_release arch_atomic64_add_negative_release -+#endif -+ -+#ifndef arch_atomic64_add_negative_relaxed -+/** -+ * arch_atomic64_add_negative_relaxed - Add and test if negative -+ * @i: integer value to add -+ * @v: pointer of type atomic64_t -+ * -+ * Atomically adds @i to @v and returns true if the result is negative, -+ * or false when the result is greater than or equal to zero. -+ */ -+static __always_inline bool -+arch_atomic64_add_negative_relaxed(s64 i, atomic64_t *v) -+{ -+ return arch_atomic64_add_return_relaxed(i, v) < 0; -+} -+#define arch_atomic64_add_negative_relaxed arch_atomic64_add_negative_relaxed -+#endif -+ -+#else /* arch_atomic64_add_negative_relaxed */ -+ -+#ifndef arch_atomic64_add_negative_acquire -+static __always_inline bool -+arch_atomic64_add_negative_acquire(s64 i, atomic64_t *v) -+{ -+ bool ret = arch_atomic64_add_negative_relaxed(i, v); -+ __atomic_acquire_fence(); -+ return ret; -+} -+#define arch_atomic64_add_negative_acquire arch_atomic64_add_negative_acquire -+#endif -+ -+#ifndef arch_atomic64_add_negative_release -+static __always_inline bool -+arch_atomic64_add_negative_release(s64 i, atomic64_t *v) -+{ -+ __atomic_release_fence(); -+ return arch_atomic64_add_negative_relaxed(i, v); -+} -+#define arch_atomic64_add_negative_release arch_atomic64_add_negative_release -+#endif -+ -+#ifndef arch_atomic64_add_negative -+static __always_inline bool -+arch_atomic64_add_negative(s64 i, atomic64_t *v) -+{ -+ bool ret; -+ __atomic_pre_full_fence(); -+ ret = arch_atomic64_add_negative_relaxed(i, v); -+ __atomic_post_full_fence(); -+ return ret; -+} -+#define arch_atomic64_add_negative arch_atomic64_add_negative -+#endif -+ -+#endif /* arch_atomic64_add_negative_relaxed */ -+ - #ifndef arch_atomic64_fetch_add_unless - /** - * arch_atomic64_fetch_add_unless - add unless the number is already a given value -@@ -2456,4 +2646,4 @@ arch_atomic64_dec_if_positive(atomic64_t *v) - #endif - - #endif /* _LINUX_ATOMIC_FALLBACK_H */ --// b5e87bdd5ede61470c29f7a7e4de781af3770f09 -+// 00071fffa021cec66f6290d706d69c91df87bade -diff --git a/include/linux/atomic/atomic-instrumented.h b/include/linux/atomic/atomic-instrumented.h -index 7a139ec030b0c..0496816738ca9 100644 ---- a/include/linux/atomic/atomic-instrumented.h -+++ b/include/linux/atomic/atomic-instrumented.h -@@ -592,6 +592,28 @@ atomic_add_negative(int i, atomic_t *v) - return arch_atomic_add_negative(i, v); - } - -+static __always_inline bool -+atomic_add_negative_acquire(int i, atomic_t *v) -+{ -+ instrument_atomic_read_write(v, sizeof(*v)); -+ return arch_atomic_add_negative_acquire(i, v); -+} -+ -+static __always_inline bool -+atomic_add_negative_release(int i, atomic_t *v) -+{ -+ kcsan_release(); -+ instrument_atomic_read_write(v, sizeof(*v)); -+ return arch_atomic_add_negative_release(i, v); -+} -+ -+static __always_inline bool -+atomic_add_negative_relaxed(int i, atomic_t *v) -+{ -+ instrument_atomic_read_write(v, sizeof(*v)); -+ return arch_atomic_add_negative_relaxed(i, v); -+} -+ - static __always_inline int - atomic_fetch_add_unless(atomic_t *v, int a, int u) - { -@@ -1211,6 +1233,28 @@ atomic64_add_negative(s64 i, atomic64_t *v) - return arch_atomic64_add_negative(i, v); - } - -+static __always_inline bool -+atomic64_add_negative_acquire(s64 i, atomic64_t *v) -+{ -+ instrument_atomic_read_write(v, sizeof(*v)); -+ return arch_atomic64_add_negative_acquire(i, v); -+} -+ -+static __always_inline bool -+atomic64_add_negative_release(s64 i, atomic64_t *v) -+{ -+ kcsan_release(); -+ instrument_atomic_read_write(v, sizeof(*v)); -+ return arch_atomic64_add_negative_release(i, v); -+} -+ -+static __always_inline bool -+atomic64_add_negative_relaxed(s64 i, atomic64_t *v) -+{ -+ instrument_atomic_read_write(v, sizeof(*v)); -+ return arch_atomic64_add_negative_relaxed(i, v); -+} -+ - static __always_inline s64 - atomic64_fetch_add_unless(atomic64_t *v, s64 a, s64 u) - { -@@ -1830,6 +1874,28 @@ atomic_long_add_negative(long i, atomic_long_t *v) - return arch_atomic_long_add_negative(i, v); - } - -+static __always_inline bool -+atomic_long_add_negative_acquire(long i, atomic_long_t *v) -+{ -+ instrument_atomic_read_write(v, sizeof(*v)); -+ return arch_atomic_long_add_negative_acquire(i, v); -+} -+ -+static __always_inline bool -+atomic_long_add_negative_release(long i, atomic_long_t *v) -+{ -+ kcsan_release(); -+ instrument_atomic_read_write(v, sizeof(*v)); -+ return arch_atomic_long_add_negative_release(i, v); -+} -+ -+static __always_inline bool -+atomic_long_add_negative_relaxed(long i, atomic_long_t *v) -+{ -+ instrument_atomic_read_write(v, sizeof(*v)); -+ return arch_atomic_long_add_negative_relaxed(i, v); -+} -+ - static __always_inline long - atomic_long_fetch_add_unless(atomic_long_t *v, long a, long u) - { -@@ -2083,4 +2149,4 @@ atomic_long_dec_if_positive(atomic_long_t *v) - }) - - #endif /* _LINUX_ATOMIC_INSTRUMENTED_H */ --// 764f741eb77a7ad565dc8d99ce2837d5542e8aee -+// 1b485de9cbaa4900de59e14ee2084357eaeb1c3a -diff --git a/include/linux/atomic/atomic-long.h b/include/linux/atomic/atomic-long.h -index 800b8c35992d1..2fc51ba66bebd 100644 ---- a/include/linux/atomic/atomic-long.h -+++ b/include/linux/atomic/atomic-long.h -@@ -479,6 +479,24 @@ arch_atomic_long_add_negative(long i, atomic_long_t *v) - return arch_atomic64_add_negative(i, v); - } - -+static __always_inline bool -+arch_atomic_long_add_negative_acquire(long i, atomic_long_t *v) -+{ -+ return arch_atomic64_add_negative_acquire(i, v); -+} -+ -+static __always_inline bool -+arch_atomic_long_add_negative_release(long i, atomic_long_t *v) -+{ -+ return arch_atomic64_add_negative_release(i, v); -+} -+ -+static __always_inline bool -+arch_atomic_long_add_negative_relaxed(long i, atomic_long_t *v) -+{ -+ return arch_atomic64_add_negative_relaxed(i, v); -+} -+ - static __always_inline long - arch_atomic_long_fetch_add_unless(atomic_long_t *v, long a, long u) - { -@@ -973,6 +991,24 @@ arch_atomic_long_add_negative(long i, atomic_long_t *v) - return arch_atomic_add_negative(i, v); - } - -+static __always_inline bool -+arch_atomic_long_add_negative_acquire(long i, atomic_long_t *v) -+{ -+ return arch_atomic_add_negative_acquire(i, v); -+} -+ -+static __always_inline bool -+arch_atomic_long_add_negative_release(long i, atomic_long_t *v) -+{ -+ return arch_atomic_add_negative_release(i, v); -+} -+ -+static __always_inline bool -+arch_atomic_long_add_negative_relaxed(long i, atomic_long_t *v) -+{ -+ return arch_atomic_add_negative_relaxed(i, v); -+} -+ - static __always_inline long - arch_atomic_long_fetch_add_unless(atomic_long_t *v, long a, long u) - { -@@ -1011,4 +1047,4 @@ arch_atomic_long_dec_if_positive(atomic_long_t *v) - - #endif /* CONFIG_64BIT */ - #endif /* _LINUX_ATOMIC_LONG_H */ --// e8f0e08ff072b74d180eabe2ad001282b38c2c88 -+// a194c07d7d2f4b0e178d3c118c919775d5d65f50 -diff --git a/scripts/atomic/atomics.tbl b/scripts/atomic/atomics.tbl -index fbee2f6190d9e..85ca8d9b5c279 100644 ---- a/scripts/atomic/atomics.tbl -+++ b/scripts/atomic/atomics.tbl -@@ -33,7 +33,7 @@ try_cmpxchg B v p:old i:new - sub_and_test b i v - dec_and_test b v - inc_and_test b v --add_negative b i v -+add_negative B i v - add_unless fb v i:a i:u - inc_not_zero b v - inc_unless_negative b v -diff --git a/scripts/atomic/fallbacks/add_negative b/scripts/atomic/fallbacks/add_negative -index 15caa2eb23712..e5980abf5904e 100755 ---- a/scripts/atomic/fallbacks/add_negative -+++ b/scripts/atomic/fallbacks/add_negative -@@ -1,16 +1,15 @@ - cat < -Date: Thu, 23 Mar 2023 21:55:31 +0100 -Subject: atomics: Provide rcuref - scalable reference counting - -From: Thomas Gleixner - -[ Upstream commit ee1ee6db07795d9637bc5e8993a8ddcf886541ef ] - -atomic_t based reference counting, including refcount_t, uses -atomic_inc_not_zero() for acquiring a reference. atomic_inc_not_zero() is -implemented with a atomic_try_cmpxchg() loop. High contention of the -reference count leads to retry loops and scales badly. There is nothing to -improve on this implementation as the semantics have to be preserved. - -Provide rcuref as a scalable alternative solution which is suitable for RCU -managed objects. Similar to refcount_t it comes with overflow and underflow -detection and mitigation. - -rcuref treats the underlying atomic_t as an unsigned integer and partitions -this space into zones: - - 0x00000000 - 0x7FFFFFFF valid zone (1 .. (INT_MAX + 1) references) - 0x80000000 - 0xBFFFFFFF saturation zone - 0xC0000000 - 0xFFFFFFFE dead zone - 0xFFFFFFFF no reference - -rcuref_get() unconditionally increments the reference count with -atomic_add_negative_relaxed(). rcuref_put() unconditionally decrements the -reference count with atomic_add_negative_release(). - -This unconditional increment avoids the inc_not_zero() problem, but -requires a more complex implementation on the put() side when the count -drops from 0 to -1. - -When this transition is detected then it is attempted to mark the reference -count dead, by setting it to the midpoint of the dead zone with a single -atomic_cmpxchg_release() operation. This operation can fail due to a -concurrent rcuref_get() elevating the reference count from -1 to 0 again. - -If the unconditional increment in rcuref_get() hits a reference count which -is marked dead (or saturated) it will detect it after the fact and bring -back the reference count to the midpoint of the respective zone. The zones -provide enough tolerance which makes it practically impossible to escape -from a zone. - -The racy implementation of rcuref_put() requires to protect rcuref_put() -against a grace period ending in order to prevent a subtle use after -free. As RCU is the only mechanism which allows to protect against that, it -is not possible to fully replace the atomic_inc_not_zero() based -implementation of refcount_t with this scheme. - -The final drop is slightly more expensive than the atomic_dec_return() -counterpart, but that's not the case which this is optimized for. The -optimization is on the high frequeunt get()/put() pairs and their -scalability. - -The performance of an uncontended rcuref_get()/put() pair where the put() -is not dropping the last reference is still on par with the plain atomic -operations, while at the same time providing overflow and underflow -detection and mitigation. - -The performance of rcuref compared to plain atomic_inc_not_zero() and -atomic_dec_return() based reference counting under contention: - - - Micro benchmark: All CPUs running a increment/decrement loop on an - elevated reference count, which means the 0 to -1 transition never - happens. - - The performance gain depends on microarchitecture and the number of - CPUs and has been observed in the range of 1.3X to 4.7X - - - Conversion of dst_entry::__refcnt to rcuref and testing with the - localhost memtier/memcached benchmark. That benchmark shows the - reference count contention prominently. - - The performance gain depends on microarchitecture and the number of - CPUs and has been observed in the range of 1.1X to 2.6X over the - previous fix for the false sharing issue vs. struct - dst_entry::__refcnt. - - When memtier is run over a real 1Gb network connection, there is a - small gain on top of the false sharing fix. The two changes combined - result in a 2%-5% total gain for that networked test. - -Reported-by: Wangyang Guo -Reported-by: Arjan Van De Ven -Signed-off-by: Thomas Gleixner -Signed-off-by: Peter Zijlstra (Intel) -Link: https://lore.kernel.org/r/20230323102800.158429195@linutronix.de -Stable-dep-of: cc9b364bb1d5 ("xfrm6: fix inet6_dev refcount underflow problem") -Signed-off-by: Sasha Levin ---- - include/linux/rcuref.h | 155 +++++++++++++++++++++++ - include/linux/types.h | 6 + - lib/Makefile | 2 +- - lib/rcuref.c | 281 +++++++++++++++++++++++++++++++++++++++++ - 4 files changed, 443 insertions(+), 1 deletion(-) - create mode 100644 include/linux/rcuref.h - create mode 100644 lib/rcuref.c - -diff --git a/include/linux/rcuref.h b/include/linux/rcuref.h -new file mode 100644 -index 0000000000000..2c8bfd0f1b6b3 ---- /dev/null -+++ b/include/linux/rcuref.h -@@ -0,0 +1,155 @@ -+/* SPDX-License-Identifier: GPL-2.0-only */ -+#ifndef _LINUX_RCUREF_H -+#define _LINUX_RCUREF_H -+ -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#define RCUREF_ONEREF 0x00000000U -+#define RCUREF_MAXREF 0x7FFFFFFFU -+#define RCUREF_SATURATED 0xA0000000U -+#define RCUREF_RELEASED 0xC0000000U -+#define RCUREF_DEAD 0xE0000000U -+#define RCUREF_NOREF 0xFFFFFFFFU -+ -+/** -+ * rcuref_init - Initialize a rcuref reference count with the given reference count -+ * @ref: Pointer to the reference count -+ * @cnt: The initial reference count typically '1' -+ */ -+static inline void rcuref_init(rcuref_t *ref, unsigned int cnt) -+{ -+ atomic_set(&ref->refcnt, cnt - 1); -+} -+ -+/** -+ * rcuref_read - Read the number of held reference counts of a rcuref -+ * @ref: Pointer to the reference count -+ * -+ * Return: The number of held references (0 ... N) -+ */ -+static inline unsigned int rcuref_read(rcuref_t *ref) -+{ -+ unsigned int c = atomic_read(&ref->refcnt); -+ -+ /* Return 0 if within the DEAD zone. */ -+ return c >= RCUREF_RELEASED ? 0 : c + 1; -+} -+ -+extern __must_check bool rcuref_get_slowpath(rcuref_t *ref); -+ -+/** -+ * rcuref_get - Acquire one reference on a rcuref reference count -+ * @ref: Pointer to the reference count -+ * -+ * Similar to atomic_inc_not_zero() but saturates at RCUREF_MAXREF. -+ * -+ * Provides no memory ordering, it is assumed the caller has guaranteed the -+ * object memory to be stable (RCU, etc.). It does provide a control dependency -+ * and thereby orders future stores. See documentation in lib/rcuref.c -+ * -+ * Return: -+ * False if the attempt to acquire a reference failed. This happens -+ * when the last reference has been put already -+ * -+ * True if a reference was successfully acquired -+ */ -+static inline __must_check bool rcuref_get(rcuref_t *ref) -+{ -+ /* -+ * Unconditionally increase the reference count. The saturation and -+ * dead zones provide enough tolerance for this. -+ */ -+ if (likely(!atomic_add_negative_relaxed(1, &ref->refcnt))) -+ return true; -+ -+ /* Handle the cases inside the saturation and dead zones */ -+ return rcuref_get_slowpath(ref); -+} -+ -+extern __must_check bool rcuref_put_slowpath(rcuref_t *ref); -+ -+/* -+ * Internal helper. Do not invoke directly. -+ */ -+static __always_inline __must_check bool __rcuref_put(rcuref_t *ref) -+{ -+ RCU_LOCKDEP_WARN(!rcu_read_lock_held() && preemptible(), -+ "suspicious rcuref_put_rcusafe() usage"); -+ /* -+ * Unconditionally decrease the reference count. The saturation and -+ * dead zones provide enough tolerance for this. -+ */ -+ if (likely(!atomic_add_negative_release(-1, &ref->refcnt))) -+ return false; -+ -+ /* -+ * Handle the last reference drop and cases inside the saturation -+ * and dead zones. -+ */ -+ return rcuref_put_slowpath(ref); -+} -+ -+/** -+ * rcuref_put_rcusafe -- Release one reference for a rcuref reference count RCU safe -+ * @ref: Pointer to the reference count -+ * -+ * Provides release memory ordering, such that prior loads and stores are done -+ * before, and provides an acquire ordering on success such that free() -+ * must come after. -+ * -+ * Can be invoked from contexts, which guarantee that no grace period can -+ * happen which would free the object concurrently if the decrement drops -+ * the last reference and the slowpath races against a concurrent get() and -+ * put() pair. rcu_read_lock()'ed and atomic contexts qualify. -+ * -+ * Return: -+ * True if this was the last reference with no future references -+ * possible. This signals the caller that it can safely release the -+ * object which is protected by the reference counter. -+ * -+ * False if there are still active references or the put() raced -+ * with a concurrent get()/put() pair. Caller is not allowed to -+ * release the protected object. -+ */ -+static inline __must_check bool rcuref_put_rcusafe(rcuref_t *ref) -+{ -+ return __rcuref_put(ref); -+} -+ -+/** -+ * rcuref_put -- Release one reference for a rcuref reference count -+ * @ref: Pointer to the reference count -+ * -+ * Can be invoked from any context. -+ * -+ * Provides release memory ordering, such that prior loads and stores are done -+ * before, and provides an acquire ordering on success such that free() -+ * must come after. -+ * -+ * Return: -+ * -+ * True if this was the last reference with no future references -+ * possible. This signals the caller that it can safely schedule the -+ * object, which is protected by the reference counter, for -+ * deconstruction. -+ * -+ * False if there are still active references or the put() raced -+ * with a concurrent get()/put() pair. Caller is not allowed to -+ * deconstruct the protected object. -+ */ -+static inline __must_check bool rcuref_put(rcuref_t *ref) -+{ -+ bool released; -+ -+ preempt_disable(); -+ released = __rcuref_put(ref); -+ preempt_enable(); -+ return released; -+} -+ -+#endif -diff --git a/include/linux/types.h b/include/linux/types.h -index ea8cf60a8a795..688fb943556a1 100644 ---- a/include/linux/types.h -+++ b/include/linux/types.h -@@ -175,6 +175,12 @@ typedef struct { - } atomic64_t; - #endif - -+typedef struct { -+ atomic_t refcnt; -+} rcuref_t; -+ -+#define RCUREF_INIT(i) { .refcnt = ATOMIC_INIT(i - 1) } -+ - struct list_head { - struct list_head *next, *prev; - }; -diff --git a/lib/Makefile b/lib/Makefile -index 5ffe72ec99797..afd78c497ec76 100644 ---- a/lib/Makefile -+++ b/lib/Makefile -@@ -47,7 +47,7 @@ obj-y += bcd.o sort.o parser.o debug_locks.o random32.o \ - list_sort.o uuid.o iov_iter.o clz_ctz.o \ - bsearch.o find_bit.o llist.o memweight.o kfifo.o \ - percpu-refcount.o rhashtable.o base64.o \ -- once.o refcount.o usercopy.o errseq.o bucket_locks.o \ -+ once.o refcount.o rcuref.o usercopy.o errseq.o bucket_locks.o \ - generic-radix-tree.o - obj-$(CONFIG_STRING_SELFTEST) += test_string.o - obj-y += string_helpers.o -diff --git a/lib/rcuref.c b/lib/rcuref.c -new file mode 100644 -index 0000000000000..5ec00a4a64d11 ---- /dev/null -+++ b/lib/rcuref.c -@@ -0,0 +1,281 @@ -+// SPDX-License-Identifier: GPL-2.0-only -+ -+/* -+ * rcuref - A scalable reference count implementation for RCU managed objects -+ * -+ * rcuref is provided to replace open coded reference count implementations -+ * based on atomic_t. It protects explicitely RCU managed objects which can -+ * be visible even after the last reference has been dropped and the object -+ * is heading towards destruction. -+ * -+ * A common usage pattern is: -+ * -+ * get() -+ * rcu_read_lock(); -+ * p = get_ptr(); -+ * if (p && !atomic_inc_not_zero(&p->refcnt)) -+ * p = NULL; -+ * rcu_read_unlock(); -+ * return p; -+ * -+ * put() -+ * if (!atomic_dec_return(&->refcnt)) { -+ * remove_ptr(p); -+ * kfree_rcu((p, rcu); -+ * } -+ * -+ * atomic_inc_not_zero() is implemented with a try_cmpxchg() loop which has -+ * O(N^2) behaviour under contention with N concurrent operations. -+ * -+ * rcuref uses atomic_add_negative_relaxed() for the fast path, which scales -+ * better under contention. -+ * -+ * Why not refcount? -+ * ================= -+ * -+ * In principle it should be possible to make refcount use the rcuref -+ * scheme, but the destruction race described below cannot be prevented -+ * unless the protected object is RCU managed. -+ * -+ * Theory of operation -+ * =================== -+ * -+ * rcuref uses an unsigned integer reference counter. As long as the -+ * counter value is greater than or equal to RCUREF_ONEREF and not larger -+ * than RCUREF_MAXREF the reference is alive: -+ * -+ * ONEREF MAXREF SATURATED RELEASED DEAD NOREF -+ * 0 0x7FFFFFFF 0x8000000 0xA0000000 0xBFFFFFFF 0xC0000000 0xE0000000 0xFFFFFFFF -+ * <---valid --------> <-------saturation zone-------> <-----dead zone-----> -+ * -+ * The get() and put() operations do unconditional increments and -+ * decrements. The result is checked after the operation. This optimizes -+ * for the fast path. -+ * -+ * If the reference count is saturated or dead, then the increments and -+ * decrements are not harmful as the reference count still stays in the -+ * respective zones and is always set back to STATURATED resp. DEAD. The -+ * zones have room for 2^28 racing operations in each direction, which -+ * makes it practically impossible to escape the zones. -+ * -+ * Once the last reference is dropped the reference count becomes -+ * RCUREF_NOREF which forces rcuref_put() into the slowpath operation. The -+ * slowpath then tries to set the reference count from RCUREF_NOREF to -+ * RCUREF_DEAD via a cmpxchg(). This opens a small window where a -+ * concurrent rcuref_get() can acquire the reference count and bring it -+ * back to RCUREF_ONEREF or even drop the reference again and mark it DEAD. -+ * -+ * If the cmpxchg() succeeds then a concurrent rcuref_get() will result in -+ * DEAD + 1, which is inside the dead zone. If that happens the reference -+ * count is put back to DEAD. -+ * -+ * The actual race is possible due to the unconditional increment and -+ * decrements in rcuref_get() and rcuref_put(): -+ * -+ * T1 T2 -+ * get() put() -+ * if (atomic_add_negative(-1, &ref->refcnt)) -+ * succeeds-> atomic_cmpxchg(&ref->refcnt, NOREF, DEAD); -+ * -+ * atomic_add_negative(1, &ref->refcnt); <- Elevates refcount to DEAD + 1 -+ * -+ * As the result of T1's add is negative, the get() goes into the slow path -+ * and observes refcnt being in the dead zone which makes the operation fail. -+ * -+ * Possible critical states: -+ * -+ * Context Counter References Operation -+ * T1 0 1 init() -+ * T2 1 2 get() -+ * T1 0 1 put() -+ * T2 -1 0 put() tries to mark dead -+ * T1 0 1 get() -+ * T2 0 1 put() mark dead fails -+ * T1 -1 0 put() tries to mark dead -+ * T1 DEAD 0 put() mark dead succeeds -+ * T2 DEAD+1 0 get() fails and puts it back to DEAD -+ * -+ * Of course there are more complex scenarios, but the above illustrates -+ * the working principle. The rest is left to the imagination of the -+ * reader. -+ * -+ * Deconstruction race -+ * =================== -+ * -+ * The release operation must be protected by prohibiting a grace period in -+ * order to prevent a possible use after free: -+ * -+ * T1 T2 -+ * put() get() -+ * // ref->refcnt = ONEREF -+ * if (!atomic_add_negative(-1, &ref->refcnt)) -+ * return false; <- Not taken -+ * -+ * // ref->refcnt == NOREF -+ * --> preemption -+ * // Elevates ref->refcnt to ONEREF -+ * if (!atomic_add_negative(1, &ref->refcnt)) -+ * return true; <- taken -+ * -+ * if (put(&p->ref)) { <-- Succeeds -+ * remove_pointer(p); -+ * kfree_rcu(p, rcu); -+ * } -+ * -+ * RCU grace period ends, object is freed -+ * -+ * atomic_cmpxchg(&ref->refcnt, NOREF, DEAD); <- UAF -+ * -+ * This is prevented by disabling preemption around the put() operation as -+ * that's in most kernel configurations cheaper than a rcu_read_lock() / -+ * rcu_read_unlock() pair and in many cases even a NOOP. In any case it -+ * prevents the grace period which keeps the object alive until all put() -+ * operations complete. -+ * -+ * Saturation protection -+ * ===================== -+ * -+ * The reference count has a saturation limit RCUREF_MAXREF (INT_MAX). -+ * Once this is exceedded the reference count becomes stale by setting it -+ * to RCUREF_SATURATED, which will cause a memory leak, but it prevents -+ * wrap arounds which obviously cause worse problems than a memory -+ * leak. When saturation is reached a warning is emitted. -+ * -+ * Race conditions -+ * =============== -+ * -+ * All reference count increment/decrement operations are unconditional and -+ * only verified after the fact. This optimizes for the good case and takes -+ * the occasional race vs. a dead or already saturated refcount into -+ * account. The saturation and dead zones are large enough to accomodate -+ * for that. -+ * -+ * Memory ordering -+ * =============== -+ * -+ * Memory ordering rules are slightly relaxed wrt regular atomic_t functions -+ * and provide only what is strictly required for refcounts. -+ * -+ * The increments are fully relaxed; these will not provide ordering. The -+ * rationale is that whatever is used to obtain the object to increase the -+ * reference count on will provide the ordering. For locked data -+ * structures, its the lock acquire, for RCU/lockless data structures its -+ * the dependent load. -+ * -+ * rcuref_get() provides a control dependency ordering future stores which -+ * ensures that the object is not modified when acquiring a reference -+ * fails. -+ * -+ * rcuref_put() provides release order, i.e. all prior loads and stores -+ * will be issued before. It also provides a control dependency ordering -+ * against the subsequent destruction of the object. -+ * -+ * If rcuref_put() successfully dropped the last reference and marked the -+ * object DEAD it also provides acquire ordering. -+ */ -+ -+#include -+#include -+ -+/** -+ * rcuref_get_slowpath - Slowpath of rcuref_get() -+ * @ref: Pointer to the reference count -+ * -+ * Invoked when the reference count is outside of the valid zone. -+ * -+ * Return: -+ * False if the reference count was already marked dead -+ * -+ * True if the reference count is saturated, which prevents the -+ * object from being deconstructed ever. -+ */ -+bool rcuref_get_slowpath(rcuref_t *ref) -+{ -+ unsigned int cnt = atomic_read(&ref->refcnt); -+ -+ /* -+ * If the reference count was already marked dead, undo the -+ * increment so it stays in the middle of the dead zone and return -+ * fail. -+ */ -+ if (cnt >= RCUREF_RELEASED) { -+ atomic_set(&ref->refcnt, RCUREF_DEAD); -+ return false; -+ } -+ -+ /* -+ * If it was saturated, warn and mark it so. In case the increment -+ * was already on a saturated value restore the saturation -+ * marker. This keeps it in the middle of the saturation zone and -+ * prevents the reference count from overflowing. This leaks the -+ * object memory, but prevents the obvious reference count overflow -+ * damage. -+ */ -+ if (WARN_ONCE(cnt > RCUREF_MAXREF, "rcuref saturated - leaking memory")) -+ atomic_set(&ref->refcnt, RCUREF_SATURATED); -+ return true; -+} -+EXPORT_SYMBOL_GPL(rcuref_get_slowpath); -+ -+/** -+ * rcuref_put_slowpath - Slowpath of __rcuref_put() -+ * @ref: Pointer to the reference count -+ * -+ * Invoked when the reference count is outside of the valid zone. -+ * -+ * Return: -+ * True if this was the last reference with no future references -+ * possible. This signals the caller that it can safely schedule the -+ * object, which is protected by the reference counter, for -+ * deconstruction. -+ * -+ * False if there are still active references or the put() raced -+ * with a concurrent get()/put() pair. Caller is not allowed to -+ * deconstruct the protected object. -+ */ -+bool rcuref_put_slowpath(rcuref_t *ref) -+{ -+ unsigned int cnt = atomic_read(&ref->refcnt); -+ -+ /* Did this drop the last reference? */ -+ if (likely(cnt == RCUREF_NOREF)) { -+ /* -+ * Carefully try to set the reference count to RCUREF_DEAD. -+ * -+ * This can fail if a concurrent get() operation has -+ * elevated it again or the corresponding put() even marked -+ * it dead already. Both are valid situations and do not -+ * require a retry. If this fails the caller is not -+ * allowed to deconstruct the object. -+ */ -+ if (atomic_cmpxchg_release(&ref->refcnt, RCUREF_NOREF, RCUREF_DEAD) != RCUREF_NOREF) -+ return false; -+ -+ /* -+ * The caller can safely schedule the object for -+ * deconstruction. Provide acquire ordering. -+ */ -+ smp_acquire__after_ctrl_dep(); -+ return true; -+ } -+ -+ /* -+ * If the reference count was already in the dead zone, then this -+ * put() operation is imbalanced. Warn, put the reference count back to -+ * DEAD and tell the caller to not deconstruct the object. -+ */ -+ if (WARN_ONCE(cnt >= RCUREF_RELEASED, "rcuref - imbalanced put()")) { -+ atomic_set(&ref->refcnt, RCUREF_DEAD); -+ return false; -+ } -+ -+ /* -+ * This is a put() operation on a saturated refcount. Restore the -+ * mean saturation value and tell the caller to not deconstruct the -+ * object. -+ */ -+ if (cnt > RCUREF_MAXREF) -+ atomic_set(&ref->refcnt, RCUREF_SATURATED); -+ return false; -+} -+EXPORT_SYMBOL_GPL(rcuref_put_slowpath); --- -2.40.1 - diff --git a/queue-6.1/net-dst-fix-missing-initialization-of-rt_uncached.patch b/queue-6.1/net-dst-fix-missing-initialization-of-rt_uncached.patch deleted file mode 100644 index 3244b264911..00000000000 --- a/queue-6.1/net-dst-fix-missing-initialization-of-rt_uncached.patch +++ /dev/null @@ -1,177 +0,0 @@ -From f4fdfd10202488104e6e484bd76fd1b5cd7c10c6 Mon Sep 17 00:00:00 2001 -From: Sasha Levin -Date: Thu, 20 Apr 2023 20:25:08 +0200 -Subject: net: dst: fix missing initialization of rt_uncached - -From: Maxime Bizon - -[ Upstream commit 418a73074da9182f571e467eaded03ea501f3281 ] - -xfrm_alloc_dst() followed by xfrm4_dst_destroy(), without a -xfrm4_fill_dst() call in between, causes the following BUG: - - BUG: spinlock bad magic on CPU#0, fbxhostapd/732 - lock: 0x890b7668, .magic: 890b7668, .owner: /-1, .owner_cpu: 0 - CPU: 0 PID: 732 Comm: fbxhostapd Not tainted 6.3.0-rc6-next-20230414-00613-ge8de66369925-dirty #9 - Hardware name: Marvell Kirkwood (Flattened Device Tree) - unwind_backtrace from show_stack+0x10/0x14 - show_stack from dump_stack_lvl+0x28/0x30 - dump_stack_lvl from do_raw_spin_lock+0x20/0x80 - do_raw_spin_lock from rt_del_uncached_list+0x30/0x64 - rt_del_uncached_list from xfrm4_dst_destroy+0x3c/0xbc - xfrm4_dst_destroy from dst_destroy+0x5c/0xb0 - dst_destroy from rcu_process_callbacks+0xc4/0xec - rcu_process_callbacks from __do_softirq+0xb4/0x22c - __do_softirq from call_with_stack+0x1c/0x24 - call_with_stack from do_softirq+0x60/0x6c - do_softirq from __local_bh_enable_ip+0xa0/0xcc - -Patch "net: dst: Prevent false sharing vs. dst_entry:: __refcnt" moved -rt_uncached and rt_uncached_list fields from rtable struct to dst -struct, so they are more zeroed by memset_after(xdst, 0, u.dst) in -xfrm_alloc_dst(). - -Note that rt_uncached (list_head) was never properly initialized at -alloc time, but xfrm[46]_dst_destroy() is written in such a way that -it was not an issue thanks to the memset: - - if (xdst->u.rt.dst.rt_uncached_list) - rt_del_uncached_list(&xdst->u.rt); - -The route code does it the other way around: rt_uncached_list is -assumed to be valid IIF rt_uncached list_head is not empty: - -void rt_del_uncached_list(struct rtable *rt) -{ - if (!list_empty(&rt->dst.rt_uncached)) { - struct uncached_list *ul = rt->dst.rt_uncached_list; - - spin_lock_bh(&ul->lock); - list_del_init(&rt->dst.rt_uncached); - spin_unlock_bh(&ul->lock); - } -} - -This patch adds mandatory rt_uncached list_head initialization in -generic dst_init(), and adapt xfrm[46]_dst_destroy logic to match the -rest of the code. - -Fixes: d288a162dd1c ("net: dst: Prevent false sharing vs. dst_entry:: __refcnt") -Reported-by: kernel test robot -Link: https://lore.kernel.org/oe-lkp/202304162125.18b7bcdd-oliver.sang@intel.com -Reviewed-by: David Ahern -Reviewed-by: Eric Dumazet -CC: Leon Romanovsky -Signed-off-by: Maxime Bizon -Link: https://lore.kernel.org/r/20230420182508.2417582-1-mbizon@freebox.fr -Signed-off-by: Jakub Kicinski -Stable-dep-of: cc9b364bb1d5 ("xfrm6: fix inet6_dev refcount underflow problem") -Signed-off-by: Sasha Levin ---- - net/core/dst.c | 1 + - net/ipv4/route.c | 4 ---- - net/ipv4/xfrm4_policy.c | 4 +--- - net/ipv6/route.c | 1 - - net/ipv6/xfrm6_policy.c | 4 +--- - 5 files changed, 3 insertions(+), 11 deletions(-) - -diff --git a/net/core/dst.c b/net/core/dst.c -index 2b7b1619b5e29..1666a6f5e858e 100644 ---- a/net/core/dst.c -+++ b/net/core/dst.c -@@ -67,6 +67,7 @@ void dst_init(struct dst_entry *dst, struct dst_ops *ops, - #endif - dst->lwtstate = NULL; - rcuref_init(&dst->__rcuref, initial_ref); -+ INIT_LIST_HEAD(&dst->rt_uncached); - dst->__use = 0; - dst->lastuse = jiffies; - dst->flags = flags; -diff --git a/net/ipv4/route.c b/net/ipv4/route.c -index 7ccf6503d67aa..a44d20644fbc2 100644 ---- a/net/ipv4/route.c -+++ b/net/ipv4/route.c -@@ -1646,7 +1646,6 @@ struct rtable *rt_dst_alloc(struct net_device *dev, - rt->rt_uses_gateway = 0; - rt->rt_gw_family = 0; - rt->rt_gw4 = 0; -- INIT_LIST_HEAD(&rt->dst.rt_uncached); - - rt->dst.output = ip_output; - if (flags & RTCF_LOCAL) -@@ -1677,7 +1676,6 @@ struct rtable *rt_dst_clone(struct net_device *dev, struct rtable *rt) - new_rt->rt_gw4 = rt->rt_gw4; - else if (rt->rt_gw_family == AF_INET6) - new_rt->rt_gw6 = rt->rt_gw6; -- INIT_LIST_HEAD(&new_rt->dst.rt_uncached); - - new_rt->dst.input = rt->dst.input; - new_rt->dst.output = rt->dst.output; -@@ -2861,8 +2859,6 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or - rt->rt_gw4 = ort->rt_gw4; - else if (rt->rt_gw_family == AF_INET6) - rt->rt_gw6 = ort->rt_gw6; -- -- INIT_LIST_HEAD(&rt->dst.rt_uncached); - } - - dst_release(dst_orig); -diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c -index 47861c8b7340e..9403bbaf1b616 100644 ---- a/net/ipv4/xfrm4_policy.c -+++ b/net/ipv4/xfrm4_policy.c -@@ -91,7 +91,6 @@ static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, - xdst->u.rt.rt_gw6 = rt->rt_gw6; - xdst->u.rt.rt_pmtu = rt->rt_pmtu; - xdst->u.rt.rt_mtu_locked = rt->rt_mtu_locked; -- INIT_LIST_HEAD(&xdst->u.rt.dst.rt_uncached); - rt_add_uncached_list(&xdst->u.rt); - - return 0; -@@ -121,8 +120,7 @@ static void xfrm4_dst_destroy(struct dst_entry *dst) - struct xfrm_dst *xdst = (struct xfrm_dst *)dst; - - dst_destroy_metrics_generic(dst); -- if (xdst->u.rt.dst.rt_uncached_list) -- rt_del_uncached_list(&xdst->u.rt); -+ rt_del_uncached_list(&xdst->u.rt); - xfrm_dst_destroy(xdst); - } - -diff --git a/net/ipv6/route.c b/net/ipv6/route.c -index 9db0b2318e918..d4d06a9d985e8 100644 ---- a/net/ipv6/route.c -+++ b/net/ipv6/route.c -@@ -334,7 +334,6 @@ static const struct rt6_info ip6_blk_hole_entry_template = { - static void rt6_info_init(struct rt6_info *rt) - { - memset_after(rt, 0, dst); -- INIT_LIST_HEAD(&rt->dst.rt_uncached); - } - - /* allocate dst with ip6_dst_ops */ -diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c -index 2b493f8d00918..eecc5e59da17c 100644 ---- a/net/ipv6/xfrm6_policy.c -+++ b/net/ipv6/xfrm6_policy.c -@@ -89,7 +89,6 @@ static int xfrm6_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, - xdst->u.rt6.rt6i_gateway = rt->rt6i_gateway; - xdst->u.rt6.rt6i_dst = rt->rt6i_dst; - xdst->u.rt6.rt6i_src = rt->rt6i_src; -- INIT_LIST_HEAD(&xdst->u.rt6.dst.rt_uncached); - rt6_uncached_list_add(&xdst->u.rt6); - - return 0; -@@ -121,8 +120,7 @@ static void xfrm6_dst_destroy(struct dst_entry *dst) - if (likely(xdst->u.rt6.rt6i_idev)) - in6_dev_put(xdst->u.rt6.rt6i_idev); - dst_destroy_metrics_generic(dst); -- if (xdst->u.rt6.dst.rt_uncached_list) -- rt6_uncached_list_del(&xdst->u.rt6); -+ rt6_uncached_list_del(&xdst->u.rt6); - xfrm_dst_destroy(xdst); - } - --- -2.40.1 - diff --git a/queue-6.1/net-dst-prevent-false-sharing-vs.-dst_entry-__refcnt.patch b/queue-6.1/net-dst-prevent-false-sharing-vs.-dst_entry-__refcnt.patch deleted file mode 100644 index ae7f971dd8c..00000000000 --- a/queue-6.1/net-dst-prevent-false-sharing-vs.-dst_entry-__refcnt.patch +++ /dev/null @@ -1,372 +0,0 @@ -From e7f0083dd5326ec3a897b9d9c144fdaf4f630c4a Mon Sep 17 00:00:00 2001 -From: Sasha Levin -Date: Thu, 23 Mar 2023 21:55:29 +0100 -Subject: net: dst: Prevent false sharing vs. dst_entry:: __refcnt - -From: Wangyang Guo - -[ Upstream commit d288a162dd1c73507da582966f17dd226e34a0c0 ] - -dst_entry::__refcnt is highly contended in scenarios where many connections -happen from and to the same IP. The reference count is an atomic_t, so the -reference count operations have to take the cache-line exclusive. - -Aside of the unavoidable reference count contention there is another -significant problem which is caused by that: False sharing. - -perf top identified two affected read accesses. dst_entry::lwtstate and -rtable::rt_genid. - -dst_entry:__refcnt is located at offset 64 of dst_entry, which puts it into -a seperate cacheline vs. the read mostly members located at the beginning -of the struct. - -That prevents false sharing vs. the struct members in the first 64 -bytes of the structure, but there is also - - dst_entry::lwtstate - -which is located after the reference count and in the same cache line. This -member is read after a reference count has been acquired. - -struct rtable embeds a struct dst_entry at offset 0. struct dst_entry has a -size of 112 bytes, which means that the struct members of rtable which -follow the dst member share the same cache line as dst_entry::__refcnt. -Especially - - rtable::rt_genid - -is also read by the contexts which have a reference count acquired -already. - -When dst_entry:__refcnt is incremented or decremented via an atomic -operation these read accesses stall. This was found when analysing the -memtier benchmark in 1:100 mode, which amplifies the problem extremly. - -Move the rt[6i]_uncached[_list] members out of struct rtable and struct -rt6_info into struct dst_entry to provide padding and move the lwtstate -member after that so it ends up in the same cache line. - -The resulting improvement depends on the micro-architecture and the number -of CPUs. It ranges from +20% to +120% with a localhost memtier/memcached -benchmark. - -[ tglx: Rearrange struct ] - -Signed-off-by: Wangyang Guo -Signed-off-by: Arjan van de Ven -Signed-off-by: Thomas Gleixner -Reviewed-by: Eric Dumazet -Reviewed-by: David Ahern -Link: https://lore.kernel.org/r/20230323102800.042297517@linutronix.de -Signed-off-by: Jakub Kicinski -Stable-dep-of: cc9b364bb1d5 ("xfrm6: fix inet6_dev refcount underflow problem") -Signed-off-by: Sasha Levin ---- - include/net/dst.h | 15 ++++++++++++++- - include/net/ip6_fib.h | 3 --- - include/net/ip6_route.h | 2 +- - include/net/route.h | 3 --- - net/ipv4/route.c | 20 ++++++++++---------- - net/ipv4/xfrm4_policy.c | 4 ++-- - net/ipv6/route.c | 26 +++++++++++++------------- - net/ipv6/xfrm6_policy.c | 4 ++-- - 8 files changed, 42 insertions(+), 35 deletions(-) - -diff --git a/include/net/dst.h b/include/net/dst.h -index d67fda89cd0fa..81f2279ea911a 100644 ---- a/include/net/dst.h -+++ b/include/net/dst.h -@@ -69,15 +69,28 @@ struct dst_entry { - #endif - int __use; - unsigned long lastuse; -- struct lwtunnel_state *lwtstate; - struct rcu_head rcu_head; - short error; - short __pad; - __u32 tclassid; - #ifndef CONFIG_64BIT -+ struct lwtunnel_state *lwtstate; - atomic_t __refcnt; /* 32-bit offset 64 */ - #endif - netdevice_tracker dev_tracker; -+ -+ /* -+ * Used by rtable and rt6_info. Moves lwtstate into the next cache -+ * line on 64bit so that lwtstate does not cause false sharing with -+ * __refcnt under contention of __refcnt. This also puts the -+ * frequently accessed members of rtable and rt6_info out of the -+ * __refcnt cache line. -+ */ -+ struct list_head rt_uncached; -+ struct uncached_list *rt_uncached_list; -+#ifdef CONFIG_64BIT -+ struct lwtunnel_state *lwtstate; -+#endif - }; - - struct dst_metrics { -diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h -index fa4e6af382e2a..9ba6413fd2e3e 100644 ---- a/include/net/ip6_fib.h -+++ b/include/net/ip6_fib.h -@@ -217,9 +217,6 @@ struct rt6_info { - struct inet6_dev *rt6i_idev; - u32 rt6i_flags; - -- struct list_head rt6i_uncached; -- struct uncached_list *rt6i_uncached_list; -- - /* more non-fragment space at head required */ - unsigned short rt6i_nfheader_len; - }; -diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h -index 035d61d50a989..6c6b673d92554 100644 ---- a/include/net/ip6_route.h -+++ b/include/net/ip6_route.h -@@ -104,7 +104,7 @@ static inline struct dst_entry *ip6_route_output(struct net *net, - static inline void ip6_rt_put_flags(struct rt6_info *rt, int flags) - { - if (!(flags & RT6_LOOKUP_F_DST_NOREF) || -- !list_empty(&rt->rt6i_uncached)) -+ !list_empty(&rt->dst.rt_uncached)) - ip6_rt_put(rt); - } - -diff --git a/include/net/route.h b/include/net/route.h -index af8431b25f800..9ca0f72868b76 100644 ---- a/include/net/route.h -+++ b/include/net/route.h -@@ -78,9 +78,6 @@ struct rtable { - /* Miscellaneous cached information */ - u32 rt_mtu_locked:1, - rt_pmtu:31; -- -- struct list_head rt_uncached; -- struct uncached_list *rt_uncached_list; - }; - - static inline bool rt_is_input_route(const struct rtable *rt) -diff --git a/net/ipv4/route.c b/net/ipv4/route.c -index 9cbaae4f5ee71..7ccf6503d67aa 100644 ---- a/net/ipv4/route.c -+++ b/net/ipv4/route.c -@@ -1510,20 +1510,20 @@ void rt_add_uncached_list(struct rtable *rt) - { - struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list); - -- rt->rt_uncached_list = ul; -+ rt->dst.rt_uncached_list = ul; - - spin_lock_bh(&ul->lock); -- list_add_tail(&rt->rt_uncached, &ul->head); -+ list_add_tail(&rt->dst.rt_uncached, &ul->head); - spin_unlock_bh(&ul->lock); - } - - void rt_del_uncached_list(struct rtable *rt) - { -- if (!list_empty(&rt->rt_uncached)) { -- struct uncached_list *ul = rt->rt_uncached_list; -+ if (!list_empty(&rt->dst.rt_uncached)) { -+ struct uncached_list *ul = rt->dst.rt_uncached_list; - - spin_lock_bh(&ul->lock); -- list_del_init(&rt->rt_uncached); -+ list_del_init(&rt->dst.rt_uncached); - spin_unlock_bh(&ul->lock); - } - } -@@ -1548,13 +1548,13 @@ void rt_flush_dev(struct net_device *dev) - continue; - - spin_lock_bh(&ul->lock); -- list_for_each_entry_safe(rt, safe, &ul->head, rt_uncached) { -+ list_for_each_entry_safe(rt, safe, &ul->head, dst.rt_uncached) { - if (rt->dst.dev != dev) - continue; - rt->dst.dev = blackhole_netdev; - netdev_ref_replace(dev, blackhole_netdev, - &rt->dst.dev_tracker, GFP_ATOMIC); -- list_move(&rt->rt_uncached, &ul->quarantine); -+ list_move(&rt->dst.rt_uncached, &ul->quarantine); - } - spin_unlock_bh(&ul->lock); - } -@@ -1646,7 +1646,7 @@ struct rtable *rt_dst_alloc(struct net_device *dev, - rt->rt_uses_gateway = 0; - rt->rt_gw_family = 0; - rt->rt_gw4 = 0; -- INIT_LIST_HEAD(&rt->rt_uncached); -+ INIT_LIST_HEAD(&rt->dst.rt_uncached); - - rt->dst.output = ip_output; - if (flags & RTCF_LOCAL) -@@ -1677,7 +1677,7 @@ struct rtable *rt_dst_clone(struct net_device *dev, struct rtable *rt) - new_rt->rt_gw4 = rt->rt_gw4; - else if (rt->rt_gw_family == AF_INET6) - new_rt->rt_gw6 = rt->rt_gw6; -- INIT_LIST_HEAD(&new_rt->rt_uncached); -+ INIT_LIST_HEAD(&new_rt->dst.rt_uncached); - - new_rt->dst.input = rt->dst.input; - new_rt->dst.output = rt->dst.output; -@@ -2862,7 +2862,7 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or - else if (rt->rt_gw_family == AF_INET6) - rt->rt_gw6 = ort->rt_gw6; - -- INIT_LIST_HEAD(&rt->rt_uncached); -+ INIT_LIST_HEAD(&rt->dst.rt_uncached); - } - - dst_release(dst_orig); -diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c -index 3d0dfa6cf9f96..47861c8b7340e 100644 ---- a/net/ipv4/xfrm4_policy.c -+++ b/net/ipv4/xfrm4_policy.c -@@ -91,7 +91,7 @@ static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, - xdst->u.rt.rt_gw6 = rt->rt_gw6; - xdst->u.rt.rt_pmtu = rt->rt_pmtu; - xdst->u.rt.rt_mtu_locked = rt->rt_mtu_locked; -- INIT_LIST_HEAD(&xdst->u.rt.rt_uncached); -+ INIT_LIST_HEAD(&xdst->u.rt.dst.rt_uncached); - rt_add_uncached_list(&xdst->u.rt); - - return 0; -@@ -121,7 +121,7 @@ static void xfrm4_dst_destroy(struct dst_entry *dst) - struct xfrm_dst *xdst = (struct xfrm_dst *)dst; - - dst_destroy_metrics_generic(dst); -- if (xdst->u.rt.rt_uncached_list) -+ if (xdst->u.rt.dst.rt_uncached_list) - rt_del_uncached_list(&xdst->u.rt); - xfrm_dst_destroy(xdst); - } -diff --git a/net/ipv6/route.c b/net/ipv6/route.c -index 0bcdb675ba2c1..7205adee46c21 100644 ---- a/net/ipv6/route.c -+++ b/net/ipv6/route.c -@@ -139,20 +139,20 @@ void rt6_uncached_list_add(struct rt6_info *rt) - { - struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list); - -- rt->rt6i_uncached_list = ul; -+ rt->dst.rt_uncached_list = ul; - - spin_lock_bh(&ul->lock); -- list_add_tail(&rt->rt6i_uncached, &ul->head); -+ list_add_tail(&rt->dst.rt_uncached, &ul->head); - spin_unlock_bh(&ul->lock); - } - - void rt6_uncached_list_del(struct rt6_info *rt) - { -- if (!list_empty(&rt->rt6i_uncached)) { -- struct uncached_list *ul = rt->rt6i_uncached_list; -+ if (!list_empty(&rt->dst.rt_uncached)) { -+ struct uncached_list *ul = rt->dst.rt_uncached_list; - - spin_lock_bh(&ul->lock); -- list_del_init(&rt->rt6i_uncached); -+ list_del_init(&rt->dst.rt_uncached); - spin_unlock_bh(&ul->lock); - } - } -@@ -169,7 +169,7 @@ static void rt6_uncached_list_flush_dev(struct net_device *dev) - continue; - - spin_lock_bh(&ul->lock); -- list_for_each_entry_safe(rt, safe, &ul->head, rt6i_uncached) { -+ list_for_each_entry_safe(rt, safe, &ul->head, dst.rt_uncached) { - struct inet6_dev *rt_idev = rt->rt6i_idev; - struct net_device *rt_dev = rt->dst.dev; - bool handled = false; -@@ -188,7 +188,7 @@ static void rt6_uncached_list_flush_dev(struct net_device *dev) - handled = true; - } - if (handled) -- list_move(&rt->rt6i_uncached, -+ list_move(&rt->dst.rt_uncached, - &ul->quarantine); - } - spin_unlock_bh(&ul->lock); -@@ -334,7 +334,7 @@ static const struct rt6_info ip6_blk_hole_entry_template = { - static void rt6_info_init(struct rt6_info *rt) - { - memset_after(rt, 0, dst); -- INIT_LIST_HEAD(&rt->rt6i_uncached); -+ INIT_LIST_HEAD(&rt->dst.rt_uncached); - } - - /* allocate dst with ip6_dst_ops */ -@@ -2641,7 +2641,7 @@ struct dst_entry *ip6_route_output_flags(struct net *net, - dst = ip6_route_output_flags_noref(net, sk, fl6, flags); - rt6 = (struct rt6_info *)dst; - /* For dst cached in uncached_list, refcnt is already taken. */ -- if (list_empty(&rt6->rt6i_uncached) && !dst_hold_safe(dst)) { -+ if (list_empty(&rt6->dst.rt_uncached) && !dst_hold_safe(dst)) { - dst = &net->ipv6.ip6_null_entry->dst; - dst_hold(dst); - } -@@ -2751,7 +2751,7 @@ INDIRECT_CALLABLE_SCOPE struct dst_entry *ip6_dst_check(struct dst_entry *dst, - from = rcu_dereference(rt->from); - - if (from && (rt->rt6i_flags & RTF_PCPU || -- unlikely(!list_empty(&rt->rt6i_uncached)))) -+ unlikely(!list_empty(&rt->dst.rt_uncached)))) - dst_ret = rt6_dst_from_check(rt, from, cookie); - else - dst_ret = rt6_check(rt, from, cookie); -@@ -6488,7 +6488,7 @@ static int __net_init ip6_route_net_init(struct net *net) - net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops; - dst_init_metrics(&net->ipv6.ip6_null_entry->dst, - ip6_template_metrics, true); -- INIT_LIST_HEAD(&net->ipv6.ip6_null_entry->rt6i_uncached); -+ INIT_LIST_HEAD(&net->ipv6.ip6_null_entry->dst.rt_uncached); - - #ifdef CONFIG_IPV6_MULTIPLE_TABLES - net->ipv6.fib6_has_custom_rules = false; -@@ -6500,7 +6500,7 @@ static int __net_init ip6_route_net_init(struct net *net) - net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops; - dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst, - ip6_template_metrics, true); -- INIT_LIST_HEAD(&net->ipv6.ip6_prohibit_entry->rt6i_uncached); -+ INIT_LIST_HEAD(&net->ipv6.ip6_prohibit_entry->dst.rt_uncached); - - net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template, - sizeof(*net->ipv6.ip6_blk_hole_entry), -@@ -6510,7 +6510,7 @@ static int __net_init ip6_route_net_init(struct net *net) - net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops; - dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst, - ip6_template_metrics, true); -- INIT_LIST_HEAD(&net->ipv6.ip6_blk_hole_entry->rt6i_uncached); -+ INIT_LIST_HEAD(&net->ipv6.ip6_blk_hole_entry->dst.rt_uncached); - #ifdef CONFIG_IPV6_SUBTREES - net->ipv6.fib6_routes_require_src = 0; - #endif -diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c -index ea435eba30534..2b493f8d00918 100644 ---- a/net/ipv6/xfrm6_policy.c -+++ b/net/ipv6/xfrm6_policy.c -@@ -89,7 +89,7 @@ static int xfrm6_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, - xdst->u.rt6.rt6i_gateway = rt->rt6i_gateway; - xdst->u.rt6.rt6i_dst = rt->rt6i_dst; - xdst->u.rt6.rt6i_src = rt->rt6i_src; -- INIT_LIST_HEAD(&xdst->u.rt6.rt6i_uncached); -+ INIT_LIST_HEAD(&xdst->u.rt6.dst.rt_uncached); - rt6_uncached_list_add(&xdst->u.rt6); - - return 0; -@@ -121,7 +121,7 @@ static void xfrm6_dst_destroy(struct dst_entry *dst) - if (likely(xdst->u.rt6.rt6i_idev)) - in6_dev_put(xdst->u.rt6.rt6i_idev); - dst_destroy_metrics_generic(dst); -- if (xdst->u.rt6.rt6i_uncached_list) -+ if (xdst->u.rt6.dst.rt_uncached_list) - rt6_uncached_list_del(&xdst->u.rt6); - xfrm_dst_destroy(xdst); - } --- -2.40.1 - diff --git a/queue-6.1/net-dst-switch-to-rcuref_t-reference-counting.patch b/queue-6.1/net-dst-switch-to-rcuref_t-reference-counting.patch deleted file mode 100644 index 8ed6867cc71..00000000000 --- a/queue-6.1/net-dst-switch-to-rcuref_t-reference-counting.patch +++ /dev/null @@ -1,259 +0,0 @@ -From 180ab46081f3404a77e4cef550c4f0b28701a1b3 Mon Sep 17 00:00:00 2001 -From: Sasha Levin -Date: Thu, 23 Mar 2023 21:55:32 +0100 -Subject: net: dst: Switch to rcuref_t reference counting - -From: Thomas Gleixner - -[ Upstream commit bc9d3a9f2afca189a6ae40225b6985e3c775375e ] - -Under high contention dst_entry::__refcnt becomes a significant bottleneck. - -atomic_inc_not_zero() is implemented with a cmpxchg() loop, which goes into -high retry rates on contention. - -Switch the reference count to rcuref_t which results in a significant -performance gain. Rename the reference count member to __rcuref to reflect -the change. - -The gain depends on the micro-architecture and the number of concurrent -operations and has been measured in the range of +25% to +130% with a -localhost memtier/memcached benchmark which amplifies the problem -massively. - -Running the memtier/memcached benchmark over a real (1Gb) network -connection the conversion on top of the false sharing fix for struct -dst_entry::__refcnt results in a total gain in the 2%-5% range over the -upstream baseline. - -Reported-by: Wangyang Guo -Reported-by: Arjan Van De Ven -Signed-off-by: Thomas Gleixner -Link: https://lore.kernel.org/r/20230307125538.989175656@linutronix.de -Link: https://lore.kernel.org/r/20230323102800.215027837@linutronix.de -Signed-off-by: Jakub Kicinski -Stable-dep-of: cc9b364bb1d5 ("xfrm6: fix inet6_dev refcount underflow problem") -Signed-off-by: Sasha Levin ---- - include/net/dst.h | 19 ++++++++++--------- - include/net/sock.h | 2 +- - net/bridge/br_nf_core.c | 2 +- - net/core/dst.c | 26 +++++--------------------- - net/core/rtnetlink.c | 2 +- - net/ipv6/route.c | 6 +++--- - net/netfilter/ipvs/ip_vs_xmit.c | 4 ++-- - 7 files changed, 23 insertions(+), 38 deletions(-) - -diff --git a/include/net/dst.h b/include/net/dst.h -index 81f2279ea911a..78884429deed8 100644 ---- a/include/net/dst.h -+++ b/include/net/dst.h -@@ -16,6 +16,7 @@ - #include - #include - #include -+#include - #include - #include - #include -@@ -61,11 +62,11 @@ struct dst_entry { - unsigned short trailer_len; /* space to reserve at tail */ - - /* -- * __refcnt wants to be on a different cache line from -+ * __rcuref wants to be on a different cache line from - * input/output/ops or performance tanks badly - */ - #ifdef CONFIG_64BIT -- atomic_t __refcnt; /* 64-bit offset 64 */ -+ rcuref_t __rcuref; /* 64-bit offset 64 */ - #endif - int __use; - unsigned long lastuse; -@@ -75,16 +76,16 @@ struct dst_entry { - __u32 tclassid; - #ifndef CONFIG_64BIT - struct lwtunnel_state *lwtstate; -- atomic_t __refcnt; /* 32-bit offset 64 */ -+ rcuref_t __rcuref; /* 32-bit offset 64 */ - #endif - netdevice_tracker dev_tracker; - - /* - * Used by rtable and rt6_info. Moves lwtstate into the next cache - * line on 64bit so that lwtstate does not cause false sharing with -- * __refcnt under contention of __refcnt. This also puts the -+ * __rcuref under contention of __rcuref. This also puts the - * frequently accessed members of rtable and rt6_info out of the -- * __refcnt cache line. -+ * __rcuref cache line. - */ - struct list_head rt_uncached; - struct uncached_list *rt_uncached_list; -@@ -238,10 +239,10 @@ static inline void dst_hold(struct dst_entry *dst) - { - /* - * If your kernel compilation stops here, please check -- * the placement of __refcnt in struct dst_entry -+ * the placement of __rcuref in struct dst_entry - */ -- BUILD_BUG_ON(offsetof(struct dst_entry, __refcnt) & 63); -- WARN_ON(atomic_inc_not_zero(&dst->__refcnt) == 0); -+ BUILD_BUG_ON(offsetof(struct dst_entry, __rcuref) & 63); -+ WARN_ON(!rcuref_get(&dst->__rcuref)); - } - - static inline void dst_use_noref(struct dst_entry *dst, unsigned long time) -@@ -305,7 +306,7 @@ static inline void skb_dst_copy(struct sk_buff *nskb, const struct sk_buff *oskb - */ - static inline bool dst_hold_safe(struct dst_entry *dst) - { -- return atomic_inc_not_zero(&dst->__refcnt); -+ return rcuref_get(&dst->__rcuref); - } - - /** -diff --git a/include/net/sock.h b/include/net/sock.h -index fe695e8bfe289..4c988b981d6e1 100644 ---- a/include/net/sock.h -+++ b/include/net/sock.h -@@ -2181,7 +2181,7 @@ sk_dst_get(struct sock *sk) - - rcu_read_lock(); - dst = rcu_dereference(sk->sk_dst_cache); -- if (dst && !atomic_inc_not_zero(&dst->__refcnt)) -+ if (dst && !rcuref_get(&dst->__rcuref)) - dst = NULL; - rcu_read_unlock(); - return dst; -diff --git a/net/bridge/br_nf_core.c b/net/bridge/br_nf_core.c -index 8c69f0c95a8ed..98aea5485aaef 100644 ---- a/net/bridge/br_nf_core.c -+++ b/net/bridge/br_nf_core.c -@@ -73,7 +73,7 @@ void br_netfilter_rtable_init(struct net_bridge *br) - { - struct rtable *rt = &br->fake_rtable; - -- atomic_set(&rt->dst.__refcnt, 1); -+ rcuref_init(&rt->dst.__rcuref, 1); - rt->dst.dev = br->dev; - dst_init_metrics(&rt->dst, br_dst_default_metrics, true); - rt->dst.flags = DST_NOXFRM | DST_FAKE_RTABLE; -diff --git a/net/core/dst.c b/net/core/dst.c -index a4e738d321ba2..2b7b1619b5e29 100644 ---- a/net/core/dst.c -+++ b/net/core/dst.c -@@ -66,7 +66,7 @@ void dst_init(struct dst_entry *dst, struct dst_ops *ops, - dst->tclassid = 0; - #endif - dst->lwtstate = NULL; -- atomic_set(&dst->__refcnt, initial_ref); -+ rcuref_init(&dst->__rcuref, initial_ref); - dst->__use = 0; - dst->lastuse = jiffies; - dst->flags = flags; -@@ -166,31 +166,15 @@ EXPORT_SYMBOL(dst_dev_put); - - void dst_release(struct dst_entry *dst) - { -- if (dst) { -- int newrefcnt; -- -- newrefcnt = atomic_dec_return(&dst->__refcnt); -- if (WARN_ONCE(newrefcnt < 0, "dst_release underflow")) -- net_warn_ratelimited("%s: dst:%p refcnt:%d\n", -- __func__, dst, newrefcnt); -- if (!newrefcnt) -- call_rcu_hurry(&dst->rcu_head, dst_destroy_rcu); -- } -+ if (dst && rcuref_put(&dst->__rcuref)) -+ call_rcu_hurry(&dst->rcu_head, dst_destroy_rcu); - } - EXPORT_SYMBOL(dst_release); - - void dst_release_immediate(struct dst_entry *dst) - { -- if (dst) { -- int newrefcnt; -- -- newrefcnt = atomic_dec_return(&dst->__refcnt); -- if (WARN_ONCE(newrefcnt < 0, "dst_release_immediate underflow")) -- net_warn_ratelimited("%s: dst:%p refcnt:%d\n", -- __func__, dst, newrefcnt); -- if (!newrefcnt) -- dst_destroy(dst); -- } -+ if (dst && rcuref_put(&dst->__rcuref)) -+ dst_destroy(dst); - } - EXPORT_SYMBOL(dst_release_immediate); - -diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c -index 854b3fd66b1be..90810408cc5df 100644 ---- a/net/core/rtnetlink.c -+++ b/net/core/rtnetlink.c -@@ -839,7 +839,7 @@ int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst, u32 id, - if (dst) { - ci.rta_lastuse = jiffies_delta_to_clock_t(jiffies - dst->lastuse); - ci.rta_used = dst->__use; -- ci.rta_clntref = atomic_read(&dst->__refcnt); -+ ci.rta_clntref = rcuref_read(&dst->__rcuref); - } - if (expires) { - unsigned long clock; -diff --git a/net/ipv6/route.c b/net/ipv6/route.c -index 7205adee46c21..9db0b2318e918 100644 ---- a/net/ipv6/route.c -+++ b/net/ipv6/route.c -@@ -293,7 +293,7 @@ static const struct fib6_info fib6_null_entry_template = { - - static const struct rt6_info ip6_null_entry_template = { - .dst = { -- .__refcnt = ATOMIC_INIT(1), -+ .__rcuref = RCUREF_INIT(1), - .__use = 1, - .obsolete = DST_OBSOLETE_FORCE_CHK, - .error = -ENETUNREACH, -@@ -307,7 +307,7 @@ static const struct rt6_info ip6_null_entry_template = { - - static const struct rt6_info ip6_prohibit_entry_template = { - .dst = { -- .__refcnt = ATOMIC_INIT(1), -+ .__rcuref = RCUREF_INIT(1), - .__use = 1, - .obsolete = DST_OBSOLETE_FORCE_CHK, - .error = -EACCES, -@@ -319,7 +319,7 @@ static const struct rt6_info ip6_prohibit_entry_template = { - - static const struct rt6_info ip6_blk_hole_entry_template = { - .dst = { -- .__refcnt = ATOMIC_INIT(1), -+ .__rcuref = RCUREF_INIT(1), - .__use = 1, - .obsolete = DST_OBSOLETE_FORCE_CHK, - .error = -EINVAL, -diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c -index 7243079ef3546..70ef036909fb0 100644 ---- a/net/netfilter/ipvs/ip_vs_xmit.c -+++ b/net/netfilter/ipvs/ip_vs_xmit.c -@@ -339,7 +339,7 @@ __ip_vs_get_out_rt(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb, - spin_unlock_bh(&dest->dst_lock); - IP_VS_DBG(10, "new dst %pI4, src %pI4, refcnt=%d\n", - &dest->addr.ip, &dest_dst->dst_saddr.ip, -- atomic_read(&rt->dst.__refcnt)); -+ rcuref_read(&rt->dst.__rcuref)); - } - if (ret_saddr) - *ret_saddr = dest_dst->dst_saddr.ip; -@@ -507,7 +507,7 @@ __ip_vs_get_out_rt_v6(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb, - spin_unlock_bh(&dest->dst_lock); - IP_VS_DBG(10, "new dst %pI6, src %pI6, refcnt=%d\n", - &dest->addr.in6, &dest_dst->dst_saddr.in6, -- atomic_read(&rt->dst.__refcnt)); -+ rcuref_read(&rt->dst.__rcuref)); - } - if (ret_saddr) - *ret_saddr = dest_dst->dst_saddr.in6; --- -2.40.1 - diff --git a/queue-6.1/net-use-call_rcu_hurry-for-dst_release.patch b/queue-6.1/net-use-call_rcu_hurry-for-dst_release.patch deleted file mode 100644 index 2677027a7af..00000000000 --- a/queue-6.1/net-use-call_rcu_hurry-for-dst_release.patch +++ /dev/null @@ -1,92 +0,0 @@ -From 51290b74abe5ae7c0313a41f7e182e0d23a0ad56 Mon Sep 17 00:00:00 2001 -From: Sasha Levin -Date: Fri, 18 Nov 2022 19:19:08 +0000 -Subject: net: Use call_rcu_hurry() for dst_release() - -From: Joel Fernandes (Google) - -[ Upstream commit 483c26ff63f42e8898ed43aca0b9953bc91f0cd4 ] - -In a networking test on ChromeOS, kernels built with the new -CONFIG_RCU_LAZY=y Kconfig option fail a networking test in the teardown -phase. - -This failure may be reproduced as follows: ip netns del - -The CONFIG_RCU_LAZY=y Kconfig option was introduced by earlier commits -in this series for the benefit of certain battery-powered systems. -This Kconfig option causes call_rcu() to delay its callbacks in order -to batch them. This means that a given RCU grace period covers more -callbacks, thus reducing the number of grace periods, in turn reducing -the amount of energy consumed, which increases battery lifetime which -can be a very good thing. This is not a subtle effect: In some important -use cases, the battery lifetime is increased by more than 10%. - -This CONFIG_RCU_LAZY=y option is available only for CPUs that offload -callbacks, for example, CPUs mentioned in the rcu_nocbs kernel boot -parameter passed to kernels built with CONFIG_RCU_NOCB_CPU=y. - -Delaying callbacks is normally not a problem because most callbacks do -nothing but free memory. If the system is short on memory, a shrinker -will kick all currently queued lazy callbacks out of their laziness, -thus freeing their memory in short order. Similarly, the rcu_barrier() -function, which blocks until all currently queued callbacks are invoked, -will also kick lazy callbacks, thus enabling rcu_barrier() to complete -in a timely manner. - -However, there are some cases where laziness is not a good option. -For example, synchronize_rcu() invokes call_rcu(), and blocks until -the newly queued callback is invoked. It would not be a good for -synchronize_rcu() to block for ten seconds, even on an idle system. -Therefore, synchronize_rcu() invokes call_rcu_hurry() instead of -call_rcu(). The arrival of a non-lazy call_rcu_hurry() callback on a -given CPU kicks any lazy callbacks that might be already queued on that -CPU. After all, if there is going to be a grace period, all callbacks -might as well get full benefit from it. - -Yes, this could be done the other way around by creating a -call_rcu_lazy(), but earlier experience with this approach and -feedback at the 2022 Linux Plumbers Conference shifted the approach -to call_rcu() being lazy with call_rcu_hurry() for the few places -where laziness is inappropriate. - -Returning to the test failure, use of ftrace showed that this failure -cause caused by the aadded delays due to this new lazy behavior of -call_rcu() in kernels built with CONFIG_RCU_LAZY=y. - -Therefore, make dst_release() use call_rcu_hurry() in order to revert -to the old test-failure-free behavior. - -[ paulmck: Apply s/call_rcu_flush/call_rcu_hurry/ feedback from Tejun Heo. ] - -Signed-off-by: Joel Fernandes (Google) -Cc: David Ahern -Cc: "David S. Miller" -Cc: Hideaki YOSHIFUJI -Cc: Jakub Kicinski -Cc: Paolo Abeni -Cc: -Reviewed-by: Eric Dumazet -Signed-off-by: Paul E. McKenney -Stable-dep-of: cc9b364bb1d5 ("xfrm6: fix inet6_dev refcount underflow problem") -Signed-off-by: Sasha Levin ---- - net/core/dst.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/net/core/dst.c b/net/core/dst.c -index bc9c9be4e0801..a4e738d321ba2 100644 ---- a/net/core/dst.c -+++ b/net/core/dst.c -@@ -174,7 +174,7 @@ void dst_release(struct dst_entry *dst) - net_warn_ratelimited("%s: dst:%p refcnt:%d\n", - __func__, dst, newrefcnt); - if (!newrefcnt) -- call_rcu(&dst->rcu_head, dst_destroy_rcu); -+ call_rcu_hurry(&dst->rcu_head, dst_destroy_rcu); - } - } - EXPORT_SYMBOL(dst_release); --- -2.40.1 - diff --git a/queue-6.1/rcu-fix-late-wakeup-when-flush-of-bypass-cblist-happ.patch b/queue-6.1/rcu-fix-late-wakeup-when-flush-of-bypass-cblist-happ.patch deleted file mode 100644 index e38cc5f00b5..00000000000 --- a/queue-6.1/rcu-fix-late-wakeup-when-flush-of-bypass-cblist-happ.patch +++ /dev/null @@ -1,68 +0,0 @@ -From 3f132e8e674299042d9e5313dfbfcb3de55af912 Mon Sep 17 00:00:00 2001 -From: Sasha Levin -Date: Sat, 17 Sep 2022 16:41:59 +0000 -Subject: rcu: Fix late wakeup when flush of bypass cblist happens - -From: Joel Fernandes (Google) - -[ Upstream commit b50606f35f4b73c8e4c6b9c64fe7ba72ea919134 ] - -When the bypass cblist gets too big or its timeout has occurred, it is -flushed into the main cblist. However, the bypass timer is still running -and the behavior is that it would eventually expire and wake the GP -thread. - -Since we are going to use the bypass cblist for lazy CBs, do the wakeup -soon as the flush for "too big or too long" bypass list happens. -Otherwise, long delays can happen for callbacks which get promoted from -lazy to non-lazy. - -This is a good thing to do anyway (regardless of future lazy patches), -since it makes the behavior consistent with behavior of other code paths -where flushing into the ->cblist makes the GP kthread into a -non-sleeping state quickly. - -[ Frederic Weisbecker: Changes to avoid unnecessary GP-thread wakeups plus - comment changes. ] - -Reviewed-by: Frederic Weisbecker -Signed-off-by: Joel Fernandes (Google) -Signed-off-by: Paul E. McKenney -Stable-dep-of: cc9b364bb1d5 ("xfrm6: fix inet6_dev refcount underflow problem") -Signed-off-by: Sasha Levin ---- - kernel/rcu/tree_nocb.h | 10 ++++++++-- - 1 file changed, 8 insertions(+), 2 deletions(-) - -diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h -index 0a5f0ef414845..04c87f250e01a 100644 ---- a/kernel/rcu/tree_nocb.h -+++ b/kernel/rcu/tree_nocb.h -@@ -433,8 +433,9 @@ static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp, - if ((ncbs && j != READ_ONCE(rdp->nocb_bypass_first)) || - ncbs >= qhimark) { - rcu_nocb_lock(rdp); -+ *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist); -+ - if (!rcu_nocb_flush_bypass(rdp, rhp, j)) { -- *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist); - if (*was_alldone) - trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, - TPS("FirstQ")); -@@ -447,7 +448,12 @@ static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp, - rcu_advance_cbs_nowake(rdp->mynode, rdp); - rdp->nocb_gp_adv_time = j; - } -- rcu_nocb_unlock_irqrestore(rdp, flags); -+ -+ // The flush succeeded and we moved CBs into the regular list. -+ // Don't wait for the wake up timer as it may be too far ahead. -+ // Wake up the GP thread now instead, if the cblist was empty. -+ __call_rcu_nocb_wake(rdp, *was_alldone, flags); -+ - return true; // Callback already enqueued. - } - --- -2.40.1 - diff --git a/queue-6.1/rcu-fix-missing-nocb-gp-wake-on-rcu_barrier.patch b/queue-6.1/rcu-fix-missing-nocb-gp-wake-on-rcu_barrier.patch deleted file mode 100644 index d1ccd6ce3de..00000000000 --- a/queue-6.1/rcu-fix-missing-nocb-gp-wake-on-rcu_barrier.patch +++ /dev/null @@ -1,94 +0,0 @@ -From 6e201fbbe533ee08318f49c360c83145a1231ac2 Mon Sep 17 00:00:00 2001 -From: Sasha Levin -Date: Sun, 16 Oct 2022 16:22:53 +0000 -Subject: rcu: Fix missing nocb gp wake on rcu_barrier() - -From: Frederic Weisbecker - -[ Upstream commit b8f7aca3f0e0e6223094ba2662bac90353674b04 ] - -In preparation for RCU lazy changes, wake up the RCU nocb gp thread if -needed after an entrain. This change prevents the RCU barrier callback -from waiting in the queue for several seconds before the lazy callbacks -in front of it are serviced. - -Reported-by: Joel Fernandes (Google) -Signed-off-by: Frederic Weisbecker -Signed-off-by: Joel Fernandes (Google) -Signed-off-by: Paul E. McKenney -Stable-dep-of: cc9b364bb1d5 ("xfrm6: fix inet6_dev refcount underflow problem") -Signed-off-by: Sasha Levin ---- - kernel/rcu/tree.c | 11 +++++++++++ - kernel/rcu/tree.h | 1 + - kernel/rcu/tree_nocb.h | 5 +++++ - 3 files changed, 17 insertions(+) - -diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c -index 917a1e43f7839..6ea59aa53db78 100644 ---- a/kernel/rcu/tree.c -+++ b/kernel/rcu/tree.c -@@ -3908,6 +3908,8 @@ static void rcu_barrier_entrain(struct rcu_data *rdp) - { - unsigned long gseq = READ_ONCE(rcu_state.barrier_sequence); - unsigned long lseq = READ_ONCE(rdp->barrier_seq_snap); -+ bool wake_nocb = false; -+ bool was_alldone = false; - - lockdep_assert_held(&rcu_state.barrier_lock); - if (rcu_seq_state(lseq) || !rcu_seq_state(gseq) || rcu_seq_ctr(lseq) != rcu_seq_ctr(gseq)) -@@ -3916,7 +3918,14 @@ static void rcu_barrier_entrain(struct rcu_data *rdp) - rdp->barrier_head.func = rcu_barrier_callback; - debug_rcu_head_queue(&rdp->barrier_head); - rcu_nocb_lock(rdp); -+ /* -+ * Flush bypass and wakeup rcuog if we add callbacks to an empty regular -+ * queue. This way we don't wait for bypass timer that can reach seconds -+ * if it's fully lazy. -+ */ -+ was_alldone = rcu_rdp_is_offloaded(rdp) && !rcu_segcblist_pend_cbs(&rdp->cblist); - WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies)); -+ wake_nocb = was_alldone && rcu_segcblist_pend_cbs(&rdp->cblist); - if (rcu_segcblist_entrain(&rdp->cblist, &rdp->barrier_head)) { - atomic_inc(&rcu_state.barrier_cpu_count); - } else { -@@ -3924,6 +3933,8 @@ static void rcu_barrier_entrain(struct rcu_data *rdp) - rcu_barrier_trace(TPS("IRQNQ"), -1, rcu_state.barrier_sequence); - } - rcu_nocb_unlock(rdp); -+ if (wake_nocb) -+ wake_nocb_gp(rdp, false); - smp_store_release(&rdp->barrier_seq_snap, gseq); - } - -diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h -index d4a97e40ea9c3..925dd98f8b23b 100644 ---- a/kernel/rcu/tree.h -+++ b/kernel/rcu/tree.h -@@ -439,6 +439,7 @@ static void zero_cpu_stall_ticks(struct rcu_data *rdp); - static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp); - static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq); - static void rcu_init_one_nocb(struct rcu_node *rnp); -+static bool wake_nocb_gp(struct rcu_data *rdp, bool force); - static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp, - unsigned long j); - static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp, -diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h -index 04c87f250e01a..74d4983d68f82 100644 ---- a/kernel/rcu/tree_nocb.h -+++ b/kernel/rcu/tree_nocb.h -@@ -1570,6 +1570,11 @@ static void rcu_init_one_nocb(struct rcu_node *rnp) - { - } - -+static bool wake_nocb_gp(struct rcu_data *rdp, bool force) -+{ -+ return false; -+} -+ - static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp, - unsigned long j) - { --- -2.40.1 - diff --git a/queue-6.1/rcu-make-call_rcu-lazy-to-save-power.patch b/queue-6.1/rcu-make-call_rcu-lazy-to-save-power.patch deleted file mode 100644 index f81ebaa1b05..00000000000 --- a/queue-6.1/rcu-make-call_rcu-lazy-to-save-power.patch +++ /dev/null @@ -1,689 +0,0 @@ -From 7b253194c188b40a04df52ea0aeacae23989ef0d Mon Sep 17 00:00:00 2001 -From: Sasha Levin -Date: Sun, 16 Oct 2022 16:22:54 +0000 -Subject: rcu: Make call_rcu() lazy to save power - -From: Joel Fernandes (Google) - -[ Upstream commit 3cb278e73be58bfb780ecd55129296d2f74c1fb7 ] - -Implement timer-based RCU callback batching (also known as lazy -callbacks). With this we save about 5-10% of power consumed due -to RCU requests that happen when system is lightly loaded or idle. - -By default, all async callbacks (queued via call_rcu) are marked -lazy. An alternate API call_rcu_hurry() is provided for the few users, -for example synchronize_rcu(), that need the old behavior. - -The batch is flushed whenever a certain amount of time has passed, or -the batch on a particular CPU grows too big. Also memory pressure will -flush it in a future patch. - -To handle several corner cases automagically (such as rcu_barrier() and -hotplug), we re-use bypass lists which were originally introduced to -address lock contention, to handle lazy CBs as well. The bypass list -length has the lazy CB length included in it. A separate lazy CB length -counter is also introduced to keep track of the number of lazy CBs. - -[ paulmck: Fix formatting of inline call_rcu_lazy() definition. ] -[ paulmck: Apply Zqiang feedback. ] -[ paulmck: Apply s/call_rcu_flush/call_rcu_hurry/ feedback from Tejun Heo. ] - -Suggested-by: Paul McKenney -Acked-by: Frederic Weisbecker -Signed-off-by: Joel Fernandes (Google) -Signed-off-by: Paul E. McKenney -Stable-dep-of: cc9b364bb1d5 ("xfrm6: fix inet6_dev refcount underflow problem") -Signed-off-by: Sasha Levin ---- - include/linux/rcupdate.h | 9 +++ - kernel/rcu/Kconfig | 8 ++ - kernel/rcu/rcu.h | 8 ++ - kernel/rcu/tiny.c | 2 +- - kernel/rcu/tree.c | 129 ++++++++++++++++++++----------- - kernel/rcu/tree.h | 11 ++- - kernel/rcu/tree_exp.h | 2 +- - kernel/rcu/tree_nocb.h | 159 +++++++++++++++++++++++++++++++-------- - 8 files changed, 246 insertions(+), 82 deletions(-) - -diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h -index e9e61cd27ef63..46f05dc8b31aa 100644 ---- a/include/linux/rcupdate.h -+++ b/include/linux/rcupdate.h -@@ -108,6 +108,15 @@ static inline int rcu_preempt_depth(void) - - #endif /* #else #ifdef CONFIG_PREEMPT_RCU */ - -+#ifdef CONFIG_RCU_LAZY -+void call_rcu_hurry(struct rcu_head *head, rcu_callback_t func); -+#else -+static inline void call_rcu_hurry(struct rcu_head *head, rcu_callback_t func) -+{ -+ call_rcu(head, func); -+} -+#endif -+ - /* Internal to kernel */ - void rcu_init(void); - extern int rcu_scheduler_active; -diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig -index d471d22a5e21b..d78f6181c8aad 100644 ---- a/kernel/rcu/Kconfig -+++ b/kernel/rcu/Kconfig -@@ -311,4 +311,12 @@ config TASKS_TRACE_RCU_READ_MB - Say N here if you hate read-side memory barriers. - Take the default if you are unsure. - -+config RCU_LAZY -+ bool "RCU callback lazy invocation functionality" -+ depends on RCU_NOCB_CPU -+ default n -+ help -+ To save power, batch RCU callbacks and flush after delay, memory -+ pressure, or callback list growing too big. -+ - endmenu # "RCU Subsystem" -diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h -index 48d8f754b730e..6b86c5912beaf 100644 ---- a/kernel/rcu/rcu.h -+++ b/kernel/rcu/rcu.h -@@ -474,6 +474,14 @@ enum rcutorture_type { - INVALID_RCU_FLAVOR - }; - -+#if defined(CONFIG_RCU_LAZY) -+unsigned long rcu_lazy_get_jiffies_till_flush(void); -+void rcu_lazy_set_jiffies_till_flush(unsigned long j); -+#else -+static inline unsigned long rcu_lazy_get_jiffies_till_flush(void) { return 0; } -+static inline void rcu_lazy_set_jiffies_till_flush(unsigned long j) { } -+#endif -+ - #if defined(CONFIG_TREE_RCU) - void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags, - unsigned long *gp_seq); -diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c -index a33a8d4942c37..72913ce21258b 100644 ---- a/kernel/rcu/tiny.c -+++ b/kernel/rcu/tiny.c -@@ -44,7 +44,7 @@ static struct rcu_ctrlblk rcu_ctrlblk = { - - void rcu_barrier(void) - { -- wait_rcu_gp(call_rcu); -+ wait_rcu_gp(call_rcu_hurry); - } - EXPORT_SYMBOL(rcu_barrier); - -diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c -index 6ea59aa53db78..855c035ec9630 100644 ---- a/kernel/rcu/tree.c -+++ b/kernel/rcu/tree.c -@@ -2731,47 +2731,8 @@ static void check_cb_ovld(struct rcu_data *rdp) - raw_spin_unlock_rcu_node(rnp); - } - --/** -- * call_rcu() - Queue an RCU callback for invocation after a grace period. -- * @head: structure to be used for queueing the RCU updates. -- * @func: actual callback function to be invoked after the grace period -- * -- * The callback function will be invoked some time after a full grace -- * period elapses, in other words after all pre-existing RCU read-side -- * critical sections have completed. However, the callback function -- * might well execute concurrently with RCU read-side critical sections -- * that started after call_rcu() was invoked. -- * -- * RCU read-side critical sections are delimited by rcu_read_lock() -- * and rcu_read_unlock(), and may be nested. In addition, but only in -- * v5.0 and later, regions of code across which interrupts, preemption, -- * or softirqs have been disabled also serve as RCU read-side critical -- * sections. This includes hardware interrupt handlers, softirq handlers, -- * and NMI handlers. -- * -- * Note that all CPUs must agree that the grace period extended beyond -- * all pre-existing RCU read-side critical section. On systems with more -- * than one CPU, this means that when "func()" is invoked, each CPU is -- * guaranteed to have executed a full memory barrier since the end of its -- * last RCU read-side critical section whose beginning preceded the call -- * to call_rcu(). It also means that each CPU executing an RCU read-side -- * critical section that continues beyond the start of "func()" must have -- * executed a memory barrier after the call_rcu() but before the beginning -- * of that RCU read-side critical section. Note that these guarantees -- * include CPUs that are offline, idle, or executing in user mode, as -- * well as CPUs that are executing in the kernel. -- * -- * Furthermore, if CPU A invoked call_rcu() and CPU B invoked the -- * resulting RCU callback function "func()", then both CPU A and CPU B are -- * guaranteed to execute a full memory barrier during the time interval -- * between the call to call_rcu() and the invocation of "func()" -- even -- * if CPU A and CPU B are the same CPU (but again only if the system has -- * more than one CPU). -- * -- * Implementation of these memory-ordering guarantees is described here: -- * Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst. -- */ --void call_rcu(struct rcu_head *head, rcu_callback_t func) -+static void -+__call_rcu_common(struct rcu_head *head, rcu_callback_t func, bool lazy) - { - static atomic_t doublefrees; - unsigned long flags; -@@ -2812,7 +2773,7 @@ void call_rcu(struct rcu_head *head, rcu_callback_t func) - } - - check_cb_ovld(rdp); -- if (rcu_nocb_try_bypass(rdp, head, &was_alldone, flags)) -+ if (rcu_nocb_try_bypass(rdp, head, &was_alldone, flags, lazy)) - return; // Enqueued onto ->nocb_bypass, so just leave. - // If no-CBs CPU gets here, rcu_nocb_try_bypass() acquired ->nocb_lock. - rcu_segcblist_enqueue(&rdp->cblist, head); -@@ -2834,8 +2795,84 @@ void call_rcu(struct rcu_head *head, rcu_callback_t func) - local_irq_restore(flags); - } - } --EXPORT_SYMBOL_GPL(call_rcu); - -+#ifdef CONFIG_RCU_LAZY -+/** -+ * call_rcu_hurry() - Queue RCU callback for invocation after grace period, and -+ * flush all lazy callbacks (including the new one) to the main ->cblist while -+ * doing so. -+ * -+ * @head: structure to be used for queueing the RCU updates. -+ * @func: actual callback function to be invoked after the grace period -+ * -+ * The callback function will be invoked some time after a full grace -+ * period elapses, in other words after all pre-existing RCU read-side -+ * critical sections have completed. -+ * -+ * Use this API instead of call_rcu() if you don't want the callback to be -+ * invoked after very long periods of time, which can happen on systems without -+ * memory pressure and on systems which are lightly loaded or mostly idle. -+ * This function will cause callbacks to be invoked sooner than later at the -+ * expense of extra power. Other than that, this function is identical to, and -+ * reuses call_rcu()'s logic. Refer to call_rcu() for more details about memory -+ * ordering and other functionality. -+ */ -+void call_rcu_hurry(struct rcu_head *head, rcu_callback_t func) -+{ -+ return __call_rcu_common(head, func, false); -+} -+EXPORT_SYMBOL_GPL(call_rcu_hurry); -+#endif -+ -+/** -+ * call_rcu() - Queue an RCU callback for invocation after a grace period. -+ * By default the callbacks are 'lazy' and are kept hidden from the main -+ * ->cblist to prevent starting of grace periods too soon. -+ * If you desire grace periods to start very soon, use call_rcu_hurry(). -+ * -+ * @head: structure to be used for queueing the RCU updates. -+ * @func: actual callback function to be invoked after the grace period -+ * -+ * The callback function will be invoked some time after a full grace -+ * period elapses, in other words after all pre-existing RCU read-side -+ * critical sections have completed. However, the callback function -+ * might well execute concurrently with RCU read-side critical sections -+ * that started after call_rcu() was invoked. -+ * -+ * RCU read-side critical sections are delimited by rcu_read_lock() -+ * and rcu_read_unlock(), and may be nested. In addition, but only in -+ * v5.0 and later, regions of code across which interrupts, preemption, -+ * or softirqs have been disabled also serve as RCU read-side critical -+ * sections. This includes hardware interrupt handlers, softirq handlers, -+ * and NMI handlers. -+ * -+ * Note that all CPUs must agree that the grace period extended beyond -+ * all pre-existing RCU read-side critical section. On systems with more -+ * than one CPU, this means that when "func()" is invoked, each CPU is -+ * guaranteed to have executed a full memory barrier since the end of its -+ * last RCU read-side critical section whose beginning preceded the call -+ * to call_rcu(). It also means that each CPU executing an RCU read-side -+ * critical section that continues beyond the start of "func()" must have -+ * executed a memory barrier after the call_rcu() but before the beginning -+ * of that RCU read-side critical section. Note that these guarantees -+ * include CPUs that are offline, idle, or executing in user mode, as -+ * well as CPUs that are executing in the kernel. -+ * -+ * Furthermore, if CPU A invoked call_rcu() and CPU B invoked the -+ * resulting RCU callback function "func()", then both CPU A and CPU B are -+ * guaranteed to execute a full memory barrier during the time interval -+ * between the call to call_rcu() and the invocation of "func()" -- even -+ * if CPU A and CPU B are the same CPU (but again only if the system has -+ * more than one CPU). -+ * -+ * Implementation of these memory-ordering guarantees is described here: -+ * Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst. -+ */ -+void call_rcu(struct rcu_head *head, rcu_callback_t func) -+{ -+ return __call_rcu_common(head, func, IS_ENABLED(CONFIG_RCU_LAZY)); -+} -+EXPORT_SYMBOL_GPL(call_rcu); - - /* Maximum number of jiffies to wait before draining a batch. */ - #define KFREE_DRAIN_JIFFIES (5 * HZ) -@@ -3521,7 +3558,7 @@ void synchronize_rcu(void) - if (rcu_gp_is_expedited()) - synchronize_rcu_expedited(); - else -- wait_rcu_gp(call_rcu); -+ wait_rcu_gp(call_rcu_hurry); - return; - } - -@@ -3924,7 +3961,7 @@ static void rcu_barrier_entrain(struct rcu_data *rdp) - * if it's fully lazy. - */ - was_alldone = rcu_rdp_is_offloaded(rdp) && !rcu_segcblist_pend_cbs(&rdp->cblist); -- WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies)); -+ WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies, false)); - wake_nocb = was_alldone && rcu_segcblist_pend_cbs(&rdp->cblist); - if (rcu_segcblist_entrain(&rdp->cblist, &rdp->barrier_head)) { - atomic_inc(&rcu_state.barrier_cpu_count); -@@ -4359,7 +4396,7 @@ void rcutree_migrate_callbacks(int cpu) - my_rdp = this_cpu_ptr(&rcu_data); - my_rnp = my_rdp->mynode; - rcu_nocb_lock(my_rdp); /* irqs already disabled. */ -- WARN_ON_ONCE(!rcu_nocb_flush_bypass(my_rdp, NULL, jiffies)); -+ WARN_ON_ONCE(!rcu_nocb_flush_bypass(my_rdp, NULL, jiffies, false)); - raw_spin_lock_rcu_node(my_rnp); /* irqs already disabled. */ - /* Leverage recent GPs and set GP for new callbacks. */ - needwake = rcu_advance_cbs(my_rnp, rdp) || -diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h -index 925dd98f8b23b..fcb5d696eb170 100644 ---- a/kernel/rcu/tree.h -+++ b/kernel/rcu/tree.h -@@ -263,14 +263,16 @@ struct rcu_data { - unsigned long last_fqs_resched; /* Time of last rcu_resched(). */ - unsigned long last_sched_clock; /* Jiffies of last rcu_sched_clock_irq(). */ - -+ long lazy_len; /* Length of buffered lazy callbacks. */ - int cpu; - }; - - /* Values for nocb_defer_wakeup field in struct rcu_data. */ - #define RCU_NOCB_WAKE_NOT 0 - #define RCU_NOCB_WAKE_BYPASS 1 --#define RCU_NOCB_WAKE 2 --#define RCU_NOCB_WAKE_FORCE 3 -+#define RCU_NOCB_WAKE_LAZY 2 -+#define RCU_NOCB_WAKE 3 -+#define RCU_NOCB_WAKE_FORCE 4 - - #define RCU_JIFFIES_TILL_FORCE_QS (1 + (HZ > 250) + (HZ > 500)) - /* For jiffies_till_first_fqs and */ -@@ -441,9 +443,10 @@ static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq); - static void rcu_init_one_nocb(struct rcu_node *rnp); - static bool wake_nocb_gp(struct rcu_data *rdp, bool force); - static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp, -- unsigned long j); -+ unsigned long j, bool lazy); - static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp, -- bool *was_alldone, unsigned long flags); -+ bool *was_alldone, unsigned long flags, -+ bool lazy); - static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_empty, - unsigned long flags); - static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp, int level); -diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h -index aa3ec3c3b9f75..b9637df7cda70 100644 ---- a/kernel/rcu/tree_exp.h -+++ b/kernel/rcu/tree_exp.h -@@ -941,7 +941,7 @@ void synchronize_rcu_expedited(void) - - /* If expedited grace periods are prohibited, fall back to normal. */ - if (rcu_gp_is_normal()) { -- wait_rcu_gp(call_rcu); -+ wait_rcu_gp(call_rcu_hurry); - return; - } - -diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h -index 74d4983d68f82..c3ec5f389d27f 100644 ---- a/kernel/rcu/tree_nocb.h -+++ b/kernel/rcu/tree_nocb.h -@@ -256,6 +256,31 @@ static bool wake_nocb_gp(struct rcu_data *rdp, bool force) - return __wake_nocb_gp(rdp_gp, rdp, force, flags); - } - -+/* -+ * LAZY_FLUSH_JIFFIES decides the maximum amount of time that -+ * can elapse before lazy callbacks are flushed. Lazy callbacks -+ * could be flushed much earlier for a number of other reasons -+ * however, LAZY_FLUSH_JIFFIES will ensure no lazy callbacks are -+ * left unsubmitted to RCU after those many jiffies. -+ */ -+#define LAZY_FLUSH_JIFFIES (10 * HZ) -+static unsigned long jiffies_till_flush = LAZY_FLUSH_JIFFIES; -+ -+#ifdef CONFIG_RCU_LAZY -+// To be called only from test code. -+void rcu_lazy_set_jiffies_till_flush(unsigned long jif) -+{ -+ jiffies_till_flush = jif; -+} -+EXPORT_SYMBOL(rcu_lazy_set_jiffies_till_flush); -+ -+unsigned long rcu_lazy_get_jiffies_till_flush(void) -+{ -+ return jiffies_till_flush; -+} -+EXPORT_SYMBOL(rcu_lazy_get_jiffies_till_flush); -+#endif -+ - /* - * Arrange to wake the GP kthread for this NOCB group at some future - * time when it is safe to do so. -@@ -269,10 +294,14 @@ static void wake_nocb_gp_defer(struct rcu_data *rdp, int waketype, - raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags); - - /* -- * Bypass wakeup overrides previous deferments. In case -- * of callback storm, no need to wake up too early. -+ * Bypass wakeup overrides previous deferments. In case of -+ * callback storms, no need to wake up too early. - */ -- if (waketype == RCU_NOCB_WAKE_BYPASS) { -+ if (waketype == RCU_NOCB_WAKE_LAZY && -+ rdp->nocb_defer_wakeup == RCU_NOCB_WAKE_NOT) { -+ mod_timer(&rdp_gp->nocb_timer, jiffies + jiffies_till_flush); -+ WRITE_ONCE(rdp_gp->nocb_defer_wakeup, waketype); -+ } else if (waketype == RCU_NOCB_WAKE_BYPASS) { - mod_timer(&rdp_gp->nocb_timer, jiffies + 2); - WRITE_ONCE(rdp_gp->nocb_defer_wakeup, waketype); - } else { -@@ -293,10 +322,13 @@ static void wake_nocb_gp_defer(struct rcu_data *rdp, int waketype, - * proves to be initially empty, just return false because the no-CB GP - * kthread may need to be awakened in this case. - * -+ * Return true if there was something to be flushed and it succeeded, otherwise -+ * false. -+ * - * Note that this function always returns true if rhp is NULL. - */ - static bool rcu_nocb_do_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp, -- unsigned long j) -+ unsigned long j, bool lazy) - { - struct rcu_cblist rcl; - -@@ -310,7 +342,20 @@ static bool rcu_nocb_do_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp, - /* Note: ->cblist.len already accounts for ->nocb_bypass contents. */ - if (rhp) - rcu_segcblist_inc_len(&rdp->cblist); /* Must precede enqueue. */ -- rcu_cblist_flush_enqueue(&rcl, &rdp->nocb_bypass, rhp); -+ -+ /* -+ * If the new CB requested was a lazy one, queue it onto the main -+ * ->cblist so we can take advantage of a sooner grade period. -+ */ -+ if (lazy && rhp) { -+ rcu_cblist_flush_enqueue(&rcl, &rdp->nocb_bypass, NULL); -+ rcu_cblist_enqueue(&rcl, rhp); -+ WRITE_ONCE(rdp->lazy_len, 0); -+ } else { -+ rcu_cblist_flush_enqueue(&rcl, &rdp->nocb_bypass, rhp); -+ WRITE_ONCE(rdp->lazy_len, 0); -+ } -+ - rcu_segcblist_insert_pend_cbs(&rdp->cblist, &rcl); - WRITE_ONCE(rdp->nocb_bypass_first, j); - rcu_nocb_bypass_unlock(rdp); -@@ -326,13 +371,13 @@ static bool rcu_nocb_do_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp, - * Note that this function always returns true if rhp is NULL. - */ - static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp, -- unsigned long j) -+ unsigned long j, bool lazy) - { - if (!rcu_rdp_is_offloaded(rdp)) - return true; - rcu_lockdep_assert_cblist_protected(rdp); - rcu_nocb_bypass_lock(rdp); -- return rcu_nocb_do_flush_bypass(rdp, rhp, j); -+ return rcu_nocb_do_flush_bypass(rdp, rhp, j, lazy); - } - - /* -@@ -345,7 +390,7 @@ static void rcu_nocb_try_flush_bypass(struct rcu_data *rdp, unsigned long j) - if (!rcu_rdp_is_offloaded(rdp) || - !rcu_nocb_bypass_trylock(rdp)) - return; -- WARN_ON_ONCE(!rcu_nocb_do_flush_bypass(rdp, NULL, j)); -+ WARN_ON_ONCE(!rcu_nocb_do_flush_bypass(rdp, NULL, j, false)); - } - - /* -@@ -367,12 +412,14 @@ static void rcu_nocb_try_flush_bypass(struct rcu_data *rdp, unsigned long j) - * there is only one CPU in operation. - */ - static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp, -- bool *was_alldone, unsigned long flags) -+ bool *was_alldone, unsigned long flags, -+ bool lazy) - { - unsigned long c; - unsigned long cur_gp_seq; - unsigned long j = jiffies; - long ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass); -+ bool bypass_is_lazy = (ncbs == READ_ONCE(rdp->lazy_len)); - - lockdep_assert_irqs_disabled(); - -@@ -417,25 +464,29 @@ static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp, - // If there hasn't yet been all that many ->cblist enqueues - // this jiffy, tell the caller to enqueue onto ->cblist. But flush - // ->nocb_bypass first. -- if (rdp->nocb_nobypass_count < nocb_nobypass_lim_per_jiffy) { -+ // Lazy CBs throttle this back and do immediate bypass queuing. -+ if (rdp->nocb_nobypass_count < nocb_nobypass_lim_per_jiffy && !lazy) { - rcu_nocb_lock(rdp); - *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist); - if (*was_alldone) - trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, - TPS("FirstQ")); -- WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, j)); -+ -+ WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, j, false)); - WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass)); - return false; // Caller must enqueue the callback. - } - - // If ->nocb_bypass has been used too long or is too full, - // flush ->nocb_bypass to ->cblist. -- if ((ncbs && j != READ_ONCE(rdp->nocb_bypass_first)) || -+ if ((ncbs && !bypass_is_lazy && j != READ_ONCE(rdp->nocb_bypass_first)) || -+ (ncbs && bypass_is_lazy && -+ (time_after(j, READ_ONCE(rdp->nocb_bypass_first) + jiffies_till_flush))) || - ncbs >= qhimark) { - rcu_nocb_lock(rdp); - *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist); - -- if (!rcu_nocb_flush_bypass(rdp, rhp, j)) { -+ if (!rcu_nocb_flush_bypass(rdp, rhp, j, lazy)) { - if (*was_alldone) - trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, - TPS("FirstQ")); -@@ -463,13 +514,24 @@ static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp, - ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass); - rcu_segcblist_inc_len(&rdp->cblist); /* Must precede enqueue. */ - rcu_cblist_enqueue(&rdp->nocb_bypass, rhp); -+ -+ if (lazy) -+ WRITE_ONCE(rdp->lazy_len, rdp->lazy_len + 1); -+ - if (!ncbs) { - WRITE_ONCE(rdp->nocb_bypass_first, j); - trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("FirstBQ")); - } - rcu_nocb_bypass_unlock(rdp); - smp_mb(); /* Order enqueue before wake. */ -- if (ncbs) { -+ // A wake up of the grace period kthread or timer adjustment -+ // needs to be done only if: -+ // 1. Bypass list was fully empty before (this is the first -+ // bypass list entry), or: -+ // 2. Both of these conditions are met: -+ // a. The bypass list previously had only lazy CBs, and: -+ // b. The new CB is non-lazy. -+ if (ncbs && (!bypass_is_lazy || lazy)) { - local_irq_restore(flags); - } else { - // No-CBs GP kthread might be indefinitely asleep, if so, wake. -@@ -497,8 +559,10 @@ static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone, - unsigned long flags) - __releases(rdp->nocb_lock) - { -+ long bypass_len; - unsigned long cur_gp_seq; - unsigned long j; -+ long lazy_len; - long len; - struct task_struct *t; - -@@ -512,9 +576,16 @@ static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone, - } - // Need to actually to a wakeup. - len = rcu_segcblist_n_cbs(&rdp->cblist); -+ bypass_len = rcu_cblist_n_cbs(&rdp->nocb_bypass); -+ lazy_len = READ_ONCE(rdp->lazy_len); - if (was_alldone) { - rdp->qlen_last_fqs_check = len; -- if (!irqs_disabled_flags(flags)) { -+ // Only lazy CBs in bypass list -+ if (lazy_len && bypass_len == lazy_len) { -+ rcu_nocb_unlock_irqrestore(rdp, flags); -+ wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE_LAZY, -+ TPS("WakeLazy")); -+ } else if (!irqs_disabled_flags(flags)) { - /* ... if queue was empty ... */ - rcu_nocb_unlock_irqrestore(rdp, flags); - wake_nocb_gp(rdp, false); -@@ -605,12 +676,12 @@ static void nocb_gp_sleep(struct rcu_data *my_rdp, int cpu) - static void nocb_gp_wait(struct rcu_data *my_rdp) - { - bool bypass = false; -- long bypass_ncbs; - int __maybe_unused cpu = my_rdp->cpu; - unsigned long cur_gp_seq; - unsigned long flags; - bool gotcbs = false; - unsigned long j = jiffies; -+ bool lazy = false; - bool needwait_gp = false; // This prevents actual uninitialized use. - bool needwake; - bool needwake_gp; -@@ -640,24 +711,43 @@ static void nocb_gp_wait(struct rcu_data *my_rdp) - * won't be ignored for long. - */ - list_for_each_entry(rdp, &my_rdp->nocb_head_rdp, nocb_entry_rdp) { -+ long bypass_ncbs; -+ bool flush_bypass = false; -+ long lazy_ncbs; -+ - trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("Check")); - rcu_nocb_lock_irqsave(rdp, flags); - lockdep_assert_held(&rdp->nocb_lock); - bypass_ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass); -- if (bypass_ncbs && -+ lazy_ncbs = READ_ONCE(rdp->lazy_len); -+ -+ if (bypass_ncbs && (lazy_ncbs == bypass_ncbs) && -+ (time_after(j, READ_ONCE(rdp->nocb_bypass_first) + jiffies_till_flush) || -+ bypass_ncbs > 2 * qhimark)) { -+ flush_bypass = true; -+ } else if (bypass_ncbs && (lazy_ncbs != bypass_ncbs) && - (time_after(j, READ_ONCE(rdp->nocb_bypass_first) + 1) || - bypass_ncbs > 2 * qhimark)) { -- // Bypass full or old, so flush it. -- (void)rcu_nocb_try_flush_bypass(rdp, j); -- bypass_ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass); -+ flush_bypass = true; - } else if (!bypass_ncbs && rcu_segcblist_empty(&rdp->cblist)) { - rcu_nocb_unlock_irqrestore(rdp, flags); - continue; /* No callbacks here, try next. */ - } -+ -+ if (flush_bypass) { -+ // Bypass full or old, so flush it. -+ (void)rcu_nocb_try_flush_bypass(rdp, j); -+ bypass_ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass); -+ lazy_ncbs = READ_ONCE(rdp->lazy_len); -+ } -+ - if (bypass_ncbs) { - trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, -- TPS("Bypass")); -- bypass = true; -+ bypass_ncbs == lazy_ncbs ? TPS("Lazy") : TPS("Bypass")); -+ if (bypass_ncbs == lazy_ncbs) -+ lazy = true; -+ else -+ bypass = true; - } - rnp = rdp->mynode; - -@@ -705,12 +795,20 @@ static void nocb_gp_wait(struct rcu_data *my_rdp) - my_rdp->nocb_gp_gp = needwait_gp; - my_rdp->nocb_gp_seq = needwait_gp ? wait_gp_seq : 0; - -- if (bypass && !rcu_nocb_poll) { -- // At least one child with non-empty ->nocb_bypass, so set -- // timer in order to avoid stranding its callbacks. -- wake_nocb_gp_defer(my_rdp, RCU_NOCB_WAKE_BYPASS, -- TPS("WakeBypassIsDeferred")); -+ // At least one child with non-empty ->nocb_bypass, so set -+ // timer in order to avoid stranding its callbacks. -+ if (!rcu_nocb_poll) { -+ // If bypass list only has lazy CBs. Add a deferred lazy wake up. -+ if (lazy && !bypass) { -+ wake_nocb_gp_defer(my_rdp, RCU_NOCB_WAKE_LAZY, -+ TPS("WakeLazyIsDeferred")); -+ // Otherwise add a deferred bypass wake up. -+ } else if (bypass) { -+ wake_nocb_gp_defer(my_rdp, RCU_NOCB_WAKE_BYPASS, -+ TPS("WakeBypassIsDeferred")); -+ } - } -+ - if (rcu_nocb_poll) { - /* Polling, so trace if first poll in the series. */ - if (gotcbs) -@@ -1036,7 +1134,7 @@ static long rcu_nocb_rdp_deoffload(void *arg) - * return false, which means that future calls to rcu_nocb_try_bypass() - * will refuse to put anything into the bypass. - */ -- WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies)); -+ WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies, false)); - /* - * Start with invoking rcu_core() early. This way if the current thread - * happens to preempt an ongoing call to rcu_core() in the middle, -@@ -1290,6 +1388,7 @@ static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) - raw_spin_lock_init(&rdp->nocb_gp_lock); - timer_setup(&rdp->nocb_timer, do_nocb_deferred_wakeup_timer, 0); - rcu_cblist_init(&rdp->nocb_bypass); -+ WRITE_ONCE(rdp->lazy_len, 0); - mutex_init(&rdp->nocb_gp_kthread_mutex); - } - -@@ -1576,13 +1675,13 @@ static bool wake_nocb_gp(struct rcu_data *rdp, bool force) - } - - static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp, -- unsigned long j) -+ unsigned long j, bool lazy) - { - return true; - } - - static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp, -- bool *was_alldone, unsigned long flags) -+ bool *was_alldone, unsigned long flags, bool lazy) - { - return false; - } --- -2.40.1 - diff --git a/queue-6.1/series b/queue-6.1/series index 18429a3ceb9..02b77a9cf57 100644 --- a/queue-6.1/series +++ b/queue-6.1/series @@ -128,15 +128,6 @@ fprobe-pass-entry_data-to-handlers.patch fprobe-add-nr_maxactive-to-specify-rethook_node-pool.patch fprobe-fix-to-ensure-the-number-of-active-retprobes-.patch net-xfrm-skip-policies-marked-as-dead-while-reinsert.patch -rcu-fix-late-wakeup-when-flush-of-bypass-cblist-happ.patch -rcu-fix-missing-nocb-gp-wake-on-rcu_barrier.patch -rcu-make-call_rcu-lazy-to-save-power.patch -net-use-call_rcu_hurry-for-dst_release.patch -atomics-provide-atomic_add_negative-variants.patch -atomics-provide-rcuref-scalable-reference-counting.patch -net-dst-prevent-false-sharing-vs.-dst_entry-__refcnt.patch -net-dst-switch-to-rcuref_t-reference-counting.patch -net-dst-fix-missing-initialization-of-rt_uncached.patch xfrm6-fix-inet6_dev-refcount-underflow-problem.patch net-mlx5-e-switch-register-event-handler-before-armi.patch net-mlx5-handle-fw-tracer-change-ownership-event-bas.patch @@ -198,3 +189,5 @@ phy-mapphone-mdm6600-fix-runtime-disable-on-probe.patch phy-mapphone-mdm6600-fix-runtime-pm-for-remove.patch phy-mapphone-mdm6600-fix-pinctrl_pm-handling-for-sle.patch net-move-altnames-together-with-the-netdevice.patch +bluetooth-hci_sock-fix-slab-oob-read-in-create_monitor_event.patch +bluetooth-hci_sock-correctly-bounds-check-and-pad-hci_mon_new_index-name.patch diff --git a/queue-6.1/xfrm6-fix-inet6_dev-refcount-underflow-problem.patch b/queue-6.1/xfrm6-fix-inet6_dev-refcount-underflow-problem.patch index c0b5edf66cf..49297986bf6 100644 --- a/queue-6.1/xfrm6-fix-inet6_dev-refcount-underflow-problem.patch +++ b/queue-6.1/xfrm6-fix-inet6_dev-refcount-underflow-problem.patch @@ -33,27 +33,24 @@ Signed-off-by: Zhang Changzhong Reviewed-by: Xin Long Signed-off-by: Steffen Klassert Signed-off-by: Sasha Levin +Signed-off-by: Greg Kroah-Hartman --- - net/ipv6/xfrm6_policy.c | 4 ++-- + net/ipv6/xfrm6_policy.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) -diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c -index eecc5e59da17c..50c278f1c1063 100644 --- a/net/ipv6/xfrm6_policy.c +++ b/net/ipv6/xfrm6_policy.c -@@ -117,10 +117,10 @@ static void xfrm6_dst_destroy(struct dst_entry *dst) +@@ -118,11 +118,11 @@ static void xfrm6_dst_destroy(struct dst { struct xfrm_dst *xdst = (struct xfrm_dst *)dst; - if (likely(xdst->u.rt6.rt6i_idev)) - in6_dev_put(xdst->u.rt6.rt6i_idev); dst_destroy_metrics_generic(dst); - rt6_uncached_list_del(&xdst->u.rt6); + if (xdst->u.rt6.rt6i_uncached_list) + rt6_uncached_list_del(&xdst->u.rt6); + if (likely(xdst->u.rt6.rt6i_idev)) + in6_dev_put(xdst->u.rt6.rt6i_idev); xfrm_dst_destroy(xdst); } --- -2.40.1 -