+++ /dev/null
-From 2194a9643e933a16a92f83d3859f3916f95a5e42 Mon Sep 17 00:00:00 2001
-From: Sasha Levin <sashal@kernel.org>
-Date: Thu, 23 Mar 2023 21:55:30 +0100
-Subject: atomics: Provide atomic_add_negative() variants
-
-From: Thomas Gleixner <tglx@linutronix.de>
-
-[ Upstream commit e5ab9eff46b04c5a04778e40d7092fed3fda52ca ]
-
-atomic_add_negative() does not provide the relaxed/acquire/release
-variants.
-
-Provide them in preparation for a new scalable reference count algorithm.
-
-Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
-Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
-Acked-by: Mark Rutland <mark.rutland@arm.com>
-Link: https://lore.kernel.org/r/20230323102800.101763813@linutronix.de
-Stable-dep-of: cc9b364bb1d5 ("xfrm6: fix inet6_dev refcount underflow problem")
-Signed-off-by: Sasha Levin <sashal@kernel.org>
----
- include/linux/atomic/atomic-arch-fallback.h | 208 +++++++++++++++++++-
- include/linux/atomic/atomic-instrumented.h | 68 ++++++-
- include/linux/atomic/atomic-long.h | 38 +++-
- scripts/atomic/atomics.tbl | 2 +-
- scripts/atomic/fallbacks/add_negative | 11 +-
- 5 files changed, 309 insertions(+), 18 deletions(-)
-
-diff --git a/include/linux/atomic/atomic-arch-fallback.h b/include/linux/atomic/atomic-arch-fallback.h
-index 77bc5522e61c6..4226379a232d5 100644
---- a/include/linux/atomic/atomic-arch-fallback.h
-+++ b/include/linux/atomic/atomic-arch-fallback.h
-@@ -1208,15 +1208,21 @@ arch_atomic_inc_and_test(atomic_t *v)
- #define arch_atomic_inc_and_test arch_atomic_inc_and_test
- #endif
-
-+#ifndef arch_atomic_add_negative_relaxed
-+#ifdef arch_atomic_add_negative
-+#define arch_atomic_add_negative_acquire arch_atomic_add_negative
-+#define arch_atomic_add_negative_release arch_atomic_add_negative
-+#define arch_atomic_add_negative_relaxed arch_atomic_add_negative
-+#endif /* arch_atomic_add_negative */
-+
- #ifndef arch_atomic_add_negative
- /**
-- * arch_atomic_add_negative - add and test if negative
-+ * arch_atomic_add_negative - Add and test if negative
- * @i: integer value to add
- * @v: pointer of type atomic_t
- *
-- * Atomically adds @i to @v and returns true
-- * if the result is negative, or false when
-- * result is greater than or equal to zero.
-+ * Atomically adds @i to @v and returns true if the result is negative,
-+ * or false when the result is greater than or equal to zero.
- */
- static __always_inline bool
- arch_atomic_add_negative(int i, atomic_t *v)
-@@ -1226,6 +1232,95 @@ arch_atomic_add_negative(int i, atomic_t *v)
- #define arch_atomic_add_negative arch_atomic_add_negative
- #endif
-
-+#ifndef arch_atomic_add_negative_acquire
-+/**
-+ * arch_atomic_add_negative_acquire - Add and test if negative
-+ * @i: integer value to add
-+ * @v: pointer of type atomic_t
-+ *
-+ * Atomically adds @i to @v and returns true if the result is negative,
-+ * or false when the result is greater than or equal to zero.
-+ */
-+static __always_inline bool
-+arch_atomic_add_negative_acquire(int i, atomic_t *v)
-+{
-+ return arch_atomic_add_return_acquire(i, v) < 0;
-+}
-+#define arch_atomic_add_negative_acquire arch_atomic_add_negative_acquire
-+#endif
-+
-+#ifndef arch_atomic_add_negative_release
-+/**
-+ * arch_atomic_add_negative_release - Add and test if negative
-+ * @i: integer value to add
-+ * @v: pointer of type atomic_t
-+ *
-+ * Atomically adds @i to @v and returns true if the result is negative,
-+ * or false when the result is greater than or equal to zero.
-+ */
-+static __always_inline bool
-+arch_atomic_add_negative_release(int i, atomic_t *v)
-+{
-+ return arch_atomic_add_return_release(i, v) < 0;
-+}
-+#define arch_atomic_add_negative_release arch_atomic_add_negative_release
-+#endif
-+
-+#ifndef arch_atomic_add_negative_relaxed
-+/**
-+ * arch_atomic_add_negative_relaxed - Add and test if negative
-+ * @i: integer value to add
-+ * @v: pointer of type atomic_t
-+ *
-+ * Atomically adds @i to @v and returns true if the result is negative,
-+ * or false when the result is greater than or equal to zero.
-+ */
-+static __always_inline bool
-+arch_atomic_add_negative_relaxed(int i, atomic_t *v)
-+{
-+ return arch_atomic_add_return_relaxed(i, v) < 0;
-+}
-+#define arch_atomic_add_negative_relaxed arch_atomic_add_negative_relaxed
-+#endif
-+
-+#else /* arch_atomic_add_negative_relaxed */
-+
-+#ifndef arch_atomic_add_negative_acquire
-+static __always_inline bool
-+arch_atomic_add_negative_acquire(int i, atomic_t *v)
-+{
-+ bool ret = arch_atomic_add_negative_relaxed(i, v);
-+ __atomic_acquire_fence();
-+ return ret;
-+}
-+#define arch_atomic_add_negative_acquire arch_atomic_add_negative_acquire
-+#endif
-+
-+#ifndef arch_atomic_add_negative_release
-+static __always_inline bool
-+arch_atomic_add_negative_release(int i, atomic_t *v)
-+{
-+ __atomic_release_fence();
-+ return arch_atomic_add_negative_relaxed(i, v);
-+}
-+#define arch_atomic_add_negative_release arch_atomic_add_negative_release
-+#endif
-+
-+#ifndef arch_atomic_add_negative
-+static __always_inline bool
-+arch_atomic_add_negative(int i, atomic_t *v)
-+{
-+ bool ret;
-+ __atomic_pre_full_fence();
-+ ret = arch_atomic_add_negative_relaxed(i, v);
-+ __atomic_post_full_fence();
-+ return ret;
-+}
-+#define arch_atomic_add_negative arch_atomic_add_negative
-+#endif
-+
-+#endif /* arch_atomic_add_negative_relaxed */
-+
- #ifndef arch_atomic_fetch_add_unless
- /**
- * arch_atomic_fetch_add_unless - add unless the number is already a given value
-@@ -2329,15 +2424,21 @@ arch_atomic64_inc_and_test(atomic64_t *v)
- #define arch_atomic64_inc_and_test arch_atomic64_inc_and_test
- #endif
-
-+#ifndef arch_atomic64_add_negative_relaxed
-+#ifdef arch_atomic64_add_negative
-+#define arch_atomic64_add_negative_acquire arch_atomic64_add_negative
-+#define arch_atomic64_add_negative_release arch_atomic64_add_negative
-+#define arch_atomic64_add_negative_relaxed arch_atomic64_add_negative
-+#endif /* arch_atomic64_add_negative */
-+
- #ifndef arch_atomic64_add_negative
- /**
-- * arch_atomic64_add_negative - add and test if negative
-+ * arch_atomic64_add_negative - Add and test if negative
- * @i: integer value to add
- * @v: pointer of type atomic64_t
- *
-- * Atomically adds @i to @v and returns true
-- * if the result is negative, or false when
-- * result is greater than or equal to zero.
-+ * Atomically adds @i to @v and returns true if the result is negative,
-+ * or false when the result is greater than or equal to zero.
- */
- static __always_inline bool
- arch_atomic64_add_negative(s64 i, atomic64_t *v)
-@@ -2347,6 +2448,95 @@ arch_atomic64_add_negative(s64 i, atomic64_t *v)
- #define arch_atomic64_add_negative arch_atomic64_add_negative
- #endif
-
-+#ifndef arch_atomic64_add_negative_acquire
-+/**
-+ * arch_atomic64_add_negative_acquire - Add and test if negative
-+ * @i: integer value to add
-+ * @v: pointer of type atomic64_t
-+ *
-+ * Atomically adds @i to @v and returns true if the result is negative,
-+ * or false when the result is greater than or equal to zero.
-+ */
-+static __always_inline bool
-+arch_atomic64_add_negative_acquire(s64 i, atomic64_t *v)
-+{
-+ return arch_atomic64_add_return_acquire(i, v) < 0;
-+}
-+#define arch_atomic64_add_negative_acquire arch_atomic64_add_negative_acquire
-+#endif
-+
-+#ifndef arch_atomic64_add_negative_release
-+/**
-+ * arch_atomic64_add_negative_release - Add and test if negative
-+ * @i: integer value to add
-+ * @v: pointer of type atomic64_t
-+ *
-+ * Atomically adds @i to @v and returns true if the result is negative,
-+ * or false when the result is greater than or equal to zero.
-+ */
-+static __always_inline bool
-+arch_atomic64_add_negative_release(s64 i, atomic64_t *v)
-+{
-+ return arch_atomic64_add_return_release(i, v) < 0;
-+}
-+#define arch_atomic64_add_negative_release arch_atomic64_add_negative_release
-+#endif
-+
-+#ifndef arch_atomic64_add_negative_relaxed
-+/**
-+ * arch_atomic64_add_negative_relaxed - Add and test if negative
-+ * @i: integer value to add
-+ * @v: pointer of type atomic64_t
-+ *
-+ * Atomically adds @i to @v and returns true if the result is negative,
-+ * or false when the result is greater than or equal to zero.
-+ */
-+static __always_inline bool
-+arch_atomic64_add_negative_relaxed(s64 i, atomic64_t *v)
-+{
-+ return arch_atomic64_add_return_relaxed(i, v) < 0;
-+}
-+#define arch_atomic64_add_negative_relaxed arch_atomic64_add_negative_relaxed
-+#endif
-+
-+#else /* arch_atomic64_add_negative_relaxed */
-+
-+#ifndef arch_atomic64_add_negative_acquire
-+static __always_inline bool
-+arch_atomic64_add_negative_acquire(s64 i, atomic64_t *v)
-+{
-+ bool ret = arch_atomic64_add_negative_relaxed(i, v);
-+ __atomic_acquire_fence();
-+ return ret;
-+}
-+#define arch_atomic64_add_negative_acquire arch_atomic64_add_negative_acquire
-+#endif
-+
-+#ifndef arch_atomic64_add_negative_release
-+static __always_inline bool
-+arch_atomic64_add_negative_release(s64 i, atomic64_t *v)
-+{
-+ __atomic_release_fence();
-+ return arch_atomic64_add_negative_relaxed(i, v);
-+}
-+#define arch_atomic64_add_negative_release arch_atomic64_add_negative_release
-+#endif
-+
-+#ifndef arch_atomic64_add_negative
-+static __always_inline bool
-+arch_atomic64_add_negative(s64 i, atomic64_t *v)
-+{
-+ bool ret;
-+ __atomic_pre_full_fence();
-+ ret = arch_atomic64_add_negative_relaxed(i, v);
-+ __atomic_post_full_fence();
-+ return ret;
-+}
-+#define arch_atomic64_add_negative arch_atomic64_add_negative
-+#endif
-+
-+#endif /* arch_atomic64_add_negative_relaxed */
-+
- #ifndef arch_atomic64_fetch_add_unless
- /**
- * arch_atomic64_fetch_add_unless - add unless the number is already a given value
-@@ -2456,4 +2646,4 @@ arch_atomic64_dec_if_positive(atomic64_t *v)
- #endif
-
- #endif /* _LINUX_ATOMIC_FALLBACK_H */
--// b5e87bdd5ede61470c29f7a7e4de781af3770f09
-+// 00071fffa021cec66f6290d706d69c91df87bade
-diff --git a/include/linux/atomic/atomic-instrumented.h b/include/linux/atomic/atomic-instrumented.h
-index 7a139ec030b0c..0496816738ca9 100644
---- a/include/linux/atomic/atomic-instrumented.h
-+++ b/include/linux/atomic/atomic-instrumented.h
-@@ -592,6 +592,28 @@ atomic_add_negative(int i, atomic_t *v)
- return arch_atomic_add_negative(i, v);
- }
-
-+static __always_inline bool
-+atomic_add_negative_acquire(int i, atomic_t *v)
-+{
-+ instrument_atomic_read_write(v, sizeof(*v));
-+ return arch_atomic_add_negative_acquire(i, v);
-+}
-+
-+static __always_inline bool
-+atomic_add_negative_release(int i, atomic_t *v)
-+{
-+ kcsan_release();
-+ instrument_atomic_read_write(v, sizeof(*v));
-+ return arch_atomic_add_negative_release(i, v);
-+}
-+
-+static __always_inline bool
-+atomic_add_negative_relaxed(int i, atomic_t *v)
-+{
-+ instrument_atomic_read_write(v, sizeof(*v));
-+ return arch_atomic_add_negative_relaxed(i, v);
-+}
-+
- static __always_inline int
- atomic_fetch_add_unless(atomic_t *v, int a, int u)
- {
-@@ -1211,6 +1233,28 @@ atomic64_add_negative(s64 i, atomic64_t *v)
- return arch_atomic64_add_negative(i, v);
- }
-
-+static __always_inline bool
-+atomic64_add_negative_acquire(s64 i, atomic64_t *v)
-+{
-+ instrument_atomic_read_write(v, sizeof(*v));
-+ return arch_atomic64_add_negative_acquire(i, v);
-+}
-+
-+static __always_inline bool
-+atomic64_add_negative_release(s64 i, atomic64_t *v)
-+{
-+ kcsan_release();
-+ instrument_atomic_read_write(v, sizeof(*v));
-+ return arch_atomic64_add_negative_release(i, v);
-+}
-+
-+static __always_inline bool
-+atomic64_add_negative_relaxed(s64 i, atomic64_t *v)
-+{
-+ instrument_atomic_read_write(v, sizeof(*v));
-+ return arch_atomic64_add_negative_relaxed(i, v);
-+}
-+
- static __always_inline s64
- atomic64_fetch_add_unless(atomic64_t *v, s64 a, s64 u)
- {
-@@ -1830,6 +1874,28 @@ atomic_long_add_negative(long i, atomic_long_t *v)
- return arch_atomic_long_add_negative(i, v);
- }
-
-+static __always_inline bool
-+atomic_long_add_negative_acquire(long i, atomic_long_t *v)
-+{
-+ instrument_atomic_read_write(v, sizeof(*v));
-+ return arch_atomic_long_add_negative_acquire(i, v);
-+}
-+
-+static __always_inline bool
-+atomic_long_add_negative_release(long i, atomic_long_t *v)
-+{
-+ kcsan_release();
-+ instrument_atomic_read_write(v, sizeof(*v));
-+ return arch_atomic_long_add_negative_release(i, v);
-+}
-+
-+static __always_inline bool
-+atomic_long_add_negative_relaxed(long i, atomic_long_t *v)
-+{
-+ instrument_atomic_read_write(v, sizeof(*v));
-+ return arch_atomic_long_add_negative_relaxed(i, v);
-+}
-+
- static __always_inline long
- atomic_long_fetch_add_unless(atomic_long_t *v, long a, long u)
- {
-@@ -2083,4 +2149,4 @@ atomic_long_dec_if_positive(atomic_long_t *v)
- })
-
- #endif /* _LINUX_ATOMIC_INSTRUMENTED_H */
--// 764f741eb77a7ad565dc8d99ce2837d5542e8aee
-+// 1b485de9cbaa4900de59e14ee2084357eaeb1c3a
-diff --git a/include/linux/atomic/atomic-long.h b/include/linux/atomic/atomic-long.h
-index 800b8c35992d1..2fc51ba66bebd 100644
---- a/include/linux/atomic/atomic-long.h
-+++ b/include/linux/atomic/atomic-long.h
-@@ -479,6 +479,24 @@ arch_atomic_long_add_negative(long i, atomic_long_t *v)
- return arch_atomic64_add_negative(i, v);
- }
-
-+static __always_inline bool
-+arch_atomic_long_add_negative_acquire(long i, atomic_long_t *v)
-+{
-+ return arch_atomic64_add_negative_acquire(i, v);
-+}
-+
-+static __always_inline bool
-+arch_atomic_long_add_negative_release(long i, atomic_long_t *v)
-+{
-+ return arch_atomic64_add_negative_release(i, v);
-+}
-+
-+static __always_inline bool
-+arch_atomic_long_add_negative_relaxed(long i, atomic_long_t *v)
-+{
-+ return arch_atomic64_add_negative_relaxed(i, v);
-+}
-+
- static __always_inline long
- arch_atomic_long_fetch_add_unless(atomic_long_t *v, long a, long u)
- {
-@@ -973,6 +991,24 @@ arch_atomic_long_add_negative(long i, atomic_long_t *v)
- return arch_atomic_add_negative(i, v);
- }
-
-+static __always_inline bool
-+arch_atomic_long_add_negative_acquire(long i, atomic_long_t *v)
-+{
-+ return arch_atomic_add_negative_acquire(i, v);
-+}
-+
-+static __always_inline bool
-+arch_atomic_long_add_negative_release(long i, atomic_long_t *v)
-+{
-+ return arch_atomic_add_negative_release(i, v);
-+}
-+
-+static __always_inline bool
-+arch_atomic_long_add_negative_relaxed(long i, atomic_long_t *v)
-+{
-+ return arch_atomic_add_negative_relaxed(i, v);
-+}
-+
- static __always_inline long
- arch_atomic_long_fetch_add_unless(atomic_long_t *v, long a, long u)
- {
-@@ -1011,4 +1047,4 @@ arch_atomic_long_dec_if_positive(atomic_long_t *v)
-
- #endif /* CONFIG_64BIT */
- #endif /* _LINUX_ATOMIC_LONG_H */
--// e8f0e08ff072b74d180eabe2ad001282b38c2c88
-+// a194c07d7d2f4b0e178d3c118c919775d5d65f50
-diff --git a/scripts/atomic/atomics.tbl b/scripts/atomic/atomics.tbl
-index fbee2f6190d9e..85ca8d9b5c279 100644
---- a/scripts/atomic/atomics.tbl
-+++ b/scripts/atomic/atomics.tbl
-@@ -33,7 +33,7 @@ try_cmpxchg B v p:old i:new
- sub_and_test b i v
- dec_and_test b v
- inc_and_test b v
--add_negative b i v
-+add_negative B i v
- add_unless fb v i:a i:u
- inc_not_zero b v
- inc_unless_negative b v
-diff --git a/scripts/atomic/fallbacks/add_negative b/scripts/atomic/fallbacks/add_negative
-index 15caa2eb23712..e5980abf5904e 100755
---- a/scripts/atomic/fallbacks/add_negative
-+++ b/scripts/atomic/fallbacks/add_negative
-@@ -1,16 +1,15 @@
- cat <<EOF
- /**
-- * arch_${atomic}_add_negative - add and test if negative
-+ * arch_${atomic}_add_negative${order} - Add and test if negative
- * @i: integer value to add
- * @v: pointer of type ${atomic}_t
- *
-- * Atomically adds @i to @v and returns true
-- * if the result is negative, or false when
-- * result is greater than or equal to zero.
-+ * Atomically adds @i to @v and returns true if the result is negative,
-+ * or false when the result is greater than or equal to zero.
- */
- static __always_inline bool
--arch_${atomic}_add_negative(${int} i, ${atomic}_t *v)
-+arch_${atomic}_add_negative${order}(${int} i, ${atomic}_t *v)
- {
-- return arch_${atomic}_add_return(i, v) < 0;
-+ return arch_${atomic}_add_return${order}(i, v) < 0;
- }
- EOF
---
-2.40.1
-
+++ /dev/null
-From 47ac5394259bfc9dd07646a58feea1be4e624eef Mon Sep 17 00:00:00 2001
-From: Sasha Levin <sashal@kernel.org>
-Date: Thu, 23 Mar 2023 21:55:31 +0100
-Subject: atomics: Provide rcuref - scalable reference counting
-
-From: Thomas Gleixner <tglx@linutronix.de>
-
-[ Upstream commit ee1ee6db07795d9637bc5e8993a8ddcf886541ef ]
-
-atomic_t based reference counting, including refcount_t, uses
-atomic_inc_not_zero() for acquiring a reference. atomic_inc_not_zero() is
-implemented with a atomic_try_cmpxchg() loop. High contention of the
-reference count leads to retry loops and scales badly. There is nothing to
-improve on this implementation as the semantics have to be preserved.
-
-Provide rcuref as a scalable alternative solution which is suitable for RCU
-managed objects. Similar to refcount_t it comes with overflow and underflow
-detection and mitigation.
-
-rcuref treats the underlying atomic_t as an unsigned integer and partitions
-this space into zones:
-
- 0x00000000 - 0x7FFFFFFF valid zone (1 .. (INT_MAX + 1) references)
- 0x80000000 - 0xBFFFFFFF saturation zone
- 0xC0000000 - 0xFFFFFFFE dead zone
- 0xFFFFFFFF no reference
-
-rcuref_get() unconditionally increments the reference count with
-atomic_add_negative_relaxed(). rcuref_put() unconditionally decrements the
-reference count with atomic_add_negative_release().
-
-This unconditional increment avoids the inc_not_zero() problem, but
-requires a more complex implementation on the put() side when the count
-drops from 0 to -1.
-
-When this transition is detected then it is attempted to mark the reference
-count dead, by setting it to the midpoint of the dead zone with a single
-atomic_cmpxchg_release() operation. This operation can fail due to a
-concurrent rcuref_get() elevating the reference count from -1 to 0 again.
-
-If the unconditional increment in rcuref_get() hits a reference count which
-is marked dead (or saturated) it will detect it after the fact and bring
-back the reference count to the midpoint of the respective zone. The zones
-provide enough tolerance which makes it practically impossible to escape
-from a zone.
-
-The racy implementation of rcuref_put() requires to protect rcuref_put()
-against a grace period ending in order to prevent a subtle use after
-free. As RCU is the only mechanism which allows to protect against that, it
-is not possible to fully replace the atomic_inc_not_zero() based
-implementation of refcount_t with this scheme.
-
-The final drop is slightly more expensive than the atomic_dec_return()
-counterpart, but that's not the case which this is optimized for. The
-optimization is on the high frequeunt get()/put() pairs and their
-scalability.
-
-The performance of an uncontended rcuref_get()/put() pair where the put()
-is not dropping the last reference is still on par with the plain atomic
-operations, while at the same time providing overflow and underflow
-detection and mitigation.
-
-The performance of rcuref compared to plain atomic_inc_not_zero() and
-atomic_dec_return() based reference counting under contention:
-
- - Micro benchmark: All CPUs running a increment/decrement loop on an
- elevated reference count, which means the 0 to -1 transition never
- happens.
-
- The performance gain depends on microarchitecture and the number of
- CPUs and has been observed in the range of 1.3X to 4.7X
-
- - Conversion of dst_entry::__refcnt to rcuref and testing with the
- localhost memtier/memcached benchmark. That benchmark shows the
- reference count contention prominently.
-
- The performance gain depends on microarchitecture and the number of
- CPUs and has been observed in the range of 1.1X to 2.6X over the
- previous fix for the false sharing issue vs. struct
- dst_entry::__refcnt.
-
- When memtier is run over a real 1Gb network connection, there is a
- small gain on top of the false sharing fix. The two changes combined
- result in a 2%-5% total gain for that networked test.
-
-Reported-by: Wangyang Guo <wangyang.guo@intel.com>
-Reported-by: Arjan Van De Ven <arjan.van.de.ven@intel.com>
-Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
-Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
-Link: https://lore.kernel.org/r/20230323102800.158429195@linutronix.de
-Stable-dep-of: cc9b364bb1d5 ("xfrm6: fix inet6_dev refcount underflow problem")
-Signed-off-by: Sasha Levin <sashal@kernel.org>
----
- include/linux/rcuref.h | 155 +++++++++++++++++++++++
- include/linux/types.h | 6 +
- lib/Makefile | 2 +-
- lib/rcuref.c | 281 +++++++++++++++++++++++++++++++++++++++++
- 4 files changed, 443 insertions(+), 1 deletion(-)
- create mode 100644 include/linux/rcuref.h
- create mode 100644 lib/rcuref.c
-
-diff --git a/include/linux/rcuref.h b/include/linux/rcuref.h
-new file mode 100644
-index 0000000000000..2c8bfd0f1b6b3
---- /dev/null
-+++ b/include/linux/rcuref.h
-@@ -0,0 +1,155 @@
-+/* SPDX-License-Identifier: GPL-2.0-only */
-+#ifndef _LINUX_RCUREF_H
-+#define _LINUX_RCUREF_H
-+
-+#include <linux/atomic.h>
-+#include <linux/bug.h>
-+#include <linux/limits.h>
-+#include <linux/lockdep.h>
-+#include <linux/preempt.h>
-+#include <linux/rcupdate.h>
-+
-+#define RCUREF_ONEREF 0x00000000U
-+#define RCUREF_MAXREF 0x7FFFFFFFU
-+#define RCUREF_SATURATED 0xA0000000U
-+#define RCUREF_RELEASED 0xC0000000U
-+#define RCUREF_DEAD 0xE0000000U
-+#define RCUREF_NOREF 0xFFFFFFFFU
-+
-+/**
-+ * rcuref_init - Initialize a rcuref reference count with the given reference count
-+ * @ref: Pointer to the reference count
-+ * @cnt: The initial reference count typically '1'
-+ */
-+static inline void rcuref_init(rcuref_t *ref, unsigned int cnt)
-+{
-+ atomic_set(&ref->refcnt, cnt - 1);
-+}
-+
-+/**
-+ * rcuref_read - Read the number of held reference counts of a rcuref
-+ * @ref: Pointer to the reference count
-+ *
-+ * Return: The number of held references (0 ... N)
-+ */
-+static inline unsigned int rcuref_read(rcuref_t *ref)
-+{
-+ unsigned int c = atomic_read(&ref->refcnt);
-+
-+ /* Return 0 if within the DEAD zone. */
-+ return c >= RCUREF_RELEASED ? 0 : c + 1;
-+}
-+
-+extern __must_check bool rcuref_get_slowpath(rcuref_t *ref);
-+
-+/**
-+ * rcuref_get - Acquire one reference on a rcuref reference count
-+ * @ref: Pointer to the reference count
-+ *
-+ * Similar to atomic_inc_not_zero() but saturates at RCUREF_MAXREF.
-+ *
-+ * Provides no memory ordering, it is assumed the caller has guaranteed the
-+ * object memory to be stable (RCU, etc.). It does provide a control dependency
-+ * and thereby orders future stores. See documentation in lib/rcuref.c
-+ *
-+ * Return:
-+ * False if the attempt to acquire a reference failed. This happens
-+ * when the last reference has been put already
-+ *
-+ * True if a reference was successfully acquired
-+ */
-+static inline __must_check bool rcuref_get(rcuref_t *ref)
-+{
-+ /*
-+ * Unconditionally increase the reference count. The saturation and
-+ * dead zones provide enough tolerance for this.
-+ */
-+ if (likely(!atomic_add_negative_relaxed(1, &ref->refcnt)))
-+ return true;
-+
-+ /* Handle the cases inside the saturation and dead zones */
-+ return rcuref_get_slowpath(ref);
-+}
-+
-+extern __must_check bool rcuref_put_slowpath(rcuref_t *ref);
-+
-+/*
-+ * Internal helper. Do not invoke directly.
-+ */
-+static __always_inline __must_check bool __rcuref_put(rcuref_t *ref)
-+{
-+ RCU_LOCKDEP_WARN(!rcu_read_lock_held() && preemptible(),
-+ "suspicious rcuref_put_rcusafe() usage");
-+ /*
-+ * Unconditionally decrease the reference count. The saturation and
-+ * dead zones provide enough tolerance for this.
-+ */
-+ if (likely(!atomic_add_negative_release(-1, &ref->refcnt)))
-+ return false;
-+
-+ /*
-+ * Handle the last reference drop and cases inside the saturation
-+ * and dead zones.
-+ */
-+ return rcuref_put_slowpath(ref);
-+}
-+
-+/**
-+ * rcuref_put_rcusafe -- Release one reference for a rcuref reference count RCU safe
-+ * @ref: Pointer to the reference count
-+ *
-+ * Provides release memory ordering, such that prior loads and stores are done
-+ * before, and provides an acquire ordering on success such that free()
-+ * must come after.
-+ *
-+ * Can be invoked from contexts, which guarantee that no grace period can
-+ * happen which would free the object concurrently if the decrement drops
-+ * the last reference and the slowpath races against a concurrent get() and
-+ * put() pair. rcu_read_lock()'ed and atomic contexts qualify.
-+ *
-+ * Return:
-+ * True if this was the last reference with no future references
-+ * possible. This signals the caller that it can safely release the
-+ * object which is protected by the reference counter.
-+ *
-+ * False if there are still active references or the put() raced
-+ * with a concurrent get()/put() pair. Caller is not allowed to
-+ * release the protected object.
-+ */
-+static inline __must_check bool rcuref_put_rcusafe(rcuref_t *ref)
-+{
-+ return __rcuref_put(ref);
-+}
-+
-+/**
-+ * rcuref_put -- Release one reference for a rcuref reference count
-+ * @ref: Pointer to the reference count
-+ *
-+ * Can be invoked from any context.
-+ *
-+ * Provides release memory ordering, such that prior loads and stores are done
-+ * before, and provides an acquire ordering on success such that free()
-+ * must come after.
-+ *
-+ * Return:
-+ *
-+ * True if this was the last reference with no future references
-+ * possible. This signals the caller that it can safely schedule the
-+ * object, which is protected by the reference counter, for
-+ * deconstruction.
-+ *
-+ * False if there are still active references or the put() raced
-+ * with a concurrent get()/put() pair. Caller is not allowed to
-+ * deconstruct the protected object.
-+ */
-+static inline __must_check bool rcuref_put(rcuref_t *ref)
-+{
-+ bool released;
-+
-+ preempt_disable();
-+ released = __rcuref_put(ref);
-+ preempt_enable();
-+ return released;
-+}
-+
-+#endif
-diff --git a/include/linux/types.h b/include/linux/types.h
-index ea8cf60a8a795..688fb943556a1 100644
---- a/include/linux/types.h
-+++ b/include/linux/types.h
-@@ -175,6 +175,12 @@ typedef struct {
- } atomic64_t;
- #endif
-
-+typedef struct {
-+ atomic_t refcnt;
-+} rcuref_t;
-+
-+#define RCUREF_INIT(i) { .refcnt = ATOMIC_INIT(i - 1) }
-+
- struct list_head {
- struct list_head *next, *prev;
- };
-diff --git a/lib/Makefile b/lib/Makefile
-index 5ffe72ec99797..afd78c497ec76 100644
---- a/lib/Makefile
-+++ b/lib/Makefile
-@@ -47,7 +47,7 @@ obj-y += bcd.o sort.o parser.o debug_locks.o random32.o \
- list_sort.o uuid.o iov_iter.o clz_ctz.o \
- bsearch.o find_bit.o llist.o memweight.o kfifo.o \
- percpu-refcount.o rhashtable.o base64.o \
-- once.o refcount.o usercopy.o errseq.o bucket_locks.o \
-+ once.o refcount.o rcuref.o usercopy.o errseq.o bucket_locks.o \
- generic-radix-tree.o
- obj-$(CONFIG_STRING_SELFTEST) += test_string.o
- obj-y += string_helpers.o
-diff --git a/lib/rcuref.c b/lib/rcuref.c
-new file mode 100644
-index 0000000000000..5ec00a4a64d11
---- /dev/null
-+++ b/lib/rcuref.c
-@@ -0,0 +1,281 @@
-+// SPDX-License-Identifier: GPL-2.0-only
-+
-+/*
-+ * rcuref - A scalable reference count implementation for RCU managed objects
-+ *
-+ * rcuref is provided to replace open coded reference count implementations
-+ * based on atomic_t. It protects explicitely RCU managed objects which can
-+ * be visible even after the last reference has been dropped and the object
-+ * is heading towards destruction.
-+ *
-+ * A common usage pattern is:
-+ *
-+ * get()
-+ * rcu_read_lock();
-+ * p = get_ptr();
-+ * if (p && !atomic_inc_not_zero(&p->refcnt))
-+ * p = NULL;
-+ * rcu_read_unlock();
-+ * return p;
-+ *
-+ * put()
-+ * if (!atomic_dec_return(&->refcnt)) {
-+ * remove_ptr(p);
-+ * kfree_rcu((p, rcu);
-+ * }
-+ *
-+ * atomic_inc_not_zero() is implemented with a try_cmpxchg() loop which has
-+ * O(N^2) behaviour under contention with N concurrent operations.
-+ *
-+ * rcuref uses atomic_add_negative_relaxed() for the fast path, which scales
-+ * better under contention.
-+ *
-+ * Why not refcount?
-+ * =================
-+ *
-+ * In principle it should be possible to make refcount use the rcuref
-+ * scheme, but the destruction race described below cannot be prevented
-+ * unless the protected object is RCU managed.
-+ *
-+ * Theory of operation
-+ * ===================
-+ *
-+ * rcuref uses an unsigned integer reference counter. As long as the
-+ * counter value is greater than or equal to RCUREF_ONEREF and not larger
-+ * than RCUREF_MAXREF the reference is alive:
-+ *
-+ * ONEREF MAXREF SATURATED RELEASED DEAD NOREF
-+ * 0 0x7FFFFFFF 0x8000000 0xA0000000 0xBFFFFFFF 0xC0000000 0xE0000000 0xFFFFFFFF
-+ * <---valid --------> <-------saturation zone-------> <-----dead zone----->
-+ *
-+ * The get() and put() operations do unconditional increments and
-+ * decrements. The result is checked after the operation. This optimizes
-+ * for the fast path.
-+ *
-+ * If the reference count is saturated or dead, then the increments and
-+ * decrements are not harmful as the reference count still stays in the
-+ * respective zones and is always set back to STATURATED resp. DEAD. The
-+ * zones have room for 2^28 racing operations in each direction, which
-+ * makes it practically impossible to escape the zones.
-+ *
-+ * Once the last reference is dropped the reference count becomes
-+ * RCUREF_NOREF which forces rcuref_put() into the slowpath operation. The
-+ * slowpath then tries to set the reference count from RCUREF_NOREF to
-+ * RCUREF_DEAD via a cmpxchg(). This opens a small window where a
-+ * concurrent rcuref_get() can acquire the reference count and bring it
-+ * back to RCUREF_ONEREF or even drop the reference again and mark it DEAD.
-+ *
-+ * If the cmpxchg() succeeds then a concurrent rcuref_get() will result in
-+ * DEAD + 1, which is inside the dead zone. If that happens the reference
-+ * count is put back to DEAD.
-+ *
-+ * The actual race is possible due to the unconditional increment and
-+ * decrements in rcuref_get() and rcuref_put():
-+ *
-+ * T1 T2
-+ * get() put()
-+ * if (atomic_add_negative(-1, &ref->refcnt))
-+ * succeeds-> atomic_cmpxchg(&ref->refcnt, NOREF, DEAD);
-+ *
-+ * atomic_add_negative(1, &ref->refcnt); <- Elevates refcount to DEAD + 1
-+ *
-+ * As the result of T1's add is negative, the get() goes into the slow path
-+ * and observes refcnt being in the dead zone which makes the operation fail.
-+ *
-+ * Possible critical states:
-+ *
-+ * Context Counter References Operation
-+ * T1 0 1 init()
-+ * T2 1 2 get()
-+ * T1 0 1 put()
-+ * T2 -1 0 put() tries to mark dead
-+ * T1 0 1 get()
-+ * T2 0 1 put() mark dead fails
-+ * T1 -1 0 put() tries to mark dead
-+ * T1 DEAD 0 put() mark dead succeeds
-+ * T2 DEAD+1 0 get() fails and puts it back to DEAD
-+ *
-+ * Of course there are more complex scenarios, but the above illustrates
-+ * the working principle. The rest is left to the imagination of the
-+ * reader.
-+ *
-+ * Deconstruction race
-+ * ===================
-+ *
-+ * The release operation must be protected by prohibiting a grace period in
-+ * order to prevent a possible use after free:
-+ *
-+ * T1 T2
-+ * put() get()
-+ * // ref->refcnt = ONEREF
-+ * if (!atomic_add_negative(-1, &ref->refcnt))
-+ * return false; <- Not taken
-+ *
-+ * // ref->refcnt == NOREF
-+ * --> preemption
-+ * // Elevates ref->refcnt to ONEREF
-+ * if (!atomic_add_negative(1, &ref->refcnt))
-+ * return true; <- taken
-+ *
-+ * if (put(&p->ref)) { <-- Succeeds
-+ * remove_pointer(p);
-+ * kfree_rcu(p, rcu);
-+ * }
-+ *
-+ * RCU grace period ends, object is freed
-+ *
-+ * atomic_cmpxchg(&ref->refcnt, NOREF, DEAD); <- UAF
-+ *
-+ * This is prevented by disabling preemption around the put() operation as
-+ * that's in most kernel configurations cheaper than a rcu_read_lock() /
-+ * rcu_read_unlock() pair and in many cases even a NOOP. In any case it
-+ * prevents the grace period which keeps the object alive until all put()
-+ * operations complete.
-+ *
-+ * Saturation protection
-+ * =====================
-+ *
-+ * The reference count has a saturation limit RCUREF_MAXREF (INT_MAX).
-+ * Once this is exceedded the reference count becomes stale by setting it
-+ * to RCUREF_SATURATED, which will cause a memory leak, but it prevents
-+ * wrap arounds which obviously cause worse problems than a memory
-+ * leak. When saturation is reached a warning is emitted.
-+ *
-+ * Race conditions
-+ * ===============
-+ *
-+ * All reference count increment/decrement operations are unconditional and
-+ * only verified after the fact. This optimizes for the good case and takes
-+ * the occasional race vs. a dead or already saturated refcount into
-+ * account. The saturation and dead zones are large enough to accomodate
-+ * for that.
-+ *
-+ * Memory ordering
-+ * ===============
-+ *
-+ * Memory ordering rules are slightly relaxed wrt regular atomic_t functions
-+ * and provide only what is strictly required for refcounts.
-+ *
-+ * The increments are fully relaxed; these will not provide ordering. The
-+ * rationale is that whatever is used to obtain the object to increase the
-+ * reference count on will provide the ordering. For locked data
-+ * structures, its the lock acquire, for RCU/lockless data structures its
-+ * the dependent load.
-+ *
-+ * rcuref_get() provides a control dependency ordering future stores which
-+ * ensures that the object is not modified when acquiring a reference
-+ * fails.
-+ *
-+ * rcuref_put() provides release order, i.e. all prior loads and stores
-+ * will be issued before. It also provides a control dependency ordering
-+ * against the subsequent destruction of the object.
-+ *
-+ * If rcuref_put() successfully dropped the last reference and marked the
-+ * object DEAD it also provides acquire ordering.
-+ */
-+
-+#include <linux/export.h>
-+#include <linux/rcuref.h>
-+
-+/**
-+ * rcuref_get_slowpath - Slowpath of rcuref_get()
-+ * @ref: Pointer to the reference count
-+ *
-+ * Invoked when the reference count is outside of the valid zone.
-+ *
-+ * Return:
-+ * False if the reference count was already marked dead
-+ *
-+ * True if the reference count is saturated, which prevents the
-+ * object from being deconstructed ever.
-+ */
-+bool rcuref_get_slowpath(rcuref_t *ref)
-+{
-+ unsigned int cnt = atomic_read(&ref->refcnt);
-+
-+ /*
-+ * If the reference count was already marked dead, undo the
-+ * increment so it stays in the middle of the dead zone and return
-+ * fail.
-+ */
-+ if (cnt >= RCUREF_RELEASED) {
-+ atomic_set(&ref->refcnt, RCUREF_DEAD);
-+ return false;
-+ }
-+
-+ /*
-+ * If it was saturated, warn and mark it so. In case the increment
-+ * was already on a saturated value restore the saturation
-+ * marker. This keeps it in the middle of the saturation zone and
-+ * prevents the reference count from overflowing. This leaks the
-+ * object memory, but prevents the obvious reference count overflow
-+ * damage.
-+ */
-+ if (WARN_ONCE(cnt > RCUREF_MAXREF, "rcuref saturated - leaking memory"))
-+ atomic_set(&ref->refcnt, RCUREF_SATURATED);
-+ return true;
-+}
-+EXPORT_SYMBOL_GPL(rcuref_get_slowpath);
-+
-+/**
-+ * rcuref_put_slowpath - Slowpath of __rcuref_put()
-+ * @ref: Pointer to the reference count
-+ *
-+ * Invoked when the reference count is outside of the valid zone.
-+ *
-+ * Return:
-+ * True if this was the last reference with no future references
-+ * possible. This signals the caller that it can safely schedule the
-+ * object, which is protected by the reference counter, for
-+ * deconstruction.
-+ *
-+ * False if there are still active references or the put() raced
-+ * with a concurrent get()/put() pair. Caller is not allowed to
-+ * deconstruct the protected object.
-+ */
-+bool rcuref_put_slowpath(rcuref_t *ref)
-+{
-+ unsigned int cnt = atomic_read(&ref->refcnt);
-+
-+ /* Did this drop the last reference? */
-+ if (likely(cnt == RCUREF_NOREF)) {
-+ /*
-+ * Carefully try to set the reference count to RCUREF_DEAD.
-+ *
-+ * This can fail if a concurrent get() operation has
-+ * elevated it again or the corresponding put() even marked
-+ * it dead already. Both are valid situations and do not
-+ * require a retry. If this fails the caller is not
-+ * allowed to deconstruct the object.
-+ */
-+ if (atomic_cmpxchg_release(&ref->refcnt, RCUREF_NOREF, RCUREF_DEAD) != RCUREF_NOREF)
-+ return false;
-+
-+ /*
-+ * The caller can safely schedule the object for
-+ * deconstruction. Provide acquire ordering.
-+ */
-+ smp_acquire__after_ctrl_dep();
-+ return true;
-+ }
-+
-+ /*
-+ * If the reference count was already in the dead zone, then this
-+ * put() operation is imbalanced. Warn, put the reference count back to
-+ * DEAD and tell the caller to not deconstruct the object.
-+ */
-+ if (WARN_ONCE(cnt >= RCUREF_RELEASED, "rcuref - imbalanced put()")) {
-+ atomic_set(&ref->refcnt, RCUREF_DEAD);
-+ return false;
-+ }
-+
-+ /*
-+ * This is a put() operation on a saturated refcount. Restore the
-+ * mean saturation value and tell the caller to not deconstruct the
-+ * object.
-+ */
-+ if (cnt > RCUREF_MAXREF)
-+ atomic_set(&ref->refcnt, RCUREF_SATURATED);
-+ return false;
-+}
-+EXPORT_SYMBOL_GPL(rcuref_put_slowpath);
---
-2.40.1
-
+++ /dev/null
-From f4fdfd10202488104e6e484bd76fd1b5cd7c10c6 Mon Sep 17 00:00:00 2001
-From: Sasha Levin <sashal@kernel.org>
-Date: Thu, 20 Apr 2023 20:25:08 +0200
-Subject: net: dst: fix missing initialization of rt_uncached
-
-From: Maxime Bizon <mbizon@freebox.fr>
-
-[ Upstream commit 418a73074da9182f571e467eaded03ea501f3281 ]
-
-xfrm_alloc_dst() followed by xfrm4_dst_destroy(), without a
-xfrm4_fill_dst() call in between, causes the following BUG:
-
- BUG: spinlock bad magic on CPU#0, fbxhostapd/732
- lock: 0x890b7668, .magic: 890b7668, .owner: <none>/-1, .owner_cpu: 0
- CPU: 0 PID: 732 Comm: fbxhostapd Not tainted 6.3.0-rc6-next-20230414-00613-ge8de66369925-dirty #9
- Hardware name: Marvell Kirkwood (Flattened Device Tree)
- unwind_backtrace from show_stack+0x10/0x14
- show_stack from dump_stack_lvl+0x28/0x30
- dump_stack_lvl from do_raw_spin_lock+0x20/0x80
- do_raw_spin_lock from rt_del_uncached_list+0x30/0x64
- rt_del_uncached_list from xfrm4_dst_destroy+0x3c/0xbc
- xfrm4_dst_destroy from dst_destroy+0x5c/0xb0
- dst_destroy from rcu_process_callbacks+0xc4/0xec
- rcu_process_callbacks from __do_softirq+0xb4/0x22c
- __do_softirq from call_with_stack+0x1c/0x24
- call_with_stack from do_softirq+0x60/0x6c
- do_softirq from __local_bh_enable_ip+0xa0/0xcc
-
-Patch "net: dst: Prevent false sharing vs. dst_entry:: __refcnt" moved
-rt_uncached and rt_uncached_list fields from rtable struct to dst
-struct, so they are more zeroed by memset_after(xdst, 0, u.dst) in
-xfrm_alloc_dst().
-
-Note that rt_uncached (list_head) was never properly initialized at
-alloc time, but xfrm[46]_dst_destroy() is written in such a way that
-it was not an issue thanks to the memset:
-
- if (xdst->u.rt.dst.rt_uncached_list)
- rt_del_uncached_list(&xdst->u.rt);
-
-The route code does it the other way around: rt_uncached_list is
-assumed to be valid IIF rt_uncached list_head is not empty:
-
-void rt_del_uncached_list(struct rtable *rt)
-{
- if (!list_empty(&rt->dst.rt_uncached)) {
- struct uncached_list *ul = rt->dst.rt_uncached_list;
-
- spin_lock_bh(&ul->lock);
- list_del_init(&rt->dst.rt_uncached);
- spin_unlock_bh(&ul->lock);
- }
-}
-
-This patch adds mandatory rt_uncached list_head initialization in
-generic dst_init(), and adapt xfrm[46]_dst_destroy logic to match the
-rest of the code.
-
-Fixes: d288a162dd1c ("net: dst: Prevent false sharing vs. dst_entry:: __refcnt")
-Reported-by: kernel test robot <oliver.sang@intel.com>
-Link: https://lore.kernel.org/oe-lkp/202304162125.18b7bcdd-oliver.sang@intel.com
-Reviewed-by: David Ahern <dsahern@kernel.org>
-Reviewed-by: Eric Dumazet <edumazet@google.com>
-CC: Leon Romanovsky <leon@kernel.org>
-Signed-off-by: Maxime Bizon <mbizon@freebox.fr>
-Link: https://lore.kernel.org/r/20230420182508.2417582-1-mbizon@freebox.fr
-Signed-off-by: Jakub Kicinski <kuba@kernel.org>
-Stable-dep-of: cc9b364bb1d5 ("xfrm6: fix inet6_dev refcount underflow problem")
-Signed-off-by: Sasha Levin <sashal@kernel.org>
----
- net/core/dst.c | 1 +
- net/ipv4/route.c | 4 ----
- net/ipv4/xfrm4_policy.c | 4 +---
- net/ipv6/route.c | 1 -
- net/ipv6/xfrm6_policy.c | 4 +---
- 5 files changed, 3 insertions(+), 11 deletions(-)
-
-diff --git a/net/core/dst.c b/net/core/dst.c
-index 2b7b1619b5e29..1666a6f5e858e 100644
---- a/net/core/dst.c
-+++ b/net/core/dst.c
-@@ -67,6 +67,7 @@ void dst_init(struct dst_entry *dst, struct dst_ops *ops,
- #endif
- dst->lwtstate = NULL;
- rcuref_init(&dst->__rcuref, initial_ref);
-+ INIT_LIST_HEAD(&dst->rt_uncached);
- dst->__use = 0;
- dst->lastuse = jiffies;
- dst->flags = flags;
-diff --git a/net/ipv4/route.c b/net/ipv4/route.c
-index 7ccf6503d67aa..a44d20644fbc2 100644
---- a/net/ipv4/route.c
-+++ b/net/ipv4/route.c
-@@ -1646,7 +1646,6 @@ struct rtable *rt_dst_alloc(struct net_device *dev,
- rt->rt_uses_gateway = 0;
- rt->rt_gw_family = 0;
- rt->rt_gw4 = 0;
-- INIT_LIST_HEAD(&rt->dst.rt_uncached);
-
- rt->dst.output = ip_output;
- if (flags & RTCF_LOCAL)
-@@ -1677,7 +1676,6 @@ struct rtable *rt_dst_clone(struct net_device *dev, struct rtable *rt)
- new_rt->rt_gw4 = rt->rt_gw4;
- else if (rt->rt_gw_family == AF_INET6)
- new_rt->rt_gw6 = rt->rt_gw6;
-- INIT_LIST_HEAD(&new_rt->dst.rt_uncached);
-
- new_rt->dst.input = rt->dst.input;
- new_rt->dst.output = rt->dst.output;
-@@ -2861,8 +2859,6 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or
- rt->rt_gw4 = ort->rt_gw4;
- else if (rt->rt_gw_family == AF_INET6)
- rt->rt_gw6 = ort->rt_gw6;
--
-- INIT_LIST_HEAD(&rt->dst.rt_uncached);
- }
-
- dst_release(dst_orig);
-diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
-index 47861c8b7340e..9403bbaf1b616 100644
---- a/net/ipv4/xfrm4_policy.c
-+++ b/net/ipv4/xfrm4_policy.c
-@@ -91,7 +91,6 @@ static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
- xdst->u.rt.rt_gw6 = rt->rt_gw6;
- xdst->u.rt.rt_pmtu = rt->rt_pmtu;
- xdst->u.rt.rt_mtu_locked = rt->rt_mtu_locked;
-- INIT_LIST_HEAD(&xdst->u.rt.dst.rt_uncached);
- rt_add_uncached_list(&xdst->u.rt);
-
- return 0;
-@@ -121,8 +120,7 @@ static void xfrm4_dst_destroy(struct dst_entry *dst)
- struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
-
- dst_destroy_metrics_generic(dst);
-- if (xdst->u.rt.dst.rt_uncached_list)
-- rt_del_uncached_list(&xdst->u.rt);
-+ rt_del_uncached_list(&xdst->u.rt);
- xfrm_dst_destroy(xdst);
- }
-
-diff --git a/net/ipv6/route.c b/net/ipv6/route.c
-index 9db0b2318e918..d4d06a9d985e8 100644
---- a/net/ipv6/route.c
-+++ b/net/ipv6/route.c
-@@ -334,7 +334,6 @@ static const struct rt6_info ip6_blk_hole_entry_template = {
- static void rt6_info_init(struct rt6_info *rt)
- {
- memset_after(rt, 0, dst);
-- INIT_LIST_HEAD(&rt->dst.rt_uncached);
- }
-
- /* allocate dst with ip6_dst_ops */
-diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c
-index 2b493f8d00918..eecc5e59da17c 100644
---- a/net/ipv6/xfrm6_policy.c
-+++ b/net/ipv6/xfrm6_policy.c
-@@ -89,7 +89,6 @@ static int xfrm6_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
- xdst->u.rt6.rt6i_gateway = rt->rt6i_gateway;
- xdst->u.rt6.rt6i_dst = rt->rt6i_dst;
- xdst->u.rt6.rt6i_src = rt->rt6i_src;
-- INIT_LIST_HEAD(&xdst->u.rt6.dst.rt_uncached);
- rt6_uncached_list_add(&xdst->u.rt6);
-
- return 0;
-@@ -121,8 +120,7 @@ static void xfrm6_dst_destroy(struct dst_entry *dst)
- if (likely(xdst->u.rt6.rt6i_idev))
- in6_dev_put(xdst->u.rt6.rt6i_idev);
- dst_destroy_metrics_generic(dst);
-- if (xdst->u.rt6.dst.rt_uncached_list)
-- rt6_uncached_list_del(&xdst->u.rt6);
-+ rt6_uncached_list_del(&xdst->u.rt6);
- xfrm_dst_destroy(xdst);
- }
-
---
-2.40.1
-
+++ /dev/null
-From e7f0083dd5326ec3a897b9d9c144fdaf4f630c4a Mon Sep 17 00:00:00 2001
-From: Sasha Levin <sashal@kernel.org>
-Date: Thu, 23 Mar 2023 21:55:29 +0100
-Subject: net: dst: Prevent false sharing vs. dst_entry:: __refcnt
-
-From: Wangyang Guo <wangyang.guo@intel.com>
-
-[ Upstream commit d288a162dd1c73507da582966f17dd226e34a0c0 ]
-
-dst_entry::__refcnt is highly contended in scenarios where many connections
-happen from and to the same IP. The reference count is an atomic_t, so the
-reference count operations have to take the cache-line exclusive.
-
-Aside of the unavoidable reference count contention there is another
-significant problem which is caused by that: False sharing.
-
-perf top identified two affected read accesses. dst_entry::lwtstate and
-rtable::rt_genid.
-
-dst_entry:__refcnt is located at offset 64 of dst_entry, which puts it into
-a seperate cacheline vs. the read mostly members located at the beginning
-of the struct.
-
-That prevents false sharing vs. the struct members in the first 64
-bytes of the structure, but there is also
-
- dst_entry::lwtstate
-
-which is located after the reference count and in the same cache line. This
-member is read after a reference count has been acquired.
-
-struct rtable embeds a struct dst_entry at offset 0. struct dst_entry has a
-size of 112 bytes, which means that the struct members of rtable which
-follow the dst member share the same cache line as dst_entry::__refcnt.
-Especially
-
- rtable::rt_genid
-
-is also read by the contexts which have a reference count acquired
-already.
-
-When dst_entry:__refcnt is incremented or decremented via an atomic
-operation these read accesses stall. This was found when analysing the
-memtier benchmark in 1:100 mode, which amplifies the problem extremly.
-
-Move the rt[6i]_uncached[_list] members out of struct rtable and struct
-rt6_info into struct dst_entry to provide padding and move the lwtstate
-member after that so it ends up in the same cache line.
-
-The resulting improvement depends on the micro-architecture and the number
-of CPUs. It ranges from +20% to +120% with a localhost memtier/memcached
-benchmark.
-
-[ tglx: Rearrange struct ]
-
-Signed-off-by: Wangyang Guo <wangyang.guo@intel.com>
-Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
-Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
-Reviewed-by: Eric Dumazet <edumazet@google.com>
-Reviewed-by: David Ahern <dsahern@kernel.org>
-Link: https://lore.kernel.org/r/20230323102800.042297517@linutronix.de
-Signed-off-by: Jakub Kicinski <kuba@kernel.org>
-Stable-dep-of: cc9b364bb1d5 ("xfrm6: fix inet6_dev refcount underflow problem")
-Signed-off-by: Sasha Levin <sashal@kernel.org>
----
- include/net/dst.h | 15 ++++++++++++++-
- include/net/ip6_fib.h | 3 ---
- include/net/ip6_route.h | 2 +-
- include/net/route.h | 3 ---
- net/ipv4/route.c | 20 ++++++++++----------
- net/ipv4/xfrm4_policy.c | 4 ++--
- net/ipv6/route.c | 26 +++++++++++++-------------
- net/ipv6/xfrm6_policy.c | 4 ++--
- 8 files changed, 42 insertions(+), 35 deletions(-)
-
-diff --git a/include/net/dst.h b/include/net/dst.h
-index d67fda89cd0fa..81f2279ea911a 100644
---- a/include/net/dst.h
-+++ b/include/net/dst.h
-@@ -69,15 +69,28 @@ struct dst_entry {
- #endif
- int __use;
- unsigned long lastuse;
-- struct lwtunnel_state *lwtstate;
- struct rcu_head rcu_head;
- short error;
- short __pad;
- __u32 tclassid;
- #ifndef CONFIG_64BIT
-+ struct lwtunnel_state *lwtstate;
- atomic_t __refcnt; /* 32-bit offset 64 */
- #endif
- netdevice_tracker dev_tracker;
-+
-+ /*
-+ * Used by rtable and rt6_info. Moves lwtstate into the next cache
-+ * line on 64bit so that lwtstate does not cause false sharing with
-+ * __refcnt under contention of __refcnt. This also puts the
-+ * frequently accessed members of rtable and rt6_info out of the
-+ * __refcnt cache line.
-+ */
-+ struct list_head rt_uncached;
-+ struct uncached_list *rt_uncached_list;
-+#ifdef CONFIG_64BIT
-+ struct lwtunnel_state *lwtstate;
-+#endif
- };
-
- struct dst_metrics {
-diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
-index fa4e6af382e2a..9ba6413fd2e3e 100644
---- a/include/net/ip6_fib.h
-+++ b/include/net/ip6_fib.h
-@@ -217,9 +217,6 @@ struct rt6_info {
- struct inet6_dev *rt6i_idev;
- u32 rt6i_flags;
-
-- struct list_head rt6i_uncached;
-- struct uncached_list *rt6i_uncached_list;
--
- /* more non-fragment space at head required */
- unsigned short rt6i_nfheader_len;
- };
-diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
-index 035d61d50a989..6c6b673d92554 100644
---- a/include/net/ip6_route.h
-+++ b/include/net/ip6_route.h
-@@ -104,7 +104,7 @@ static inline struct dst_entry *ip6_route_output(struct net *net,
- static inline void ip6_rt_put_flags(struct rt6_info *rt, int flags)
- {
- if (!(flags & RT6_LOOKUP_F_DST_NOREF) ||
-- !list_empty(&rt->rt6i_uncached))
-+ !list_empty(&rt->dst.rt_uncached))
- ip6_rt_put(rt);
- }
-
-diff --git a/include/net/route.h b/include/net/route.h
-index af8431b25f800..9ca0f72868b76 100644
---- a/include/net/route.h
-+++ b/include/net/route.h
-@@ -78,9 +78,6 @@ struct rtable {
- /* Miscellaneous cached information */
- u32 rt_mtu_locked:1,
- rt_pmtu:31;
--
-- struct list_head rt_uncached;
-- struct uncached_list *rt_uncached_list;
- };
-
- static inline bool rt_is_input_route(const struct rtable *rt)
-diff --git a/net/ipv4/route.c b/net/ipv4/route.c
-index 9cbaae4f5ee71..7ccf6503d67aa 100644
---- a/net/ipv4/route.c
-+++ b/net/ipv4/route.c
-@@ -1510,20 +1510,20 @@ void rt_add_uncached_list(struct rtable *rt)
- {
- struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
-
-- rt->rt_uncached_list = ul;
-+ rt->dst.rt_uncached_list = ul;
-
- spin_lock_bh(&ul->lock);
-- list_add_tail(&rt->rt_uncached, &ul->head);
-+ list_add_tail(&rt->dst.rt_uncached, &ul->head);
- spin_unlock_bh(&ul->lock);
- }
-
- void rt_del_uncached_list(struct rtable *rt)
- {
-- if (!list_empty(&rt->rt_uncached)) {
-- struct uncached_list *ul = rt->rt_uncached_list;
-+ if (!list_empty(&rt->dst.rt_uncached)) {
-+ struct uncached_list *ul = rt->dst.rt_uncached_list;
-
- spin_lock_bh(&ul->lock);
-- list_del_init(&rt->rt_uncached);
-+ list_del_init(&rt->dst.rt_uncached);
- spin_unlock_bh(&ul->lock);
- }
- }
-@@ -1548,13 +1548,13 @@ void rt_flush_dev(struct net_device *dev)
- continue;
-
- spin_lock_bh(&ul->lock);
-- list_for_each_entry_safe(rt, safe, &ul->head, rt_uncached) {
-+ list_for_each_entry_safe(rt, safe, &ul->head, dst.rt_uncached) {
- if (rt->dst.dev != dev)
- continue;
- rt->dst.dev = blackhole_netdev;
- netdev_ref_replace(dev, blackhole_netdev,
- &rt->dst.dev_tracker, GFP_ATOMIC);
-- list_move(&rt->rt_uncached, &ul->quarantine);
-+ list_move(&rt->dst.rt_uncached, &ul->quarantine);
- }
- spin_unlock_bh(&ul->lock);
- }
-@@ -1646,7 +1646,7 @@ struct rtable *rt_dst_alloc(struct net_device *dev,
- rt->rt_uses_gateway = 0;
- rt->rt_gw_family = 0;
- rt->rt_gw4 = 0;
-- INIT_LIST_HEAD(&rt->rt_uncached);
-+ INIT_LIST_HEAD(&rt->dst.rt_uncached);
-
- rt->dst.output = ip_output;
- if (flags & RTCF_LOCAL)
-@@ -1677,7 +1677,7 @@ struct rtable *rt_dst_clone(struct net_device *dev, struct rtable *rt)
- new_rt->rt_gw4 = rt->rt_gw4;
- else if (rt->rt_gw_family == AF_INET6)
- new_rt->rt_gw6 = rt->rt_gw6;
-- INIT_LIST_HEAD(&new_rt->rt_uncached);
-+ INIT_LIST_HEAD(&new_rt->dst.rt_uncached);
-
- new_rt->dst.input = rt->dst.input;
- new_rt->dst.output = rt->dst.output;
-@@ -2862,7 +2862,7 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or
- else if (rt->rt_gw_family == AF_INET6)
- rt->rt_gw6 = ort->rt_gw6;
-
-- INIT_LIST_HEAD(&rt->rt_uncached);
-+ INIT_LIST_HEAD(&rt->dst.rt_uncached);
- }
-
- dst_release(dst_orig);
-diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
-index 3d0dfa6cf9f96..47861c8b7340e 100644
---- a/net/ipv4/xfrm4_policy.c
-+++ b/net/ipv4/xfrm4_policy.c
-@@ -91,7 +91,7 @@ static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
- xdst->u.rt.rt_gw6 = rt->rt_gw6;
- xdst->u.rt.rt_pmtu = rt->rt_pmtu;
- xdst->u.rt.rt_mtu_locked = rt->rt_mtu_locked;
-- INIT_LIST_HEAD(&xdst->u.rt.rt_uncached);
-+ INIT_LIST_HEAD(&xdst->u.rt.dst.rt_uncached);
- rt_add_uncached_list(&xdst->u.rt);
-
- return 0;
-@@ -121,7 +121,7 @@ static void xfrm4_dst_destroy(struct dst_entry *dst)
- struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
-
- dst_destroy_metrics_generic(dst);
-- if (xdst->u.rt.rt_uncached_list)
-+ if (xdst->u.rt.dst.rt_uncached_list)
- rt_del_uncached_list(&xdst->u.rt);
- xfrm_dst_destroy(xdst);
- }
-diff --git a/net/ipv6/route.c b/net/ipv6/route.c
-index 0bcdb675ba2c1..7205adee46c21 100644
---- a/net/ipv6/route.c
-+++ b/net/ipv6/route.c
-@@ -139,20 +139,20 @@ void rt6_uncached_list_add(struct rt6_info *rt)
- {
- struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
-
-- rt->rt6i_uncached_list = ul;
-+ rt->dst.rt_uncached_list = ul;
-
- spin_lock_bh(&ul->lock);
-- list_add_tail(&rt->rt6i_uncached, &ul->head);
-+ list_add_tail(&rt->dst.rt_uncached, &ul->head);
- spin_unlock_bh(&ul->lock);
- }
-
- void rt6_uncached_list_del(struct rt6_info *rt)
- {
-- if (!list_empty(&rt->rt6i_uncached)) {
-- struct uncached_list *ul = rt->rt6i_uncached_list;
-+ if (!list_empty(&rt->dst.rt_uncached)) {
-+ struct uncached_list *ul = rt->dst.rt_uncached_list;
-
- spin_lock_bh(&ul->lock);
-- list_del_init(&rt->rt6i_uncached);
-+ list_del_init(&rt->dst.rt_uncached);
- spin_unlock_bh(&ul->lock);
- }
- }
-@@ -169,7 +169,7 @@ static void rt6_uncached_list_flush_dev(struct net_device *dev)
- continue;
-
- spin_lock_bh(&ul->lock);
-- list_for_each_entry_safe(rt, safe, &ul->head, rt6i_uncached) {
-+ list_for_each_entry_safe(rt, safe, &ul->head, dst.rt_uncached) {
- struct inet6_dev *rt_idev = rt->rt6i_idev;
- struct net_device *rt_dev = rt->dst.dev;
- bool handled = false;
-@@ -188,7 +188,7 @@ static void rt6_uncached_list_flush_dev(struct net_device *dev)
- handled = true;
- }
- if (handled)
-- list_move(&rt->rt6i_uncached,
-+ list_move(&rt->dst.rt_uncached,
- &ul->quarantine);
- }
- spin_unlock_bh(&ul->lock);
-@@ -334,7 +334,7 @@ static const struct rt6_info ip6_blk_hole_entry_template = {
- static void rt6_info_init(struct rt6_info *rt)
- {
- memset_after(rt, 0, dst);
-- INIT_LIST_HEAD(&rt->rt6i_uncached);
-+ INIT_LIST_HEAD(&rt->dst.rt_uncached);
- }
-
- /* allocate dst with ip6_dst_ops */
-@@ -2641,7 +2641,7 @@ struct dst_entry *ip6_route_output_flags(struct net *net,
- dst = ip6_route_output_flags_noref(net, sk, fl6, flags);
- rt6 = (struct rt6_info *)dst;
- /* For dst cached in uncached_list, refcnt is already taken. */
-- if (list_empty(&rt6->rt6i_uncached) && !dst_hold_safe(dst)) {
-+ if (list_empty(&rt6->dst.rt_uncached) && !dst_hold_safe(dst)) {
- dst = &net->ipv6.ip6_null_entry->dst;
- dst_hold(dst);
- }
-@@ -2751,7 +2751,7 @@ INDIRECT_CALLABLE_SCOPE struct dst_entry *ip6_dst_check(struct dst_entry *dst,
- from = rcu_dereference(rt->from);
-
- if (from && (rt->rt6i_flags & RTF_PCPU ||
-- unlikely(!list_empty(&rt->rt6i_uncached))))
-+ unlikely(!list_empty(&rt->dst.rt_uncached))))
- dst_ret = rt6_dst_from_check(rt, from, cookie);
- else
- dst_ret = rt6_check(rt, from, cookie);
-@@ -6488,7 +6488,7 @@ static int __net_init ip6_route_net_init(struct net *net)
- net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
- dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
- ip6_template_metrics, true);
-- INIT_LIST_HEAD(&net->ipv6.ip6_null_entry->rt6i_uncached);
-+ INIT_LIST_HEAD(&net->ipv6.ip6_null_entry->dst.rt_uncached);
-
- #ifdef CONFIG_IPV6_MULTIPLE_TABLES
- net->ipv6.fib6_has_custom_rules = false;
-@@ -6500,7 +6500,7 @@ static int __net_init ip6_route_net_init(struct net *net)
- net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
- dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
- ip6_template_metrics, true);
-- INIT_LIST_HEAD(&net->ipv6.ip6_prohibit_entry->rt6i_uncached);
-+ INIT_LIST_HEAD(&net->ipv6.ip6_prohibit_entry->dst.rt_uncached);
-
- net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
- sizeof(*net->ipv6.ip6_blk_hole_entry),
-@@ -6510,7 +6510,7 @@ static int __net_init ip6_route_net_init(struct net *net)
- net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
- dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
- ip6_template_metrics, true);
-- INIT_LIST_HEAD(&net->ipv6.ip6_blk_hole_entry->rt6i_uncached);
-+ INIT_LIST_HEAD(&net->ipv6.ip6_blk_hole_entry->dst.rt_uncached);
- #ifdef CONFIG_IPV6_SUBTREES
- net->ipv6.fib6_routes_require_src = 0;
- #endif
-diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c
-index ea435eba30534..2b493f8d00918 100644
---- a/net/ipv6/xfrm6_policy.c
-+++ b/net/ipv6/xfrm6_policy.c
-@@ -89,7 +89,7 @@ static int xfrm6_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
- xdst->u.rt6.rt6i_gateway = rt->rt6i_gateway;
- xdst->u.rt6.rt6i_dst = rt->rt6i_dst;
- xdst->u.rt6.rt6i_src = rt->rt6i_src;
-- INIT_LIST_HEAD(&xdst->u.rt6.rt6i_uncached);
-+ INIT_LIST_HEAD(&xdst->u.rt6.dst.rt_uncached);
- rt6_uncached_list_add(&xdst->u.rt6);
-
- return 0;
-@@ -121,7 +121,7 @@ static void xfrm6_dst_destroy(struct dst_entry *dst)
- if (likely(xdst->u.rt6.rt6i_idev))
- in6_dev_put(xdst->u.rt6.rt6i_idev);
- dst_destroy_metrics_generic(dst);
-- if (xdst->u.rt6.rt6i_uncached_list)
-+ if (xdst->u.rt6.dst.rt_uncached_list)
- rt6_uncached_list_del(&xdst->u.rt6);
- xfrm_dst_destroy(xdst);
- }
---
-2.40.1
-
+++ /dev/null
-From 180ab46081f3404a77e4cef550c4f0b28701a1b3 Mon Sep 17 00:00:00 2001
-From: Sasha Levin <sashal@kernel.org>
-Date: Thu, 23 Mar 2023 21:55:32 +0100
-Subject: net: dst: Switch to rcuref_t reference counting
-
-From: Thomas Gleixner <tglx@linutronix.de>
-
-[ Upstream commit bc9d3a9f2afca189a6ae40225b6985e3c775375e ]
-
-Under high contention dst_entry::__refcnt becomes a significant bottleneck.
-
-atomic_inc_not_zero() is implemented with a cmpxchg() loop, which goes into
-high retry rates on contention.
-
-Switch the reference count to rcuref_t which results in a significant
-performance gain. Rename the reference count member to __rcuref to reflect
-the change.
-
-The gain depends on the micro-architecture and the number of concurrent
-operations and has been measured in the range of +25% to +130% with a
-localhost memtier/memcached benchmark which amplifies the problem
-massively.
-
-Running the memtier/memcached benchmark over a real (1Gb) network
-connection the conversion on top of the false sharing fix for struct
-dst_entry::__refcnt results in a total gain in the 2%-5% range over the
-upstream baseline.
-
-Reported-by: Wangyang Guo <wangyang.guo@intel.com>
-Reported-by: Arjan Van De Ven <arjan.van.de.ven@intel.com>
-Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
-Link: https://lore.kernel.org/r/20230307125538.989175656@linutronix.de
-Link: https://lore.kernel.org/r/20230323102800.215027837@linutronix.de
-Signed-off-by: Jakub Kicinski <kuba@kernel.org>
-Stable-dep-of: cc9b364bb1d5 ("xfrm6: fix inet6_dev refcount underflow problem")
-Signed-off-by: Sasha Levin <sashal@kernel.org>
----
- include/net/dst.h | 19 ++++++++++---------
- include/net/sock.h | 2 +-
- net/bridge/br_nf_core.c | 2 +-
- net/core/dst.c | 26 +++++---------------------
- net/core/rtnetlink.c | 2 +-
- net/ipv6/route.c | 6 +++---
- net/netfilter/ipvs/ip_vs_xmit.c | 4 ++--
- 7 files changed, 23 insertions(+), 38 deletions(-)
-
-diff --git a/include/net/dst.h b/include/net/dst.h
-index 81f2279ea911a..78884429deed8 100644
---- a/include/net/dst.h
-+++ b/include/net/dst.h
-@@ -16,6 +16,7 @@
- #include <linux/bug.h>
- #include <linux/jiffies.h>
- #include <linux/refcount.h>
-+#include <linux/rcuref.h>
- #include <net/neighbour.h>
- #include <asm/processor.h>
- #include <linux/indirect_call_wrapper.h>
-@@ -61,11 +62,11 @@ struct dst_entry {
- unsigned short trailer_len; /* space to reserve at tail */
-
- /*
-- * __refcnt wants to be on a different cache line from
-+ * __rcuref wants to be on a different cache line from
- * input/output/ops or performance tanks badly
- */
- #ifdef CONFIG_64BIT
-- atomic_t __refcnt; /* 64-bit offset 64 */
-+ rcuref_t __rcuref; /* 64-bit offset 64 */
- #endif
- int __use;
- unsigned long lastuse;
-@@ -75,16 +76,16 @@ struct dst_entry {
- __u32 tclassid;
- #ifndef CONFIG_64BIT
- struct lwtunnel_state *lwtstate;
-- atomic_t __refcnt; /* 32-bit offset 64 */
-+ rcuref_t __rcuref; /* 32-bit offset 64 */
- #endif
- netdevice_tracker dev_tracker;
-
- /*
- * Used by rtable and rt6_info. Moves lwtstate into the next cache
- * line on 64bit so that lwtstate does not cause false sharing with
-- * __refcnt under contention of __refcnt. This also puts the
-+ * __rcuref under contention of __rcuref. This also puts the
- * frequently accessed members of rtable and rt6_info out of the
-- * __refcnt cache line.
-+ * __rcuref cache line.
- */
- struct list_head rt_uncached;
- struct uncached_list *rt_uncached_list;
-@@ -238,10 +239,10 @@ static inline void dst_hold(struct dst_entry *dst)
- {
- /*
- * If your kernel compilation stops here, please check
-- * the placement of __refcnt in struct dst_entry
-+ * the placement of __rcuref in struct dst_entry
- */
-- BUILD_BUG_ON(offsetof(struct dst_entry, __refcnt) & 63);
-- WARN_ON(atomic_inc_not_zero(&dst->__refcnt) == 0);
-+ BUILD_BUG_ON(offsetof(struct dst_entry, __rcuref) & 63);
-+ WARN_ON(!rcuref_get(&dst->__rcuref));
- }
-
- static inline void dst_use_noref(struct dst_entry *dst, unsigned long time)
-@@ -305,7 +306,7 @@ static inline void skb_dst_copy(struct sk_buff *nskb, const struct sk_buff *oskb
- */
- static inline bool dst_hold_safe(struct dst_entry *dst)
- {
-- return atomic_inc_not_zero(&dst->__refcnt);
-+ return rcuref_get(&dst->__rcuref);
- }
-
- /**
-diff --git a/include/net/sock.h b/include/net/sock.h
-index fe695e8bfe289..4c988b981d6e1 100644
---- a/include/net/sock.h
-+++ b/include/net/sock.h
-@@ -2181,7 +2181,7 @@ sk_dst_get(struct sock *sk)
-
- rcu_read_lock();
- dst = rcu_dereference(sk->sk_dst_cache);
-- if (dst && !atomic_inc_not_zero(&dst->__refcnt))
-+ if (dst && !rcuref_get(&dst->__rcuref))
- dst = NULL;
- rcu_read_unlock();
- return dst;
-diff --git a/net/bridge/br_nf_core.c b/net/bridge/br_nf_core.c
-index 8c69f0c95a8ed..98aea5485aaef 100644
---- a/net/bridge/br_nf_core.c
-+++ b/net/bridge/br_nf_core.c
-@@ -73,7 +73,7 @@ void br_netfilter_rtable_init(struct net_bridge *br)
- {
- struct rtable *rt = &br->fake_rtable;
-
-- atomic_set(&rt->dst.__refcnt, 1);
-+ rcuref_init(&rt->dst.__rcuref, 1);
- rt->dst.dev = br->dev;
- dst_init_metrics(&rt->dst, br_dst_default_metrics, true);
- rt->dst.flags = DST_NOXFRM | DST_FAKE_RTABLE;
-diff --git a/net/core/dst.c b/net/core/dst.c
-index a4e738d321ba2..2b7b1619b5e29 100644
---- a/net/core/dst.c
-+++ b/net/core/dst.c
-@@ -66,7 +66,7 @@ void dst_init(struct dst_entry *dst, struct dst_ops *ops,
- dst->tclassid = 0;
- #endif
- dst->lwtstate = NULL;
-- atomic_set(&dst->__refcnt, initial_ref);
-+ rcuref_init(&dst->__rcuref, initial_ref);
- dst->__use = 0;
- dst->lastuse = jiffies;
- dst->flags = flags;
-@@ -166,31 +166,15 @@ EXPORT_SYMBOL(dst_dev_put);
-
- void dst_release(struct dst_entry *dst)
- {
-- if (dst) {
-- int newrefcnt;
--
-- newrefcnt = atomic_dec_return(&dst->__refcnt);
-- if (WARN_ONCE(newrefcnt < 0, "dst_release underflow"))
-- net_warn_ratelimited("%s: dst:%p refcnt:%d\n",
-- __func__, dst, newrefcnt);
-- if (!newrefcnt)
-- call_rcu_hurry(&dst->rcu_head, dst_destroy_rcu);
-- }
-+ if (dst && rcuref_put(&dst->__rcuref))
-+ call_rcu_hurry(&dst->rcu_head, dst_destroy_rcu);
- }
- EXPORT_SYMBOL(dst_release);
-
- void dst_release_immediate(struct dst_entry *dst)
- {
-- if (dst) {
-- int newrefcnt;
--
-- newrefcnt = atomic_dec_return(&dst->__refcnt);
-- if (WARN_ONCE(newrefcnt < 0, "dst_release_immediate underflow"))
-- net_warn_ratelimited("%s: dst:%p refcnt:%d\n",
-- __func__, dst, newrefcnt);
-- if (!newrefcnt)
-- dst_destroy(dst);
-- }
-+ if (dst && rcuref_put(&dst->__rcuref))
-+ dst_destroy(dst);
- }
- EXPORT_SYMBOL(dst_release_immediate);
-
-diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
-index 854b3fd66b1be..90810408cc5df 100644
---- a/net/core/rtnetlink.c
-+++ b/net/core/rtnetlink.c
-@@ -839,7 +839,7 @@ int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst, u32 id,
- if (dst) {
- ci.rta_lastuse = jiffies_delta_to_clock_t(jiffies - dst->lastuse);
- ci.rta_used = dst->__use;
-- ci.rta_clntref = atomic_read(&dst->__refcnt);
-+ ci.rta_clntref = rcuref_read(&dst->__rcuref);
- }
- if (expires) {
- unsigned long clock;
-diff --git a/net/ipv6/route.c b/net/ipv6/route.c
-index 7205adee46c21..9db0b2318e918 100644
---- a/net/ipv6/route.c
-+++ b/net/ipv6/route.c
-@@ -293,7 +293,7 @@ static const struct fib6_info fib6_null_entry_template = {
-
- static const struct rt6_info ip6_null_entry_template = {
- .dst = {
-- .__refcnt = ATOMIC_INIT(1),
-+ .__rcuref = RCUREF_INIT(1),
- .__use = 1,
- .obsolete = DST_OBSOLETE_FORCE_CHK,
- .error = -ENETUNREACH,
-@@ -307,7 +307,7 @@ static const struct rt6_info ip6_null_entry_template = {
-
- static const struct rt6_info ip6_prohibit_entry_template = {
- .dst = {
-- .__refcnt = ATOMIC_INIT(1),
-+ .__rcuref = RCUREF_INIT(1),
- .__use = 1,
- .obsolete = DST_OBSOLETE_FORCE_CHK,
- .error = -EACCES,
-@@ -319,7 +319,7 @@ static const struct rt6_info ip6_prohibit_entry_template = {
-
- static const struct rt6_info ip6_blk_hole_entry_template = {
- .dst = {
-- .__refcnt = ATOMIC_INIT(1),
-+ .__rcuref = RCUREF_INIT(1),
- .__use = 1,
- .obsolete = DST_OBSOLETE_FORCE_CHK,
- .error = -EINVAL,
-diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
-index 7243079ef3546..70ef036909fb0 100644
---- a/net/netfilter/ipvs/ip_vs_xmit.c
-+++ b/net/netfilter/ipvs/ip_vs_xmit.c
-@@ -339,7 +339,7 @@ __ip_vs_get_out_rt(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb,
- spin_unlock_bh(&dest->dst_lock);
- IP_VS_DBG(10, "new dst %pI4, src %pI4, refcnt=%d\n",
- &dest->addr.ip, &dest_dst->dst_saddr.ip,
-- atomic_read(&rt->dst.__refcnt));
-+ rcuref_read(&rt->dst.__rcuref));
- }
- if (ret_saddr)
- *ret_saddr = dest_dst->dst_saddr.ip;
-@@ -507,7 +507,7 @@ __ip_vs_get_out_rt_v6(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb,
- spin_unlock_bh(&dest->dst_lock);
- IP_VS_DBG(10, "new dst %pI6, src %pI6, refcnt=%d\n",
- &dest->addr.in6, &dest_dst->dst_saddr.in6,
-- atomic_read(&rt->dst.__refcnt));
-+ rcuref_read(&rt->dst.__rcuref));
- }
- if (ret_saddr)
- *ret_saddr = dest_dst->dst_saddr.in6;
---
-2.40.1
-
+++ /dev/null
-From 51290b74abe5ae7c0313a41f7e182e0d23a0ad56 Mon Sep 17 00:00:00 2001
-From: Sasha Levin <sashal@kernel.org>
-Date: Fri, 18 Nov 2022 19:19:08 +0000
-Subject: net: Use call_rcu_hurry() for dst_release()
-
-From: Joel Fernandes (Google) <joel@joelfernandes.org>
-
-[ Upstream commit 483c26ff63f42e8898ed43aca0b9953bc91f0cd4 ]
-
-In a networking test on ChromeOS, kernels built with the new
-CONFIG_RCU_LAZY=y Kconfig option fail a networking test in the teardown
-phase.
-
-This failure may be reproduced as follows: ip netns del <name>
-
-The CONFIG_RCU_LAZY=y Kconfig option was introduced by earlier commits
-in this series for the benefit of certain battery-powered systems.
-This Kconfig option causes call_rcu() to delay its callbacks in order
-to batch them. This means that a given RCU grace period covers more
-callbacks, thus reducing the number of grace periods, in turn reducing
-the amount of energy consumed, which increases battery lifetime which
-can be a very good thing. This is not a subtle effect: In some important
-use cases, the battery lifetime is increased by more than 10%.
-
-This CONFIG_RCU_LAZY=y option is available only for CPUs that offload
-callbacks, for example, CPUs mentioned in the rcu_nocbs kernel boot
-parameter passed to kernels built with CONFIG_RCU_NOCB_CPU=y.
-
-Delaying callbacks is normally not a problem because most callbacks do
-nothing but free memory. If the system is short on memory, a shrinker
-will kick all currently queued lazy callbacks out of their laziness,
-thus freeing their memory in short order. Similarly, the rcu_barrier()
-function, which blocks until all currently queued callbacks are invoked,
-will also kick lazy callbacks, thus enabling rcu_barrier() to complete
-in a timely manner.
-
-However, there are some cases where laziness is not a good option.
-For example, synchronize_rcu() invokes call_rcu(), and blocks until
-the newly queued callback is invoked. It would not be a good for
-synchronize_rcu() to block for ten seconds, even on an idle system.
-Therefore, synchronize_rcu() invokes call_rcu_hurry() instead of
-call_rcu(). The arrival of a non-lazy call_rcu_hurry() callback on a
-given CPU kicks any lazy callbacks that might be already queued on that
-CPU. After all, if there is going to be a grace period, all callbacks
-might as well get full benefit from it.
-
-Yes, this could be done the other way around by creating a
-call_rcu_lazy(), but earlier experience with this approach and
-feedback at the 2022 Linux Plumbers Conference shifted the approach
-to call_rcu() being lazy with call_rcu_hurry() for the few places
-where laziness is inappropriate.
-
-Returning to the test failure, use of ftrace showed that this failure
-cause caused by the aadded delays due to this new lazy behavior of
-call_rcu() in kernels built with CONFIG_RCU_LAZY=y.
-
-Therefore, make dst_release() use call_rcu_hurry() in order to revert
-to the old test-failure-free behavior.
-
-[ paulmck: Apply s/call_rcu_flush/call_rcu_hurry/ feedback from Tejun Heo. ]
-
-Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
-Cc: David Ahern <dsahern@kernel.org>
-Cc: "David S. Miller" <davem@davemloft.net>
-Cc: Hideaki YOSHIFUJI <yoshfuji@linux-ipv6.org>
-Cc: Jakub Kicinski <kuba@kernel.org>
-Cc: Paolo Abeni <pabeni@redhat.com>
-Cc: <netdev@vger.kernel.org>
-Reviewed-by: Eric Dumazet <edumazet@google.com>
-Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
-Stable-dep-of: cc9b364bb1d5 ("xfrm6: fix inet6_dev refcount underflow problem")
-Signed-off-by: Sasha Levin <sashal@kernel.org>
----
- net/core/dst.c | 2 +-
- 1 file changed, 1 insertion(+), 1 deletion(-)
-
-diff --git a/net/core/dst.c b/net/core/dst.c
-index bc9c9be4e0801..a4e738d321ba2 100644
---- a/net/core/dst.c
-+++ b/net/core/dst.c
-@@ -174,7 +174,7 @@ void dst_release(struct dst_entry *dst)
- net_warn_ratelimited("%s: dst:%p refcnt:%d\n",
- __func__, dst, newrefcnt);
- if (!newrefcnt)
-- call_rcu(&dst->rcu_head, dst_destroy_rcu);
-+ call_rcu_hurry(&dst->rcu_head, dst_destroy_rcu);
- }
- }
- EXPORT_SYMBOL(dst_release);
---
-2.40.1
-
+++ /dev/null
-From 3f132e8e674299042d9e5313dfbfcb3de55af912 Mon Sep 17 00:00:00 2001
-From: Sasha Levin <sashal@kernel.org>
-Date: Sat, 17 Sep 2022 16:41:59 +0000
-Subject: rcu: Fix late wakeup when flush of bypass cblist happens
-
-From: Joel Fernandes (Google) <joel@joelfernandes.org>
-
-[ Upstream commit b50606f35f4b73c8e4c6b9c64fe7ba72ea919134 ]
-
-When the bypass cblist gets too big or its timeout has occurred, it is
-flushed into the main cblist. However, the bypass timer is still running
-and the behavior is that it would eventually expire and wake the GP
-thread.
-
-Since we are going to use the bypass cblist for lazy CBs, do the wakeup
-soon as the flush for "too big or too long" bypass list happens.
-Otherwise, long delays can happen for callbacks which get promoted from
-lazy to non-lazy.
-
-This is a good thing to do anyway (regardless of future lazy patches),
-since it makes the behavior consistent with behavior of other code paths
-where flushing into the ->cblist makes the GP kthread into a
-non-sleeping state quickly.
-
-[ Frederic Weisbecker: Changes to avoid unnecessary GP-thread wakeups plus
- comment changes. ]
-
-Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
-Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
-Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
-Stable-dep-of: cc9b364bb1d5 ("xfrm6: fix inet6_dev refcount underflow problem")
-Signed-off-by: Sasha Levin <sashal@kernel.org>
----
- kernel/rcu/tree_nocb.h | 10 ++++++++--
- 1 file changed, 8 insertions(+), 2 deletions(-)
-
-diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h
-index 0a5f0ef414845..04c87f250e01a 100644
---- a/kernel/rcu/tree_nocb.h
-+++ b/kernel/rcu/tree_nocb.h
-@@ -433,8 +433,9 @@ static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
- if ((ncbs && j != READ_ONCE(rdp->nocb_bypass_first)) ||
- ncbs >= qhimark) {
- rcu_nocb_lock(rdp);
-+ *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
-+
- if (!rcu_nocb_flush_bypass(rdp, rhp, j)) {
-- *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
- if (*was_alldone)
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
- TPS("FirstQ"));
-@@ -447,7 +448,12 @@ static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
- rcu_advance_cbs_nowake(rdp->mynode, rdp);
- rdp->nocb_gp_adv_time = j;
- }
-- rcu_nocb_unlock_irqrestore(rdp, flags);
-+
-+ // The flush succeeded and we moved CBs into the regular list.
-+ // Don't wait for the wake up timer as it may be too far ahead.
-+ // Wake up the GP thread now instead, if the cblist was empty.
-+ __call_rcu_nocb_wake(rdp, *was_alldone, flags);
-+
- return true; // Callback already enqueued.
- }
-
---
-2.40.1
-
+++ /dev/null
-From 6e201fbbe533ee08318f49c360c83145a1231ac2 Mon Sep 17 00:00:00 2001
-From: Sasha Levin <sashal@kernel.org>
-Date: Sun, 16 Oct 2022 16:22:53 +0000
-Subject: rcu: Fix missing nocb gp wake on rcu_barrier()
-
-From: Frederic Weisbecker <frederic@kernel.org>
-
-[ Upstream commit b8f7aca3f0e0e6223094ba2662bac90353674b04 ]
-
-In preparation for RCU lazy changes, wake up the RCU nocb gp thread if
-needed after an entrain. This change prevents the RCU barrier callback
-from waiting in the queue for several seconds before the lazy callbacks
-in front of it are serviced.
-
-Reported-by: Joel Fernandes (Google) <joel@joelfernandes.org>
-Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
-Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
-Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
-Stable-dep-of: cc9b364bb1d5 ("xfrm6: fix inet6_dev refcount underflow problem")
-Signed-off-by: Sasha Levin <sashal@kernel.org>
----
- kernel/rcu/tree.c | 11 +++++++++++
- kernel/rcu/tree.h | 1 +
- kernel/rcu/tree_nocb.h | 5 +++++
- 3 files changed, 17 insertions(+)
-
-diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
-index 917a1e43f7839..6ea59aa53db78 100644
---- a/kernel/rcu/tree.c
-+++ b/kernel/rcu/tree.c
-@@ -3908,6 +3908,8 @@ static void rcu_barrier_entrain(struct rcu_data *rdp)
- {
- unsigned long gseq = READ_ONCE(rcu_state.barrier_sequence);
- unsigned long lseq = READ_ONCE(rdp->barrier_seq_snap);
-+ bool wake_nocb = false;
-+ bool was_alldone = false;
-
- lockdep_assert_held(&rcu_state.barrier_lock);
- if (rcu_seq_state(lseq) || !rcu_seq_state(gseq) || rcu_seq_ctr(lseq) != rcu_seq_ctr(gseq))
-@@ -3916,7 +3918,14 @@ static void rcu_barrier_entrain(struct rcu_data *rdp)
- rdp->barrier_head.func = rcu_barrier_callback;
- debug_rcu_head_queue(&rdp->barrier_head);
- rcu_nocb_lock(rdp);
-+ /*
-+ * Flush bypass and wakeup rcuog if we add callbacks to an empty regular
-+ * queue. This way we don't wait for bypass timer that can reach seconds
-+ * if it's fully lazy.
-+ */
-+ was_alldone = rcu_rdp_is_offloaded(rdp) && !rcu_segcblist_pend_cbs(&rdp->cblist);
- WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies));
-+ wake_nocb = was_alldone && rcu_segcblist_pend_cbs(&rdp->cblist);
- if (rcu_segcblist_entrain(&rdp->cblist, &rdp->barrier_head)) {
- atomic_inc(&rcu_state.barrier_cpu_count);
- } else {
-@@ -3924,6 +3933,8 @@ static void rcu_barrier_entrain(struct rcu_data *rdp)
- rcu_barrier_trace(TPS("IRQNQ"), -1, rcu_state.barrier_sequence);
- }
- rcu_nocb_unlock(rdp);
-+ if (wake_nocb)
-+ wake_nocb_gp(rdp, false);
- smp_store_release(&rdp->barrier_seq_snap, gseq);
- }
-
-diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
-index d4a97e40ea9c3..925dd98f8b23b 100644
---- a/kernel/rcu/tree.h
-+++ b/kernel/rcu/tree.h
-@@ -439,6 +439,7 @@ static void zero_cpu_stall_ticks(struct rcu_data *rdp);
- static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp);
- static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq);
- static void rcu_init_one_nocb(struct rcu_node *rnp);
-+static bool wake_nocb_gp(struct rcu_data *rdp, bool force);
- static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
- unsigned long j);
- static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
-diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h
-index 04c87f250e01a..74d4983d68f82 100644
---- a/kernel/rcu/tree_nocb.h
-+++ b/kernel/rcu/tree_nocb.h
-@@ -1570,6 +1570,11 @@ static void rcu_init_one_nocb(struct rcu_node *rnp)
- {
- }
-
-+static bool wake_nocb_gp(struct rcu_data *rdp, bool force)
-+{
-+ return false;
-+}
-+
- static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
- unsigned long j)
- {
---
-2.40.1
-
+++ /dev/null
-From 7b253194c188b40a04df52ea0aeacae23989ef0d Mon Sep 17 00:00:00 2001
-From: Sasha Levin <sashal@kernel.org>
-Date: Sun, 16 Oct 2022 16:22:54 +0000
-Subject: rcu: Make call_rcu() lazy to save power
-
-From: Joel Fernandes (Google) <joel@joelfernandes.org>
-
-[ Upstream commit 3cb278e73be58bfb780ecd55129296d2f74c1fb7 ]
-
-Implement timer-based RCU callback batching (also known as lazy
-callbacks). With this we save about 5-10% of power consumed due
-to RCU requests that happen when system is lightly loaded or idle.
-
-By default, all async callbacks (queued via call_rcu) are marked
-lazy. An alternate API call_rcu_hurry() is provided for the few users,
-for example synchronize_rcu(), that need the old behavior.
-
-The batch is flushed whenever a certain amount of time has passed, or
-the batch on a particular CPU grows too big. Also memory pressure will
-flush it in a future patch.
-
-To handle several corner cases automagically (such as rcu_barrier() and
-hotplug), we re-use bypass lists which were originally introduced to
-address lock contention, to handle lazy CBs as well. The bypass list
-length has the lazy CB length included in it. A separate lazy CB length
-counter is also introduced to keep track of the number of lazy CBs.
-
-[ paulmck: Fix formatting of inline call_rcu_lazy() definition. ]
-[ paulmck: Apply Zqiang feedback. ]
-[ paulmck: Apply s/call_rcu_flush/call_rcu_hurry/ feedback from Tejun Heo. ]
-
-Suggested-by: Paul McKenney <paulmck@kernel.org>
-Acked-by: Frederic Weisbecker <frederic@kernel.org>
-Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
-Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
-Stable-dep-of: cc9b364bb1d5 ("xfrm6: fix inet6_dev refcount underflow problem")
-Signed-off-by: Sasha Levin <sashal@kernel.org>
----
- include/linux/rcupdate.h | 9 +++
- kernel/rcu/Kconfig | 8 ++
- kernel/rcu/rcu.h | 8 ++
- kernel/rcu/tiny.c | 2 +-
- kernel/rcu/tree.c | 129 ++++++++++++++++++++-----------
- kernel/rcu/tree.h | 11 ++-
- kernel/rcu/tree_exp.h | 2 +-
- kernel/rcu/tree_nocb.h | 159 +++++++++++++++++++++++++++++++--------
- 8 files changed, 246 insertions(+), 82 deletions(-)
-
-diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
-index e9e61cd27ef63..46f05dc8b31aa 100644
---- a/include/linux/rcupdate.h
-+++ b/include/linux/rcupdate.h
-@@ -108,6 +108,15 @@ static inline int rcu_preempt_depth(void)
-
- #endif /* #else #ifdef CONFIG_PREEMPT_RCU */
-
-+#ifdef CONFIG_RCU_LAZY
-+void call_rcu_hurry(struct rcu_head *head, rcu_callback_t func);
-+#else
-+static inline void call_rcu_hurry(struct rcu_head *head, rcu_callback_t func)
-+{
-+ call_rcu(head, func);
-+}
-+#endif
-+
- /* Internal to kernel */
- void rcu_init(void);
- extern int rcu_scheduler_active;
-diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig
-index d471d22a5e21b..d78f6181c8aad 100644
---- a/kernel/rcu/Kconfig
-+++ b/kernel/rcu/Kconfig
-@@ -311,4 +311,12 @@ config TASKS_TRACE_RCU_READ_MB
- Say N here if you hate read-side memory barriers.
- Take the default if you are unsure.
-
-+config RCU_LAZY
-+ bool "RCU callback lazy invocation functionality"
-+ depends on RCU_NOCB_CPU
-+ default n
-+ help
-+ To save power, batch RCU callbacks and flush after delay, memory
-+ pressure, or callback list growing too big.
-+
- endmenu # "RCU Subsystem"
-diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
-index 48d8f754b730e..6b86c5912beaf 100644
---- a/kernel/rcu/rcu.h
-+++ b/kernel/rcu/rcu.h
-@@ -474,6 +474,14 @@ enum rcutorture_type {
- INVALID_RCU_FLAVOR
- };
-
-+#if defined(CONFIG_RCU_LAZY)
-+unsigned long rcu_lazy_get_jiffies_till_flush(void);
-+void rcu_lazy_set_jiffies_till_flush(unsigned long j);
-+#else
-+static inline unsigned long rcu_lazy_get_jiffies_till_flush(void) { return 0; }
-+static inline void rcu_lazy_set_jiffies_till_flush(unsigned long j) { }
-+#endif
-+
- #if defined(CONFIG_TREE_RCU)
- void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags,
- unsigned long *gp_seq);
-diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
-index a33a8d4942c37..72913ce21258b 100644
---- a/kernel/rcu/tiny.c
-+++ b/kernel/rcu/tiny.c
-@@ -44,7 +44,7 @@ static struct rcu_ctrlblk rcu_ctrlblk = {
-
- void rcu_barrier(void)
- {
-- wait_rcu_gp(call_rcu);
-+ wait_rcu_gp(call_rcu_hurry);
- }
- EXPORT_SYMBOL(rcu_barrier);
-
-diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
-index 6ea59aa53db78..855c035ec9630 100644
---- a/kernel/rcu/tree.c
-+++ b/kernel/rcu/tree.c
-@@ -2731,47 +2731,8 @@ static void check_cb_ovld(struct rcu_data *rdp)
- raw_spin_unlock_rcu_node(rnp);
- }
-
--/**
-- * call_rcu() - Queue an RCU callback for invocation after a grace period.
-- * @head: structure to be used for queueing the RCU updates.
-- * @func: actual callback function to be invoked after the grace period
-- *
-- * The callback function will be invoked some time after a full grace
-- * period elapses, in other words after all pre-existing RCU read-side
-- * critical sections have completed. However, the callback function
-- * might well execute concurrently with RCU read-side critical sections
-- * that started after call_rcu() was invoked.
-- *
-- * RCU read-side critical sections are delimited by rcu_read_lock()
-- * and rcu_read_unlock(), and may be nested. In addition, but only in
-- * v5.0 and later, regions of code across which interrupts, preemption,
-- * or softirqs have been disabled also serve as RCU read-side critical
-- * sections. This includes hardware interrupt handlers, softirq handlers,
-- * and NMI handlers.
-- *
-- * Note that all CPUs must agree that the grace period extended beyond
-- * all pre-existing RCU read-side critical section. On systems with more
-- * than one CPU, this means that when "func()" is invoked, each CPU is
-- * guaranteed to have executed a full memory barrier since the end of its
-- * last RCU read-side critical section whose beginning preceded the call
-- * to call_rcu(). It also means that each CPU executing an RCU read-side
-- * critical section that continues beyond the start of "func()" must have
-- * executed a memory barrier after the call_rcu() but before the beginning
-- * of that RCU read-side critical section. Note that these guarantees
-- * include CPUs that are offline, idle, or executing in user mode, as
-- * well as CPUs that are executing in the kernel.
-- *
-- * Furthermore, if CPU A invoked call_rcu() and CPU B invoked the
-- * resulting RCU callback function "func()", then both CPU A and CPU B are
-- * guaranteed to execute a full memory barrier during the time interval
-- * between the call to call_rcu() and the invocation of "func()" -- even
-- * if CPU A and CPU B are the same CPU (but again only if the system has
-- * more than one CPU).
-- *
-- * Implementation of these memory-ordering guarantees is described here:
-- * Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst.
-- */
--void call_rcu(struct rcu_head *head, rcu_callback_t func)
-+static void
-+__call_rcu_common(struct rcu_head *head, rcu_callback_t func, bool lazy)
- {
- static atomic_t doublefrees;
- unsigned long flags;
-@@ -2812,7 +2773,7 @@ void call_rcu(struct rcu_head *head, rcu_callback_t func)
- }
-
- check_cb_ovld(rdp);
-- if (rcu_nocb_try_bypass(rdp, head, &was_alldone, flags))
-+ if (rcu_nocb_try_bypass(rdp, head, &was_alldone, flags, lazy))
- return; // Enqueued onto ->nocb_bypass, so just leave.
- // If no-CBs CPU gets here, rcu_nocb_try_bypass() acquired ->nocb_lock.
- rcu_segcblist_enqueue(&rdp->cblist, head);
-@@ -2834,8 +2795,84 @@ void call_rcu(struct rcu_head *head, rcu_callback_t func)
- local_irq_restore(flags);
- }
- }
--EXPORT_SYMBOL_GPL(call_rcu);
-
-+#ifdef CONFIG_RCU_LAZY
-+/**
-+ * call_rcu_hurry() - Queue RCU callback for invocation after grace period, and
-+ * flush all lazy callbacks (including the new one) to the main ->cblist while
-+ * doing so.
-+ *
-+ * @head: structure to be used for queueing the RCU updates.
-+ * @func: actual callback function to be invoked after the grace period
-+ *
-+ * The callback function will be invoked some time after a full grace
-+ * period elapses, in other words after all pre-existing RCU read-side
-+ * critical sections have completed.
-+ *
-+ * Use this API instead of call_rcu() if you don't want the callback to be
-+ * invoked after very long periods of time, which can happen on systems without
-+ * memory pressure and on systems which are lightly loaded or mostly idle.
-+ * This function will cause callbacks to be invoked sooner than later at the
-+ * expense of extra power. Other than that, this function is identical to, and
-+ * reuses call_rcu()'s logic. Refer to call_rcu() for more details about memory
-+ * ordering and other functionality.
-+ */
-+void call_rcu_hurry(struct rcu_head *head, rcu_callback_t func)
-+{
-+ return __call_rcu_common(head, func, false);
-+}
-+EXPORT_SYMBOL_GPL(call_rcu_hurry);
-+#endif
-+
-+/**
-+ * call_rcu() - Queue an RCU callback for invocation after a grace period.
-+ * By default the callbacks are 'lazy' and are kept hidden from the main
-+ * ->cblist to prevent starting of grace periods too soon.
-+ * If you desire grace periods to start very soon, use call_rcu_hurry().
-+ *
-+ * @head: structure to be used for queueing the RCU updates.
-+ * @func: actual callback function to be invoked after the grace period
-+ *
-+ * The callback function will be invoked some time after a full grace
-+ * period elapses, in other words after all pre-existing RCU read-side
-+ * critical sections have completed. However, the callback function
-+ * might well execute concurrently with RCU read-side critical sections
-+ * that started after call_rcu() was invoked.
-+ *
-+ * RCU read-side critical sections are delimited by rcu_read_lock()
-+ * and rcu_read_unlock(), and may be nested. In addition, but only in
-+ * v5.0 and later, regions of code across which interrupts, preemption,
-+ * or softirqs have been disabled also serve as RCU read-side critical
-+ * sections. This includes hardware interrupt handlers, softirq handlers,
-+ * and NMI handlers.
-+ *
-+ * Note that all CPUs must agree that the grace period extended beyond
-+ * all pre-existing RCU read-side critical section. On systems with more
-+ * than one CPU, this means that when "func()" is invoked, each CPU is
-+ * guaranteed to have executed a full memory barrier since the end of its
-+ * last RCU read-side critical section whose beginning preceded the call
-+ * to call_rcu(). It also means that each CPU executing an RCU read-side
-+ * critical section that continues beyond the start of "func()" must have
-+ * executed a memory barrier after the call_rcu() but before the beginning
-+ * of that RCU read-side critical section. Note that these guarantees
-+ * include CPUs that are offline, idle, or executing in user mode, as
-+ * well as CPUs that are executing in the kernel.
-+ *
-+ * Furthermore, if CPU A invoked call_rcu() and CPU B invoked the
-+ * resulting RCU callback function "func()", then both CPU A and CPU B are
-+ * guaranteed to execute a full memory barrier during the time interval
-+ * between the call to call_rcu() and the invocation of "func()" -- even
-+ * if CPU A and CPU B are the same CPU (but again only if the system has
-+ * more than one CPU).
-+ *
-+ * Implementation of these memory-ordering guarantees is described here:
-+ * Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst.
-+ */
-+void call_rcu(struct rcu_head *head, rcu_callback_t func)
-+{
-+ return __call_rcu_common(head, func, IS_ENABLED(CONFIG_RCU_LAZY));
-+}
-+EXPORT_SYMBOL_GPL(call_rcu);
-
- /* Maximum number of jiffies to wait before draining a batch. */
- #define KFREE_DRAIN_JIFFIES (5 * HZ)
-@@ -3521,7 +3558,7 @@ void synchronize_rcu(void)
- if (rcu_gp_is_expedited())
- synchronize_rcu_expedited();
- else
-- wait_rcu_gp(call_rcu);
-+ wait_rcu_gp(call_rcu_hurry);
- return;
- }
-
-@@ -3924,7 +3961,7 @@ static void rcu_barrier_entrain(struct rcu_data *rdp)
- * if it's fully lazy.
- */
- was_alldone = rcu_rdp_is_offloaded(rdp) && !rcu_segcblist_pend_cbs(&rdp->cblist);
-- WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies));
-+ WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies, false));
- wake_nocb = was_alldone && rcu_segcblist_pend_cbs(&rdp->cblist);
- if (rcu_segcblist_entrain(&rdp->cblist, &rdp->barrier_head)) {
- atomic_inc(&rcu_state.barrier_cpu_count);
-@@ -4359,7 +4396,7 @@ void rcutree_migrate_callbacks(int cpu)
- my_rdp = this_cpu_ptr(&rcu_data);
- my_rnp = my_rdp->mynode;
- rcu_nocb_lock(my_rdp); /* irqs already disabled. */
-- WARN_ON_ONCE(!rcu_nocb_flush_bypass(my_rdp, NULL, jiffies));
-+ WARN_ON_ONCE(!rcu_nocb_flush_bypass(my_rdp, NULL, jiffies, false));
- raw_spin_lock_rcu_node(my_rnp); /* irqs already disabled. */
- /* Leverage recent GPs and set GP for new callbacks. */
- needwake = rcu_advance_cbs(my_rnp, rdp) ||
-diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
-index 925dd98f8b23b..fcb5d696eb170 100644
---- a/kernel/rcu/tree.h
-+++ b/kernel/rcu/tree.h
-@@ -263,14 +263,16 @@ struct rcu_data {
- unsigned long last_fqs_resched; /* Time of last rcu_resched(). */
- unsigned long last_sched_clock; /* Jiffies of last rcu_sched_clock_irq(). */
-
-+ long lazy_len; /* Length of buffered lazy callbacks. */
- int cpu;
- };
-
- /* Values for nocb_defer_wakeup field in struct rcu_data. */
- #define RCU_NOCB_WAKE_NOT 0
- #define RCU_NOCB_WAKE_BYPASS 1
--#define RCU_NOCB_WAKE 2
--#define RCU_NOCB_WAKE_FORCE 3
-+#define RCU_NOCB_WAKE_LAZY 2
-+#define RCU_NOCB_WAKE 3
-+#define RCU_NOCB_WAKE_FORCE 4
-
- #define RCU_JIFFIES_TILL_FORCE_QS (1 + (HZ > 250) + (HZ > 500))
- /* For jiffies_till_first_fqs and */
-@@ -441,9 +443,10 @@ static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq);
- static void rcu_init_one_nocb(struct rcu_node *rnp);
- static bool wake_nocb_gp(struct rcu_data *rdp, bool force);
- static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
-- unsigned long j);
-+ unsigned long j, bool lazy);
- static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
-- bool *was_alldone, unsigned long flags);
-+ bool *was_alldone, unsigned long flags,
-+ bool lazy);
- static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_empty,
- unsigned long flags);
- static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp, int level);
-diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
-index aa3ec3c3b9f75..b9637df7cda70 100644
---- a/kernel/rcu/tree_exp.h
-+++ b/kernel/rcu/tree_exp.h
-@@ -941,7 +941,7 @@ void synchronize_rcu_expedited(void)
-
- /* If expedited grace periods are prohibited, fall back to normal. */
- if (rcu_gp_is_normal()) {
-- wait_rcu_gp(call_rcu);
-+ wait_rcu_gp(call_rcu_hurry);
- return;
- }
-
-diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h
-index 74d4983d68f82..c3ec5f389d27f 100644
---- a/kernel/rcu/tree_nocb.h
-+++ b/kernel/rcu/tree_nocb.h
-@@ -256,6 +256,31 @@ static bool wake_nocb_gp(struct rcu_data *rdp, bool force)
- return __wake_nocb_gp(rdp_gp, rdp, force, flags);
- }
-
-+/*
-+ * LAZY_FLUSH_JIFFIES decides the maximum amount of time that
-+ * can elapse before lazy callbacks are flushed. Lazy callbacks
-+ * could be flushed much earlier for a number of other reasons
-+ * however, LAZY_FLUSH_JIFFIES will ensure no lazy callbacks are
-+ * left unsubmitted to RCU after those many jiffies.
-+ */
-+#define LAZY_FLUSH_JIFFIES (10 * HZ)
-+static unsigned long jiffies_till_flush = LAZY_FLUSH_JIFFIES;
-+
-+#ifdef CONFIG_RCU_LAZY
-+// To be called only from test code.
-+void rcu_lazy_set_jiffies_till_flush(unsigned long jif)
-+{
-+ jiffies_till_flush = jif;
-+}
-+EXPORT_SYMBOL(rcu_lazy_set_jiffies_till_flush);
-+
-+unsigned long rcu_lazy_get_jiffies_till_flush(void)
-+{
-+ return jiffies_till_flush;
-+}
-+EXPORT_SYMBOL(rcu_lazy_get_jiffies_till_flush);
-+#endif
-+
- /*
- * Arrange to wake the GP kthread for this NOCB group at some future
- * time when it is safe to do so.
-@@ -269,10 +294,14 @@ static void wake_nocb_gp_defer(struct rcu_data *rdp, int waketype,
- raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags);
-
- /*
-- * Bypass wakeup overrides previous deferments. In case
-- * of callback storm, no need to wake up too early.
-+ * Bypass wakeup overrides previous deferments. In case of
-+ * callback storms, no need to wake up too early.
- */
-- if (waketype == RCU_NOCB_WAKE_BYPASS) {
-+ if (waketype == RCU_NOCB_WAKE_LAZY &&
-+ rdp->nocb_defer_wakeup == RCU_NOCB_WAKE_NOT) {
-+ mod_timer(&rdp_gp->nocb_timer, jiffies + jiffies_till_flush);
-+ WRITE_ONCE(rdp_gp->nocb_defer_wakeup, waketype);
-+ } else if (waketype == RCU_NOCB_WAKE_BYPASS) {
- mod_timer(&rdp_gp->nocb_timer, jiffies + 2);
- WRITE_ONCE(rdp_gp->nocb_defer_wakeup, waketype);
- } else {
-@@ -293,10 +322,13 @@ static void wake_nocb_gp_defer(struct rcu_data *rdp, int waketype,
- * proves to be initially empty, just return false because the no-CB GP
- * kthread may need to be awakened in this case.
- *
-+ * Return true if there was something to be flushed and it succeeded, otherwise
-+ * false.
-+ *
- * Note that this function always returns true if rhp is NULL.
- */
- static bool rcu_nocb_do_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
-- unsigned long j)
-+ unsigned long j, bool lazy)
- {
- struct rcu_cblist rcl;
-
-@@ -310,7 +342,20 @@ static bool rcu_nocb_do_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
- /* Note: ->cblist.len already accounts for ->nocb_bypass contents. */
- if (rhp)
- rcu_segcblist_inc_len(&rdp->cblist); /* Must precede enqueue. */
-- rcu_cblist_flush_enqueue(&rcl, &rdp->nocb_bypass, rhp);
-+
-+ /*
-+ * If the new CB requested was a lazy one, queue it onto the main
-+ * ->cblist so we can take advantage of a sooner grade period.
-+ */
-+ if (lazy && rhp) {
-+ rcu_cblist_flush_enqueue(&rcl, &rdp->nocb_bypass, NULL);
-+ rcu_cblist_enqueue(&rcl, rhp);
-+ WRITE_ONCE(rdp->lazy_len, 0);
-+ } else {
-+ rcu_cblist_flush_enqueue(&rcl, &rdp->nocb_bypass, rhp);
-+ WRITE_ONCE(rdp->lazy_len, 0);
-+ }
-+
- rcu_segcblist_insert_pend_cbs(&rdp->cblist, &rcl);
- WRITE_ONCE(rdp->nocb_bypass_first, j);
- rcu_nocb_bypass_unlock(rdp);
-@@ -326,13 +371,13 @@ static bool rcu_nocb_do_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
- * Note that this function always returns true if rhp is NULL.
- */
- static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
-- unsigned long j)
-+ unsigned long j, bool lazy)
- {
- if (!rcu_rdp_is_offloaded(rdp))
- return true;
- rcu_lockdep_assert_cblist_protected(rdp);
- rcu_nocb_bypass_lock(rdp);
-- return rcu_nocb_do_flush_bypass(rdp, rhp, j);
-+ return rcu_nocb_do_flush_bypass(rdp, rhp, j, lazy);
- }
-
- /*
-@@ -345,7 +390,7 @@ static void rcu_nocb_try_flush_bypass(struct rcu_data *rdp, unsigned long j)
- if (!rcu_rdp_is_offloaded(rdp) ||
- !rcu_nocb_bypass_trylock(rdp))
- return;
-- WARN_ON_ONCE(!rcu_nocb_do_flush_bypass(rdp, NULL, j));
-+ WARN_ON_ONCE(!rcu_nocb_do_flush_bypass(rdp, NULL, j, false));
- }
-
- /*
-@@ -367,12 +412,14 @@ static void rcu_nocb_try_flush_bypass(struct rcu_data *rdp, unsigned long j)
- * there is only one CPU in operation.
- */
- static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
-- bool *was_alldone, unsigned long flags)
-+ bool *was_alldone, unsigned long flags,
-+ bool lazy)
- {
- unsigned long c;
- unsigned long cur_gp_seq;
- unsigned long j = jiffies;
- long ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
-+ bool bypass_is_lazy = (ncbs == READ_ONCE(rdp->lazy_len));
-
- lockdep_assert_irqs_disabled();
-
-@@ -417,25 +464,29 @@ static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
- // If there hasn't yet been all that many ->cblist enqueues
- // this jiffy, tell the caller to enqueue onto ->cblist. But flush
- // ->nocb_bypass first.
-- if (rdp->nocb_nobypass_count < nocb_nobypass_lim_per_jiffy) {
-+ // Lazy CBs throttle this back and do immediate bypass queuing.
-+ if (rdp->nocb_nobypass_count < nocb_nobypass_lim_per_jiffy && !lazy) {
- rcu_nocb_lock(rdp);
- *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
- if (*was_alldone)
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
- TPS("FirstQ"));
-- WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, j));
-+
-+ WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, j, false));
- WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass));
- return false; // Caller must enqueue the callback.
- }
-
- // If ->nocb_bypass has been used too long or is too full,
- // flush ->nocb_bypass to ->cblist.
-- if ((ncbs && j != READ_ONCE(rdp->nocb_bypass_first)) ||
-+ if ((ncbs && !bypass_is_lazy && j != READ_ONCE(rdp->nocb_bypass_first)) ||
-+ (ncbs && bypass_is_lazy &&
-+ (time_after(j, READ_ONCE(rdp->nocb_bypass_first) + jiffies_till_flush))) ||
- ncbs >= qhimark) {
- rcu_nocb_lock(rdp);
- *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
-
-- if (!rcu_nocb_flush_bypass(rdp, rhp, j)) {
-+ if (!rcu_nocb_flush_bypass(rdp, rhp, j, lazy)) {
- if (*was_alldone)
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
- TPS("FirstQ"));
-@@ -463,13 +514,24 @@ static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
- ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
- rcu_segcblist_inc_len(&rdp->cblist); /* Must precede enqueue. */
- rcu_cblist_enqueue(&rdp->nocb_bypass, rhp);
-+
-+ if (lazy)
-+ WRITE_ONCE(rdp->lazy_len, rdp->lazy_len + 1);
-+
- if (!ncbs) {
- WRITE_ONCE(rdp->nocb_bypass_first, j);
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("FirstBQ"));
- }
- rcu_nocb_bypass_unlock(rdp);
- smp_mb(); /* Order enqueue before wake. */
-- if (ncbs) {
-+ // A wake up of the grace period kthread or timer adjustment
-+ // needs to be done only if:
-+ // 1. Bypass list was fully empty before (this is the first
-+ // bypass list entry), or:
-+ // 2. Both of these conditions are met:
-+ // a. The bypass list previously had only lazy CBs, and:
-+ // b. The new CB is non-lazy.
-+ if (ncbs && (!bypass_is_lazy || lazy)) {
- local_irq_restore(flags);
- } else {
- // No-CBs GP kthread might be indefinitely asleep, if so, wake.
-@@ -497,8 +559,10 @@ static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone,
- unsigned long flags)
- __releases(rdp->nocb_lock)
- {
-+ long bypass_len;
- unsigned long cur_gp_seq;
- unsigned long j;
-+ long lazy_len;
- long len;
- struct task_struct *t;
-
-@@ -512,9 +576,16 @@ static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone,
- }
- // Need to actually to a wakeup.
- len = rcu_segcblist_n_cbs(&rdp->cblist);
-+ bypass_len = rcu_cblist_n_cbs(&rdp->nocb_bypass);
-+ lazy_len = READ_ONCE(rdp->lazy_len);
- if (was_alldone) {
- rdp->qlen_last_fqs_check = len;
-- if (!irqs_disabled_flags(flags)) {
-+ // Only lazy CBs in bypass list
-+ if (lazy_len && bypass_len == lazy_len) {
-+ rcu_nocb_unlock_irqrestore(rdp, flags);
-+ wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE_LAZY,
-+ TPS("WakeLazy"));
-+ } else if (!irqs_disabled_flags(flags)) {
- /* ... if queue was empty ... */
- rcu_nocb_unlock_irqrestore(rdp, flags);
- wake_nocb_gp(rdp, false);
-@@ -605,12 +676,12 @@ static void nocb_gp_sleep(struct rcu_data *my_rdp, int cpu)
- static void nocb_gp_wait(struct rcu_data *my_rdp)
- {
- bool bypass = false;
-- long bypass_ncbs;
- int __maybe_unused cpu = my_rdp->cpu;
- unsigned long cur_gp_seq;
- unsigned long flags;
- bool gotcbs = false;
- unsigned long j = jiffies;
-+ bool lazy = false;
- bool needwait_gp = false; // This prevents actual uninitialized use.
- bool needwake;
- bool needwake_gp;
-@@ -640,24 +711,43 @@ static void nocb_gp_wait(struct rcu_data *my_rdp)
- * won't be ignored for long.
- */
- list_for_each_entry(rdp, &my_rdp->nocb_head_rdp, nocb_entry_rdp) {
-+ long bypass_ncbs;
-+ bool flush_bypass = false;
-+ long lazy_ncbs;
-+
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("Check"));
- rcu_nocb_lock_irqsave(rdp, flags);
- lockdep_assert_held(&rdp->nocb_lock);
- bypass_ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
-- if (bypass_ncbs &&
-+ lazy_ncbs = READ_ONCE(rdp->lazy_len);
-+
-+ if (bypass_ncbs && (lazy_ncbs == bypass_ncbs) &&
-+ (time_after(j, READ_ONCE(rdp->nocb_bypass_first) + jiffies_till_flush) ||
-+ bypass_ncbs > 2 * qhimark)) {
-+ flush_bypass = true;
-+ } else if (bypass_ncbs && (lazy_ncbs != bypass_ncbs) &&
- (time_after(j, READ_ONCE(rdp->nocb_bypass_first) + 1) ||
- bypass_ncbs > 2 * qhimark)) {
-- // Bypass full or old, so flush it.
-- (void)rcu_nocb_try_flush_bypass(rdp, j);
-- bypass_ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
-+ flush_bypass = true;
- } else if (!bypass_ncbs && rcu_segcblist_empty(&rdp->cblist)) {
- rcu_nocb_unlock_irqrestore(rdp, flags);
- continue; /* No callbacks here, try next. */
- }
-+
-+ if (flush_bypass) {
-+ // Bypass full or old, so flush it.
-+ (void)rcu_nocb_try_flush_bypass(rdp, j);
-+ bypass_ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
-+ lazy_ncbs = READ_ONCE(rdp->lazy_len);
-+ }
-+
- if (bypass_ncbs) {
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
-- TPS("Bypass"));
-- bypass = true;
-+ bypass_ncbs == lazy_ncbs ? TPS("Lazy") : TPS("Bypass"));
-+ if (bypass_ncbs == lazy_ncbs)
-+ lazy = true;
-+ else
-+ bypass = true;
- }
- rnp = rdp->mynode;
-
-@@ -705,12 +795,20 @@ static void nocb_gp_wait(struct rcu_data *my_rdp)
- my_rdp->nocb_gp_gp = needwait_gp;
- my_rdp->nocb_gp_seq = needwait_gp ? wait_gp_seq : 0;
-
-- if (bypass && !rcu_nocb_poll) {
-- // At least one child with non-empty ->nocb_bypass, so set
-- // timer in order to avoid stranding its callbacks.
-- wake_nocb_gp_defer(my_rdp, RCU_NOCB_WAKE_BYPASS,
-- TPS("WakeBypassIsDeferred"));
-+ // At least one child with non-empty ->nocb_bypass, so set
-+ // timer in order to avoid stranding its callbacks.
-+ if (!rcu_nocb_poll) {
-+ // If bypass list only has lazy CBs. Add a deferred lazy wake up.
-+ if (lazy && !bypass) {
-+ wake_nocb_gp_defer(my_rdp, RCU_NOCB_WAKE_LAZY,
-+ TPS("WakeLazyIsDeferred"));
-+ // Otherwise add a deferred bypass wake up.
-+ } else if (bypass) {
-+ wake_nocb_gp_defer(my_rdp, RCU_NOCB_WAKE_BYPASS,
-+ TPS("WakeBypassIsDeferred"));
-+ }
- }
-+
- if (rcu_nocb_poll) {
- /* Polling, so trace if first poll in the series. */
- if (gotcbs)
-@@ -1036,7 +1134,7 @@ static long rcu_nocb_rdp_deoffload(void *arg)
- * return false, which means that future calls to rcu_nocb_try_bypass()
- * will refuse to put anything into the bypass.
- */
-- WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies));
-+ WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies, false));
- /*
- * Start with invoking rcu_core() early. This way if the current thread
- * happens to preempt an ongoing call to rcu_core() in the middle,
-@@ -1290,6 +1388,7 @@ static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
- raw_spin_lock_init(&rdp->nocb_gp_lock);
- timer_setup(&rdp->nocb_timer, do_nocb_deferred_wakeup_timer, 0);
- rcu_cblist_init(&rdp->nocb_bypass);
-+ WRITE_ONCE(rdp->lazy_len, 0);
- mutex_init(&rdp->nocb_gp_kthread_mutex);
- }
-
-@@ -1576,13 +1675,13 @@ static bool wake_nocb_gp(struct rcu_data *rdp, bool force)
- }
-
- static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
-- unsigned long j)
-+ unsigned long j, bool lazy)
- {
- return true;
- }
-
- static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
-- bool *was_alldone, unsigned long flags)
-+ bool *was_alldone, unsigned long flags, bool lazy)
- {
- return false;
- }
---
-2.40.1
-
fprobe-add-nr_maxactive-to-specify-rethook_node-pool.patch
fprobe-fix-to-ensure-the-number-of-active-retprobes-.patch
net-xfrm-skip-policies-marked-as-dead-while-reinsert.patch
-rcu-fix-late-wakeup-when-flush-of-bypass-cblist-happ.patch
-rcu-fix-missing-nocb-gp-wake-on-rcu_barrier.patch
-rcu-make-call_rcu-lazy-to-save-power.patch
-net-use-call_rcu_hurry-for-dst_release.patch
-atomics-provide-atomic_add_negative-variants.patch
-atomics-provide-rcuref-scalable-reference-counting.patch
-net-dst-prevent-false-sharing-vs.-dst_entry-__refcnt.patch
-net-dst-switch-to-rcuref_t-reference-counting.patch
-net-dst-fix-missing-initialization-of-rt_uncached.patch
xfrm6-fix-inet6_dev-refcount-underflow-problem.patch
net-mlx5-e-switch-register-event-handler-before-armi.patch
net-mlx5-handle-fw-tracer-change-ownership-event-bas.patch
phy-mapphone-mdm6600-fix-runtime-pm-for-remove.patch
phy-mapphone-mdm6600-fix-pinctrl_pm-handling-for-sle.patch
net-move-altnames-together-with-the-netdevice.patch
+bluetooth-hci_sock-fix-slab-oob-read-in-create_monitor_event.patch
+bluetooth-hci_sock-correctly-bounds-check-and-pad-hci_mon_new_index-name.patch
Reviewed-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
- net/ipv6/xfrm6_policy.c | 4 ++--
+ net/ipv6/xfrm6_policy.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
-diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c
-index eecc5e59da17c..50c278f1c1063 100644
--- a/net/ipv6/xfrm6_policy.c
+++ b/net/ipv6/xfrm6_policy.c
-@@ -117,10 +117,10 @@ static void xfrm6_dst_destroy(struct dst_entry *dst)
+@@ -118,11 +118,11 @@ static void xfrm6_dst_destroy(struct dst
{
struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
- if (likely(xdst->u.rt6.rt6i_idev))
- in6_dev_put(xdst->u.rt6.rt6i_idev);
dst_destroy_metrics_generic(dst);
- rt6_uncached_list_del(&xdst->u.rt6);
+ if (xdst->u.rt6.rt6i_uncached_list)
+ rt6_uncached_list_del(&xdst->u.rt6);
+ if (likely(xdst->u.rt6.rt6i_idev))
+ in6_dev_put(xdst->u.rt6.rt6i_idev);
xfrm_dst_destroy(xdst);
}
---
-2.40.1
-