From: Sasha Levin <sashal@kernel.org>
Date: Mon, 15 Nov 2021 13:42:05 +0000 (-0500)
Subject: Fixes for 5.14
X-Git-Tag: v5.4.160~52^2~2
X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=8365319e1c4b7f48973d4fda406e47d2b3333e94;p=thirdparty%2Fkernel%2Fstable-queue.git

Fixes for 5.14

Signed-off-by: Sasha Levin <sashal@kernel.org>
---

diff --git a/queue-5.14/bpf-cgroup-assign-cgroup-in-cgroup_sk_alloc-when-cal.patch b/queue-5.14/bpf-cgroup-assign-cgroup-in-cgroup_sk_alloc-when-cal.patch
new file mode 100644
index 00000000000..c6540151caf
--- /dev/null
+++ b/queue-5.14/bpf-cgroup-assign-cgroup-in-cgroup_sk_alloc-when-cal.patch
@@ -0,0 +1,94 @@
+From f0e12bf7f8d17ee03634576fc011f4fea557b7c3 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 27 Sep 2021 14:39:20 +0200
+Subject: bpf, cgroup: Assign cgroup in cgroup_sk_alloc when called from
+ interrupt
+
+From: Daniel Borkmann <daniel@iogearbox.net>
+
+[ Upstream commit 78cc316e9583067884eb8bd154301dc1e9ee945c ]
+
+If cgroup_sk_alloc() is called from interrupt context, then just assign the
+root cgroup to skcd->cgroup. Prior to commit 8520e224f547 ("bpf, cgroups:
+Fix cgroup v2 fallback on v1/v2 mixed mode") we would just return, and later
+on in sock_cgroup_ptr(), we were NULL-testing the cgroup in fast-path, and
+iff indeed NULL returning the root cgroup (v ?: &cgrp_dfl_root.cgrp). Rather
+than re-adding the NULL-test to the fast-path we can just assign it once from
+cgroup_sk_alloc() given v1/v2 handling has been simplified. The migration from
+NULL test with returning &cgrp_dfl_root.cgrp to assigning &cgrp_dfl_root.cgrp
+directly does /not/ change behavior for callers of sock_cgroup_ptr().
+
+syzkaller was able to trigger a splat in the legacy netrom code base, where
+the RX handler in nr_rx_frame() calls nr_make_new() which calls sk_alloc()
+and therefore cgroup_sk_alloc() with in_interrupt() condition. Thus the NULL
+skcd->cgroup, where it trips over on cgroup_sk_free() side given it expects
+a non-NULL object. There are a few other candidates aside from netrom which
+have similar pattern where in their accept-like implementation, they just call
+to sk_alloc() and thus cgroup_sk_alloc() instead of sk_clone_lock() with the
+corresponding cgroup_sk_clone() which then inherits the cgroup from the parent
+socket. None of them are related to core protocols where BPF cgroup programs
+are running from. However, in future, they should follow to implement a similar
+inheritance mechanism.
+
+Additionally, with a !CONFIG_CGROUP_NET_PRIO and !CONFIG_CGROUP_NET_CLASSID
+configuration, the same issue was exposed also prior to 8520e224f547 due to
+commit e876ecc67db8 ("cgroup: memcg: net: do not associate sock with unrelated
+cgroup") which added the early in_interrupt() return back then.
+
+Fixes: 8520e224f547 ("bpf, cgroups: Fix cgroup v2 fallback on v1/v2 mixed mode")
+Fixes: e876ecc67db8 ("cgroup: memcg: net: do not associate sock with unrelated cgroup")
+Reported-by: syzbot+df709157a4ecaf192b03@syzkaller.appspotmail.com
+Reported-by: syzbot+533f389d4026d86a2a95@syzkaller.appspotmail.com
+Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
+Signed-off-by: Alexei Starovoitov <ast@kernel.org>
+Tested-by: syzbot+df709157a4ecaf192b03@syzkaller.appspotmail.com
+Tested-by: syzbot+533f389d4026d86a2a95@syzkaller.appspotmail.com
+Acked-by: Tejun Heo <tj@kernel.org>
+Link: https://lore.kernel.org/bpf/20210927123921.21535-1-daniel@iogearbox.net
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ kernel/cgroup/cgroup.c | 17 ++++++++++++-----
+ 1 file changed, 12 insertions(+), 5 deletions(-)
+
+diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
+index 869a16f7684f1..bfbed4c99f166 100644
+--- a/kernel/cgroup/cgroup.c
++++ b/kernel/cgroup/cgroup.c
+@@ -6586,22 +6586,29 @@ int cgroup_parse_float(const char *input, unsigned dec_shift, s64 *v)
+ 
+ void cgroup_sk_alloc(struct sock_cgroup_data *skcd)
+ {
+-	/* Don't associate the sock with unrelated interrupted task's cgroup. */
+-	if (in_interrupt())
+-		return;
++	struct cgroup *cgroup;
+ 
+ 	rcu_read_lock();
++	/* Don't associate the sock with unrelated interrupted task's cgroup. */
++	if (in_interrupt()) {
++		cgroup = &cgrp_dfl_root.cgrp;
++		cgroup_get(cgroup);
++		goto out;
++	}
++
+ 	while (true) {
+ 		struct css_set *cset;
+ 
+ 		cset = task_css_set(current);
+ 		if (likely(cgroup_tryget(cset->dfl_cgrp))) {
+-			skcd->cgroup = cset->dfl_cgrp;
+-			cgroup_bpf_get(cset->dfl_cgrp);
++			cgroup = cset->dfl_cgrp;
+ 			break;
+ 		}
+ 		cpu_relax();
+ 	}
++out:
++	skcd->cgroup = cgroup;
++	cgroup_bpf_get(cgroup);
+ 	rcu_read_unlock();
+ }
+ 
+-- 
+2.33.0
+
diff --git a/queue-5.14/bpf-cgroups-fix-cgroup-v2-fallback-on-v1-v2-mixed-mo.patch b/queue-5.14/bpf-cgroups-fix-cgroup-v2-fallback-on-v1-v2-mixed-mo.patch
new file mode 100644
index 00000000000..68c98feab1b
--- /dev/null
+++ b/queue-5.14/bpf-cgroups-fix-cgroup-v2-fallback-on-v1-v2-mixed-mo.patch
@@ -0,0 +1,398 @@
+From 9aec4cd9ad553abd4e083435a9dfe1a556747ca0 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 14 Sep 2021 01:07:57 +0200
+Subject: bpf, cgroups: Fix cgroup v2 fallback on v1/v2 mixed mode
+
+From: Daniel Borkmann <daniel@iogearbox.net>
+
+[ Upstream commit 8520e224f547cd070c7c8f97b1fc6d58cff7ccaa ]
+
+Fix cgroup v1 interference when non-root cgroup v2 BPF programs are used.
+Back in the days, commit bd1060a1d671 ("sock, cgroup: add sock->sk_cgroup")
+embedded per-socket cgroup information into sock->sk_cgrp_data and in order
+to save 8 bytes in struct sock made both mutually exclusive, that is, when
+cgroup v1 socket tagging (e.g. net_cls/net_prio) is used, then cgroup v2
+falls back to the root cgroup in sock_cgroup_ptr() (&cgrp_dfl_root.cgrp).
+
+The assumption made was "there is no reason to mix the two and this is in line
+with how legacy and v2 compatibility is handled" as stated in bd1060a1d671.
+However, with Kubernetes more widely supporting cgroups v2 as well nowadays,
+this assumption no longer holds, and the possibility of the v1/v2 mixed mode
+with the v2 root fallback being hit becomes a real security issue.
+
+Many of the cgroup v2 BPF programs are also used for policy enforcement, just
+to pick _one_ example, that is, to programmatically deny socket related system
+calls like connect(2) or bind(2). A v2 root fallback would implicitly cause
+a policy bypass for the affected Pods.
+
+In production environments, we have recently seen this case due to various
+circumstances: i) a different 3rd party agent and/or ii) a container runtime
+such as [0] in the user's environment configuring legacy cgroup v1 net_cls
+tags, which triggered implicitly mentioned root fallback. Another case is
+Kubernetes projects like kind [1] which create Kubernetes nodes in a container
+and also add cgroup namespaces to the mix, meaning programs which are attached
+to the cgroup v2 root of the cgroup namespace get attached to a non-root
+cgroup v2 path from init namespace point of view. And the latter's root is
+out of reach for agents on a kind Kubernetes node to configure. Meaning, any
+entity on the node setting cgroup v1 net_cls tag will trigger the bypass
+despite cgroup v2 BPF programs attached to the namespace root.
+
+Generally, this mutual exclusiveness does not hold anymore in today's user
+environments and makes cgroup v2 usage from BPF side fragile and unreliable.
+This fix adds proper struct cgroup pointer for the cgroup v2 case to struct
+sock_cgroup_data in order to address these issues; this implicitly also fixes
+the tradeoffs being made back then with regards to races and refcount leaks
+as stated in bd1060a1d671, and removes the fallback, so that cgroup v2 BPF
+programs always operate as expected.
+
+  [0] https://github.com/nestybox/sysbox/
+  [1] https://kind.sigs.k8s.io/
+
+Fixes: bd1060a1d671 ("sock, cgroup: add sock->sk_cgroup")
+Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
+Signed-off-by: Alexei Starovoitov <ast@kernel.org>
+Acked-by: Stanislav Fomichev <sdf@google.com>
+Acked-by: Tejun Heo <tj@kernel.org>
+Link: https://lore.kernel.org/bpf/20210913230759.2313-1-daniel@iogearbox.net
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ include/linux/cgroup-defs.h  | 107 +++++++++--------------------------
+ include/linux/cgroup.h       |  22 +------
+ kernel/cgroup/cgroup.c       |  50 ++++------------
+ net/core/netclassid_cgroup.c |   7 +--
+ net/core/netprio_cgroup.c    |  10 +---
+ 5 files changed, 41 insertions(+), 155 deletions(-)
+
+diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
+index e1c705fdfa7c5..db2e147e069fe 100644
+--- a/include/linux/cgroup-defs.h
++++ b/include/linux/cgroup-defs.h
+@@ -752,107 +752,54 @@ static inline void cgroup_threadgroup_change_end(struct task_struct *tsk) {}
+  * sock_cgroup_data is embedded at sock->sk_cgrp_data and contains
+  * per-socket cgroup information except for memcg association.
+  *
+- * On legacy hierarchies, net_prio and net_cls controllers directly set
+- * attributes on each sock which can then be tested by the network layer.
+- * On the default hierarchy, each sock is associated with the cgroup it was
+- * created in and the networking layer can match the cgroup directly.
+- *
+- * To avoid carrying all three cgroup related fields separately in sock,
+- * sock_cgroup_data overloads (prioidx, classid) and the cgroup pointer.
+- * On boot, sock_cgroup_data records the cgroup that the sock was created
+- * in so that cgroup2 matches can be made; however, once either net_prio or
+- * net_cls starts being used, the area is overridden to carry prioidx and/or
+- * classid.  The two modes are distinguished by whether the lowest bit is
+- * set.  Clear bit indicates cgroup pointer while set bit prioidx and
+- * classid.
+- *
+- * While userland may start using net_prio or net_cls at any time, once
+- * either is used, cgroup2 matching no longer works.  There is no reason to
+- * mix the two and this is in line with how legacy and v2 compatibility is
+- * handled.  On mode switch, cgroup references which are already being
+- * pointed to by socks may be leaked.  While this can be remedied by adding
+- * synchronization around sock_cgroup_data, given that the number of leaked
+- * cgroups is bound and highly unlikely to be high, this seems to be the
+- * better trade-off.
++ * On legacy hierarchies, net_prio and net_cls controllers directly
++ * set attributes on each sock which can then be tested by the network
++ * layer. On the default hierarchy, each sock is associated with the
++ * cgroup it was created in and the networking layer can match the
++ * cgroup directly.
+  */
+ struct sock_cgroup_data {
+-	union {
+-#ifdef __LITTLE_ENDIAN
+-		struct {
+-			u8	is_data : 1;
+-			u8	no_refcnt : 1;
+-			u8	unused : 6;
+-			u8	padding;
+-			u16	prioidx;
+-			u32	classid;
+-		} __packed;
+-#else
+-		struct {
+-			u32	classid;
+-			u16	prioidx;
+-			u8	padding;
+-			u8	unused : 6;
+-			u8	no_refcnt : 1;
+-			u8	is_data : 1;
+-		} __packed;
++	struct cgroup	*cgroup; /* v2 */
++#ifdef CONFIG_CGROUP_NET_CLASSID
++	u32		classid; /* v1 */
++#endif
++#ifdef CONFIG_CGROUP_NET_PRIO
++	u16		prioidx; /* v1 */
+ #endif
+-		u64		val;
+-	};
+ };
+ 
+-/*
+- * There's a theoretical window where the following accessors race with
+- * updaters and return part of the previous pointer as the prioidx or
+- * classid.  Such races are short-lived and the result isn't critical.
+- */
+ static inline u16 sock_cgroup_prioidx(const struct sock_cgroup_data *skcd)
+ {
+-	/* fallback to 1 which is always the ID of the root cgroup */
+-	return (skcd->is_data & 1) ? skcd->prioidx : 1;
++#ifdef CONFIG_CGROUP_NET_PRIO
++	return READ_ONCE(skcd->prioidx);
++#else
++	return 1;
++#endif
+ }
+ 
+ static inline u32 sock_cgroup_classid(const struct sock_cgroup_data *skcd)
+ {
+-	/* fallback to 0 which is the unconfigured default classid */
+-	return (skcd->is_data & 1) ? skcd->classid : 0;
++#ifdef CONFIG_CGROUP_NET_CLASSID
++	return READ_ONCE(skcd->classid);
++#else
++	return 0;
++#endif
+ }
+ 
+-/*
+- * If invoked concurrently, the updaters may clobber each other.  The
+- * caller is responsible for synchronization.
+- */
+ static inline void sock_cgroup_set_prioidx(struct sock_cgroup_data *skcd,
+ 					   u16 prioidx)
+ {
+-	struct sock_cgroup_data skcd_buf = {{ .val = READ_ONCE(skcd->val) }};
+-
+-	if (sock_cgroup_prioidx(&skcd_buf) == prioidx)
+-		return;
+-
+-	if (!(skcd_buf.is_data & 1)) {
+-		skcd_buf.val = 0;
+-		skcd_buf.is_data = 1;
+-	}
+-
+-	skcd_buf.prioidx = prioidx;
+-	WRITE_ONCE(skcd->val, skcd_buf.val);	/* see sock_cgroup_ptr() */
++#ifdef CONFIG_CGROUP_NET_PRIO
++	WRITE_ONCE(skcd->prioidx, prioidx);
++#endif
+ }
+ 
+ static inline void sock_cgroup_set_classid(struct sock_cgroup_data *skcd,
+ 					   u32 classid)
+ {
+-	struct sock_cgroup_data skcd_buf = {{ .val = READ_ONCE(skcd->val) }};
+-
+-	if (sock_cgroup_classid(&skcd_buf) == classid)
+-		return;
+-
+-	if (!(skcd_buf.is_data & 1)) {
+-		skcd_buf.val = 0;
+-		skcd_buf.is_data = 1;
+-	}
+-
+-	skcd_buf.classid = classid;
+-	WRITE_ONCE(skcd->val, skcd_buf.val);	/* see sock_cgroup_ptr() */
++#ifdef CONFIG_CGROUP_NET_CLASSID
++	WRITE_ONCE(skcd->classid, classid);
++#endif
+ }
+ 
+ #else	/* CONFIG_SOCK_CGROUP_DATA */
+diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
+index 7bf60454a3136..75c151413fda8 100644
+--- a/include/linux/cgroup.h
++++ b/include/linux/cgroup.h
+@@ -829,33 +829,13 @@ static inline void cgroup_account_cputime_field(struct task_struct *task,
+  */
+ #ifdef CONFIG_SOCK_CGROUP_DATA
+ 
+-#if defined(CONFIG_CGROUP_NET_PRIO) || defined(CONFIG_CGROUP_NET_CLASSID)
+-extern spinlock_t cgroup_sk_update_lock;
+-#endif
+-
+-void cgroup_sk_alloc_disable(void);
+ void cgroup_sk_alloc(struct sock_cgroup_data *skcd);
+ void cgroup_sk_clone(struct sock_cgroup_data *skcd);
+ void cgroup_sk_free(struct sock_cgroup_data *skcd);
+ 
+ static inline struct cgroup *sock_cgroup_ptr(struct sock_cgroup_data *skcd)
+ {
+-#if defined(CONFIG_CGROUP_NET_PRIO) || defined(CONFIG_CGROUP_NET_CLASSID)
+-	unsigned long v;
+-
+-	/*
+-	 * @skcd->val is 64bit but the following is safe on 32bit too as we
+-	 * just need the lower ulong to be written and read atomically.
+-	 */
+-	v = READ_ONCE(skcd->val);
+-
+-	if (v & 3)
+-		return &cgrp_dfl_root.cgrp;
+-
+-	return (struct cgroup *)(unsigned long)v ?: &cgrp_dfl_root.cgrp;
+-#else
+-	return (struct cgroup *)(unsigned long)skcd->val;
+-#endif
++	return skcd->cgroup;
+ }
+ 
+ #else	/* CONFIG_CGROUP_DATA */
+diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
+index 552d86b6d6c90..869a16f7684f1 100644
+--- a/kernel/cgroup/cgroup.c
++++ b/kernel/cgroup/cgroup.c
+@@ -6584,74 +6584,44 @@ int cgroup_parse_float(const char *input, unsigned dec_shift, s64 *v)
+  */
+ #ifdef CONFIG_SOCK_CGROUP_DATA
+ 
+-#if defined(CONFIG_CGROUP_NET_PRIO) || defined(CONFIG_CGROUP_NET_CLASSID)
+-
+-DEFINE_SPINLOCK(cgroup_sk_update_lock);
+-static bool cgroup_sk_alloc_disabled __read_mostly;
+-
+-void cgroup_sk_alloc_disable(void)
+-{
+-	if (cgroup_sk_alloc_disabled)
+-		return;
+-	pr_info("cgroup: disabling cgroup2 socket matching due to net_prio or net_cls activation\n");
+-	cgroup_sk_alloc_disabled = true;
+-}
+-
+-#else
+-
+-#define cgroup_sk_alloc_disabled	false
+-
+-#endif
+-
+ void cgroup_sk_alloc(struct sock_cgroup_data *skcd)
+ {
+-	if (cgroup_sk_alloc_disabled) {
+-		skcd->no_refcnt = 1;
+-		return;
+-	}
+-
+ 	/* Don't associate the sock with unrelated interrupted task's cgroup. */
+ 	if (in_interrupt())
+ 		return;
+ 
+ 	rcu_read_lock();
+-
+ 	while (true) {
+ 		struct css_set *cset;
+ 
+ 		cset = task_css_set(current);
+ 		if (likely(cgroup_tryget(cset->dfl_cgrp))) {
+-			skcd->val = (unsigned long)cset->dfl_cgrp;
++			skcd->cgroup = cset->dfl_cgrp;
+ 			cgroup_bpf_get(cset->dfl_cgrp);
+ 			break;
+ 		}
+ 		cpu_relax();
+ 	}
+-
+ 	rcu_read_unlock();
+ }
+ 
+ void cgroup_sk_clone(struct sock_cgroup_data *skcd)
+ {
+-	if (skcd->val) {
+-		if (skcd->no_refcnt)
+-			return;
+-		/*
+-		 * We might be cloning a socket which is left in an empty
+-		 * cgroup and the cgroup might have already been rmdir'd.
+-		 * Don't use cgroup_get_live().
+-		 */
+-		cgroup_get(sock_cgroup_ptr(skcd));
+-		cgroup_bpf_get(sock_cgroup_ptr(skcd));
+-	}
++	struct cgroup *cgrp = sock_cgroup_ptr(skcd);
++
++	/*
++	 * We might be cloning a socket which is left in an empty
++	 * cgroup and the cgroup might have already been rmdir'd.
++	 * Don't use cgroup_get_live().
++	 */
++	cgroup_get(cgrp);
++	cgroup_bpf_get(cgrp);
+ }
+ 
+ void cgroup_sk_free(struct sock_cgroup_data *skcd)
+ {
+ 	struct cgroup *cgrp = sock_cgroup_ptr(skcd);
+ 
+-	if (skcd->no_refcnt)
+-		return;
+ 	cgroup_bpf_put(cgrp);
+ 	cgroup_put(cgrp);
+ }
+diff --git a/net/core/netclassid_cgroup.c b/net/core/netclassid_cgroup.c
+index b49c57d35a88e..1a6a86693b745 100644
+--- a/net/core/netclassid_cgroup.c
++++ b/net/core/netclassid_cgroup.c
+@@ -71,11 +71,8 @@ static int update_classid_sock(const void *v, struct file *file, unsigned n)
+ 	struct update_classid_context *ctx = (void *)v;
+ 	struct socket *sock = sock_from_file(file);
+ 
+-	if (sock) {
+-		spin_lock(&cgroup_sk_update_lock);
++	if (sock)
+ 		sock_cgroup_set_classid(&sock->sk->sk_cgrp_data, ctx->classid);
+-		spin_unlock(&cgroup_sk_update_lock);
+-	}
+ 	if (--ctx->batch == 0) {
+ 		ctx->batch = UPDATE_CLASSID_BATCH;
+ 		return n + 1;
+@@ -121,8 +118,6 @@ static int write_classid(struct cgroup_subsys_state *css, struct cftype *cft,
+ 	struct css_task_iter it;
+ 	struct task_struct *p;
+ 
+-	cgroup_sk_alloc_disable();
+-
+ 	cs->classid = (u32)value;
+ 
+ 	css_task_iter_start(css, 0, &it);
+diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c
+index 99a431c56f230..8456dfbe2eb40 100644
+--- a/net/core/netprio_cgroup.c
++++ b/net/core/netprio_cgroup.c
+@@ -207,8 +207,6 @@ static ssize_t write_priomap(struct kernfs_open_file *of,
+ 	if (!dev)
+ 		return -ENODEV;
+ 
+-	cgroup_sk_alloc_disable();
+-
+ 	rtnl_lock();
+ 
+ 	ret = netprio_set_prio(of_css(of), dev, prio);
+@@ -221,12 +219,10 @@ static ssize_t write_priomap(struct kernfs_open_file *of,
+ static int update_netprio(const void *v, struct file *file, unsigned n)
+ {
+ 	struct socket *sock = sock_from_file(file);
+-	if (sock) {
+-		spin_lock(&cgroup_sk_update_lock);
++
++	if (sock)
+ 		sock_cgroup_set_prioidx(&sock->sk->sk_cgrp_data,
+ 					(unsigned long)v);
+-		spin_unlock(&cgroup_sk_update_lock);
+-	}
+ 	return 0;
+ }
+ 
+@@ -235,8 +231,6 @@ static void net_prio_attach(struct cgroup_taskset *tset)
+ 	struct task_struct *p;
+ 	struct cgroup_subsys_state *css;
+ 
+-	cgroup_sk_alloc_disable();
+-
+ 	cgroup_taskset_for_each(p, css, tset) {
+ 		void *v = (void *)(unsigned long)css->id;
+ 
+-- 
+2.33.0
+
diff --git a/queue-5.14/net-neigh-enable-state-migration-between-nud_permane.patch b/queue-5.14/net-neigh-enable-state-migration-between-nud_permane.patch
new file mode 100644
index 00000000000..17127d6aa06
--- /dev/null
+++ b/queue-5.14/net-neigh-enable-state-migration-between-nud_permane.patch
@@ -0,0 +1,163 @@
+From 7d1d747d5feaa153c65752b3a49c0b672342936b Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 11 Oct 2021 14:12:36 +0200
+Subject: net, neigh: Enable state migration between NUD_PERMANENT and NTF_USE
+
+From: Daniel Borkmann <daniel@iogearbox.net>
+
+[ Upstream commit 3dc20f4762c62d3b3f0940644881ed818aa7b2f5 ]
+
+Currently, it is not possible to migrate a neighbor entry between NUD_PERMANENT
+state and NTF_USE flag with a dynamic NUD state from a user space control plane.
+Similarly, it is not possible to add/remove NTF_EXT_LEARNED flag from an existing
+neighbor entry in combination with NTF_USE flag.
+
+This is due to the latter directly calling into neigh_event_send() without any
+meta data updates as happening in __neigh_update(). Thus, to enable this use
+case, extend the latter with a NEIGH_UPDATE_F_USE flag where we break the
+NUD_PERMANENT state in particular so that a latter neigh_event_send() is able
+to re-resolve a neighbor entry.
+
+Before fix, NUD_PERMANENT -> NUD_* & NTF_USE:
+
+  # ./ip/ip n replace 192.168.178.30 dev enp5s0 lladdr f4:8c:50:5e:71:9a
+  # ./ip/ip n
+  192.168.178.30 dev enp5s0 lladdr f4:8c:50:5e:71:9a PERMANENT
+  [...]
+  # ./ip/ip n replace 192.168.178.30 dev enp5s0 use extern_learn
+  # ./ip/ip n
+  192.168.178.30 dev enp5s0 lladdr f4:8c:50:5e:71:9a PERMANENT
+  [...]
+
+As can be seen, despite the admin-triggered replace, the entry remains in the
+NUD_PERMANENT state.
+
+After fix, NUD_PERMANENT -> NUD_* & NTF_USE:
+
+  # ./ip/ip n replace 192.168.178.30 dev enp5s0 lladdr f4:8c:50:5e:71:9a
+  # ./ip/ip n
+  192.168.178.30 dev enp5s0 lladdr f4:8c:50:5e:71:9a PERMANENT
+  [...]
+  # ./ip/ip n replace 192.168.178.30 dev enp5s0 use extern_learn
+  # ./ip/ip n
+  192.168.178.30 dev enp5s0 lladdr f4:8c:50:5e:71:9a extern_learn REACHABLE
+  [...]
+  # ./ip/ip n
+  192.168.178.30 dev enp5s0 lladdr f4:8c:50:5e:71:9a extern_learn STALE
+  [...]
+  # ./ip/ip n replace 192.168.178.30 dev enp5s0 lladdr f4:8c:50:5e:71:9a
+  # ./ip/ip n
+  192.168.178.30 dev enp5s0 lladdr f4:8c:50:5e:71:9a PERMANENT
+  [...]
+
+After the fix, the admin-triggered replace switches to a dynamic state from
+the NTF_USE flag which triggered a new neighbor resolution. Likewise, we can
+transition back from there, if needed, into NUD_PERMANENT.
+
+Similar before/after behavior can be observed for below transitions:
+
+Before fix, NTF_USE -> NTF_USE | NTF_EXT_LEARNED -> NTF_USE:
+
+  # ./ip/ip n replace 192.168.178.30 dev enp5s0 use
+  # ./ip/ip n
+  192.168.178.30 dev enp5s0 lladdr f4:8c:50:5e:71:9a REACHABLE
+  [...]
+  # ./ip/ip n replace 192.168.178.30 dev enp5s0 use extern_learn
+  # ./ip/ip n
+  192.168.178.30 dev enp5s0 lladdr f4:8c:50:5e:71:9a REACHABLE
+  [...]
+
+After fix, NTF_USE -> NTF_USE | NTF_EXT_LEARNED -> NTF_USE:
+
+  # ./ip/ip n replace 192.168.178.30 dev enp5s0 use
+  # ./ip/ip n
+  192.168.178.30 dev enp5s0 lladdr f4:8c:50:5e:71:9a REACHABLE
+  [...]
+  # ./ip/ip n replace 192.168.178.30 dev enp5s0 use extern_learn
+  # ./ip/ip n
+  192.168.178.30 dev enp5s0 lladdr f4:8c:50:5e:71:9a extern_learn REACHABLE
+  [...]
+  # ./ip/ip n replace 192.168.178.30 dev enp5s0 use
+  # ./ip/ip n
+  192.168.178.30 dev enp5s0 lladdr f4:8c:50:5e:71:9a REACHABLE
+  [..]
+
+Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
+Acked-by: Roopa Prabhu <roopa@nvidia.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ include/net/neighbour.h |  1 +
+ net/core/neighbour.c    | 22 +++++++++++++---------
+ 2 files changed, 14 insertions(+), 9 deletions(-)
+
+diff --git a/include/net/neighbour.h b/include/net/neighbour.h
+index 990f9b1d17092..d5767e25509cc 100644
+--- a/include/net/neighbour.h
++++ b/include/net/neighbour.h
+@@ -253,6 +253,7 @@ static inline void *neighbour_priv(const struct neighbour *n)
+ #define NEIGH_UPDATE_F_OVERRIDE			0x00000001
+ #define NEIGH_UPDATE_F_WEAK_OVERRIDE		0x00000002
+ #define NEIGH_UPDATE_F_OVERRIDE_ISROUTER	0x00000004
++#define NEIGH_UPDATE_F_USE			0x10000000
+ #define NEIGH_UPDATE_F_EXT_LEARNED		0x20000000
+ #define NEIGH_UPDATE_F_ISROUTER			0x40000000
+ #define NEIGH_UPDATE_F_ADMIN			0x80000000
+diff --git a/net/core/neighbour.c b/net/core/neighbour.c
+index 077883f9f570b..704832723ab87 100644
+--- a/net/core/neighbour.c
++++ b/net/core/neighbour.c
+@@ -1221,7 +1221,7 @@ static void neigh_update_hhs(struct neighbour *neigh)
+ 				lladdr instead of overriding it
+ 				if it is different.
+ 	NEIGH_UPDATE_F_ADMIN	means that the change is administrative.
+-
++	NEIGH_UPDATE_F_USE	means that the entry is user triggered.
+ 	NEIGH_UPDATE_F_OVERRIDE_ISROUTER allows to override existing
+ 				NTF_ROUTER flag.
+ 	NEIGH_UPDATE_F_ISROUTER	indicates if the neighbour is known as
+@@ -1259,6 +1259,12 @@ static int __neigh_update(struct neighbour *neigh, const u8 *lladdr,
+ 		goto out;
+ 
+ 	ext_learn_change = neigh_update_ext_learned(neigh, flags, &notify);
++	if (flags & NEIGH_UPDATE_F_USE) {
++		new = old & ~NUD_PERMANENT;
++		neigh->nud_state = new;
++		err = 0;
++		goto out;
++	}
+ 
+ 	if (!(new & NUD_VALID)) {
+ 		neigh_del_timer(neigh);
+@@ -1968,22 +1974,20 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh,
+ 
+ 	if (protocol)
+ 		neigh->protocol = protocol;
+-
+ 	if (ndm->ndm_flags & NTF_EXT_LEARNED)
+ 		flags |= NEIGH_UPDATE_F_EXT_LEARNED;
+-
+ 	if (ndm->ndm_flags & NTF_ROUTER)
+ 		flags |= NEIGH_UPDATE_F_ISROUTER;
++	if (ndm->ndm_flags & NTF_USE)
++		flags |= NEIGH_UPDATE_F_USE;
+ 
+-	if (ndm->ndm_flags & NTF_USE) {
++	err = __neigh_update(neigh, lladdr, ndm->ndm_state, flags,
++			     NETLINK_CB(skb).portid, extack);
++	if (!err && ndm->ndm_flags & NTF_USE) {
+ 		neigh_event_send(neigh, NULL);
+ 		err = 0;
+-	} else
+-		err = __neigh_update(neigh, lladdr, ndm->ndm_state, flags,
+-				     NETLINK_CB(skb).portid, extack);
+-
++	}
+ 	neigh_release(neigh);
+-
+ out:
+ 	return err;
+ }
+-- 
+2.33.0
+
diff --git a/queue-5.14/series b/queue-5.14/series
index e2458f7888d..e02d7a110c4 100644
--- a/queue-5.14/series
+++ b/queue-5.14/series
@@ -793,3 +793,6 @@ mips-fix-duplicated-slashes-for-platform-file-path.patch
 mips-fix-assembly-error-from-mipsr2-code-used-within-mips_isa_arch_level.patch
 x86-mce-add-errata-workaround-for-skylake-skx37.patch
 kvm-x86-move-guest_pv_has-out-of-user_access-section.patch
+net-neigh-enable-state-migration-between-nud_permane.patch
+bpf-cgroups-fix-cgroup-v2-fallback-on-v1-v2-mixed-mo.patch
+bpf-cgroup-assign-cgroup-in-cgroup_sk_alloc-when-cal.patch