5.14-stable patches

author Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Mon, 20 Sep 2021 09:01:38 +0000 (11:01 +0200)

committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Mon, 20 Sep 2021 09:01:38 +0000 (11:01 +0200)
author Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Mon, 20 Sep 2021 09:01:38 +0000 (11:01 +0200)
committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Mon, 20 Sep 2021 09:01:38 +0000 (11:01 +0200)
diff --git a/queue-5.14/bnxt_en-make-bnxt_free_skbs-safe-to-call-after-bnxt_free_mem.patch b/queue-5.14/bnxt_en-make-bnxt_free_skbs-safe-to-call-after-bnxt_free_mem.patch

new file mode 100644 (file)

index 0000000..8b6eb9e
--- /dev/null
+++ b/queue-5.14/bnxt_en-make-bnxt_free_skbs-safe-to-call-after-bnxt_free_mem.patch
@@ -0,0 +1,75 @@
+From 1affc01fdc6035189a5ab2a24948c9419ee0ecf2 Mon Sep 17 00:00:00 2001
+From: Edwin Peer <edwin.peer@broadcom.com>
+Date: Sun, 12 Sep 2021 12:34:48 -0400
+Subject: bnxt_en: make bnxt_free_skbs() safe to call after bnxt_free_mem()
+
+From: Edwin Peer <edwin.peer@broadcom.com>
+
+commit 1affc01fdc6035189a5ab2a24948c9419ee0ecf2 upstream.
+
+The call to bnxt_free_mem(..., false) in the bnxt_half_open_nic() error
+path will deallocate ring descriptor memory via bnxt_free_?x_rings(),
+but because irq_re_init is false, the ring info itself is not freed.
+
+To simplify error paths, deallocation functions have generally been
+written to be safe when called on unallocated memory. It should always
+be safe to call dev_close(), which calls bnxt_free_skbs() a second time,
+even in this semi- allocated ring state.
+
+Calling bnxt_free_skbs() a second time with the rings already freed will
+cause NULL pointer dereference.  Fix it by checking the rings are valid
+before proceeding in bnxt_free_tx_skbs() and
+bnxt_free_one_rx_ring_skbs().
+
+Fixes: 975bc99a4a39 ("bnxt_en: Refactor bnxt_free_rx_skbs().")
+Signed-off-by: Edwin Peer <edwin.peer@broadcom.com>
+Signed-off-by: Michael Chan <michael.chan@broadcom.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/ethernet/broadcom/bnxt/bnxt.c |   13 +++++++++++++
+ 1 file changed, 13 insertions(+)
+
+--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
++++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+@@ -2680,6 +2680,9 @@ static void bnxt_free_tx_skbs(struct bnx
+               struct bnxt_tx_ring_info *txr = &bp->tx_ring[i];
+               int j;
+ 
++              if (!txr->tx_buf_ring)
++                      continue;
++
+               for (j = 0; j < max_idx;) {
+                       struct bnxt_sw_tx_bd *tx_buf = &txr->tx_buf_ring[j];
+                       struct sk_buff *skb;
+@@ -2764,6 +2767,9 @@ static void bnxt_free_one_rx_ring_skbs(s
+       }
+ 
+ skip_rx_tpa_free:
++      if (!rxr->rx_buf_ring)
++              goto skip_rx_buf_free;
++
+       for (i = 0; i < max_idx; i++) {
+               struct bnxt_sw_rx_bd *rx_buf = &rxr->rx_buf_ring[i];
+               dma_addr_t mapping = rx_buf->mapping;
+@@ -2786,6 +2792,11 @@ skip_rx_tpa_free:
+                       kfree(data);
+               }
+       }
++
++skip_rx_buf_free:
++      if (!rxr->rx_agg_ring)
++              goto skip_rx_agg_free;
++
+       for (i = 0; i < max_agg_idx; i++) {
+               struct bnxt_sw_rx_agg_bd *rx_agg_buf = &rxr->rx_agg_ring[i];
+               struct page *page = rx_agg_buf->page;
+@@ -2802,6 +2813,8 @@ skip_rx_tpa_free:
+ 
+               __free_page(page);
+       }
++
++skip_rx_agg_free:
+       if (rxr->rx_page) {
+               __free_page(rxr->rx_page);
+               rxr->rx_page = NULL;
diff --git a/queue-5.14/bpf-cgroups-fix-cgroup-v2-fallback-on-v1-v2-mixed-mode.patch b/queue-5.14/bpf-cgroups-fix-cgroup-v2-fallback-on-v1-v2-mixed-mode.patch

new file mode 100644 (file)

index 0000000..877ff9f
--- /dev/null
+++ b/queue-5.14/bpf-cgroups-fix-cgroup-v2-fallback-on-v1-v2-mixed-mode.patch
@@ -0,0 +1,385 @@
+From 8520e224f547cd070c7c8f97b1fc6d58cff7ccaa Mon Sep 17 00:00:00 2001
+From: Daniel Borkmann <daniel@iogearbox.net>
+Date: Tue, 14 Sep 2021 01:07:57 +0200
+Subject: bpf, cgroups: Fix cgroup v2 fallback on v1/v2 mixed mode
+
+From: Daniel Borkmann <daniel@iogearbox.net>
+
+commit 8520e224f547cd070c7c8f97b1fc6d58cff7ccaa upstream.
+
+Fix cgroup v1 interference when non-root cgroup v2 BPF programs are used.
+Back in the days, commit bd1060a1d671 ("sock, cgroup: add sock->sk_cgroup")
+embedded per-socket cgroup information into sock->sk_cgrp_data and in order
+to save 8 bytes in struct sock made both mutually exclusive, that is, when
+cgroup v1 socket tagging (e.g. net_cls/net_prio) is used, then cgroup v2
+falls back to the root cgroup in sock_cgroup_ptr() (&cgrp_dfl_root.cgrp).
+
+The assumption made was "there is no reason to mix the two and this is in line
+with how legacy and v2 compatibility is handled" as stated in bd1060a1d671.
+However, with Kubernetes more widely supporting cgroups v2 as well nowadays,
+this assumption no longer holds, and the possibility of the v1/v2 mixed mode
+with the v2 root fallback being hit becomes a real security issue.
+
+Many of the cgroup v2 BPF programs are also used for policy enforcement, just
+to pick _one_ example, that is, to programmatically deny socket related system
+calls like connect(2) or bind(2). A v2 root fallback would implicitly cause
+a policy bypass for the affected Pods.
+
+In production environments, we have recently seen this case due to various
+circumstances: i) a different 3rd party agent and/or ii) a container runtime
+such as [0] in the user's environment configuring legacy cgroup v1 net_cls
+tags, which triggered implicitly mentioned root fallback. Another case is
+Kubernetes projects like kind [1] which create Kubernetes nodes in a container
+and also add cgroup namespaces to the mix, meaning programs which are attached
+to the cgroup v2 root of the cgroup namespace get attached to a non-root
+cgroup v2 path from init namespace point of view. And the latter's root is
+out of reach for agents on a kind Kubernetes node to configure. Meaning, any
+entity on the node setting cgroup v1 net_cls tag will trigger the bypass
+despite cgroup v2 BPF programs attached to the namespace root.
+
+Generally, this mutual exclusiveness does not hold anymore in today's user
+environments and makes cgroup v2 usage from BPF side fragile and unreliable.
+This fix adds proper struct cgroup pointer for the cgroup v2 case to struct
+sock_cgroup_data in order to address these issues; this implicitly also fixes
+the tradeoffs being made back then with regards to races and refcount leaks
+as stated in bd1060a1d671, and removes the fallback, so that cgroup v2 BPF
+programs always operate as expected.
+
+  [0] https://github.com/nestybox/sysbox/
+  [1] https://kind.sigs.k8s.io/
+
+Fixes: bd1060a1d671 ("sock, cgroup: add sock->sk_cgroup")
+Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
+Signed-off-by: Alexei Starovoitov <ast@kernel.org>
+Acked-by: Stanislav Fomichev <sdf@google.com>
+Acked-by: Tejun Heo <tj@kernel.org>
+Link: https://lore.kernel.org/bpf/20210913230759.2313-1-daniel@iogearbox.net
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/linux/cgroup-defs.h  |  107 ++++++++++---------------------------------
+ include/linux/cgroup.h       |   22 --------
+ kernel/cgroup/cgroup.c       |   50 ++++----------------
+ net/core/netclassid_cgroup.c |    7 --
+ net/core/netprio_cgroup.c    |   10 ----
+ 5 files changed, 41 insertions(+), 155 deletions(-)
+
+--- a/include/linux/cgroup-defs.h
++++ b/include/linux/cgroup-defs.h
+@@ -752,107 +752,54 @@ static inline void cgroup_threadgroup_ch
+  * sock_cgroup_data is embedded at sock->sk_cgrp_data and contains
+  * per-socket cgroup information except for memcg association.
+  *
+- * On legacy hierarchies, net_prio and net_cls controllers directly set
+- * attributes on each sock which can then be tested by the network layer.
+- * On the default hierarchy, each sock is associated with the cgroup it was
+- * created in and the networking layer can match the cgroup directly.
+- *
+- * To avoid carrying all three cgroup related fields separately in sock,
+- * sock_cgroup_data overloads (prioidx, classid) and the cgroup pointer.
+- * On boot, sock_cgroup_data records the cgroup that the sock was created
+- * in so that cgroup2 matches can be made; however, once either net_prio or
+- * net_cls starts being used, the area is overridden to carry prioidx and/or
+- * classid.  The two modes are distinguished by whether the lowest bit is
+- * set.  Clear bit indicates cgroup pointer while set bit prioidx and
+- * classid.
+- *
+- * While userland may start using net_prio or net_cls at any time, once
+- * either is used, cgroup2 matching no longer works.  There is no reason to
+- * mix the two and this is in line with how legacy and v2 compatibility is
+- * handled.  On mode switch, cgroup references which are already being
+- * pointed to by socks may be leaked.  While this can be remedied by adding
+- * synchronization around sock_cgroup_data, given that the number of leaked
+- * cgroups is bound and highly unlikely to be high, this seems to be the
+- * better trade-off.
++ * On legacy hierarchies, net_prio and net_cls controllers directly
++ * set attributes on each sock which can then be tested by the network
++ * layer. On the default hierarchy, each sock is associated with the
++ * cgroup it was created in and the networking layer can match the
++ * cgroup directly.
+  */
+ struct sock_cgroup_data {
+-      union {
+-#ifdef __LITTLE_ENDIAN
+-              struct {
+-                      u8      is_data : 1;
+-                      u8      no_refcnt : 1;
+-                      u8      unused : 6;
+-                      u8      padding;
+-                      u16     prioidx;
+-                      u32     classid;
+-              } __packed;
+-#else
+-              struct {
+-                      u32     classid;
+-                      u16     prioidx;
+-                      u8      padding;
+-                      u8      unused : 6;
+-                      u8      no_refcnt : 1;
+-                      u8      is_data : 1;
+-              } __packed;
++      struct cgroup   *cgroup; /* v2 */
++#ifdef CONFIG_CGROUP_NET_CLASSID
++      u32             classid; /* v1 */
++#endif
++#ifdef CONFIG_CGROUP_NET_PRIO
++      u16             prioidx; /* v1 */
+ #endif
+-              u64             val;
+-      };
+ };
+ 
+-/*
+- * There's a theoretical window where the following accessors race with
+- * updaters and return part of the previous pointer as the prioidx or
+- * classid.  Such races are short-lived and the result isn't critical.
+- */
+ static inline u16 sock_cgroup_prioidx(const struct sock_cgroup_data *skcd)
+ {
+-      /* fallback to 1 which is always the ID of the root cgroup */
+-      return (skcd->is_data & 1) ? skcd->prioidx : 1;
++#ifdef CONFIG_CGROUP_NET_PRIO
++      return READ_ONCE(skcd->prioidx);
++#else
++      return 1;
++#endif
+ }
+ 
+ static inline u32 sock_cgroup_classid(const struct sock_cgroup_data *skcd)
+ {
+-      /* fallback to 0 which is the unconfigured default classid */
+-      return (skcd->is_data & 1) ? skcd->classid : 0;
++#ifdef CONFIG_CGROUP_NET_CLASSID
++      return READ_ONCE(skcd->classid);
++#else
++      return 0;
++#endif
+ }
+ 
+-/*
+- * If invoked concurrently, the updaters may clobber each other.  The
+- * caller is responsible for synchronization.
+- */
+ static inline void sock_cgroup_set_prioidx(struct sock_cgroup_data *skcd,
+                                          u16 prioidx)
+ {
+-      struct sock_cgroup_data skcd_buf = {{ .val = READ_ONCE(skcd->val) }};
+-
+-      if (sock_cgroup_prioidx(&skcd_buf) == prioidx)
+-              return;
+-
+-      if (!(skcd_buf.is_data & 1)) {
+-              skcd_buf.val = 0;
+-              skcd_buf.is_data = 1;
+-      }
+-
+-      skcd_buf.prioidx = prioidx;
+-      WRITE_ONCE(skcd->val, skcd_buf.val);    /* see sock_cgroup_ptr() */
++#ifdef CONFIG_CGROUP_NET_PRIO
++      WRITE_ONCE(skcd->prioidx, prioidx);
++#endif
+ }
+ 
+ static inline void sock_cgroup_set_classid(struct sock_cgroup_data *skcd,
+                                          u32 classid)
+ {
+-      struct sock_cgroup_data skcd_buf = {{ .val = READ_ONCE(skcd->val) }};
+-
+-      if (sock_cgroup_classid(&skcd_buf) == classid)
+-              return;
+-
+-      if (!(skcd_buf.is_data & 1)) {
+-              skcd_buf.val = 0;
+-              skcd_buf.is_data = 1;
+-      }
+-
+-      skcd_buf.classid = classid;
+-      WRITE_ONCE(skcd->val, skcd_buf.val);    /* see sock_cgroup_ptr() */
++#ifdef CONFIG_CGROUP_NET_CLASSID
++      WRITE_ONCE(skcd->classid, classid);
++#endif
+ }
+ 
+ #else /* CONFIG_SOCK_CGROUP_DATA */
+--- a/include/linux/cgroup.h
++++ b/include/linux/cgroup.h
+@@ -829,33 +829,13 @@ static inline void cgroup_account_cputim
+  */
+ #ifdef CONFIG_SOCK_CGROUP_DATA
+ 
+-#if defined(CONFIG_CGROUP_NET_PRIO) || defined(CONFIG_CGROUP_NET_CLASSID)
+-extern spinlock_t cgroup_sk_update_lock;
+-#endif
+-
+-void cgroup_sk_alloc_disable(void);
+ void cgroup_sk_alloc(struct sock_cgroup_data *skcd);
+ void cgroup_sk_clone(struct sock_cgroup_data *skcd);
+ void cgroup_sk_free(struct sock_cgroup_data *skcd);
+ 
+ static inline struct cgroup *sock_cgroup_ptr(struct sock_cgroup_data *skcd)
+ {
+-#if defined(CONFIG_CGROUP_NET_PRIO) || defined(CONFIG_CGROUP_NET_CLASSID)
+-      unsigned long v;
+-
+-      /*
+-       * @skcd->val is 64bit but the following is safe on 32bit too as we
+-       * just need the lower ulong to be written and read atomically.
+-       */
+-      v = READ_ONCE(skcd->val);
+-
+-      if (v & 3)
+-              return &cgrp_dfl_root.cgrp;
+-
+-      return (struct cgroup *)(unsigned long)v ?: &cgrp_dfl_root.cgrp;
+-#else
+-      return (struct cgroup *)(unsigned long)skcd->val;
+-#endif
++      return skcd->cgroup;
+ }
+ 
+ #else /* CONFIG_CGROUP_DATA */
+--- a/kernel/cgroup/cgroup.c
++++ b/kernel/cgroup/cgroup.c
+@@ -6559,74 +6559,44 @@ int cgroup_parse_float(const char *input
+  */
+ #ifdef CONFIG_SOCK_CGROUP_DATA
+ 
+-#if defined(CONFIG_CGROUP_NET_PRIO) || defined(CONFIG_CGROUP_NET_CLASSID)
+-
+-DEFINE_SPINLOCK(cgroup_sk_update_lock);
+-static bool cgroup_sk_alloc_disabled __read_mostly;
+-
+-void cgroup_sk_alloc_disable(void)
+-{
+-      if (cgroup_sk_alloc_disabled)
+-              return;
+-      pr_info("cgroup: disabling cgroup2 socket matching due to net_prio or net_cls activation\n");
+-      cgroup_sk_alloc_disabled = true;
+-}
+-
+-#else
+-
+-#define cgroup_sk_alloc_disabled      false
+-
+-#endif
+-
+ void cgroup_sk_alloc(struct sock_cgroup_data *skcd)
+ {
+-      if (cgroup_sk_alloc_disabled) {
+-              skcd->no_refcnt = 1;
+-              return;
+-      }
+-
+       /* Don't associate the sock with unrelated interrupted task's cgroup. */
+       if (in_interrupt())
+               return;
+ 
+       rcu_read_lock();
+-
+       while (true) {
+               struct css_set *cset;
+ 
+               cset = task_css_set(current);
+               if (likely(cgroup_tryget(cset->dfl_cgrp))) {
+-                      skcd->val = (unsigned long)cset->dfl_cgrp;
++                      skcd->cgroup = cset->dfl_cgrp;
+                       cgroup_bpf_get(cset->dfl_cgrp);
+                       break;
+               }
+               cpu_relax();
+       }
+-
+       rcu_read_unlock();
+ }
+ 
+ void cgroup_sk_clone(struct sock_cgroup_data *skcd)
+ {
+-      if (skcd->val) {
+-              if (skcd->no_refcnt)
+-                      return;
+-              /*
+-               * We might be cloning a socket which is left in an empty
+-               * cgroup and the cgroup might have already been rmdir'd.
+-               * Don't use cgroup_get_live().
+-               */
+-              cgroup_get(sock_cgroup_ptr(skcd));
+-              cgroup_bpf_get(sock_cgroup_ptr(skcd));
+-      }
++      struct cgroup *cgrp = sock_cgroup_ptr(skcd);
++
++      /*
++       * We might be cloning a socket which is left in an empty
++       * cgroup and the cgroup might have already been rmdir'd.
++       * Don't use cgroup_get_live().
++       */
++      cgroup_get(cgrp);
++      cgroup_bpf_get(cgrp);
+ }
+ 
+ void cgroup_sk_free(struct sock_cgroup_data *skcd)
+ {
+       struct cgroup *cgrp = sock_cgroup_ptr(skcd);
+ 
+-      if (skcd->no_refcnt)
+-              return;
+       cgroup_bpf_put(cgrp);
+       cgroup_put(cgrp);
+ }
+--- a/net/core/netclassid_cgroup.c
++++ b/net/core/netclassid_cgroup.c
+@@ -71,11 +71,8 @@ static int update_classid_sock(const voi
+       struct update_classid_context *ctx = (void *)v;
+       struct socket *sock = sock_from_file(file);
+ 
+-      if (sock) {
+-              spin_lock(&cgroup_sk_update_lock);
++      if (sock)
+               sock_cgroup_set_classid(&sock->sk->sk_cgrp_data, ctx->classid);
+-              spin_unlock(&cgroup_sk_update_lock);
+-      }
+       if (--ctx->batch == 0) {
+               ctx->batch = UPDATE_CLASSID_BATCH;
+               return n + 1;
+@@ -121,8 +118,6 @@ static int write_classid(struct cgroup_s
+       struct css_task_iter it;
+       struct task_struct *p;
+ 
+-      cgroup_sk_alloc_disable();
+-
+       cs->classid = (u32)value;
+ 
+       css_task_iter_start(css, 0, &it);
+--- a/net/core/netprio_cgroup.c
++++ b/net/core/netprio_cgroup.c
+@@ -207,8 +207,6 @@ static ssize_t write_priomap(struct kern
+       if (!dev)
+               return -ENODEV;
+ 
+-      cgroup_sk_alloc_disable();
+-
+       rtnl_lock();
+ 
+       ret = netprio_set_prio(of_css(of), dev, prio);
+@@ -221,12 +219,10 @@ static ssize_t write_priomap(struct kern
+ static int update_netprio(const void *v, struct file *file, unsigned n)
+ {
+       struct socket *sock = sock_from_file(file);
+-      if (sock) {
+-              spin_lock(&cgroup_sk_update_lock);
++
++      if (sock)
+               sock_cgroup_set_prioidx(&sock->sk->sk_cgrp_data,
+                                       (unsigned long)v);
+-              spin_unlock(&cgroup_sk_update_lock);
+-      }
+       return 0;
+ }
+ 
+@@ -235,8 +231,6 @@ static void net_prio_attach(struct cgrou
+       struct task_struct *p;
+       struct cgroup_subsys_state *css;
+ 
+-      cgroup_sk_alloc_disable();
+-
+       cgroup_taskset_for_each(p, css, tset) {
+               void *v = (void *)(unsigned long)css->id;
+ 
diff --git a/queue-5.14/dt-bindings-arm-fix-toradex-compatible-typo.patch b/queue-5.14/dt-bindings-arm-fix-toradex-compatible-typo.patch

new file mode 100644 (file)

index 0000000..03a57bb
--- /dev/null
+++ b/queue-5.14/dt-bindings-arm-fix-toradex-compatible-typo.patch
@@ -0,0 +1,31 @@
+From 55c21d57eafb7b379bb7b3e93baf9ca2695895b0 Mon Sep 17 00:00:00 2001
+From: David Heidelberg <david@ixit.cz>
+Date: Sun, 12 Sep 2021 18:51:20 +0200
+Subject: dt-bindings: arm: Fix Toradex compatible typo
+
+From: David Heidelberg <david@ixit.cz>
+
+commit 55c21d57eafb7b379bb7b3e93baf9ca2695895b0 upstream.
+
+Fix board compatible typo reported by dtbs_check.
+
+Fixes: f4d1577e9bc6 ("dt-bindings: arm: Convert Tegra board/soc bindings to json-schema")
+Signed-off-by: David Heidelberg <david@ixit.cz>
+Link: https://lore.kernel.org/r/20210912165120.188490-1-david@ixit.cz
+Signed-off-by: Rob Herring <robh@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ Documentation/devicetree/bindings/arm/tegra.yaml |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/Documentation/devicetree/bindings/arm/tegra.yaml
++++ b/Documentation/devicetree/bindings/arm/tegra.yaml
+@@ -54,7 +54,7 @@ properties:
+           - const: toradex,apalis_t30
+           - const: nvidia,tegra30
+       - items:
+-          - const: toradex,apalis_t30-eval-v1.1
++          - const: toradex,apalis_t30-v1.1-eval
+           - const: toradex,apalis_t30-eval
+           - const: toradex,apalis_t30-v1.1
+           - const: toradex,apalis_t30
diff --git a/queue-5.14/ibmvnic-check-failover_pending-in-login-response.patch b/queue-5.14/ibmvnic-check-failover_pending-in-login-response.patch

new file mode 100644 (file)

index 0000000..65e9997
--- /dev/null
+++ b/queue-5.14/ibmvnic-check-failover_pending-in-login-response.patch
@@ -0,0 +1,38 @@
+From 273c29e944bda9a20a30c26cfc34c9a3f363280b Mon Sep 17 00:00:00 2001
+From: Sukadev Bhattiprolu <sukadev@linux.ibm.com>
+Date: Wed, 8 Sep 2021 09:58:20 -0700
+Subject: ibmvnic: check failover_pending in login response
+
+From: Sukadev Bhattiprolu <sukadev@linux.ibm.com>
+
+commit 273c29e944bda9a20a30c26cfc34c9a3f363280b upstream.
+
+If a failover occurs before a login response is received, the login
+response buffer maybe undefined. Check that there was no failover
+before accessing the login response buffer.
+
+Fixes: 032c5e82847a ("Driver for IBM System i/p VNIC protocol")
+Signed-off-by: Sukadev Bhattiprolu <sukadev@linux.ibm.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/ethernet/ibm/ibmvnic.c |    8 ++++++++
+ 1 file changed, 8 insertions(+)
+
+--- a/drivers/net/ethernet/ibm/ibmvnic.c
++++ b/drivers/net/ethernet/ibm/ibmvnic.c
+@@ -4700,6 +4700,14 @@ static int handle_login_rsp(union ibmvni
+               return 0;
+       }
+ 
++      if (adapter->failover_pending) {
++              adapter->init_done_rc = -EAGAIN;
++              netdev_dbg(netdev, "Failover pending, ignoring login response\n");
++              complete(&adapter->init_done);
++              /* login response buffer will be released on reset */
++              return 0;
++      }
++
+       netdev->mtu = adapter->req_mtu - ETH_HLEN;
+ 
+       netdev_dbg(adapter->netdev, "Login Response Buffer:\n");
diff --git a/queue-5.14/kvm-ppc-book3s-hv-tolerate-treclaim.-in-fake-suspend-mode-changing-registers.patch b/queue-5.14/kvm-ppc-book3s-hv-tolerate-treclaim.-in-fake-suspend-mode-changing-registers.patch

new file mode 100644 (file)

index 0000000..8be67b8
--- /dev/null
+++ b/queue-5.14/kvm-ppc-book3s-hv-tolerate-treclaim.-in-fake-suspend-mode-changing-registers.patch
@@ -0,0 +1,96 @@
+From 267cdfa21385d78c794768233678756e32b39ead Mon Sep 17 00:00:00 2001
+From: Nicholas Piggin <npiggin@gmail.com>
+Date: Wed, 8 Sep 2021 20:17:18 +1000
+Subject: KVM: PPC: Book3S HV: Tolerate treclaim. in fake-suspend mode changing registers
+
+From: Nicholas Piggin <npiggin@gmail.com>
+
+commit 267cdfa21385d78c794768233678756e32b39ead upstream.
+
+POWER9 DD2.2 and 2.3 hardware implements a "fake-suspend" mode where
+certain TM instructions executed in HV=0 mode cause softpatch interrupts
+so the hypervisor can emulate them and prevent problematic processor
+conditions. In this fake-suspend mode, the treclaim. instruction does
+not modify registers.
+
+Unfortunately the rfscv instruction executed by the guest do not
+generate softpatch interrupts, which can cause the hypervisor to lose
+track of the fake-suspend mode, and it can execute this treclaim. while
+not in fake-suspend mode. This modifies GPRs and crashes the hypervisor.
+
+It's not trivial to disable scv in the guest with HFSCR now, because
+they assume a POWER9 has scv available. So this fix saves and restores
+checkpointed registers across the treclaim.
+
+Fixes: 7854f7545bff ("KVM: PPC: Book3S: Rework TM save/restore code and make it C-callable")
+Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
+Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
+Link: https://lore.kernel.org/r/20210908101718.118522-2-npiggin@gmail.com
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/powerpc/kvm/book3s_hv_rmhandlers.S |   36 ++++++++++++++++++++++++++++++--
+ 1 file changed, 34 insertions(+), 2 deletions(-)
+
+--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
++++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+@@ -2578,7 +2578,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_P9_TM_HV_A
+       /* The following code handles the fake_suspend = 1 case */
+       mflr    r0
+       std     r0, PPC_LR_STKOFF(r1)
+-      stdu    r1, -PPC_MIN_STKFRM(r1)
++      stdu    r1, -TM_FRAME_SIZE(r1)
+ 
+       /* Turn on TM. */
+       mfmsr   r8
+@@ -2593,10 +2593,42 @@ BEGIN_FTR_SECTION
+ END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_XER_SO_BUG)
+       nop
+ 
++      /*
++       * It's possible that treclaim. may modify registers, if we have lost
++       * track of fake-suspend state in the guest due to it using rfscv.
++       * Save and restore registers in case this occurs.
++       */
++      mfspr   r3, SPRN_DSCR
++      mfspr   r4, SPRN_XER
++      mfspr   r5, SPRN_AMR
++      /* SPRN_TAR would need to be saved here if the kernel ever used it */
++      mfcr    r12
++      SAVE_NVGPRS(r1)
++      SAVE_GPR(2, r1)
++      SAVE_GPR(3, r1)
++      SAVE_GPR(4, r1)
++      SAVE_GPR(5, r1)
++      stw     r12, 8(r1)
++      std     r1, HSTATE_HOST_R1(r13)
++
+       /* We have to treclaim here because that's the only way to do S->N */
+       li      r3, TM_CAUSE_KVM_RESCHED
+       TRECLAIM(R3)
+ 
++      GET_PACA(r13)
++      ld      r1, HSTATE_HOST_R1(r13)
++      REST_GPR(2, r1)
++      REST_GPR(3, r1)
++      REST_GPR(4, r1)
++      REST_GPR(5, r1)
++      lwz     r12, 8(r1)
++      REST_NVGPRS(r1)
++      mtspr   SPRN_DSCR, r3
++      mtspr   SPRN_XER, r4
++      mtspr   SPRN_AMR, r5
++      mtcr    r12
++      HMT_MEDIUM
++
+       /*
+        * We were in fake suspend, so we are not going to save the
+        * register state as the guest checkpointed state (since
+@@ -2624,7 +2656,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_XER_
+       std     r5, VCPU_TFHAR(r9)
+       std     r6, VCPU_TFIAR(r9)
+ 
+-      addi    r1, r1, PPC_MIN_STKFRM
++      addi    r1, r1, TM_FRAME_SIZE
+       ld      r0, PPC_LR_STKOFF(r1)
+       mtlr    r0
+       blr
diff --git a/queue-5.14/net-hns3-change-affinity_mask-to-numa-node-range.patch b/queue-5.14/net-hns3-change-affinity_mask-to-numa-node-range.patch

new file mode 100644 (file)

index 0000000..3802c76
--- /dev/null
+++ b/queue-5.14/net-hns3-change-affinity_mask-to-numa-node-range.patch
@@ -0,0 +1,55 @@
+From 1dc839ec09d3ab2a4156dc98328b8bc3586f2b70 Mon Sep 17 00:00:00 2001
+From: Yufeng Mo <moyufeng@huawei.com>
+Date: Mon, 13 Sep 2021 21:08:22 +0800
+Subject: net: hns3: change affinity_mask to numa node range
+
+From: Yufeng Mo <moyufeng@huawei.com>
+
+commit 1dc839ec09d3ab2a4156dc98328b8bc3586f2b70 upstream.
+
+Currently, affinity_mask is set to a single cpu. As a result,
+irqbalance becomes invalid in SUBSET or EXACT mode. To solve
+this problem, change affinity_mask to numa node range. In this
+way, irqbalance can be performed on the cpu of the numa node.
+
+Fixes: 0812545487ec ("net: hns3: add interrupt affinity support for misc interrupt")
+Signed-off-by: Yufeng Mo <moyufeng@huawei.com>
+Signed-off-by: Guangbin Huang <huangguangbin2@huawei.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c |   14 ++++++++------
+ 1 file changed, 8 insertions(+), 6 deletions(-)
+
+--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
++++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
+@@ -1528,9 +1528,10 @@ static void hclge_init_kdump_kernel_conf
+ static int hclge_configure(struct hclge_dev *hdev)
+ {
+       struct hnae3_ae_dev *ae_dev = pci_get_drvdata(hdev->pdev);
++      const struct cpumask *cpumask = cpu_online_mask;
+       struct hclge_cfg cfg;
+       unsigned int i;
+-      int ret;
++      int node, ret;
+ 
+       ret = hclge_get_cfg(hdev, &cfg);
+       if (ret)
+@@ -1595,11 +1596,12 @@ static int hclge_configure(struct hclge_
+ 
+       hclge_init_kdump_kernel_config(hdev);
+ 
+-      /* Set the init affinity based on pci func number */
+-      i = cpumask_weight(cpumask_of_node(dev_to_node(&hdev->pdev->dev)));
+-      i = i ? PCI_FUNC(hdev->pdev->devfn) % i : 0;
+-      cpumask_set_cpu(cpumask_local_spread(i, dev_to_node(&hdev->pdev->dev)),
+-                      &hdev->affinity_mask);
++      /* Set the affinity based on numa node */
++      node = dev_to_node(&hdev->pdev->dev);
++      if (node != NUMA_NO_NODE)
++              cpumask = cpumask_of_node(node);
++
++      cpumask_copy(&hdev->affinity_mask, cpumask);
+ 
+       return ret;
+ }
diff --git a/queue-5.14/net-hns3-disable-mac-in-flr-process.patch b/queue-5.14/net-hns3-disable-mac-in-flr-process.patch

new file mode 100644 (file)

index 0000000..decacff
--- /dev/null
+++ b/queue-5.14/net-hns3-disable-mac-in-flr-process.patch
@@ -0,0 +1,39 @@
+From b81d8948746520f989e86d66292ff72b5056114a Mon Sep 17 00:00:00 2001
+From: Yufeng Mo <moyufeng@huawei.com>
+Date: Mon, 13 Sep 2021 21:08:23 +0800
+Subject: net: hns3: disable mac in flr process
+
+From: Yufeng Mo <moyufeng@huawei.com>
+
+commit b81d8948746520f989e86d66292ff72b5056114a upstream.
+
+The firmware will not disable mac in flr process. Therefore, the driver
+needs to proactively disable mac during flr, which is the same as the
+function reset.
+
+Fixes: 35d93a30040c ("net: hns3: adjust the process of PF reset")
+Signed-off-by: Yufeng Mo <moyufeng@huawei.com>
+Signed-off-by: Guangbin Huang <huangguangbin2@huawei.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c |    5 +++--
+ 1 file changed, 3 insertions(+), 2 deletions(-)
+
+--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
++++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
+@@ -8120,11 +8120,12 @@ static void hclge_ae_stop(struct hnae3_h
+       hclge_clear_arfs_rules(hdev);
+       spin_unlock_bh(&hdev->fd_rule_lock);
+ 
+-      /* If it is not PF reset, the firmware will disable the MAC,
++      /* If it is not PF reset or FLR, the firmware will disable the MAC,
+        * so it only need to stop phy here.
+        */
+       if (test_bit(HCLGE_STATE_RST_HANDLING, &hdev->state) &&
+-          hdev->reset_type != HNAE3_FUNC_RESET) {
++          hdev->reset_type != HNAE3_FUNC_RESET &&
++          hdev->reset_type != HNAE3_FLR_RESET) {
+               hclge_mac_stop_phy(hdev);
+               hclge_update_link_status(hdev);
+               return;
diff --git a/queue-5.14/net-hns3-fix-the-timing-issue-of-vf-clearing-interrupt-sources.patch b/queue-5.14/net-hns3-fix-the-timing-issue-of-vf-clearing-interrupt-sources.patch

new file mode 100644 (file)

index 0000000..91872c0
--- /dev/null
+++ b/queue-5.14/net-hns3-fix-the-timing-issue-of-vf-clearing-interrupt-sources.patch
@@ -0,0 +1,56 @@
+From 427900d27d86b820c559037a984bd403f910860f Mon Sep 17 00:00:00 2001
+From: Jiaran Zhang <zhangjiaran@huawei.com>
+Date: Mon, 13 Sep 2021 21:08:25 +0800
+Subject: net: hns3: fix the timing issue of VF clearing interrupt sources
+
+From: Jiaran Zhang <zhangjiaran@huawei.com>
+
+commit 427900d27d86b820c559037a984bd403f910860f upstream.
+
+Currently, the VF does not clear the interrupt source immediately after
+receiving the interrupt. As a result, if the second interrupt task is
+triggered when processing the first interrupt task, clearing the
+interrupt source before exiting will clear the interrupt sources of the
+two tasks at the same time. As a result, no interrupt is triggered for
+the second task. The VF detects the missed message only when the next
+interrupt is generated.
+
+Clearing it immediately after executing check_evt_cause ensures that:
+1. Even if two interrupt tasks are triggered at the same time, they can
+be processed.
+2. If the second task is triggered during the processing of the first
+task and the interrupt source is not cleared, the interrupt is reported
+after vector0 is enabled.
+
+Fixes: b90fcc5bd904 ("net: hns3: add reset handling for VF when doing Core/Global/IMP reset")
+Signed-off-by: Jiaran Zhang <zhangjiaran@huawei.com>
+Signed-off-by: Guangbin Huang <huangguangbin2@huawei.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c |    6 +++---
+ 1 file changed, 3 insertions(+), 3 deletions(-)
+
+--- a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c
++++ b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c
+@@ -2463,6 +2463,8 @@ static irqreturn_t hclgevf_misc_irq_hand
+ 
+       hclgevf_enable_vector(&hdev->misc_vector, false);
+       event_cause = hclgevf_check_evt_cause(hdev, &clearval);
++      if (event_cause != HCLGEVF_VECTOR0_EVENT_OTHER)
++              hclgevf_clear_event_cause(hdev, clearval);
+ 
+       switch (event_cause) {
+       case HCLGEVF_VECTOR0_EVENT_RST:
+@@ -2475,10 +2477,8 @@ static irqreturn_t hclgevf_misc_irq_hand
+               break;
+       }
+ 
+-      if (event_cause != HCLGEVF_VECTOR0_EVENT_OTHER) {
+-              hclgevf_clear_event_cause(hdev, clearval);
++      if (event_cause != HCLGEVF_VECTOR0_EVENT_OTHER)
+               hclgevf_enable_vector(&hdev->misc_vector, true);
+-      }
+ 
+       return IRQ_HANDLED;
+ }
diff --git a/queue-5.14/net-hns3-pad-the-short-tunnel-frame-before-sending-to-hardware.patch b/queue-5.14/net-hns3-pad-the-short-tunnel-frame-before-sending-to-hardware.patch

new file mode 100644 (file)

index 0000000..bb41021
--- /dev/null
+++ b/queue-5.14/net-hns3-pad-the-short-tunnel-frame-before-sending-to-hardware.patch
@@ -0,0 +1,46 @@
+From d18e81183b1cb9c309266cbbce9acd3e0c528d04 Mon Sep 17 00:00:00 2001
+From: Yufeng Mo <moyufeng@huawei.com>
+Date: Mon, 13 Sep 2021 21:08:21 +0800
+Subject: net: hns3: pad the short tunnel frame before sending to hardware
+
+From: Yufeng Mo <moyufeng@huawei.com>
+
+commit d18e81183b1cb9c309266cbbce9acd3e0c528d04 upstream.
+
+The hardware cannot handle short tunnel frames below 65 bytes,
+and will cause vlan tag missing problem. So pads packet size to
+65 bytes for tunnel frames to fix this bug.
+
+Fixes: 3db084d28dc0("net: hns3: Fix for vxlan tx checksum bug")
+Signed-off-by: Yufeng Mo <moyufeng@huawei.com>
+Signed-off-by: Guangbin Huang <huangguangbin2@huawei.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/ethernet/hisilicon/hns3/hns3_enet.c |    8 ++++++--
+ 1 file changed, 6 insertions(+), 2 deletions(-)
+
+--- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
++++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
+@@ -73,6 +73,7 @@ MODULE_PARM_DESC(tx_sgl, "Minimum number
+ #define HNS3_OUTER_VLAN_TAG   2
+ 
+ #define HNS3_MIN_TX_LEN               33U
++#define HNS3_MIN_TUN_PKT_LEN  65U
+ 
+ /* hns3_pci_tbl - PCI Device ID Table
+  *
+@@ -1425,8 +1426,11 @@ static int hns3_set_l2l3l4(struct sk_buf
+                              l4.tcp->doff);
+               break;
+       case IPPROTO_UDP:
+-              if (hns3_tunnel_csum_bug(skb))
+-                      return skb_checksum_help(skb);
++              if (hns3_tunnel_csum_bug(skb)) {
++                      int ret = skb_put_padto(skb, HNS3_MIN_TUN_PKT_LEN);
++
++                      return ret ? ret : skb_checksum_help(skb);
++              }
+ 
+               hns3_set_field(*type_cs_vlan_tso, HNS3_TXD_L4CS_B, 1);
+               hns3_set_field(*type_cs_vlan_tso, HNS3_TXD_L4T_S,
diff --git a/queue-5.14/powerpc-64s-system-call-rfscv-workaround-for-tm-bugs.patch b/queue-5.14/powerpc-64s-system-call-rfscv-workaround-for-tm-bugs.patch

new file mode 100644 (file)

index 0000000..666d2b6
--- /dev/null
+++ b/queue-5.14/powerpc-64s-system-call-rfscv-workaround-for-tm-bugs.patch
@@ -0,0 +1,51 @@
+From ae7aaecc3f2f78b76ab3a8d6178610f55aadfa56 Mon Sep 17 00:00:00 2001
+From: Nicholas Piggin <npiggin@gmail.com>
+Date: Wed, 8 Sep 2021 20:17:17 +1000
+Subject: powerpc/64s: system call rfscv workaround for TM bugs
+
+From: Nicholas Piggin <npiggin@gmail.com>
+
+commit ae7aaecc3f2f78b76ab3a8d6178610f55aadfa56 upstream.
+
+The rfscv instruction does not work correctly with the fake-suspend mode
+in POWER9, which can end up with the hypervisor restoring an incorrect
+checkpoint.
+
+Work around this by setting the _TIF_RESTOREALL flag if a system call
+returns to a transaction active state, causing rfid to be used instead
+of rfscv to return, which will do the right thing. The contents of the
+registers are irrelevant because they will be overwritten in this case
+anyway.
+
+Fixes: 7fa95f9adaee7 ("powerpc/64s: system call support for scv/rfscv instructions")
+Reported-by: Eirik Fuller <efuller@redhat.com>
+Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
+Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
+Link: https://lore.kernel.org/r/20210908101718.118522-1-npiggin@gmail.com
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/powerpc/kernel/interrupt.c |   13 +++++++++++++
+ 1 file changed, 13 insertions(+)
+
+--- a/arch/powerpc/kernel/interrupt.c
++++ b/arch/powerpc/kernel/interrupt.c
+@@ -140,6 +140,19 @@ notrace long system_call_exception(long
+       irq_soft_mask_regs_set_state(regs, IRQS_ENABLED);
+ 
+       /*
++       * If system call is called with TM active, set _TIF_RESTOREALL to
++       * prevent RFSCV being used to return to userspace, because POWER9
++       * TM implementation has problems with this instruction returning to
++       * transactional state. Final register values are not relevant because
++       * the transaction will be aborted upon return anyway. Or in the case
++       * of unsupported_scv SIGILL fault, the return state does not much
++       * matter because it's an edge case.
++       */
++      if (IS_ENABLED(CONFIG_PPC_TRANSACTIONAL_MEM) &&
++                      unlikely(MSR_TM_TRANSACTIONAL(regs->msr)))
++              current_thread_info()->flags |= _TIF_RESTOREALL;
++
++      /*
+        * If the system call was made with a transaction active, doom it and
+        * return without performing the system call. Unless it was an
+        * unsupported scv vector, in which case it's treated like an illegal
diff --git a/queue-5.14/powerpc-mce-fix-access-error-in-mce-handler.patch b/queue-5.14/powerpc-mce-fix-access-error-in-mce-handler.patch

new file mode 100644 (file)

index 0000000..259bd7e
--- /dev/null
+++ b/queue-5.14/powerpc-mce-fix-access-error-in-mce-handler.patch
@@ -0,0 +1,84 @@
+From 3a1e92d0896e928ac2a5b58962d05a39afef2e23 Mon Sep 17 00:00:00 2001
+From: Ganesh Goudar <ganeshgr@linux.ibm.com>
+Date: Thu, 9 Sep 2021 12:13:30 +0530
+Subject: powerpc/mce: Fix access error in mce handler
+
+From: Ganesh Goudar <ganeshgr@linux.ibm.com>
+
+commit 3a1e92d0896e928ac2a5b58962d05a39afef2e23 upstream.
+
+We queue an irq work for deferred processing of mce event in realmode
+mce handler, where translation is disabled. Queuing of the work may
+result in accessing memory outside RMO region, such access needs the
+translation to be enabled for an LPAR running with hash mmu else the
+kernel crashes.
+
+After enabling translation in mce_handle_error() we used to leave it
+enabled to avoid crashing here, but now with the commit
+74c3354bc1d89 ("powerpc/pseries/mce: restore msr before returning from
+handler") we are restoring the MSR to disable translation.
+
+Hence to fix this enable the translation before queuing the work.
+
+Without this change following trace is seen on injecting SLB multihit in
+an LPAR running with hash mmu.
+
+  Oops: Kernel access of bad area, sig: 11 [#1]
+  LE PAGE_SIZE=64K MMU=Hash SMP NR_CPUS=2048 NUMA pSeries
+  CPU: 5 PID: 1883 Comm: insmod Tainted: G        OE     5.14.0-mce+ #137
+  NIP:  c000000000735d60 LR: c000000000318640 CTR: 0000000000000000
+  REGS: c00000001ebff9a0 TRAP: 0300   Tainted: G       OE      (5.14.0-mce+)
+  MSR:  8000000000001003 <SF,ME,RI,LE>  CR: 28008228  XER: 00000001
+  CFAR: c00000000031863c DAR: c00000027fa8fe08 DSISR: 40000000 IRQMASK: 0
+  ...
+  NIP llist_add_batch+0x0/0x40
+  LR  __irq_work_queue_local+0x70/0xc0
+  Call Trace:
+    0xc00000001ebffc0c (unreliable)
+    irq_work_queue+0x40/0x70
+    machine_check_queue_event+0xbc/0xd0
+    machine_check_early_common+0x16c/0x1f4
+
+Fixes: 74c3354bc1d89 ("powerpc/pseries/mce: restore msr before returning from handler")
+Signed-off-by: Ganesh Goudar <ganeshgr@linux.ibm.com>
+[mpe: Fix comment formatting, trim oops in change log for readability]
+Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
+Link: https://lore.kernel.org/r/20210909064330.312432-1-ganeshgr@linux.ibm.com
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/powerpc/kernel/mce.c |   17 +++++++++++++++--
+ 1 file changed, 15 insertions(+), 2 deletions(-)
+
+--- a/arch/powerpc/kernel/mce.c
++++ b/arch/powerpc/kernel/mce.c
+@@ -249,6 +249,7 @@ void machine_check_queue_event(void)
+ {
+       int index;
+       struct machine_check_event evt;
++      unsigned long msr;
+ 
+       if (!get_mce_event(&evt, MCE_EVENT_RELEASE))
+               return;
+@@ -262,8 +263,20 @@ void machine_check_queue_event(void)
+       memcpy(&local_paca->mce_info->mce_event_queue[index],
+              &evt, sizeof(evt));
+ 
+-      /* Queue irq work to process this event later. */
+-      irq_work_queue(&mce_event_process_work);
++      /*
++       * Queue irq work to process this event later. Before
++       * queuing the work enable translation for non radix LPAR,
++       * as irq_work_queue may try to access memory outside RMO
++       * region.
++       */
++      if (!radix_enabled() && firmware_has_feature(FW_FEATURE_LPAR)) {
++              msr = mfmsr();
++              mtmsr(msr | MSR_IR | MSR_DR);
++              irq_work_queue(&mce_event_process_work);
++              mtmsr(msr);
++      } else {
++              irq_work_queue(&mce_event_process_work);
++      }
+ }
+ 
+ void mce_common_process_ue(struct pt_regs *regs,
diff --git a/queue-5.14/qed-handle-management-fw-error.patch b/queue-5.14/qed-handle-management-fw-error.patch

new file mode 100644 (file)

index 0000000..9fed778
--- /dev/null
+++ b/queue-5.14/qed-handle-management-fw-error.patch
@@ -0,0 +1,46 @@
+From 20e100f52730cd0db609e559799c1712b5f27582 Mon Sep 17 00:00:00 2001
+From: Shai Malin <smalin@marvell.com>
+Date: Fri, 10 Sep 2021 11:33:56 +0300
+Subject: qed: Handle management FW error
+
+From: Shai Malin <smalin@marvell.com>
+
+commit 20e100f52730cd0db609e559799c1712b5f27582 upstream.
+
+Handle MFW (management FW) error response in order to avoid a crash
+during recovery flows.
+
+Changes from v1:
+- Add "Fixes tag".
+
+Fixes: tag 5e7ba042fd05 ("qed: Fix reading stale configuration information")
+Signed-off-by: Ariel Elior <aelior@marvell.com>
+Signed-off-by: Shai Malin <smalin@marvell.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/ethernet/qlogic/qed/qed_mcp.c |    6 +++++-
+ 1 file changed, 5 insertions(+), 1 deletion(-)
+
+--- a/drivers/net/ethernet/qlogic/qed/qed_mcp.c
++++ b/drivers/net/ethernet/qlogic/qed/qed_mcp.c
+@@ -3368,6 +3368,7 @@ qed_mcp_get_nvm_image_att(struct qed_hwf
+                         struct qed_nvm_image_att *p_image_att)
+ {
+       enum nvm_image_type type;
++      int rc;
+       u32 i;
+ 
+       /* Translate image_id into MFW definitions */
+@@ -3396,7 +3397,10 @@ qed_mcp_get_nvm_image_att(struct qed_hwf
+               return -EINVAL;
+       }
+ 
+-      qed_mcp_nvm_info_populate(p_hwfn);
++      rc = qed_mcp_nvm_info_populate(p_hwfn);
++      if (rc)
++              return rc;
++
+       for (i = 0; i < p_hwfn->nvm_info.num_images; i++)
+               if (type == p_hwfn->nvm_info.image_att[i].image_type)
+                       break;
diff --git a/queue-5.14/s390-pci_mmio-fully-validate-the-vma-before-calling-follow_pte.patch b/queue-5.14/s390-pci_mmio-fully-validate-the-vma-before-calling-follow_pte.patch

new file mode 100644 (file)

index 0000000..d1c7f9f
--- /dev/null
+++ b/queue-5.14/s390-pci_mmio-fully-validate-the-vma-before-calling-follow_pte.patch
@@ -0,0 +1,48 @@
+From a8b92b8c1eac8d655a97b1e90f4d83c25d9b9a18 Mon Sep 17 00:00:00 2001
+From: David Hildenbrand <david@redhat.com>
+Date: Thu, 9 Sep 2021 16:59:42 +0200
+Subject: s390/pci_mmio: fully validate the VMA before calling follow_pte()
+
+From: David Hildenbrand <david@redhat.com>
+
+commit a8b92b8c1eac8d655a97b1e90f4d83c25d9b9a18 upstream.
+
+We should not walk/touch page tables outside of VMA boundaries when
+holding only the mmap sem in read mode. Evil user space can modify the
+VMA layout just before this function runs and e.g., trigger races with
+page table removal code since commit dd2283f2605e ("mm: mmap: zap pages
+with read mmap_sem in munmap").
+
+find_vma() does not check if the address is >= the VMA start address;
+use vma_lookup() instead.
+
+Reviewed-by: Niklas Schnelle <schnelle@linux.ibm.com>
+Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
+Fixes: dd2283f2605e ("mm: mmap: zap pages with read mmap_sem in munmap")
+Signed-off-by: David Hildenbrand <david@redhat.com>
+Signed-off-by: Vasily Gorbik <gor@linux.ibm.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/s390/pci/pci_mmio.c |    4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+--- a/arch/s390/pci/pci_mmio.c
++++ b/arch/s390/pci/pci_mmio.c
+@@ -159,7 +159,7 @@ SYSCALL_DEFINE3(s390_pci_mmio_write, uns
+ 
+       mmap_read_lock(current->mm);
+       ret = -EINVAL;
+-      vma = find_vma(current->mm, mmio_addr);
++      vma = vma_lookup(current->mm, mmio_addr);
+       if (!vma)
+               goto out_unlock_mmap;
+       if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
+@@ -298,7 +298,7 @@ SYSCALL_DEFINE3(s390_pci_mmio_read, unsi
+ 
+       mmap_read_lock(current->mm);
+       ret = -EINVAL;
+-      vma = find_vma(current->mm, mmio_addr);
++      vma = vma_lookup(current->mm, mmio_addr);
+       if (!vma)
+               goto out_unlock_mmap;
+       if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
diff --git a/queue-5.14/selftest-net-fix-typo-in-altname-test.patch b/queue-5.14/selftest-net-fix-typo-in-altname-test.patch

new file mode 100644 (file)

index 0000000..bcfc165
--- /dev/null
+++ b/queue-5.14/selftest-net-fix-typo-in-altname-test.patch
@@ -0,0 +1,34 @@
+From 1b704b27beb11ce147d64b21c914e57afbfb5656 Mon Sep 17 00:00:00 2001
+From: Andrea Claudi <aclaudi@redhat.com>
+Date: Sat, 11 Sep 2021 16:14:18 +0200
+Subject: selftest: net: fix typo in altname test
+
+From: Andrea Claudi <aclaudi@redhat.com>
+
+commit 1b704b27beb11ce147d64b21c914e57afbfb5656 upstream.
+
+If altname deletion of the short alternative name fails, the error
+message printed is: "Failed to add short alternative name".
+This is obviously a typo, as we are testing altname deletion.
+
+Fix this using a proper error message.
+
+Fixes: f95e6c9c4617 ("selftest: net: add alternative names test")
+Signed-off-by: Andrea Claudi <aclaudi@redhat.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ tools/testing/selftests/net/altnames.sh |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/tools/testing/selftests/net/altnames.sh
++++ b/tools/testing/selftests/net/altnames.sh
+@@ -45,7 +45,7 @@ altnames_test()
+       check_err $? "Got unexpected long alternative name from link show JSON"
+ 
+       ip link property del $DUMMY_DEV altname $SHORT_NAME
+-      check_err $? "Failed to add short alternative name"
++      check_err $? "Failed to delete short alternative name"
+ 
+       ip -j -p link show $SHORT_NAME &>/dev/null
+       check_fail $? "Unexpected success while trying to do link show with deleted short alternative name"
diff --git a/queue-5.14/series b/queue-5.14/series

index e2b2be058dbd7f611601aed29f0d2aa709e482fe..93dc73d2049238e32562ad73f710578d2a791425 100644 (file)
--- a/queue-5.14/series
+++ b/queue-5.14/series
@@ -62,3 +62,20 @@ vhost_net-fix-oob-on-sendmsg-failure.patch
  net-af_unix-fix-a-data-race-in-unix_dgram_poll.patch
  net-dsa-destroy-the-phylink-instance-on-any-error-in-dsa_slave_phy_setup.patch
  revert-ipv4-fix-memory-leaks-in-ip_cmsg_send-callers.patch
+x86-uaccess-fix-32-bit-__get_user_asm_u64-when-cc_has_asm_goto_output-y.patch
+bpf-cgroups-fix-cgroup-v2-fallback-on-v1-v2-mixed-mode.patch
+tcp-fix-tp-undo_retrans-accounting-in-tcp_sacktag_one.patch
+selftest-net-fix-typo-in-altname-test.patch
+qed-handle-management-fw-error.patch
+udp_tunnel-fix-udp_tunnel_nic-work-queue-type.patch
+dt-bindings-arm-fix-toradex-compatible-typo.patch
+ibmvnic-check-failover_pending-in-login-response.patch
+kvm-ppc-book3s-hv-tolerate-treclaim.-in-fake-suspend-mode-changing-registers.patch
+powerpc-64s-system-call-rfscv-workaround-for-tm-bugs.patch
+powerpc-mce-fix-access-error-in-mce-handler.patch
+s390-pci_mmio-fully-validate-the-vma-before-calling-follow_pte.patch
+bnxt_en-make-bnxt_free_skbs-safe-to-call-after-bnxt_free_mem.patch
+net-hns3-pad-the-short-tunnel-frame-before-sending-to-hardware.patch
+net-hns3-change-affinity_mask-to-numa-node-range.patch
+net-hns3-disable-mac-in-flr-process.patch
+net-hns3-fix-the-timing-issue-of-vf-clearing-interrupt-sources.patch
diff --git a/queue-5.14/tcp-fix-tp-undo_retrans-accounting-in-tcp_sacktag_one.patch b/queue-5.14/tcp-fix-tp-undo_retrans-accounting-in-tcp_sacktag_one.patch

new file mode 100644 (file)

index 0000000..bc5899f
--- /dev/null
+++ b/queue-5.14/tcp-fix-tp-undo_retrans-accounting-in-tcp_sacktag_one.patch
@@ -0,0 +1,42 @@
+From 4f884f3962767877d7aabbc1ec124d2c307a4257 Mon Sep 17 00:00:00 2001
+From: zhenggy <zhenggy@chinatelecom.cn>
+Date: Tue, 14 Sep 2021 09:51:15 +0800
+Subject: tcp: fix tp->undo_retrans accounting in tcp_sacktag_one()
+
+From: zhenggy <zhenggy@chinatelecom.cn>
+
+commit 4f884f3962767877d7aabbc1ec124d2c307a4257 upstream.
+
+Commit 10d3be569243 ("tcp-tso: do not split TSO packets at retransmit
+time") may directly retrans a multiple segments TSO/GSO packet without
+split, Since this commit, we can no longer assume that a retransmitted
+packet is a single segment.
+
+This patch fixes the tp->undo_retrans accounting in tcp_sacktag_one()
+that use the actual segments(pcount) of the retransmitted packet.
+
+Before that commit (10d3be569243), the assumption underlying the
+tp->undo_retrans-- seems correct.
+
+Fixes: 10d3be569243 ("tcp-tso: do not split TSO packets at retransmit time")
+Signed-off-by: zhenggy <zhenggy@chinatelecom.cn>
+Reviewed-by: Eric Dumazet <edumazet@google.com>
+Acked-by: Yuchung Cheng <ycheng@google.com>
+Acked-by: Neal Cardwell <ncardwell@google.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/ipv4/tcp_input.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/net/ipv4/tcp_input.c
++++ b/net/ipv4/tcp_input.c
+@@ -1314,7 +1314,7 @@ static u8 tcp_sacktag_one(struct sock *s
+       if (dup_sack && (sacked & TCPCB_RETRANS)) {
+               if (tp->undo_marker && tp->undo_retrans > 0 &&
+                   after(end_seq, tp->undo_marker))
+-                      tp->undo_retrans--;
++                      tp->undo_retrans = max_t(int, 0, tp->undo_retrans - pcount);
+               if ((sacked & TCPCB_SACKED_ACKED) &&
+                   before(start_seq, state->reord))
+                               state->reord = start_seq;
diff --git a/queue-5.14/udp_tunnel-fix-udp_tunnel_nic-work-queue-type.patch b/queue-5.14/udp_tunnel-fix-udp_tunnel_nic-work-queue-type.patch

new file mode 100644 (file)

index 0000000..03abc11
--- /dev/null
+++ b/queue-5.14/udp_tunnel-fix-udp_tunnel_nic-work-queue-type.patch
@@ -0,0 +1,64 @@
+From e50e711351bdc656a8e6ca1022b4293cae8dcd59 Mon Sep 17 00:00:00 2001
+From: Aya Levin <ayal@nvidia.com>
+Date: Mon, 13 Sep 2021 10:53:49 +0300
+Subject: udp_tunnel: Fix udp_tunnel_nic work-queue type
+
+From: Aya Levin <ayal@nvidia.com>
+
+commit e50e711351bdc656a8e6ca1022b4293cae8dcd59 upstream.
+
+Turn udp_tunnel_nic work-queue to an ordered work-queue. This queue
+holds the UDP-tunnel configuration commands of the different netdevs.
+When the netdevs are functions of the same NIC the order of
+execution may be crucial.
+
+Problem example:
+NIC with 2 PFs, both PFs declare offload quota of up to 3 UDP-ports.
+ $ifconfig eth2 1.1.1.1/16 up
+
+ $ip link add eth2_19503 type vxlan id 5049 remote 1.1.1.2 dev eth2 dstport 19053
+ $ip link set dev eth2_19503 up
+
+ $ip link add eth2_19504 type vxlan id 5049 remote 1.1.1.3 dev eth2 dstport 19054
+ $ip link set dev eth2_19504 up
+
+ $ip link add eth2_19505 type vxlan id 5049 remote 1.1.1.4 dev eth2 dstport 19055
+ $ip link set dev eth2_19505 up
+
+ $ip link add eth2_19506 type vxlan id 5049 remote 1.1.1.5 dev eth2 dstport 19056
+ $ip link set dev eth2_19506 up
+
+NIC RX port offload infrastructure offloads the first 3 UDP-ports (on
+all devices which sets NETIF_F_RX_UDP_TUNNEL_PORT feature) and not
+UDP-port 19056. So both PFs gets this offload configuration.
+
+ $ip link set dev eth2_19504 down
+
+This triggers udp-tunnel-core to remove the UDP-port 19504 from
+offload-ports-list and offload UDP-port 19056 instead.
+
+In this scenario it is important that the UDP-port of 19504 will be
+removed from both PFs before trying to add UDP-port 19056. The NIC can
+stop offloading a UDP-port only when all references are removed.
+Otherwise the NIC may report exceeding of the offload quota.
+
+Fixes: cc4e3835eff4 ("udp_tunnel: add central NIC RX port offload infrastructure")
+Signed-off-by: Aya Levin <ayal@nvidia.com>
+Reviewed-by: Tariq Toukan <tariqt@nvidia.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/ipv4/udp_tunnel_nic.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/net/ipv4/udp_tunnel_nic.c
++++ b/net/ipv4/udp_tunnel_nic.c
+@@ -935,7 +935,7 @@ static int __init udp_tunnel_nic_init_mo
+ {
+       int err;
+ 
+-      udp_tunnel_nic_workqueue = alloc_workqueue("udp_tunnel_nic", 0, 0);
++      udp_tunnel_nic_workqueue = alloc_ordered_workqueue("udp_tunnel_nic", 0);
+       if (!udp_tunnel_nic_workqueue)
+               return -ENOMEM;
+ 
diff --git a/queue-5.14/x86-uaccess-fix-32-bit-__get_user_asm_u64-when-cc_has_asm_goto_output-y.patch b/queue-5.14/x86-uaccess-fix-32-bit-__get_user_asm_u64-when-cc_has_asm_goto_output-y.patch

new file mode 100644 (file)

index 0000000..f188d74
--- /dev/null
+++ b/queue-5.14/x86-uaccess-fix-32-bit-__get_user_asm_u64-when-cc_has_asm_goto_output-y.patch
@@ -0,0 +1,59 @@
+From a69ae291e1cc2d08ae77c2029579c59c9bde5061 Mon Sep 17 00:00:00 2001
+From: Will Deacon <will@kernel.org>
+Date: Mon, 13 Sep 2021 17:35:47 +0100
+Subject: x86/uaccess: Fix 32-bit __get_user_asm_u64() when CC_HAS_ASM_GOTO_OUTPUT=y
+
+From: Will Deacon <will@kernel.org>
+
+commit a69ae291e1cc2d08ae77c2029579c59c9bde5061 upstream.
+
+Commit 865c50e1d279 ("x86/uaccess: utilize CONFIG_CC_HAS_ASM_GOTO_OUTPUT")
+added an optimised version of __get_user_asm() for x86 using 'asm goto'.
+
+Like the non-optimised code, the 32-bit implementation of 64-bit
+get_user() expands to a pair of 32-bit accesses.  Unlike the
+non-optimised code, the _original_ pointer is incremented to copy the
+high word instead of loading through a new pointer explicitly
+constructed to point at a 32-bit type.  Consequently, if the pointer
+points at a 64-bit type then we end up loading the wrong data for the
+upper 32-bits.
+
+This was observed as a mount() failure in Android targeting i686 after
+b0cfcdd9b967 ("d_path: make 'prepend()' fill up the buffer exactly on
+overflow") because the call to copy_from_kernel_nofault() from
+prepend_copy() ends up in __get_kernel_nofault() and casts the source
+pointer to a 'u64 __user *'.  An attempt to mount at "/debug_ramdisk"
+therefore ends up failing trying to mount "/debumdismdisk".
+
+Use the existing '__gu_ptr' source pointer to unsigned int for 32-bit
+__get_user_asm_u64() instead of the original pointer.
+
+Cc: Bill Wendling <morbo@google.com>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Cc: Ingo Molnar <mingo@redhat.com>
+Cc: Borislav Petkov <bp@alien8.de>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Reported-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+Fixes: 865c50e1d279 ("x86/uaccess: utilize CONFIG_CC_HAS_ASM_GOTO_OUTPUT")
+Signed-off-by: Will Deacon <will@kernel.org>
+Reviewed-by: Nick Desaulniers <ndesaulniers@google.com>
+Tested-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/include/asm/uaccess.h |    4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+--- a/arch/x86/include/asm/uaccess.h
++++ b/arch/x86/include/asm/uaccess.h
+@@ -301,8 +301,8 @@ do {                                                                       \
+       unsigned int __gu_low, __gu_high;                               \
+       const unsigned int __user *__gu_ptr;                            \
+       __gu_ptr = (const void __user *)(ptr);                          \
+-      __get_user_asm(__gu_low, ptr, "l", "=r", label);                \
+-      __get_user_asm(__gu_high, ptr+1, "l", "=r", label);             \
++      __get_user_asm(__gu_low, __gu_ptr, "l", "=r", label);           \
++      __get_user_asm(__gu_high, __gu_ptr+1, "l", "=r", label);        \
+       (x) = ((unsigned long long)__gu_high << 32) | __gu_low;         \
+ } while (0)
+ #else
author	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Mon, 20 Sep 2021 09:01:38 +0000 (11:01 +0200)
committer	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Mon, 20 Sep 2021 09:01:38 +0000 (11:01 +0200)
queue-5.14/bnxt_en-make-bnxt_free_skbs-safe-to-call-after-bnxt_free_mem.patch	[new file with mode: 0644]	patch \| blob
queue-5.14/bpf-cgroups-fix-cgroup-v2-fallback-on-v1-v2-mixed-mode.patch	[new file with mode: 0644]	patch \| blob
queue-5.14/dt-bindings-arm-fix-toradex-compatible-typo.patch	[new file with mode: 0644]	patch \| blob
queue-5.14/ibmvnic-check-failover_pending-in-login-response.patch	[new file with mode: 0644]	patch \| blob
queue-5.14/kvm-ppc-book3s-hv-tolerate-treclaim.-in-fake-suspend-mode-changing-registers.patch	[new file with mode: 0644]	patch \| blob
queue-5.14/net-hns3-change-affinity_mask-to-numa-node-range.patch	[new file with mode: 0644]	patch \| blob
queue-5.14/net-hns3-disable-mac-in-flr-process.patch	[new file with mode: 0644]	patch \| blob
queue-5.14/net-hns3-fix-the-timing-issue-of-vf-clearing-interrupt-sources.patch	[new file with mode: 0644]	patch \| blob
queue-5.14/net-hns3-pad-the-short-tunnel-frame-before-sending-to-hardware.patch	[new file with mode: 0644]	patch \| blob
queue-5.14/powerpc-64s-system-call-rfscv-workaround-for-tm-bugs.patch	[new file with mode: 0644]	patch \| blob
queue-5.14/powerpc-mce-fix-access-error-in-mce-handler.patch	[new file with mode: 0644]	patch \| blob
queue-5.14/qed-handle-management-fw-error.patch	[new file with mode: 0644]	patch \| blob
queue-5.14/s390-pci_mmio-fully-validate-the-vma-before-calling-follow_pte.patch	[new file with mode: 0644]	patch \| blob
queue-5.14/selftest-net-fix-typo-in-altname-test.patch	[new file with mode: 0644]	patch \| blob
queue-5.14/series		patch \| blob \| blame \| history
queue-5.14/tcp-fix-tp-undo_retrans-accounting-in-tcp_sacktag_one.patch	[new file with mode: 0644]	patch \| blob
queue-5.14/udp_tunnel-fix-udp_tunnel_nic-work-queue-type.patch	[new file with mode: 0644]	patch \| blob
queue-5.14/x86-uaccess-fix-32-bit-__get_user_asm_u64-when-cc_has_asm_goto_output-y.patch	[new file with mode: 0644]	patch \| blob