fixes for 5.4

author Sasha Levin <sashal@kernel.org>

Mon, 3 Feb 2020 03:01:12 +0000 (22:01 -0500)

committer Sasha Levin <sashal@kernel.org>

Mon, 3 Feb 2020 03:01:12 +0000 (22:01 -0500)
author Sasha Levin <sashal@kernel.org>
Mon, 3 Feb 2020 03:01:12 +0000 (22:01 -0500)
committer Sasha Levin <sashal@kernel.org>
Mon, 3 Feb 2020 03:01:12 +0000 (22:01 -0500)
diff --git a/queue-5.4/e1000e-drop-unnecessary-__e1000_down-bit-twiddling.patch b/queue-5.4/e1000e-drop-unnecessary-__e1000_down-bit-twiddling.patch

new file mode 100644 (file)

index 0000000..11c0fd4
--- /dev/null
+++ b/queue-5.4/e1000e-drop-unnecessary-__e1000_down-bit-twiddling.patch
@@ -0,0 +1,55 @@
+From 613b8b49ab902e1c338c78698e5d7810ac11be65 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 11 Oct 2019 08:34:59 -0700
+Subject: e1000e: Drop unnecessary __E1000_DOWN bit twiddling
+
+From: Alexander Duyck <alexander.h.duyck@linux.intel.com>
+
+[ Upstream commit daee5598e491d8d3979bd4ad6c447d89ce57b446 ]
+
+Since we no longer check for __E1000_DOWN in e1000e_close we can drop the
+spot where we were restoring the bit. This saves us a bit of unnecessary
+complexity.
+
+Signed-off-by: Alexander Duyck <alexander.h.duyck@linux.intel.com>
+Tested-by: Aaron Brown <aaron.f.brown@intel.com>
+Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/intel/e1000e/netdev.c | 7 +------
+ 1 file changed, 1 insertion(+), 6 deletions(-)
+
+diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c b/drivers/net/ethernet/intel/e1000e/netdev.c
+index d7d56e42a6aac..aa9fdda839148 100644
+--- a/drivers/net/ethernet/intel/e1000e/netdev.c
++++ b/drivers/net/ethernet/intel/e1000e/netdev.c
+@@ -7407,15 +7407,13 @@ static void e1000_remove(struct pci_dev *pdev)
+ {
+       struct net_device *netdev = pci_get_drvdata(pdev);
+       struct e1000_adapter *adapter = netdev_priv(netdev);
+-      bool down = test_bit(__E1000_DOWN, &adapter->state);
+ 
+       e1000e_ptp_remove(adapter);
+ 
+       /* The timers may be rescheduled, so explicitly disable them
+        * from being rescheduled.
+        */
+-      if (!down)
+-              set_bit(__E1000_DOWN, &adapter->state);
++      set_bit(__E1000_DOWN, &adapter->state);
+       del_timer_sync(&adapter->phy_info_timer);
+ 
+       cancel_work_sync(&adapter->reset_task);
+@@ -7435,9 +7433,6 @@ static void e1000_remove(struct pci_dev *pdev)
+               }
+       }
+ 
+-      /* Don't lie to e1000_close() down the road. */
+-      if (!down)
+-              clear_bit(__E1000_DOWN, &adapter->state);
+       unregister_netdev(netdev);
+ 
+       if (pci_dev_run_wake(pdev))
+-- 
+2.20.1
+
diff --git a/queue-5.4/e1000e-revert-e1000e-make-watchdog-use-delayed-work.patch b/queue-5.4/e1000e-revert-e1000e-make-watchdog-use-delayed-work.patch

new file mode 100644 (file)

index 0000000..f246517
--- /dev/null
+++ b/queue-5.4/e1000e-revert-e1000e-make-watchdog-use-delayed-work.patch
@@ -0,0 +1,180 @@
+From d84eb43b7150df147cba96b52fb9952232ef4477 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Sat, 4 Jan 2020 23:29:22 -0800
+Subject: e1000e: Revert "e1000e: Make watchdog use delayed work"
+
+From: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
+
+[ Upstream commit d5ad7a6a7f3c87b278d7e4973b65682be4e588dd ]
+
+This reverts commit 59653e6497d16f7ac1d9db088f3959f57ee8c3db.
+
+This is due to this commit causing driver crashes and connections to
+reset unexpectedly.
+
+Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
+Tested-by: Aaron Brown <aaron.f.brown@intel.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/intel/e1000e/e1000.h  |  5 +-
+ drivers/net/ethernet/intel/e1000e/netdev.c | 54 ++++++++++------------
+ 2 files changed, 27 insertions(+), 32 deletions(-)
+
+diff --git a/drivers/net/ethernet/intel/e1000e/e1000.h b/drivers/net/ethernet/intel/e1000e/e1000.h
+index 6c51b1bad8c42..37a2314d3e6b1 100644
+--- a/drivers/net/ethernet/intel/e1000e/e1000.h
++++ b/drivers/net/ethernet/intel/e1000e/e1000.h
+@@ -185,13 +185,12 @@ struct e1000_phy_regs {
+ 
+ /* board specific private data structure */
+ struct e1000_adapter {
++      struct timer_list watchdog_timer;
+       struct timer_list phy_info_timer;
+       struct timer_list blink_timer;
+ 
+       struct work_struct reset_task;
+-      struct delayed_work watchdog_task;
+-
+-      struct workqueue_struct *e1000_workqueue;
++      struct work_struct watchdog_task;
+ 
+       const struct e1000_info *ei;
+ 
+diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c b/drivers/net/ethernet/intel/e1000e/netdev.c
+index aa9fdda839148..c27ed7363768c 100644
+--- a/drivers/net/ethernet/intel/e1000e/netdev.c
++++ b/drivers/net/ethernet/intel/e1000e/netdev.c
+@@ -1780,8 +1780,7 @@ static irqreturn_t e1000_intr_msi(int __always_unused irq, void *data)
+               }
+               /* guard against interrupt when we're going down */
+               if (!test_bit(__E1000_DOWN, &adapter->state))
+-                      mod_delayed_work(adapter->e1000_workqueue,
+-                                       &adapter->watchdog_task, HZ);
++                      mod_timer(&adapter->watchdog_timer, jiffies + 1);
+       }
+ 
+       /* Reset on uncorrectable ECC error */
+@@ -1861,8 +1860,7 @@ static irqreturn_t e1000_intr(int __always_unused irq, void *data)
+               }
+               /* guard against interrupt when we're going down */
+               if (!test_bit(__E1000_DOWN, &adapter->state))
+-                      mod_delayed_work(adapter->e1000_workqueue,
+-                                       &adapter->watchdog_task, HZ);
++                      mod_timer(&adapter->watchdog_timer, jiffies + 1);
+       }
+ 
+       /* Reset on uncorrectable ECC error */
+@@ -1907,8 +1905,7 @@ static irqreturn_t e1000_msix_other(int __always_unused irq, void *data)
+               hw->mac.get_link_status = true;
+               /* guard against interrupt when we're going down */
+               if (!test_bit(__E1000_DOWN, &adapter->state))
+-                      mod_delayed_work(adapter->e1000_workqueue,
+-                                       &adapter->watchdog_task, HZ);
++                      mod_timer(&adapter->watchdog_timer, jiffies + 1);
+       }
+ 
+       if (!test_bit(__E1000_DOWN, &adapter->state))
+@@ -4281,6 +4278,7 @@ void e1000e_down(struct e1000_adapter *adapter, bool reset)
+ 
+       napi_synchronize(&adapter->napi);
+ 
++      del_timer_sync(&adapter->watchdog_timer);
+       del_timer_sync(&adapter->phy_info_timer);
+ 
+       spin_lock(&adapter->stats64_lock);
+@@ -5152,11 +5150,25 @@ static void e1000e_check_82574_phy_workaround(struct e1000_adapter *adapter)
+       }
+ }
+ 
++/**
++ * e1000_watchdog - Timer Call-back
++ * @data: pointer to adapter cast into an unsigned long
++ **/
++static void e1000_watchdog(struct timer_list *t)
++{
++      struct e1000_adapter *adapter = from_timer(adapter, t, watchdog_timer);
++
++      /* Do the rest outside of interrupt context */
++      schedule_work(&adapter->watchdog_task);
++
++      /* TODO: make this use queue_delayed_work() */
++}
++
+ static void e1000_watchdog_task(struct work_struct *work)
+ {
+       struct e1000_adapter *adapter = container_of(work,
+                                                    struct e1000_adapter,
+-                                                   watchdog_task.work);
++                                                   watchdog_task);
+       struct net_device *netdev = adapter->netdev;
+       struct e1000_mac_info *mac = &adapter->hw.mac;
+       struct e1000_phy_info *phy = &adapter->hw.phy;
+@@ -5404,9 +5416,8 @@ static void e1000_watchdog_task(struct work_struct *work)
+ 
+       /* Reset the timer */
+       if (!test_bit(__E1000_DOWN, &adapter->state))
+-              queue_delayed_work(adapter->e1000_workqueue,
+-                                 &adapter->watchdog_task,
+-                                 round_jiffies(2 * HZ));
++              mod_timer(&adapter->watchdog_timer,
++                        round_jiffies(jiffies + 2 * HZ));
+ }
+ 
+ #define E1000_TX_FLAGS_CSUM           0x00000001
+@@ -7259,21 +7270,11 @@ static int e1000_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
+               goto err_eeprom;
+       }
+ 
+-      adapter->e1000_workqueue = alloc_workqueue("%s", WQ_MEM_RECLAIM, 0,
+-                                                 e1000e_driver_name);
+-
+-      if (!adapter->e1000_workqueue) {
+-              err = -ENOMEM;
+-              goto err_workqueue;
+-      }
+-
+-      INIT_DELAYED_WORK(&adapter->watchdog_task, e1000_watchdog_task);
+-      queue_delayed_work(adapter->e1000_workqueue, &adapter->watchdog_task,
+-                         0);
+-
++      timer_setup(&adapter->watchdog_timer, e1000_watchdog, 0);
+       timer_setup(&adapter->phy_info_timer, e1000_update_phy_info, 0);
+ 
+       INIT_WORK(&adapter->reset_task, e1000_reset_task);
++      INIT_WORK(&adapter->watchdog_task, e1000_watchdog_task);
+       INIT_WORK(&adapter->downshift_task, e1000e_downshift_workaround);
+       INIT_WORK(&adapter->update_phy_task, e1000e_update_phy_task);
+       INIT_WORK(&adapter->print_hang_task, e1000_print_hw_hang);
+@@ -7367,9 +7368,6 @@ static int e1000_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
+       return 0;
+ 
+ err_register:
+-      flush_workqueue(adapter->e1000_workqueue);
+-      destroy_workqueue(adapter->e1000_workqueue);
+-err_workqueue:
+       if (!(adapter->flags & FLAG_HAS_AMT))
+               e1000e_release_hw_control(adapter);
+ err_eeprom:
+@@ -7414,17 +7412,15 @@ static void e1000_remove(struct pci_dev *pdev)
+        * from being rescheduled.
+        */
+       set_bit(__E1000_DOWN, &adapter->state);
++      del_timer_sync(&adapter->watchdog_timer);
+       del_timer_sync(&adapter->phy_info_timer);
+ 
+       cancel_work_sync(&adapter->reset_task);
++      cancel_work_sync(&adapter->watchdog_task);
+       cancel_work_sync(&adapter->downshift_task);
+       cancel_work_sync(&adapter->update_phy_task);
+       cancel_work_sync(&adapter->print_hang_task);
+ 
+-      cancel_delayed_work(&adapter->watchdog_task);
+-      flush_workqueue(adapter->e1000_workqueue);
+-      destroy_workqueue(adapter->e1000_workqueue);
+-
+       if (adapter->flags & FLAG_HAS_HW_TIMESTAMP) {
+               cancel_work_sync(&adapter->tx_hwtstamp_work);
+               if (adapter->tx_hwtstamp_skb) {
+-- 
+2.20.1
+
diff --git a/queue-5.4/series b/queue-5.4/series

index e3e14c386dd0785852915eb9f37775a630a254bc..c93137fab938883f5876044ba95a3e8120e1b39b 100644 (file)
--- a/queue-5.4/series
+++ b/queue-5.4/series
@@ -1,2 +1,7 @@
  vfs-fix-do_last-regression.patch
  cifs-fix-soft-mounts-hanging-in-the-reconnect-code.patch
+x86-resctrl-fix-a-deadlock-due-to-inaccurate-referen.patch
+x86-resctrl-fix-use-after-free-when-deleting-resourc.patch
+x86-resctrl-fix-use-after-free-due-to-inaccurate-ref.patch
+e1000e-drop-unnecessary-__e1000_down-bit-twiddling.patch
+e1000e-revert-e1000e-make-watchdog-use-delayed-work.patch
diff --git a/queue-5.4/x86-resctrl-fix-a-deadlock-due-to-inaccurate-referen.patch b/queue-5.4/x86-resctrl-fix-a-deadlock-due-to-inaccurate-referen.patch

new file mode 100644 (file)

index 0000000..1f0d90e
--- /dev/null
+++ b/queue-5.4/x86-resctrl-fix-a-deadlock-due-to-inaccurate-referen.patch
@@ -0,0 +1,221 @@
+From 45b1dca1b89b2e2ac862d2e57616677d0a10acd1 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 9 Jan 2020 00:28:05 +0800
+Subject: x86/resctrl: Fix a deadlock due to inaccurate reference
+
+From: Xiaochen Shen <xiaochen.shen@intel.com>
+
+[ Upstream commit 334b0f4e9b1b4a1d475f803419d202f6c5e4d18e ]
+
+There is a race condition which results in a deadlock when rmdir and
+mkdir execute concurrently:
+
+$ ls /sys/fs/resctrl/c1/mon_groups/m1/
+cpus  cpus_list  mon_data  tasks
+
+Thread 1: rmdir /sys/fs/resctrl/c1
+Thread 2: mkdir /sys/fs/resctrl/c1/mon_groups/m1
+
+3 locks held by mkdir/48649:
+ #0:  (sb_writers#17){.+.+}, at: [<ffffffffb4ca2aa0>] mnt_want_write+0x20/0x50
+ #1:  (&type->i_mutex_dir_key#8/1){+.+.}, at: [<ffffffffb4c8c13b>] filename_create+0x7b/0x170
+ #2:  (rdtgroup_mutex){+.+.}, at: [<ffffffffb4a4389d>] rdtgroup_kn_lock_live+0x3d/0x70
+
+4 locks held by rmdir/48652:
+ #0:  (sb_writers#17){.+.+}, at: [<ffffffffb4ca2aa0>] mnt_want_write+0x20/0x50
+ #1:  (&type->i_mutex_dir_key#8/1){+.+.}, at: [<ffffffffb4c8c3cf>] do_rmdir+0x13f/0x1e0
+ #2:  (&type->i_mutex_dir_key#8){++++}, at: [<ffffffffb4c86d5d>] vfs_rmdir+0x4d/0x120
+ #3:  (rdtgroup_mutex){+.+.}, at: [<ffffffffb4a4389d>] rdtgroup_kn_lock_live+0x3d/0x70
+
+Thread 1 is deleting control group "c1". Holding rdtgroup_mutex,
+kernfs_remove() removes all kernfs nodes under directory "c1"
+recursively, then waits for sub kernfs node "mon_groups" to drop active
+reference.
+
+Thread 2 is trying to create a subdirectory "m1" in the "mon_groups"
+directory. The wrapper kernfs_iop_mkdir() takes an active reference to
+the "mon_groups" directory but the code drops the active reference to
+the parent directory "c1" instead.
+
+As a result, Thread 1 is blocked on waiting for active reference to drop
+and never release rdtgroup_mutex, while Thread 2 is also blocked on
+trying to get rdtgroup_mutex.
+
+Thread 1 (rdtgroup_rmdir)   Thread 2 (rdtgroup_mkdir)
+(rmdir /sys/fs/resctrl/c1)  (mkdir /sys/fs/resctrl/c1/mon_groups/m1)
+-------------------------   -------------------------
+                            kernfs_iop_mkdir
+                              /*
+                               * kn: "m1", parent_kn: "mon_groups",
+                               * prgrp_kn: parent_kn->parent: "c1",
+                               *
+                               * "mon_groups", parent_kn->active++: 1
+                               */
+                              kernfs_get_active(parent_kn)
+kernfs_iop_rmdir
+  /* "c1", kn->active++ */
+  kernfs_get_active(kn)
+
+  rdtgroup_kn_lock_live
+    atomic_inc(&rdtgrp->waitcount)
+    /* "c1", kn->active-- */
+    kernfs_break_active_protection(kn)
+    mutex_lock
+
+  rdtgroup_rmdir_ctrl
+    free_all_child_rdtgrp
+      sentry->flags = RDT_DELETED
+
+    rdtgroup_ctrl_remove
+      rdtgrp->flags = RDT_DELETED
+      kernfs_get(kn)
+      kernfs_remove(rdtgrp->kn)
+        __kernfs_remove
+          /* "mon_groups", sub_kn */
+          atomic_add(KN_DEACTIVATED_BIAS, &sub_kn->active)
+          kernfs_drain(sub_kn)
+            /*
+             * sub_kn->active == KN_DEACTIVATED_BIAS + 1,
+             * waiting on sub_kn->active to drop, but it
+             * never drops in Thread 2 which is blocked
+             * on getting rdtgroup_mutex.
+             */
+Thread 1 hangs here ---->
+            wait_event(sub_kn->active == KN_DEACTIVATED_BIAS)
+            ...
+                              rdtgroup_mkdir
+                                rdtgroup_mkdir_mon(parent_kn, prgrp_kn)
+                                  mkdir_rdt_prepare(parent_kn, prgrp_kn)
+                                    rdtgroup_kn_lock_live(prgrp_kn)
+                                      atomic_inc(&rdtgrp->waitcount)
+                                      /*
+                                       * "c1", prgrp_kn->active--
+                                       *
+                                       * The active reference on "c1" is
+                                       * dropped, but not matching the
+                                       * actual active reference taken
+                                       * on "mon_groups", thus causing
+                                       * Thread 1 to wait forever while
+                                       * holding rdtgroup_mutex.
+                                       */
+                                      kernfs_break_active_protection(
+                                                               prgrp_kn)
+                                      /*
+                                       * Trying to get rdtgroup_mutex
+                                       * which is held by Thread 1.
+                                       */
+Thread 2 hangs here ---->             mutex_lock
+                                      ...
+
+The problem is that the creation of a subdirectory in the "mon_groups"
+directory incorrectly releases the active protection of its parent
+directory instead of itself before it starts waiting for rdtgroup_mutex.
+This is triggered by the rdtgroup_mkdir() flow calling
+rdtgroup_kn_lock_live()/rdtgroup_kn_unlock() with kernfs node of the
+parent control group ("c1") as argument. It should be called with kernfs
+node "mon_groups" instead. What is currently missing is that the
+kn->priv of "mon_groups" is NULL instead of pointing to the rdtgrp.
+
+Fix it by pointing kn->priv to rdtgrp when "mon_groups" is created. Then
+it could be passed to rdtgroup_kn_lock_live()/rdtgroup_kn_unlock()
+instead. And then it operates on the same rdtgroup structure but handles
+the active reference of kernfs node "mon_groups" to prevent deadlock.
+The same changes are also made to the "mon_data" directories.
+
+This results in some unused function parameters that will be cleaned up
+in follow-up patch as the focus here is on the fix only in support of
+backporting efforts.
+
+Fixes: c7d9aac61311 ("x86/intel_rdt/cqm: Add mkdir support for RDT monitoring")
+Suggested-by: Reinette Chatre <reinette.chatre@intel.com>
+Signed-off-by: Xiaochen Shen <xiaochen.shen@intel.com>
+Signed-off-by: Borislav Petkov <bp@suse.de>
+Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
+Reviewed-by: Tony Luck <tony.luck@intel.com>
+Acked-by: Thomas Gleixner <tglx@linutronix.de>
+Cc: stable@vger.kernel.org
+Link: https://lkml.kernel.org/r/1578500886-21771-4-git-send-email-xiaochen.shen@intel.com
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/kernel/cpu/resctrl/rdtgroup.c | 16 ++++++++--------
+ 1 file changed, 8 insertions(+), 8 deletions(-)
+
+diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
+index dac7209a07084..e4da26325e3ea 100644
+--- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
++++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
+@@ -1970,7 +1970,7 @@ static int rdt_get_tree(struct fs_context *fc)
+ 
+       if (rdt_mon_capable) {
+               ret = mongroup_create_dir(rdtgroup_default.kn,
+-                                        NULL, "mon_groups",
++                                        &rdtgroup_default, "mon_groups",
+                                         &kn_mongrp);
+               if (ret < 0)
+                       goto out_info;
+@@ -2446,7 +2446,7 @@ static int mkdir_mondata_all(struct kernfs_node *parent_kn,
+       /*
+        * Create the mon_data directory first.
+        */
+-      ret = mongroup_create_dir(parent_kn, NULL, "mon_data", &kn);
++      ret = mongroup_create_dir(parent_kn, prgrp, "mon_data", &kn);
+       if (ret)
+               return ret;
+ 
+@@ -2645,7 +2645,7 @@ static int mkdir_rdt_prepare(struct kernfs_node *parent_kn,
+       uint files = 0;
+       int ret;
+ 
+-      prdtgrp = rdtgroup_kn_lock_live(prgrp_kn);
++      prdtgrp = rdtgroup_kn_lock_live(parent_kn);
+       if (!prdtgrp) {
+               ret = -ENODEV;
+               goto out_unlock;
+@@ -2718,7 +2718,7 @@ static int mkdir_rdt_prepare(struct kernfs_node *parent_kn,
+       kernfs_activate(kn);
+ 
+       /*
+-       * The caller unlocks the prgrp_kn upon success.
++       * The caller unlocks the parent_kn upon success.
+        */
+       return 0;
+ 
+@@ -2729,7 +2729,7 @@ static int mkdir_rdt_prepare(struct kernfs_node *parent_kn,
+ out_free_rgrp:
+       kfree(rdtgrp);
+ out_unlock:
+-      rdtgroup_kn_unlock(prgrp_kn);
++      rdtgroup_kn_unlock(parent_kn);
+       return ret;
+ }
+ 
+@@ -2767,7 +2767,7 @@ static int rdtgroup_mkdir_mon(struct kernfs_node *parent_kn,
+        */
+       list_add_tail(&rdtgrp->mon.crdtgrp_list, &prgrp->mon.crdtgrp_list);
+ 
+-      rdtgroup_kn_unlock(prgrp_kn);
++      rdtgroup_kn_unlock(parent_kn);
+       return ret;
+ }
+ 
+@@ -2810,7 +2810,7 @@ static int rdtgroup_mkdir_ctrl_mon(struct kernfs_node *parent_kn,
+                * Create an empty mon_groups directory to hold the subset
+                * of tasks and cpus to monitor.
+                */
+-              ret = mongroup_create_dir(kn, NULL, "mon_groups", NULL);
++              ret = mongroup_create_dir(kn, rdtgrp, "mon_groups", NULL);
+               if (ret) {
+                       rdt_last_cmd_puts("kernfs subdir error\n");
+                       goto out_del_list;
+@@ -2826,7 +2826,7 @@ static int rdtgroup_mkdir_ctrl_mon(struct kernfs_node *parent_kn,
+ out_common_fail:
+       mkdir_rdt_prepare_clean(rdtgrp);
+ out_unlock:
+-      rdtgroup_kn_unlock(prgrp_kn);
++      rdtgroup_kn_unlock(parent_kn);
+       return ret;
+ }
+ 
+-- 
+2.20.1
+
diff --git a/queue-5.4/x86-resctrl-fix-use-after-free-due-to-inaccurate-ref.patch b/queue-5.4/x86-resctrl-fix-use-after-free-due-to-inaccurate-ref.patch

new file mode 100644 (file)

index 0000000..4721f7c
--- /dev/null
+++ b/queue-5.4/x86-resctrl-fix-use-after-free-due-to-inaccurate-ref.patch
@@ -0,0 +1,128 @@
+From c78ba78f3b5b50eecaefbc66780643af964dbbba Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 9 Jan 2020 00:28:04 +0800
+Subject: x86/resctrl: Fix use-after-free due to inaccurate refcount of
+ rdtgroup
+
+From: Xiaochen Shen <xiaochen.shen@intel.com>
+
+[ Upstream commit 074fadee59ee7a9d2b216e9854bd4efb5dad679f ]
+
+There is a race condition in the following scenario which results in an
+use-after-free issue when reading a monitoring file and deleting the
+parent ctrl_mon group concurrently:
+
+Thread 1 calls atomic_inc() to take refcount of rdtgrp and then calls
+kernfs_break_active_protection() to drop the active reference of kernfs
+node in rdtgroup_kn_lock_live().
+
+In Thread 2, kernfs_remove() is a blocking routine. It waits on all sub
+kernfs nodes to drop the active reference when removing all subtree
+kernfs nodes recursively. Thread 2 could block on kernfs_remove() until
+Thread 1 calls kernfs_break_active_protection(). Only after
+kernfs_remove() completes the refcount of rdtgrp could be trusted.
+
+Before Thread 1 calls atomic_inc() and kernfs_break_active_protection(),
+Thread 2 could call kfree() when the refcount of rdtgrp (sentry) is 0
+instead of 1 due to the race.
+
+In Thread 1, in rdtgroup_kn_unlock(), referring to earlier rdtgrp memory
+(rdtgrp->waitcount) which was already freed in Thread 2 results in
+use-after-free issue.
+
+Thread 1 (rdtgroup_mondata_show)  Thread 2 (rdtgroup_rmdir)
+--------------------------------  -------------------------
+rdtgroup_kn_lock_live
+  /*
+   * kn active protection until
+   * kernfs_break_active_protection(kn)
+   */
+  rdtgrp = kernfs_to_rdtgroup(kn)
+                                  rdtgroup_kn_lock_live
+                                    atomic_inc(&rdtgrp->waitcount)
+                                    mutex_lock
+                                  rdtgroup_rmdir_ctrl
+                                    free_all_child_rdtgrp
+                                      /*
+                                       * sentry->waitcount should be 1
+                                       * but is 0 now due to the race.
+                                       */
+                                      kfree(sentry)*[1]
+  /*
+   * Only after kernfs_remove()
+   * completes, the refcount of
+   * rdtgrp could be trusted.
+   */
+  atomic_inc(&rdtgrp->waitcount)
+  /* kn->active-- */
+  kernfs_break_active_protection(kn)
+                                    rdtgroup_ctrl_remove
+                                      rdtgrp->flags = RDT_DELETED
+                                      /*
+                                       * Blocking routine, wait for
+                                       * all sub kernfs nodes to drop
+                                       * active reference in
+                                       * kernfs_break_active_protection.
+                                       */
+                                      kernfs_remove(rdtgrp->kn)
+                                  rdtgroup_kn_unlock
+                                    mutex_unlock
+                                    atomic_dec_and_test(
+                                                &rdtgrp->waitcount)
+                                    && (flags & RDT_DELETED)
+                                      kernfs_unbreak_active_protection(kn)
+                                      kfree(rdtgrp)
+  mutex_lock
+mon_event_read
+rdtgroup_kn_unlock
+  mutex_unlock
+  /*
+   * Use-after-free: refer to earlier rdtgrp
+   * memory which was freed in [1].
+   */
+  atomic_dec_and_test(&rdtgrp->waitcount)
+  && (flags & RDT_DELETED)
+    /* kn->active++ */
+    kernfs_unbreak_active_protection(kn)
+    kfree(rdtgrp)
+
+Fix it by moving free_all_child_rdtgrp() to after kernfs_remove() in
+rdtgroup_rmdir_ctrl() to ensure it has the accurate refcount of rdtgrp.
+
+Fixes: f3cbeacaa06e ("x86/intel_rdt/cqm: Add rmdir support")
+Suggested-by: Reinette Chatre <reinette.chatre@intel.com>
+Signed-off-by: Xiaochen Shen <xiaochen.shen@intel.com>
+Signed-off-by: Borislav Petkov <bp@suse.de>
+Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
+Reviewed-by: Tony Luck <tony.luck@intel.com>
+Acked-by: Thomas Gleixner <tglx@linutronix.de>
+Cc: stable@vger.kernel.org
+Link: https://lkml.kernel.org/r/1578500886-21771-3-git-send-email-xiaochen.shen@intel.com
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/kernel/cpu/resctrl/rdtgroup.c | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
+index c7564294a12a8..954fd048ad9bd 100644
+--- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
++++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
+@@ -2960,13 +2960,13 @@ static int rdtgroup_rmdir_ctrl(struct kernfs_node *kn, struct rdtgroup *rdtgrp,
+       closid_free(rdtgrp->closid);
+       free_rmid(rdtgrp->mon.rmid);
+ 
++      rdtgroup_ctrl_remove(kn, rdtgrp);
++
+       /*
+        * Free all the child monitor group rmids.
+        */
+       free_all_child_rdtgrp(rdtgrp);
+ 
+-      rdtgroup_ctrl_remove(kn, rdtgrp);
+-
+       return 0;
+ }
+ 
+-- 
+2.20.1
+
diff --git a/queue-5.4/x86-resctrl-fix-use-after-free-when-deleting-resourc.patch b/queue-5.4/x86-resctrl-fix-use-after-free-when-deleting-resourc.patch

new file mode 100644 (file)

index 0000000..8a0abaf
--- /dev/null
+++ b/queue-5.4/x86-resctrl-fix-use-after-free-when-deleting-resourc.patch
@@ -0,0 +1,224 @@
+From 545e5454ad863083e84ad46372551f81b93f47f8 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 9 Jan 2020 00:28:03 +0800
+Subject: x86/resctrl: Fix use-after-free when deleting resource groups
+
+From: Xiaochen Shen <xiaochen.shen@intel.com>
+
+[ Upstream commit b8511ccc75c033f6d54188ea4df7bf1e85778740 ]
+
+A resource group (rdtgrp) contains a reference count (rdtgrp->waitcount)
+that indicates how many waiters expect this rdtgrp to exist. Waiters
+could be waiting on rdtgroup_mutex or some work sitting on a task's
+workqueue for when the task returns from kernel mode or exits.
+
+The deletion of a rdtgrp is intended to have two phases:
+
+  (1) while holding rdtgroup_mutex the necessary cleanup is done and
+  rdtgrp->flags is set to RDT_DELETED,
+
+  (2) after releasing the rdtgroup_mutex, the rdtgrp structure is freed
+  only if there are no waiters and its flag is set to RDT_DELETED. Upon
+  gaining access to rdtgroup_mutex or rdtgrp, a waiter is required to check
+  for the RDT_DELETED flag.
+
+When unmounting the resctrl file system or deleting ctrl_mon groups,
+all of the subdirectories are removed and the data structure of rdtgrp
+is forcibly freed without checking rdtgrp->waitcount. If at this point
+there was a waiter on rdtgrp then a use-after-free issue occurs when the
+waiter starts running and accesses the rdtgrp structure it was waiting
+on.
+
+See kfree() calls in [1], [2] and [3] in these two call paths in
+following scenarios:
+(1) rdt_kill_sb() -> rmdir_all_sub() -> free_all_child_rdtgrp()
+(2) rdtgroup_rmdir() -> rdtgroup_rmdir_ctrl() -> free_all_child_rdtgrp()
+
+There are several scenarios that result in use-after-free issue in
+following:
+
+Scenario 1:
+-----------
+In Thread 1, rdtgroup_tasks_write() adds a task_work callback
+move_myself(). If move_myself() is scheduled to execute after Thread 2
+rdt_kill_sb() is finished, referring to earlier rdtgrp memory
+(rdtgrp->waitcount) which was already freed in Thread 2 results in
+use-after-free issue.
+
+Thread 1 (rdtgroup_tasks_write)        Thread 2 (rdt_kill_sb)
+-------------------------------        ----------------------
+rdtgroup_kn_lock_live
+  atomic_inc(&rdtgrp->waitcount)
+  mutex_lock
+rdtgroup_move_task
+  __rdtgroup_move_task
+    /*
+     * Take an extra refcount, so rdtgrp cannot be freed
+     * before the call back move_myself has been invoked
+     */
+    atomic_inc(&rdtgrp->waitcount)
+    /* Callback move_myself will be scheduled for later */
+    task_work_add(move_myself)
+rdtgroup_kn_unlock
+  mutex_unlock
+  atomic_dec_and_test(&rdtgrp->waitcount)
+  && (flags & RDT_DELETED)
+                                       mutex_lock
+                                       rmdir_all_sub
+                                         /*
+                                          * sentry and rdtgrp are freed
+                                          * without checking refcount
+                                          */
+                                         free_all_child_rdtgrp
+                                           kfree(sentry)*[1]
+                                         kfree(rdtgrp)*[2]
+                                       mutex_unlock
+/*
+ * Callback is scheduled to execute
+ * after rdt_kill_sb is finished
+ */
+move_myself
+  /*
+   * Use-after-free: refer to earlier rdtgrp
+   * memory which was freed in [1] or [2].
+   */
+  atomic_dec_and_test(&rdtgrp->waitcount)
+  && (flags & RDT_DELETED)
+    kfree(rdtgrp)
+
+Scenario 2:
+-----------
+In Thread 1, rdtgroup_tasks_write() adds a task_work callback
+move_myself(). If move_myself() is scheduled to execute after Thread 2
+rdtgroup_rmdir() is finished, referring to earlier rdtgrp memory
+(rdtgrp->waitcount) which was already freed in Thread 2 results in
+use-after-free issue.
+
+Thread 1 (rdtgroup_tasks_write)        Thread 2 (rdtgroup_rmdir)
+-------------------------------        -------------------------
+rdtgroup_kn_lock_live
+  atomic_inc(&rdtgrp->waitcount)
+  mutex_lock
+rdtgroup_move_task
+  __rdtgroup_move_task
+    /*
+     * Take an extra refcount, so rdtgrp cannot be freed
+     * before the call back move_myself has been invoked
+     */
+    atomic_inc(&rdtgrp->waitcount)
+    /* Callback move_myself will be scheduled for later */
+    task_work_add(move_myself)
+rdtgroup_kn_unlock
+  mutex_unlock
+  atomic_dec_and_test(&rdtgrp->waitcount)
+  && (flags & RDT_DELETED)
+                                       rdtgroup_kn_lock_live
+                                         atomic_inc(&rdtgrp->waitcount)
+                                         mutex_lock
+                                       rdtgroup_rmdir_ctrl
+                                         free_all_child_rdtgrp
+                                           /*
+                                            * sentry is freed without
+                                            * checking refcount
+                                            */
+                                           kfree(sentry)*[3]
+                                         rdtgroup_ctrl_remove
+                                           rdtgrp->flags = RDT_DELETED
+                                       rdtgroup_kn_unlock
+                                         mutex_unlock
+                                         atomic_dec_and_test(
+                                                     &rdtgrp->waitcount)
+                                         && (flags & RDT_DELETED)
+                                           kfree(rdtgrp)
+/*
+ * Callback is scheduled to execute
+ * after rdt_kill_sb is finished
+ */
+move_myself
+  /*
+   * Use-after-free: refer to earlier rdtgrp
+   * memory which was freed in [3].
+   */
+  atomic_dec_and_test(&rdtgrp->waitcount)
+  && (flags & RDT_DELETED)
+    kfree(rdtgrp)
+
+If CONFIG_DEBUG_SLAB=y, Slab corruption on kmalloc-2k can be observed
+like following. Note that "0x6b" is POISON_FREE after kfree(). The
+corrupted bits "0x6a", "0x64" at offset 0x424 correspond to
+waitcount member of struct rdtgroup which was freed:
+
+  Slab corruption (Not tainted): kmalloc-2k start=ffff9504c5b0d000, len=2048
+  420: 6b 6b 6b 6b 6a 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b  kkkkjkkkkkkkkkkk
+  Single bit error detected. Probably bad RAM.
+  Run memtest86+ or a similar memory test tool.
+  Next obj: start=ffff9504c5b0d800, len=2048
+  000: 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b  kkkkkkkkkkkkkkkk
+  010: 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b  kkkkkkkkkkkkkkkk
+
+  Slab corruption (Not tainted): kmalloc-2k start=ffff9504c58ab800, len=2048
+  420: 6b 6b 6b 6b 64 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b  kkkkdkkkkkkkkkkk
+  Prev obj: start=ffff9504c58ab000, len=2048
+  000: 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b  kkkkkkkkkkkkkkkk
+  010: 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b  kkkkkkkkkkkkkkkk
+
+Fix this by taking reference count (waitcount) of rdtgrp into account in
+the two call paths that currently do not do so. Instead of always
+freeing the resource group it will only be freed if there are no waiters
+on it. If there are waiters, the resource group will have its flags set
+to RDT_DELETED.
+
+It will be left to the waiter to free the resource group when it starts
+running and finding that it was the last waiter and the resource group
+has been removed (rdtgrp->flags & RDT_DELETED) since. (1) rdt_kill_sb()
+-> rmdir_all_sub() -> free_all_child_rdtgrp() (2) rdtgroup_rmdir() ->
+rdtgroup_rmdir_ctrl() -> free_all_child_rdtgrp()
+
+Fixes: f3cbeacaa06e ("x86/intel_rdt/cqm: Add rmdir support")
+Fixes: 60cf5e101fd4 ("x86/intel_rdt: Add mkdir to resctrl file system")
+Suggested-by: Reinette Chatre <reinette.chatre@intel.com>
+Signed-off-by: Xiaochen Shen <xiaochen.shen@intel.com>
+Signed-off-by: Borislav Petkov <bp@suse.de>
+Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
+Reviewed-by: Tony Luck <tony.luck@intel.com>
+Acked-by: Thomas Gleixner <tglx@linutronix.de>
+Cc: stable@vger.kernel.org
+Link: https://lkml.kernel.org/r/1578500886-21771-2-git-send-email-xiaochen.shen@intel.com
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/kernel/cpu/resctrl/rdtgroup.c | 12 ++++++++++--
+ 1 file changed, 10 insertions(+), 2 deletions(-)
+
+diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
+index e4da26325e3ea..c7564294a12a8 100644
+--- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
++++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
+@@ -2205,7 +2205,11 @@ static void free_all_child_rdtgrp(struct rdtgroup *rdtgrp)
+       list_for_each_entry_safe(sentry, stmp, head, mon.crdtgrp_list) {
+               free_rmid(sentry->mon.rmid);
+               list_del(&sentry->mon.crdtgrp_list);
+-              kfree(sentry);
++
++              if (atomic_read(&sentry->waitcount) != 0)
++                      sentry->flags = RDT_DELETED;
++              else
++                      kfree(sentry);
+       }
+ }
+ 
+@@ -2243,7 +2247,11 @@ static void rmdir_all_sub(void)
+ 
+               kernfs_remove(rdtgrp->kn);
+               list_del(&rdtgrp->rdtgroup_list);
+-              kfree(rdtgrp);
++
++              if (atomic_read(&rdtgrp->waitcount) != 0)
++                      rdtgrp->flags = RDT_DELETED;
++              else
++                      kfree(rdtgrp);
+       }
+       /* Notify online CPUs to update per cpu storage and PQR_ASSOC MSR */
+       update_closid_rmid(cpu_online_mask, &rdtgroup_default);
+-- 
+2.20.1
+
author	Sasha Levin <sashal@kernel.org>
	Mon, 3 Feb 2020 03:01:12 +0000 (22:01 -0500)
committer	Sasha Levin <sashal@kernel.org>
	Mon, 3 Feb 2020 03:01:12 +0000 (22:01 -0500)
queue-5.4/e1000e-drop-unnecessary-__e1000_down-bit-twiddling.patch	[new file with mode: 0644]	patch \| blob
queue-5.4/e1000e-revert-e1000e-make-watchdog-use-delayed-work.patch	[new file with mode: 0644]	patch \| blob
queue-5.4/series		patch \| blob \| blame \| history
queue-5.4/x86-resctrl-fix-a-deadlock-due-to-inaccurate-referen.patch	[new file with mode: 0644]	patch \| blob
queue-5.4/x86-resctrl-fix-use-after-free-due-to-inaccurate-ref.patch	[new file with mode: 0644]	patch \| blob
queue-5.4/x86-resctrl-fix-use-after-free-when-deleting-resourc.patch	[new file with mode: 0644]	patch \| blob