From: Sasha Levin Date: Mon, 3 Feb 2020 03:01:12 +0000 (-0500) Subject: fixes for 5.4 X-Git-Tag: v5.5.2~39 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=76afec64448bc179c127a0df6043e565781430c0;p=thirdparty%2Fkernel%2Fstable-queue.git fixes for 5.4 Signed-off-by: Sasha Levin --- diff --git a/queue-5.4/e1000e-drop-unnecessary-__e1000_down-bit-twiddling.patch b/queue-5.4/e1000e-drop-unnecessary-__e1000_down-bit-twiddling.patch new file mode 100644 index 00000000000..11c0fd48a4c --- /dev/null +++ b/queue-5.4/e1000e-drop-unnecessary-__e1000_down-bit-twiddling.patch @@ -0,0 +1,55 @@ +From 613b8b49ab902e1c338c78698e5d7810ac11be65 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 11 Oct 2019 08:34:59 -0700 +Subject: e1000e: Drop unnecessary __E1000_DOWN bit twiddling + +From: Alexander Duyck + +[ Upstream commit daee5598e491d8d3979bd4ad6c447d89ce57b446 ] + +Since we no longer check for __E1000_DOWN in e1000e_close we can drop the +spot where we were restoring the bit. This saves us a bit of unnecessary +complexity. + +Signed-off-by: Alexander Duyck +Tested-by: Aaron Brown +Signed-off-by: Jeff Kirsher +Signed-off-by: Sasha Levin +--- + drivers/net/ethernet/intel/e1000e/netdev.c | 7 +------ + 1 file changed, 1 insertion(+), 6 deletions(-) + +diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c b/drivers/net/ethernet/intel/e1000e/netdev.c +index d7d56e42a6aac..aa9fdda839148 100644 +--- a/drivers/net/ethernet/intel/e1000e/netdev.c ++++ b/drivers/net/ethernet/intel/e1000e/netdev.c +@@ -7407,15 +7407,13 @@ static void e1000_remove(struct pci_dev *pdev) + { + struct net_device *netdev = pci_get_drvdata(pdev); + struct e1000_adapter *adapter = netdev_priv(netdev); +- bool down = test_bit(__E1000_DOWN, &adapter->state); + + e1000e_ptp_remove(adapter); + + /* The timers may be rescheduled, so explicitly disable them + * from being rescheduled. + */ +- if (!down) +- set_bit(__E1000_DOWN, &adapter->state); ++ set_bit(__E1000_DOWN, &adapter->state); + del_timer_sync(&adapter->phy_info_timer); + + cancel_work_sync(&adapter->reset_task); +@@ -7435,9 +7433,6 @@ static void e1000_remove(struct pci_dev *pdev) + } + } + +- /* Don't lie to e1000_close() down the road. */ +- if (!down) +- clear_bit(__E1000_DOWN, &adapter->state); + unregister_netdev(netdev); + + if (pci_dev_run_wake(pdev)) +-- +2.20.1 + diff --git a/queue-5.4/e1000e-revert-e1000e-make-watchdog-use-delayed-work.patch b/queue-5.4/e1000e-revert-e1000e-make-watchdog-use-delayed-work.patch new file mode 100644 index 00000000000..f246517c7f9 --- /dev/null +++ b/queue-5.4/e1000e-revert-e1000e-make-watchdog-use-delayed-work.patch @@ -0,0 +1,180 @@ +From d84eb43b7150df147cba96b52fb9952232ef4477 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Sat, 4 Jan 2020 23:29:22 -0800 +Subject: e1000e: Revert "e1000e: Make watchdog use delayed work" + +From: Jeff Kirsher + +[ Upstream commit d5ad7a6a7f3c87b278d7e4973b65682be4e588dd ] + +This reverts commit 59653e6497d16f7ac1d9db088f3959f57ee8c3db. + +This is due to this commit causing driver crashes and connections to +reset unexpectedly. + +Signed-off-by: Jeff Kirsher +Tested-by: Aaron Brown +Signed-off-by: Sasha Levin +--- + drivers/net/ethernet/intel/e1000e/e1000.h | 5 +- + drivers/net/ethernet/intel/e1000e/netdev.c | 54 ++++++++++------------ + 2 files changed, 27 insertions(+), 32 deletions(-) + +diff --git a/drivers/net/ethernet/intel/e1000e/e1000.h b/drivers/net/ethernet/intel/e1000e/e1000.h +index 6c51b1bad8c42..37a2314d3e6b1 100644 +--- a/drivers/net/ethernet/intel/e1000e/e1000.h ++++ b/drivers/net/ethernet/intel/e1000e/e1000.h +@@ -185,13 +185,12 @@ struct e1000_phy_regs { + + /* board specific private data structure */ + struct e1000_adapter { ++ struct timer_list watchdog_timer; + struct timer_list phy_info_timer; + struct timer_list blink_timer; + + struct work_struct reset_task; +- struct delayed_work watchdog_task; +- +- struct workqueue_struct *e1000_workqueue; ++ struct work_struct watchdog_task; + + const struct e1000_info *ei; + +diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c b/drivers/net/ethernet/intel/e1000e/netdev.c +index aa9fdda839148..c27ed7363768c 100644 +--- a/drivers/net/ethernet/intel/e1000e/netdev.c ++++ b/drivers/net/ethernet/intel/e1000e/netdev.c +@@ -1780,8 +1780,7 @@ static irqreturn_t e1000_intr_msi(int __always_unused irq, void *data) + } + /* guard against interrupt when we're going down */ + if (!test_bit(__E1000_DOWN, &adapter->state)) +- mod_delayed_work(adapter->e1000_workqueue, +- &adapter->watchdog_task, HZ); ++ mod_timer(&adapter->watchdog_timer, jiffies + 1); + } + + /* Reset on uncorrectable ECC error */ +@@ -1861,8 +1860,7 @@ static irqreturn_t e1000_intr(int __always_unused irq, void *data) + } + /* guard against interrupt when we're going down */ + if (!test_bit(__E1000_DOWN, &adapter->state)) +- mod_delayed_work(adapter->e1000_workqueue, +- &adapter->watchdog_task, HZ); ++ mod_timer(&adapter->watchdog_timer, jiffies + 1); + } + + /* Reset on uncorrectable ECC error */ +@@ -1907,8 +1905,7 @@ static irqreturn_t e1000_msix_other(int __always_unused irq, void *data) + hw->mac.get_link_status = true; + /* guard against interrupt when we're going down */ + if (!test_bit(__E1000_DOWN, &adapter->state)) +- mod_delayed_work(adapter->e1000_workqueue, +- &adapter->watchdog_task, HZ); ++ mod_timer(&adapter->watchdog_timer, jiffies + 1); + } + + if (!test_bit(__E1000_DOWN, &adapter->state)) +@@ -4281,6 +4278,7 @@ void e1000e_down(struct e1000_adapter *adapter, bool reset) + + napi_synchronize(&adapter->napi); + ++ del_timer_sync(&adapter->watchdog_timer); + del_timer_sync(&adapter->phy_info_timer); + + spin_lock(&adapter->stats64_lock); +@@ -5152,11 +5150,25 @@ static void e1000e_check_82574_phy_workaround(struct e1000_adapter *adapter) + } + } + ++/** ++ * e1000_watchdog - Timer Call-back ++ * @data: pointer to adapter cast into an unsigned long ++ **/ ++static void e1000_watchdog(struct timer_list *t) ++{ ++ struct e1000_adapter *adapter = from_timer(adapter, t, watchdog_timer); ++ ++ /* Do the rest outside of interrupt context */ ++ schedule_work(&adapter->watchdog_task); ++ ++ /* TODO: make this use queue_delayed_work() */ ++} ++ + static void e1000_watchdog_task(struct work_struct *work) + { + struct e1000_adapter *adapter = container_of(work, + struct e1000_adapter, +- watchdog_task.work); ++ watchdog_task); + struct net_device *netdev = adapter->netdev; + struct e1000_mac_info *mac = &adapter->hw.mac; + struct e1000_phy_info *phy = &adapter->hw.phy; +@@ -5404,9 +5416,8 @@ static void e1000_watchdog_task(struct work_struct *work) + + /* Reset the timer */ + if (!test_bit(__E1000_DOWN, &adapter->state)) +- queue_delayed_work(adapter->e1000_workqueue, +- &adapter->watchdog_task, +- round_jiffies(2 * HZ)); ++ mod_timer(&adapter->watchdog_timer, ++ round_jiffies(jiffies + 2 * HZ)); + } + + #define E1000_TX_FLAGS_CSUM 0x00000001 +@@ -7259,21 +7270,11 @@ static int e1000_probe(struct pci_dev *pdev, const struct pci_device_id *ent) + goto err_eeprom; + } + +- adapter->e1000_workqueue = alloc_workqueue("%s", WQ_MEM_RECLAIM, 0, +- e1000e_driver_name); +- +- if (!adapter->e1000_workqueue) { +- err = -ENOMEM; +- goto err_workqueue; +- } +- +- INIT_DELAYED_WORK(&adapter->watchdog_task, e1000_watchdog_task); +- queue_delayed_work(adapter->e1000_workqueue, &adapter->watchdog_task, +- 0); +- ++ timer_setup(&adapter->watchdog_timer, e1000_watchdog, 0); + timer_setup(&adapter->phy_info_timer, e1000_update_phy_info, 0); + + INIT_WORK(&adapter->reset_task, e1000_reset_task); ++ INIT_WORK(&adapter->watchdog_task, e1000_watchdog_task); + INIT_WORK(&adapter->downshift_task, e1000e_downshift_workaround); + INIT_WORK(&adapter->update_phy_task, e1000e_update_phy_task); + INIT_WORK(&adapter->print_hang_task, e1000_print_hw_hang); +@@ -7367,9 +7368,6 @@ static int e1000_probe(struct pci_dev *pdev, const struct pci_device_id *ent) + return 0; + + err_register: +- flush_workqueue(adapter->e1000_workqueue); +- destroy_workqueue(adapter->e1000_workqueue); +-err_workqueue: + if (!(adapter->flags & FLAG_HAS_AMT)) + e1000e_release_hw_control(adapter); + err_eeprom: +@@ -7414,17 +7412,15 @@ static void e1000_remove(struct pci_dev *pdev) + * from being rescheduled. + */ + set_bit(__E1000_DOWN, &adapter->state); ++ del_timer_sync(&adapter->watchdog_timer); + del_timer_sync(&adapter->phy_info_timer); + + cancel_work_sync(&adapter->reset_task); ++ cancel_work_sync(&adapter->watchdog_task); + cancel_work_sync(&adapter->downshift_task); + cancel_work_sync(&adapter->update_phy_task); + cancel_work_sync(&adapter->print_hang_task); + +- cancel_delayed_work(&adapter->watchdog_task); +- flush_workqueue(adapter->e1000_workqueue); +- destroy_workqueue(adapter->e1000_workqueue); +- + if (adapter->flags & FLAG_HAS_HW_TIMESTAMP) { + cancel_work_sync(&adapter->tx_hwtstamp_work); + if (adapter->tx_hwtstamp_skb) { +-- +2.20.1 + diff --git a/queue-5.4/series b/queue-5.4/series index e3e14c386dd..c93137fab93 100644 --- a/queue-5.4/series +++ b/queue-5.4/series @@ -1,2 +1,7 @@ vfs-fix-do_last-regression.patch cifs-fix-soft-mounts-hanging-in-the-reconnect-code.patch +x86-resctrl-fix-a-deadlock-due-to-inaccurate-referen.patch +x86-resctrl-fix-use-after-free-when-deleting-resourc.patch +x86-resctrl-fix-use-after-free-due-to-inaccurate-ref.patch +e1000e-drop-unnecessary-__e1000_down-bit-twiddling.patch +e1000e-revert-e1000e-make-watchdog-use-delayed-work.patch diff --git a/queue-5.4/x86-resctrl-fix-a-deadlock-due-to-inaccurate-referen.patch b/queue-5.4/x86-resctrl-fix-a-deadlock-due-to-inaccurate-referen.patch new file mode 100644 index 00000000000..1f0d90e0363 --- /dev/null +++ b/queue-5.4/x86-resctrl-fix-a-deadlock-due-to-inaccurate-referen.patch @@ -0,0 +1,221 @@ +From 45b1dca1b89b2e2ac862d2e57616677d0a10acd1 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 9 Jan 2020 00:28:05 +0800 +Subject: x86/resctrl: Fix a deadlock due to inaccurate reference + +From: Xiaochen Shen + +[ Upstream commit 334b0f4e9b1b4a1d475f803419d202f6c5e4d18e ] + +There is a race condition which results in a deadlock when rmdir and +mkdir execute concurrently: + +$ ls /sys/fs/resctrl/c1/mon_groups/m1/ +cpus cpus_list mon_data tasks + +Thread 1: rmdir /sys/fs/resctrl/c1 +Thread 2: mkdir /sys/fs/resctrl/c1/mon_groups/m1 + +3 locks held by mkdir/48649: + #0: (sb_writers#17){.+.+}, at: [] mnt_want_write+0x20/0x50 + #1: (&type->i_mutex_dir_key#8/1){+.+.}, at: [] filename_create+0x7b/0x170 + #2: (rdtgroup_mutex){+.+.}, at: [] rdtgroup_kn_lock_live+0x3d/0x70 + +4 locks held by rmdir/48652: + #0: (sb_writers#17){.+.+}, at: [] mnt_want_write+0x20/0x50 + #1: (&type->i_mutex_dir_key#8/1){+.+.}, at: [] do_rmdir+0x13f/0x1e0 + #2: (&type->i_mutex_dir_key#8){++++}, at: [] vfs_rmdir+0x4d/0x120 + #3: (rdtgroup_mutex){+.+.}, at: [] rdtgroup_kn_lock_live+0x3d/0x70 + +Thread 1 is deleting control group "c1". Holding rdtgroup_mutex, +kernfs_remove() removes all kernfs nodes under directory "c1" +recursively, then waits for sub kernfs node "mon_groups" to drop active +reference. + +Thread 2 is trying to create a subdirectory "m1" in the "mon_groups" +directory. The wrapper kernfs_iop_mkdir() takes an active reference to +the "mon_groups" directory but the code drops the active reference to +the parent directory "c1" instead. + +As a result, Thread 1 is blocked on waiting for active reference to drop +and never release rdtgroup_mutex, while Thread 2 is also blocked on +trying to get rdtgroup_mutex. + +Thread 1 (rdtgroup_rmdir) Thread 2 (rdtgroup_mkdir) +(rmdir /sys/fs/resctrl/c1) (mkdir /sys/fs/resctrl/c1/mon_groups/m1) +------------------------- ------------------------- + kernfs_iop_mkdir + /* + * kn: "m1", parent_kn: "mon_groups", + * prgrp_kn: parent_kn->parent: "c1", + * + * "mon_groups", parent_kn->active++: 1 + */ + kernfs_get_active(parent_kn) +kernfs_iop_rmdir + /* "c1", kn->active++ */ + kernfs_get_active(kn) + + rdtgroup_kn_lock_live + atomic_inc(&rdtgrp->waitcount) + /* "c1", kn->active-- */ + kernfs_break_active_protection(kn) + mutex_lock + + rdtgroup_rmdir_ctrl + free_all_child_rdtgrp + sentry->flags = RDT_DELETED + + rdtgroup_ctrl_remove + rdtgrp->flags = RDT_DELETED + kernfs_get(kn) + kernfs_remove(rdtgrp->kn) + __kernfs_remove + /* "mon_groups", sub_kn */ + atomic_add(KN_DEACTIVATED_BIAS, &sub_kn->active) + kernfs_drain(sub_kn) + /* + * sub_kn->active == KN_DEACTIVATED_BIAS + 1, + * waiting on sub_kn->active to drop, but it + * never drops in Thread 2 which is blocked + * on getting rdtgroup_mutex. + */ +Thread 1 hangs here ----> + wait_event(sub_kn->active == KN_DEACTIVATED_BIAS) + ... + rdtgroup_mkdir + rdtgroup_mkdir_mon(parent_kn, prgrp_kn) + mkdir_rdt_prepare(parent_kn, prgrp_kn) + rdtgroup_kn_lock_live(prgrp_kn) + atomic_inc(&rdtgrp->waitcount) + /* + * "c1", prgrp_kn->active-- + * + * The active reference on "c1" is + * dropped, but not matching the + * actual active reference taken + * on "mon_groups", thus causing + * Thread 1 to wait forever while + * holding rdtgroup_mutex. + */ + kernfs_break_active_protection( + prgrp_kn) + /* + * Trying to get rdtgroup_mutex + * which is held by Thread 1. + */ +Thread 2 hangs here ----> mutex_lock + ... + +The problem is that the creation of a subdirectory in the "mon_groups" +directory incorrectly releases the active protection of its parent +directory instead of itself before it starts waiting for rdtgroup_mutex. +This is triggered by the rdtgroup_mkdir() flow calling +rdtgroup_kn_lock_live()/rdtgroup_kn_unlock() with kernfs node of the +parent control group ("c1") as argument. It should be called with kernfs +node "mon_groups" instead. What is currently missing is that the +kn->priv of "mon_groups" is NULL instead of pointing to the rdtgrp. + +Fix it by pointing kn->priv to rdtgrp when "mon_groups" is created. Then +it could be passed to rdtgroup_kn_lock_live()/rdtgroup_kn_unlock() +instead. And then it operates on the same rdtgroup structure but handles +the active reference of kernfs node "mon_groups" to prevent deadlock. +The same changes are also made to the "mon_data" directories. + +This results in some unused function parameters that will be cleaned up +in follow-up patch as the focus here is on the fix only in support of +backporting efforts. + +Fixes: c7d9aac61311 ("x86/intel_rdt/cqm: Add mkdir support for RDT monitoring") +Suggested-by: Reinette Chatre +Signed-off-by: Xiaochen Shen +Signed-off-by: Borislav Petkov +Reviewed-by: Reinette Chatre +Reviewed-by: Tony Luck +Acked-by: Thomas Gleixner +Cc: stable@vger.kernel.org +Link: https://lkml.kernel.org/r/1578500886-21771-4-git-send-email-xiaochen.shen@intel.com +Signed-off-by: Sasha Levin +--- + arch/x86/kernel/cpu/resctrl/rdtgroup.c | 16 ++++++++-------- + 1 file changed, 8 insertions(+), 8 deletions(-) + +diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c +index dac7209a07084..e4da26325e3ea 100644 +--- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c ++++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c +@@ -1970,7 +1970,7 @@ static int rdt_get_tree(struct fs_context *fc) + + if (rdt_mon_capable) { + ret = mongroup_create_dir(rdtgroup_default.kn, +- NULL, "mon_groups", ++ &rdtgroup_default, "mon_groups", + &kn_mongrp); + if (ret < 0) + goto out_info; +@@ -2446,7 +2446,7 @@ static int mkdir_mondata_all(struct kernfs_node *parent_kn, + /* + * Create the mon_data directory first. + */ +- ret = mongroup_create_dir(parent_kn, NULL, "mon_data", &kn); ++ ret = mongroup_create_dir(parent_kn, prgrp, "mon_data", &kn); + if (ret) + return ret; + +@@ -2645,7 +2645,7 @@ static int mkdir_rdt_prepare(struct kernfs_node *parent_kn, + uint files = 0; + int ret; + +- prdtgrp = rdtgroup_kn_lock_live(prgrp_kn); ++ prdtgrp = rdtgroup_kn_lock_live(parent_kn); + if (!prdtgrp) { + ret = -ENODEV; + goto out_unlock; +@@ -2718,7 +2718,7 @@ static int mkdir_rdt_prepare(struct kernfs_node *parent_kn, + kernfs_activate(kn); + + /* +- * The caller unlocks the prgrp_kn upon success. ++ * The caller unlocks the parent_kn upon success. + */ + return 0; + +@@ -2729,7 +2729,7 @@ static int mkdir_rdt_prepare(struct kernfs_node *parent_kn, + out_free_rgrp: + kfree(rdtgrp); + out_unlock: +- rdtgroup_kn_unlock(prgrp_kn); ++ rdtgroup_kn_unlock(parent_kn); + return ret; + } + +@@ -2767,7 +2767,7 @@ static int rdtgroup_mkdir_mon(struct kernfs_node *parent_kn, + */ + list_add_tail(&rdtgrp->mon.crdtgrp_list, &prgrp->mon.crdtgrp_list); + +- rdtgroup_kn_unlock(prgrp_kn); ++ rdtgroup_kn_unlock(parent_kn); + return ret; + } + +@@ -2810,7 +2810,7 @@ static int rdtgroup_mkdir_ctrl_mon(struct kernfs_node *parent_kn, + * Create an empty mon_groups directory to hold the subset + * of tasks and cpus to monitor. + */ +- ret = mongroup_create_dir(kn, NULL, "mon_groups", NULL); ++ ret = mongroup_create_dir(kn, rdtgrp, "mon_groups", NULL); + if (ret) { + rdt_last_cmd_puts("kernfs subdir error\n"); + goto out_del_list; +@@ -2826,7 +2826,7 @@ static int rdtgroup_mkdir_ctrl_mon(struct kernfs_node *parent_kn, + out_common_fail: + mkdir_rdt_prepare_clean(rdtgrp); + out_unlock: +- rdtgroup_kn_unlock(prgrp_kn); ++ rdtgroup_kn_unlock(parent_kn); + return ret; + } + +-- +2.20.1 + diff --git a/queue-5.4/x86-resctrl-fix-use-after-free-due-to-inaccurate-ref.patch b/queue-5.4/x86-resctrl-fix-use-after-free-due-to-inaccurate-ref.patch new file mode 100644 index 00000000000..4721f7c192a --- /dev/null +++ b/queue-5.4/x86-resctrl-fix-use-after-free-due-to-inaccurate-ref.patch @@ -0,0 +1,128 @@ +From c78ba78f3b5b50eecaefbc66780643af964dbbba Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 9 Jan 2020 00:28:04 +0800 +Subject: x86/resctrl: Fix use-after-free due to inaccurate refcount of + rdtgroup + +From: Xiaochen Shen + +[ Upstream commit 074fadee59ee7a9d2b216e9854bd4efb5dad679f ] + +There is a race condition in the following scenario which results in an +use-after-free issue when reading a monitoring file and deleting the +parent ctrl_mon group concurrently: + +Thread 1 calls atomic_inc() to take refcount of rdtgrp and then calls +kernfs_break_active_protection() to drop the active reference of kernfs +node in rdtgroup_kn_lock_live(). + +In Thread 2, kernfs_remove() is a blocking routine. It waits on all sub +kernfs nodes to drop the active reference when removing all subtree +kernfs nodes recursively. Thread 2 could block on kernfs_remove() until +Thread 1 calls kernfs_break_active_protection(). Only after +kernfs_remove() completes the refcount of rdtgrp could be trusted. + +Before Thread 1 calls atomic_inc() and kernfs_break_active_protection(), +Thread 2 could call kfree() when the refcount of rdtgrp (sentry) is 0 +instead of 1 due to the race. + +In Thread 1, in rdtgroup_kn_unlock(), referring to earlier rdtgrp memory +(rdtgrp->waitcount) which was already freed in Thread 2 results in +use-after-free issue. + +Thread 1 (rdtgroup_mondata_show) Thread 2 (rdtgroup_rmdir) +-------------------------------- ------------------------- +rdtgroup_kn_lock_live + /* + * kn active protection until + * kernfs_break_active_protection(kn) + */ + rdtgrp = kernfs_to_rdtgroup(kn) + rdtgroup_kn_lock_live + atomic_inc(&rdtgrp->waitcount) + mutex_lock + rdtgroup_rmdir_ctrl + free_all_child_rdtgrp + /* + * sentry->waitcount should be 1 + * but is 0 now due to the race. + */ + kfree(sentry)*[1] + /* + * Only after kernfs_remove() + * completes, the refcount of + * rdtgrp could be trusted. + */ + atomic_inc(&rdtgrp->waitcount) + /* kn->active-- */ + kernfs_break_active_protection(kn) + rdtgroup_ctrl_remove + rdtgrp->flags = RDT_DELETED + /* + * Blocking routine, wait for + * all sub kernfs nodes to drop + * active reference in + * kernfs_break_active_protection. + */ + kernfs_remove(rdtgrp->kn) + rdtgroup_kn_unlock + mutex_unlock + atomic_dec_and_test( + &rdtgrp->waitcount) + && (flags & RDT_DELETED) + kernfs_unbreak_active_protection(kn) + kfree(rdtgrp) + mutex_lock +mon_event_read +rdtgroup_kn_unlock + mutex_unlock + /* + * Use-after-free: refer to earlier rdtgrp + * memory which was freed in [1]. + */ + atomic_dec_and_test(&rdtgrp->waitcount) + && (flags & RDT_DELETED) + /* kn->active++ */ + kernfs_unbreak_active_protection(kn) + kfree(rdtgrp) + +Fix it by moving free_all_child_rdtgrp() to after kernfs_remove() in +rdtgroup_rmdir_ctrl() to ensure it has the accurate refcount of rdtgrp. + +Fixes: f3cbeacaa06e ("x86/intel_rdt/cqm: Add rmdir support") +Suggested-by: Reinette Chatre +Signed-off-by: Xiaochen Shen +Signed-off-by: Borislav Petkov +Reviewed-by: Reinette Chatre +Reviewed-by: Tony Luck +Acked-by: Thomas Gleixner +Cc: stable@vger.kernel.org +Link: https://lkml.kernel.org/r/1578500886-21771-3-git-send-email-xiaochen.shen@intel.com +Signed-off-by: Sasha Levin +--- + arch/x86/kernel/cpu/resctrl/rdtgroup.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c +index c7564294a12a8..954fd048ad9bd 100644 +--- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c ++++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c +@@ -2960,13 +2960,13 @@ static int rdtgroup_rmdir_ctrl(struct kernfs_node *kn, struct rdtgroup *rdtgrp, + closid_free(rdtgrp->closid); + free_rmid(rdtgrp->mon.rmid); + ++ rdtgroup_ctrl_remove(kn, rdtgrp); ++ + /* + * Free all the child monitor group rmids. + */ + free_all_child_rdtgrp(rdtgrp); + +- rdtgroup_ctrl_remove(kn, rdtgrp); +- + return 0; + } + +-- +2.20.1 + diff --git a/queue-5.4/x86-resctrl-fix-use-after-free-when-deleting-resourc.patch b/queue-5.4/x86-resctrl-fix-use-after-free-when-deleting-resourc.patch new file mode 100644 index 00000000000..8a0abaf7337 --- /dev/null +++ b/queue-5.4/x86-resctrl-fix-use-after-free-when-deleting-resourc.patch @@ -0,0 +1,224 @@ +From 545e5454ad863083e84ad46372551f81b93f47f8 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 9 Jan 2020 00:28:03 +0800 +Subject: x86/resctrl: Fix use-after-free when deleting resource groups + +From: Xiaochen Shen + +[ Upstream commit b8511ccc75c033f6d54188ea4df7bf1e85778740 ] + +A resource group (rdtgrp) contains a reference count (rdtgrp->waitcount) +that indicates how many waiters expect this rdtgrp to exist. Waiters +could be waiting on rdtgroup_mutex or some work sitting on a task's +workqueue for when the task returns from kernel mode or exits. + +The deletion of a rdtgrp is intended to have two phases: + + (1) while holding rdtgroup_mutex the necessary cleanup is done and + rdtgrp->flags is set to RDT_DELETED, + + (2) after releasing the rdtgroup_mutex, the rdtgrp structure is freed + only if there are no waiters and its flag is set to RDT_DELETED. Upon + gaining access to rdtgroup_mutex or rdtgrp, a waiter is required to check + for the RDT_DELETED flag. + +When unmounting the resctrl file system or deleting ctrl_mon groups, +all of the subdirectories are removed and the data structure of rdtgrp +is forcibly freed without checking rdtgrp->waitcount. If at this point +there was a waiter on rdtgrp then a use-after-free issue occurs when the +waiter starts running and accesses the rdtgrp structure it was waiting +on. + +See kfree() calls in [1], [2] and [3] in these two call paths in +following scenarios: +(1) rdt_kill_sb() -> rmdir_all_sub() -> free_all_child_rdtgrp() +(2) rdtgroup_rmdir() -> rdtgroup_rmdir_ctrl() -> free_all_child_rdtgrp() + +There are several scenarios that result in use-after-free issue in +following: + +Scenario 1: +----------- +In Thread 1, rdtgroup_tasks_write() adds a task_work callback +move_myself(). If move_myself() is scheduled to execute after Thread 2 +rdt_kill_sb() is finished, referring to earlier rdtgrp memory +(rdtgrp->waitcount) which was already freed in Thread 2 results in +use-after-free issue. + +Thread 1 (rdtgroup_tasks_write) Thread 2 (rdt_kill_sb) +------------------------------- ---------------------- +rdtgroup_kn_lock_live + atomic_inc(&rdtgrp->waitcount) + mutex_lock +rdtgroup_move_task + __rdtgroup_move_task + /* + * Take an extra refcount, so rdtgrp cannot be freed + * before the call back move_myself has been invoked + */ + atomic_inc(&rdtgrp->waitcount) + /* Callback move_myself will be scheduled for later */ + task_work_add(move_myself) +rdtgroup_kn_unlock + mutex_unlock + atomic_dec_and_test(&rdtgrp->waitcount) + && (flags & RDT_DELETED) + mutex_lock + rmdir_all_sub + /* + * sentry and rdtgrp are freed + * without checking refcount + */ + free_all_child_rdtgrp + kfree(sentry)*[1] + kfree(rdtgrp)*[2] + mutex_unlock +/* + * Callback is scheduled to execute + * after rdt_kill_sb is finished + */ +move_myself + /* + * Use-after-free: refer to earlier rdtgrp + * memory which was freed in [1] or [2]. + */ + atomic_dec_and_test(&rdtgrp->waitcount) + && (flags & RDT_DELETED) + kfree(rdtgrp) + +Scenario 2: +----------- +In Thread 1, rdtgroup_tasks_write() adds a task_work callback +move_myself(). If move_myself() is scheduled to execute after Thread 2 +rdtgroup_rmdir() is finished, referring to earlier rdtgrp memory +(rdtgrp->waitcount) which was already freed in Thread 2 results in +use-after-free issue. + +Thread 1 (rdtgroup_tasks_write) Thread 2 (rdtgroup_rmdir) +------------------------------- ------------------------- +rdtgroup_kn_lock_live + atomic_inc(&rdtgrp->waitcount) + mutex_lock +rdtgroup_move_task + __rdtgroup_move_task + /* + * Take an extra refcount, so rdtgrp cannot be freed + * before the call back move_myself has been invoked + */ + atomic_inc(&rdtgrp->waitcount) + /* Callback move_myself will be scheduled for later */ + task_work_add(move_myself) +rdtgroup_kn_unlock + mutex_unlock + atomic_dec_and_test(&rdtgrp->waitcount) + && (flags & RDT_DELETED) + rdtgroup_kn_lock_live + atomic_inc(&rdtgrp->waitcount) + mutex_lock + rdtgroup_rmdir_ctrl + free_all_child_rdtgrp + /* + * sentry is freed without + * checking refcount + */ + kfree(sentry)*[3] + rdtgroup_ctrl_remove + rdtgrp->flags = RDT_DELETED + rdtgroup_kn_unlock + mutex_unlock + atomic_dec_and_test( + &rdtgrp->waitcount) + && (flags & RDT_DELETED) + kfree(rdtgrp) +/* + * Callback is scheduled to execute + * after rdt_kill_sb is finished + */ +move_myself + /* + * Use-after-free: refer to earlier rdtgrp + * memory which was freed in [3]. + */ + atomic_dec_and_test(&rdtgrp->waitcount) + && (flags & RDT_DELETED) + kfree(rdtgrp) + +If CONFIG_DEBUG_SLAB=y, Slab corruption on kmalloc-2k can be observed +like following. Note that "0x6b" is POISON_FREE after kfree(). The +corrupted bits "0x6a", "0x64" at offset 0x424 correspond to +waitcount member of struct rdtgroup which was freed: + + Slab corruption (Not tainted): kmalloc-2k start=ffff9504c5b0d000, len=2048 + 420: 6b 6b 6b 6b 6a 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b kkkkjkkkkkkkkkkk + Single bit error detected. Probably bad RAM. + Run memtest86+ or a similar memory test tool. + Next obj: start=ffff9504c5b0d800, len=2048 + 000: 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b kkkkkkkkkkkkkkkk + 010: 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b kkkkkkkkkkkkkkkk + + Slab corruption (Not tainted): kmalloc-2k start=ffff9504c58ab800, len=2048 + 420: 6b 6b 6b 6b 64 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b kkkkdkkkkkkkkkkk + Prev obj: start=ffff9504c58ab000, len=2048 + 000: 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b kkkkkkkkkkkkkkkk + 010: 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b kkkkkkkkkkkkkkkk + +Fix this by taking reference count (waitcount) of rdtgrp into account in +the two call paths that currently do not do so. Instead of always +freeing the resource group it will only be freed if there are no waiters +on it. If there are waiters, the resource group will have its flags set +to RDT_DELETED. + +It will be left to the waiter to free the resource group when it starts +running and finding that it was the last waiter and the resource group +has been removed (rdtgrp->flags & RDT_DELETED) since. (1) rdt_kill_sb() +-> rmdir_all_sub() -> free_all_child_rdtgrp() (2) rdtgroup_rmdir() -> +rdtgroup_rmdir_ctrl() -> free_all_child_rdtgrp() + +Fixes: f3cbeacaa06e ("x86/intel_rdt/cqm: Add rmdir support") +Fixes: 60cf5e101fd4 ("x86/intel_rdt: Add mkdir to resctrl file system") +Suggested-by: Reinette Chatre +Signed-off-by: Xiaochen Shen +Signed-off-by: Borislav Petkov +Reviewed-by: Reinette Chatre +Reviewed-by: Tony Luck +Acked-by: Thomas Gleixner +Cc: stable@vger.kernel.org +Link: https://lkml.kernel.org/r/1578500886-21771-2-git-send-email-xiaochen.shen@intel.com +Signed-off-by: Sasha Levin +--- + arch/x86/kernel/cpu/resctrl/rdtgroup.c | 12 ++++++++++-- + 1 file changed, 10 insertions(+), 2 deletions(-) + +diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c +index e4da26325e3ea..c7564294a12a8 100644 +--- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c ++++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c +@@ -2205,7 +2205,11 @@ static void free_all_child_rdtgrp(struct rdtgroup *rdtgrp) + list_for_each_entry_safe(sentry, stmp, head, mon.crdtgrp_list) { + free_rmid(sentry->mon.rmid); + list_del(&sentry->mon.crdtgrp_list); +- kfree(sentry); ++ ++ if (atomic_read(&sentry->waitcount) != 0) ++ sentry->flags = RDT_DELETED; ++ else ++ kfree(sentry); + } + } + +@@ -2243,7 +2247,11 @@ static void rmdir_all_sub(void) + + kernfs_remove(rdtgrp->kn); + list_del(&rdtgrp->rdtgroup_list); +- kfree(rdtgrp); ++ ++ if (atomic_read(&rdtgrp->waitcount) != 0) ++ rdtgrp->flags = RDT_DELETED; ++ else ++ kfree(rdtgrp); + } + /* Notify online CPUs to update per cpu storage and PQR_ASSOC MSR */ + update_closid_rmid(cpu_online_mask, &rdtgroup_default); +-- +2.20.1 +