fixes for 4.9

author Sasha Levin <sashal@kernel.org>

Fri, 17 May 2019 02:10:55 +0000 (22:10 -0400)

committer Sasha Levin <sashal@kernel.org>

Fri, 17 May 2019 02:10:55 +0000 (22:10 -0400)
author Sasha Levin <sashal@kernel.org>
Fri, 17 May 2019 02:10:55 +0000 (22:10 -0400)
committer Sasha Levin <sashal@kernel.org>
Fri, 17 May 2019 02:10:55 +0000 (22:10 -0400)
diff --git a/queue-4.9/locking-rwsem-prevent-decrement-of-reader-count-befo.patch b/queue-4.9/locking-rwsem-prevent-decrement-of-reader-count-befo.patch

new file mode 100644 (file)

index 0000000..040e2cd
--- /dev/null
+++ b/queue-4.9/locking-rwsem-prevent-decrement-of-reader-count-befo.patch
@@ -0,0 +1,129 @@
+From 73a822880c775230d03565de9e6e3c4d92b812ce Mon Sep 17 00:00:00 2001
+From: Waiman Long <longman@redhat.com>
+Date: Sun, 28 Apr 2019 17:25:38 -0400
+Subject: locking/rwsem: Prevent decrement of reader count before increment
+
+[ Upstream commit a9e9bcb45b1525ba7aea26ed9441e8632aeeda58 ]
+
+During my rwsem testing, it was found that after a down_read(), the
+reader count may occasionally become 0 or even negative. Consequently,
+a writer may steal the lock at that time and execute with the reader
+in parallel thus breaking the mutual exclusion guarantee of the write
+lock. In other words, both readers and writer can become rwsem owners
+simultaneously.
+
+The current reader wakeup code does it in one pass to clear waiter->task
+and put them into wake_q before fully incrementing the reader count.
+Once waiter->task is cleared, the corresponding reader may see it,
+finish the critical section and do unlock to decrement the count before
+the count is incremented. This is not a problem if there is only one
+reader to wake up as the count has been pre-incremented by 1.  It is
+a problem if there are more than one readers to be woken up and writer
+can steal the lock.
+
+The wakeup was actually done in 2 passes before the following v4.9 commit:
+
+  70800c3c0cc5 ("locking/rwsem: Scan the wait_list for readers only once")
+
+To fix this problem, the wakeup is now done in two passes
+again. In the first pass, we collect the readers and count them.
+The reader count is then fully incremented. In the second pass, the
+waiter->task is then cleared and they are put into wake_q to be woken
+up later.
+
+Signed-off-by: Waiman Long <longman@redhat.com>
+Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Borislav Petkov <bp@alien8.de>
+Cc: Davidlohr Bueso <dave@stgolabs.net>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Cc: Tim Chen <tim.c.chen@linux.intel.com>
+Cc: Will Deacon <will.deacon@arm.com>
+Cc: huang ying <huang.ying.caritas@gmail.com>
+Fixes: 70800c3c0cc5 ("locking/rwsem: Scan the wait_list for readers only once")
+Link: http://lkml.kernel.org/r/20190428212557.13482-2-longman@redhat.com
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ kernel/locking/rwsem-xadd.c | 44 +++++++++++++++++++++++++------------
+ 1 file changed, 30 insertions(+), 14 deletions(-)
+
+diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
+index be06c45cbe4f9..0cdbb636e3163 100644
+--- a/kernel/locking/rwsem-xadd.c
++++ b/kernel/locking/rwsem-xadd.c
+@@ -127,6 +127,7 @@ static void __rwsem_mark_wake(struct rw_semaphore *sem,
+ {
+       struct rwsem_waiter *waiter, *tmp;
+       long oldcount, woken = 0, adjustment = 0;
++      struct list_head wlist;
+ 
+       /*
+        * Take a peek at the queue head waiter such that we can determine
+@@ -185,18 +186,42 @@ static void __rwsem_mark_wake(struct rw_semaphore *sem,
+        * of the queue. We know that woken will be at least 1 as we accounted
+        * for above. Note we increment the 'active part' of the count by the
+        * number of readers before waking any processes up.
++       *
++       * We have to do wakeup in 2 passes to prevent the possibility that
++       * the reader count may be decremented before it is incremented. It
++       * is because the to-be-woken waiter may not have slept yet. So it
++       * may see waiter->task got cleared, finish its critical section and
++       * do an unlock before the reader count increment.
++       *
++       * 1) Collect the read-waiters in a separate list, count them and
++       *    fully increment the reader count in rwsem.
++       * 2) For each waiters in the new list, clear waiter->task and
++       *    put them into wake_q to be woken up later.
+        */
+-      list_for_each_entry_safe(waiter, tmp, &sem->wait_list, list) {
+-              struct task_struct *tsk;
+-
++      list_for_each_entry(waiter, &sem->wait_list, list) {
+               if (waiter->type == RWSEM_WAITING_FOR_WRITE)
+                       break;
+ 
+               woken++;
+-              tsk = waiter->task;
++      }
++      list_cut_before(&wlist, &sem->wait_list, &waiter->list);
++
++      adjustment = woken * RWSEM_ACTIVE_READ_BIAS - adjustment;
++      if (list_empty(&sem->wait_list)) {
++              /* hit end of list above */
++              adjustment -= RWSEM_WAITING_BIAS;
++      }
++
++      if (adjustment)
++              atomic_long_add(adjustment, &sem->count);
++
++      /* 2nd pass */
++      list_for_each_entry_safe(waiter, tmp, &wlist, list) {
++              struct task_struct *tsk;
+ 
++              tsk = waiter->task;
+               get_task_struct(tsk);
+-              list_del(&waiter->list);
++
+               /*
+                * Ensure calling get_task_struct() before setting the reader
+                * waiter to nil such that rwsem_down_read_failed() cannot
+@@ -212,15 +237,6 @@ static void __rwsem_mark_wake(struct rw_semaphore *sem,
+               /* wake_q_add() already take the task ref */
+               put_task_struct(tsk);
+       }
+-
+-      adjustment = woken * RWSEM_ACTIVE_READ_BIAS - adjustment;
+-      if (list_empty(&sem->wait_list)) {
+-              /* hit end of list above */
+-              adjustment -= RWSEM_WAITING_BIAS;
+-      }
+-
+-      if (adjustment)
+-              atomic_long_add(adjustment, &sem->count);
+ }
+ 
+ /*
+-- 
+2.20.1
+
diff --git a/queue-4.9/net-core-another-layer-of-lists-around-pf_memalloc-s.patch b/queue-4.9/net-core-another-layer-of-lists-around-pf_memalloc-s.patch

new file mode 100644 (file)

index 0000000..61b9d95
--- /dev/null
+++ b/queue-4.9/net-core-another-layer-of-lists-around-pf_memalloc-s.patch
@@ -0,0 +1,64 @@
+From cc83886d9a61b4303a66f562b56e7447627d4992 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 16 May 2019 21:30:49 -0400
+Subject: net: core: another layer of lists, around PF_MEMALLOC skb handling
+
+[ Upstream commit 78ed8cc25986ac5c21762eeddc1e86e94d422e36 ]
+
+First example of a layer splitting the list (rather than merely taking
+ individual packets off it).
+Involves new list.h function, list_cut_before(), like list_cut_position()
+ but cuts on the other side of the given entry.
+
+Signed-off-by: Edward Cree <ecree@solarflare.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+[sl: cut out non list.h bits, we only want list_cut_before]
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ include/linux/list.h | 30 ++++++++++++++++++++++++++++++
+ 1 file changed, 30 insertions(+)
+
+diff --git a/include/linux/list.h b/include/linux/list.h
+index 5809e9a2de5b2..6f935018ea056 100644
+--- a/include/linux/list.h
++++ b/include/linux/list.h
+@@ -271,6 +271,36 @@ static inline void list_cut_position(struct list_head *list,
+               __list_cut_position(list, head, entry);
+ }
+ 
++/**
++ * list_cut_before - cut a list into two, before given entry
++ * @list: a new list to add all removed entries
++ * @head: a list with entries
++ * @entry: an entry within head, could be the head itself
++ *
++ * This helper moves the initial part of @head, up to but
++ * excluding @entry, from @head to @list.  You should pass
++ * in @entry an element you know is on @head.  @list should
++ * be an empty list or a list you do not care about losing
++ * its data.
++ * If @entry == @head, all entries on @head are moved to
++ * @list.
++ */
++static inline void list_cut_before(struct list_head *list,
++                                 struct list_head *head,
++                                 struct list_head *entry)
++{
++      if (head->next == entry) {
++              INIT_LIST_HEAD(list);
++              return;
++      }
++      list->next = head->next;
++      list->next->prev = list;
++      list->prev = entry->prev;
++      list->prev->next = list;
++      head->next = entry;
++      entry->prev = head;
++}
++
+ static inline void __list_splice(const struct list_head *list,
+                                struct list_head *prev,
+                                struct list_head *next)
+-- 
+2.20.1
+
diff --git a/queue-4.9/pci-hv-fix-a-memory-leak-in-hv_eject_device_work.patch b/queue-4.9/pci-hv-fix-a-memory-leak-in-hv_eject_device_work.patch

new file mode 100644 (file)

index 0000000..0a04d24
--- /dev/null
+++ b/queue-4.9/pci-hv-fix-a-memory-leak-in-hv_eject_device_work.patch
@@ -0,0 +1,51 @@
+From f8b31684437ba3bfcf62a8ce9c845d4ff370095d Mon Sep 17 00:00:00 2001
+From: Dexuan Cui <decui@microsoft.com>
+Date: Wed, 15 May 2019 15:42:07 -0700
+Subject: PCI: hv: Fix a memory leak in hv_eject_device_work()
+
+[ Upstream commit 05f151a73ec2b23ffbff706e5203e729a995cdc2 ]
+
+When a device is created in new_pcichild_device(), hpdev->refs is set
+to 2 (i.e. the initial value of 1 plus the get_pcichild()).
+
+When we hot remove the device from the host, in a Linux VM we first call
+hv_pci_eject_device(), which increases hpdev->refs by get_pcichild() and
+then schedules a work of hv_eject_device_work(), so hpdev->refs becomes
+3 (let's ignore the paired get/put_pcichild() in other places). But in
+hv_eject_device_work(), currently we only call put_pcichild() twice,
+meaning the 'hpdev' struct can't be freed in put_pcichild().
+
+Add one put_pcichild() to fix the memory leak.
+
+The device can also be removed when we run "rmmod pci-hyperv". On this
+path (hv_pci_remove() -> hv_pci_bus_exit() -> hv_pci_devices_present()),
+hpdev->refs is 2, and we do correctly call put_pcichild() twice in
+pci_devices_present_work().
+
+Fixes: 4daace0d8ce8 ("PCI: hv: Add paravirtual PCI front-end for Microsoft Hyper-V VMs")
+Signed-off-by: Dexuan Cui <decui@microsoft.com>
+[lorenzo.pieralisi@arm.com: commit log rework]
+Signed-off-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
+Reviewed-by: Stephen Hemminger <stephen@networkplumber.org>
+Reviewed-by:  Michael Kelley <mikelley@microsoft.com>
+Cc: stable@vger.kernel.org
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/pci/host/pci-hyperv.c | 1 +
+ 1 file changed, 1 insertion(+)
+
+diff --git a/drivers/pci/host/pci-hyperv.c b/drivers/pci/host/pci-hyperv.c
+index b4d8ccfd9f7c2..200b415765264 100644
+--- a/drivers/pci/host/pci-hyperv.c
++++ b/drivers/pci/host/pci-hyperv.c
+@@ -1620,6 +1620,7 @@ static void hv_eject_device_work(struct work_struct *work)
+       spin_unlock_irqrestore(&hpdev->hbus->device_list_lock, flags);
+ 
+       put_pcichild(hpdev, hv_pcidev_ref_childlist);
++      put_pcichild(hpdev, hv_pcidev_ref_initial);
+       put_pcichild(hpdev, hv_pcidev_ref_pnp);
+       put_hvpcibus(hpdev->hbus);
+ }
+-- 
+2.20.1
+
diff --git a/queue-4.9/series b/queue-4.9/series

new file mode 100644 (file)

index 0000000..f40ffcc
--- /dev/null
+++ b/queue-4.9/series
@@ -0,0 +1,3 @@
+locking-rwsem-prevent-decrement-of-reader-count-befo.patch
+pci-hv-fix-a-memory-leak-in-hv_eject_device_work.patch
+net-core-another-layer-of-lists-around-pf_memalloc-s.patch
author	Sasha Levin <sashal@kernel.org>
	Fri, 17 May 2019 02:10:55 +0000 (22:10 -0400)
committer	Sasha Levin <sashal@kernel.org>
	Fri, 17 May 2019 02:10:55 +0000 (22:10 -0400)
queue-4.9/locking-rwsem-prevent-decrement-of-reader-count-befo.patch	[new file with mode: 0644]	patch \| blob
queue-4.9/net-core-another-layer-of-lists-around-pf_memalloc-s.patch	[new file with mode: 0644]	patch \| blob
queue-4.9/pci-hv-fix-a-memory-leak-in-hv_eject_device_work.patch	[new file with mode: 0644]	patch \| blob
queue-4.9/series	[new file with mode: 0644]	patch \| blob