5.10-stable patches

author Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Mon, 4 Jan 2021 10:24:04 +0000 (11:24 +0100)

committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Mon, 4 Jan 2021 10:24:04 +0000 (11:24 +0100)
author Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Mon, 4 Jan 2021 10:24:04 +0000 (11:24 +0100)
committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Mon, 4 Jan 2021 10:24:04 +0000 (11:24 +0100)
diff --git a/queue-5.10/io_uring-add-a-helper-for-setting-a-ref-node.patch b/queue-5.10/io_uring-add-a-helper-for-setting-a-ref-node.patch

new file mode 100644 (file)

index 0000000..a49160c
--- /dev/null
+++ b/queue-5.10/io_uring-add-a-helper-for-setting-a-ref-node.patch
@@ -0,0 +1,66 @@
+From 1642b4450d20e31439c80c28256c8eee08684698 Mon Sep 17 00:00:00 2001
+From: Pavel Begunkov <asml.silence@gmail.com>
+Date: Wed, 30 Dec 2020 21:34:14 +0000
+Subject: io_uring: add a helper for setting a ref node
+
+From: Pavel Begunkov <asml.silence@gmail.com>
+
+commit 1642b4450d20e31439c80c28256c8eee08684698 upstream.
+
+Setting a new reference node to a file data is not trivial, don't repeat
+it, add and use a helper.
+
+Cc: stable@vger.kernel.org # 5.6+
+Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/io_uring.c |   22 ++++++++++++----------
+ 1 file changed, 12 insertions(+), 10 deletions(-)
+
+--- a/fs/io_uring.c
++++ b/fs/io_uring.c
+@@ -6991,6 +6991,16 @@ static void io_file_ref_kill(struct perc
+       complete(&data->done);
+ }
+ 
++static void io_sqe_files_set_node(struct fixed_file_data *file_data,
++                                struct fixed_file_ref_node *ref_node)
++{
++      spin_lock(&file_data->lock);
++      file_data->node = ref_node;
++      list_add_tail(&ref_node->node, &file_data->ref_list);
++      spin_unlock(&file_data->lock);
++      percpu_ref_get(&file_data->refs);
++}
++
+ static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
+ {
+       struct fixed_file_data *data = ctx->file_data;
+@@ -7519,11 +7529,7 @@ static int io_sqe_files_register(struct
+               return PTR_ERR(ref_node);
+       }
+ 
+-      file_data->node = ref_node;
+-      spin_lock(&file_data->lock);
+-      list_add_tail(&ref_node->node, &file_data->ref_list);
+-      spin_unlock(&file_data->lock);
+-      percpu_ref_get(&file_data->refs);
++      io_sqe_files_set_node(file_data, ref_node);
+       return ret;
+ out_fput:
+       for (i = 0; i < ctx->nr_user_files; i++) {
+@@ -7679,11 +7685,7 @@ static int __io_sqe_files_update(struct
+ 
+       if (needs_switch) {
+               percpu_ref_kill(&data->node->refs);
+-              spin_lock(&data->lock);
+-              list_add_tail(&ref_node->node, &data->ref_list);
+-              data->node = ref_node;
+-              spin_unlock(&data->lock);
+-              percpu_ref_get(&ctx->file_data->refs);
++              io_sqe_files_set_node(data, ref_node);
+       } else
+               destroy_fixed_file_ref_node(ref_node);
+ 
diff --git a/queue-5.10/io_uring-don-t-assume-mm-is-constant-across-submits.patch b/queue-5.10/io_uring-don-t-assume-mm-is-constant-across-submits.patch

new file mode 100644 (file)

index 0000000..708a6ec
--- /dev/null
+++ b/queue-5.10/io_uring-don-t-assume-mm-is-constant-across-submits.patch
@@ -0,0 +1,56 @@
+From 77788775c7132a8d93c6930ab1bd84fc743c7cb7 Mon Sep 17 00:00:00 2001
+From: Jens Axboe <axboe@kernel.dk>
+Date: Tue, 29 Dec 2020 10:50:46 -0700
+Subject: io_uring: don't assume mm is constant across submits
+
+From: Jens Axboe <axboe@kernel.dk>
+
+commit 77788775c7132a8d93c6930ab1bd84fc743c7cb7 upstream.
+
+If we COW the identity, we assume that ->mm never changes. But this
+isn't true of multiple processes end up sharing the ring. Hence treat
+id->mm like like any other process compontent when it comes to the
+identity mapping. This is pretty trivial, just moving the existing grab
+into io_grab_identity(), and including a check for the match.
+
+Cc: stable@vger.kernel.org # 5.10
+Fixes: 1e6fa5216a0e ("io_uring: COW io_identity on mismatch")
+Reported-by: Christian Brauner <christian.brauner@ubuntu.com>:
+Tested-by: Christian Brauner <christian.brauner@ubuntu.com>:
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/io_uring.c |   14 +++++++-------
+ 1 file changed, 7 insertions(+), 7 deletions(-)
+
+--- a/fs/io_uring.c
++++ b/fs/io_uring.c
+@@ -1369,6 +1369,13 @@ static bool io_grab_identity(struct io_k
+               spin_unlock_irq(&ctx->inflight_lock);
+               req->work.flags |= IO_WQ_WORK_FILES;
+       }
++      if (!(req->work.flags & IO_WQ_WORK_MM) &&
++          (def->work_flags & IO_WQ_WORK_MM)) {
++              if (id->mm != current->mm)
++                      return false;
++              mmgrab(id->mm);
++              req->work.flags |= IO_WQ_WORK_MM;
++      }
+ 
+       return true;
+ }
+@@ -1393,13 +1400,6 @@ static void io_prep_async_work(struct io
+                       req->work.flags |= IO_WQ_WORK_UNBOUND;
+       }
+ 
+-      /* ->mm can never change on us */
+-      if (!(req->work.flags & IO_WQ_WORK_MM) &&
+-          (def->work_flags & IO_WQ_WORK_MM)) {
+-              mmgrab(id->mm);
+-              req->work.flags |= IO_WQ_WORK_MM;
+-      }
+-
+       /* if we fail grabbing identity, we must COW, regrab, and retry */
+       if (io_grab_identity(req))
+               return;
diff --git a/queue-5.10/io_uring-fix-io_sqe_files_unregister-hangs.patch b/queue-5.10/io_uring-fix-io_sqe_files_unregister-hangs.patch

new file mode 100644 (file)

index 0000000..33d6660
--- /dev/null
+++ b/queue-5.10/io_uring-fix-io_sqe_files_unregister-hangs.patch
@@ -0,0 +1,81 @@
+From 1ffc54220c444774b7f09e6d2121e732f8e19b94 Mon Sep 17 00:00:00 2001
+From: Pavel Begunkov <asml.silence@gmail.com>
+Date: Wed, 30 Dec 2020 21:34:15 +0000
+Subject: io_uring: fix io_sqe_files_unregister() hangs
+
+From: Pavel Begunkov <asml.silence@gmail.com>
+
+commit 1ffc54220c444774b7f09e6d2121e732f8e19b94 upstream.
+
+io_sqe_files_unregister() uninterruptibly waits for enqueued ref nodes,
+however requests keeping them may never complete, e.g. because of some
+userspace dependency. Make sure it's interruptible otherwise it would
+hang forever.
+
+Cc: stable@vger.kernel.org # 5.6+
+Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/io_uring.c |   24 ++++++++++++++++++++++--
+ 1 file changed, 22 insertions(+), 2 deletions(-)
+
+--- a/fs/io_uring.c
++++ b/fs/io_uring.c
+@@ -941,6 +941,10 @@ enum io_mem_account {
+       ACCT_PINNED,
+ };
+ 
++static void destroy_fixed_file_ref_node(struct fixed_file_ref_node *ref_node);
++static struct fixed_file_ref_node *alloc_fixed_file_ref_node(
++                      struct io_ring_ctx *ctx);
++
+ static void __io_complete_rw(struct io_kiocb *req, long res, long res2,
+                            struct io_comp_state *cs);
+ static void io_cqring_fill_event(struct io_kiocb *req, long res);
+@@ -7004,11 +7008,15 @@ static void io_sqe_files_set_node(struct
+ static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
+ {
+       struct fixed_file_data *data = ctx->file_data;
+-      struct fixed_file_ref_node *ref_node = NULL;
++      struct fixed_file_ref_node *backup_node, *ref_node = NULL;
+       unsigned nr_tables, i;
++      int ret;
+ 
+       if (!data)
+               return -ENXIO;
++      backup_node = alloc_fixed_file_ref_node(ctx);
++      if (!backup_node)
++              return -ENOMEM;
+ 
+       spin_lock(&data->lock);
+       ref_node = data->node;
+@@ -7020,7 +7028,18 @@ static int io_sqe_files_unregister(struc
+ 
+       /* wait for all refs nodes to complete */
+       flush_delayed_work(&ctx->file_put_work);
+-      wait_for_completion(&data->done);
++      do {
++              ret = wait_for_completion_interruptible(&data->done);
++              if (!ret)
++                      break;
++              ret = io_run_task_work_sig();
++              if (ret < 0) {
++                      percpu_ref_resurrect(&data->refs);
++                      reinit_completion(&data->done);
++                      io_sqe_files_set_node(data, backup_node);
++                      return ret;
++              }
++      } while (1);
+ 
+       __io_sqe_files_unregister(ctx);
+       nr_tables = DIV_ROUND_UP(ctx->nr_user_files, IORING_MAX_FILES_TABLE);
+@@ -7031,6 +7050,7 @@ static int io_sqe_files_unregister(struc
+       kfree(data);
+       ctx->file_data = NULL;
+       ctx->nr_user_files = 0;
++      destroy_fixed_file_ref_node(backup_node);
+       return 0;
+ }
+ 
diff --git a/queue-5.10/kernel-io_uring-cancel-io_uring-before-task-works.patch b/queue-5.10/kernel-io_uring-cancel-io_uring-before-task-works.patch

new file mode 100644 (file)

index 0000000..d74d00d
--- /dev/null
+++ b/queue-5.10/kernel-io_uring-cancel-io_uring-before-task-works.patch
@@ -0,0 +1,64 @@
+From b1b6b5a30dce872f500dc43f067cba8e7f86fc7d Mon Sep 17 00:00:00 2001
+From: Pavel Begunkov <asml.silence@gmail.com>
+Date: Wed, 30 Dec 2020 21:34:16 +0000
+Subject: kernel/io_uring: cancel io_uring before task works
+
+From: Pavel Begunkov <asml.silence@gmail.com>
+
+commit b1b6b5a30dce872f500dc43f067cba8e7f86fc7d upstream.
+
+For cancelling io_uring requests it needs either to be able to run
+currently enqueued task_works or having it shut down by that moment.
+Otherwise io_uring_cancel_files() may be waiting for requests that won't
+ever complete.
+
+Go with the first way and do cancellations before setting PF_EXITING and
+so before putting the task_work infrastructure into a transition state
+where task_work_run() would better not be called.
+
+Cc: stable@vger.kernel.org # 5.5+
+Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/file.c     |    2 --
+ kernel/exit.c |    2 ++
+ 2 files changed, 2 insertions(+), 2 deletions(-)
+
+--- a/fs/file.c
++++ b/fs/file.c
+@@ -21,7 +21,6 @@
+ #include <linux/rcupdate.h>
+ #include <linux/close_range.h>
+ #include <net/sock.h>
+-#include <linux/io_uring.h>
+ 
+ unsigned int sysctl_nr_open __read_mostly = 1024*1024;
+ unsigned int sysctl_nr_open_min = BITS_PER_LONG;
+@@ -453,7 +452,6 @@ void exit_files(struct task_struct *tsk)
+       struct files_struct * files = tsk->files;
+ 
+       if (files) {
+-              io_uring_files_cancel(files);
+               task_lock(tsk);
+               tsk->files = NULL;
+               task_unlock(tsk);
+--- a/kernel/exit.c
++++ b/kernel/exit.c
+@@ -63,6 +63,7 @@
+ #include <linux/random.h>
+ #include <linux/rcuwait.h>
+ #include <linux/compat.h>
++#include <linux/io_uring.h>
+ 
+ #include <linux/uaccess.h>
+ #include <asm/unistd.h>
+@@ -762,6 +763,7 @@ void __noreturn do_exit(long code)
+               schedule();
+       }
+ 
++      io_uring_files_cancel(tsk->files);
+       exit_signals(tsk);  /* sets PF_EXITING */
+ 
+       /* sync mm's RSS info before statistics gathering */
diff --git a/queue-5.10/lib-zlib-fix-inflating-zlib-streams-on-s390.patch b/queue-5.10/lib-zlib-fix-inflating-zlib-streams-on-s390.patch

new file mode 100644 (file)

index 0000000..921b858
--- /dev/null
+++ b/queue-5.10/lib-zlib-fix-inflating-zlib-streams-on-s390.patch
@@ -0,0 +1,61 @@
+From f0bb29e8c4076444d32df00c8d32e169ceecf283 Mon Sep 17 00:00:00 2001
+From: Ilya Leoshkevich <iii@linux.ibm.com>
+Date: Tue, 29 Dec 2020 15:15:01 -0800
+Subject: lib/zlib: fix inflating zlib streams on s390
+
+From: Ilya Leoshkevich <iii@linux.ibm.com>
+
+commit f0bb29e8c4076444d32df00c8d32e169ceecf283 upstream.
+
+Decompressing zlib streams on s390 fails with "incorrect data check"
+error.
+
+Userspace zlib checks inflate_state.flags in order to byteswap checksums
+only for zlib streams, and s390 hardware inflate code, which was ported
+from there, tries to match this behavior.  At the same time, kernel zlib
+does not use inflate_state.flags, so it contains essentially random
+values.  For many use cases either zlib stream is zeroed out or checksum
+is not used, so this problem is masked, but at least SquashFS is still
+affected.
+
+Fix by always passing a checksum to and from the hardware as is, which
+matches zlib_inflate()'s expectations.
+
+Link: https://lkml.kernel.org/r/20201215155551.894884-1-iii@linux.ibm.com
+Fixes: 126196100063 ("lib/zlib: add s390 hardware support for kernel zlib_inflate")
+Signed-off-by: Ilya Leoshkevich <iii@linux.ibm.com>
+Tested-by: Christian Borntraeger <borntraeger@de.ibm.com>
+Acked-by: Mikhail Zaslonko <zaslonko@linux.ibm.com>
+Acked-by: Christian Borntraeger <borntraeger@de.ibm.com>
+Cc: Heiko Carstens <hca@linux.ibm.com>
+Cc: Vasily Gorbik <gor@linux.ibm.com>
+Cc: Mikhail Zaslonko <zaslonko@linux.ibm.com>
+Cc: <stable@vger.kernel.org>   [5.6+]
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ lib/zlib_dfltcc/dfltcc_inflate.c |    4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+--- a/lib/zlib_dfltcc/dfltcc_inflate.c
++++ b/lib/zlib_dfltcc/dfltcc_inflate.c
+@@ -125,7 +125,7 @@ dfltcc_inflate_action dfltcc_inflate(
+     param->ho = (state->write - state->whave) & ((1 << HB_BITS) - 1);
+     if (param->hl)
+         param->nt = 0; /* Honor history for the first block */
+-    param->cv = state->flags ? REVERSE(state->check) : state->check;
++    param->cv = state->check;
+ 
+     /* Inflate */
+     do {
+@@ -138,7 +138,7 @@ dfltcc_inflate_action dfltcc_inflate(
+     state->bits = param->sbb;
+     state->whave = param->hl;
+     state->write = (param->ho + param->hl) & ((1 << HB_BITS) - 1);
+-    state->check = state->flags ? REVERSE(param->cv) : param->cv;
++    state->check = param->cv;
+     if (cc == DFLTCC_CC_OP2_CORRUPT && param->oesc != 0) {
+         /* Report an error if stream is corrupted */
+         state->mode = BAD;
diff --git a/queue-5.10/mm-hugetlb-fix-deadlock-in-hugetlb_cow-error-path.patch b/queue-5.10/mm-hugetlb-fix-deadlock-in-hugetlb_cow-error-path.patch

new file mode 100644 (file)

index 0000000..d06ee7e
--- /dev/null
+++ b/queue-5.10/mm-hugetlb-fix-deadlock-in-hugetlb_cow-error-path.patch
@@ -0,0 +1,89 @@
+From e7dd91c456a8cdbcd7066997d15e36d14276a949 Mon Sep 17 00:00:00 2001
+From: Mike Kravetz <mike.kravetz@oracle.com>
+Date: Tue, 29 Dec 2020 15:14:25 -0800
+Subject: mm/hugetlb: fix deadlock in hugetlb_cow error path
+
+From: Mike Kravetz <mike.kravetz@oracle.com>
+
+commit e7dd91c456a8cdbcd7066997d15e36d14276a949 upstream.
+
+syzbot reported the deadlock here [1].  The issue is in hugetlb cow
+error handling when there are not enough huge pages for the faulting
+task which took the original reservation.  It is possible that other
+(child) tasks could have consumed pages associated with the reservation.
+In this case, we want the task which took the original reservation to
+succeed.  So, we unmap any associated pages in children so that they can
+be used by the faulting task that owns the reservation.
+
+The unmapping code needs to hold i_mmap_rwsem in write mode.  However,
+due to commit c0d0381ade79 ("hugetlbfs: use i_mmap_rwsem for more pmd
+sharing synchronization") we are already holding i_mmap_rwsem in read
+mode when hugetlb_cow is called.
+
+Technically, i_mmap_rwsem does not need to be held in read mode for COW
+mappings as they can not share pmd's.  Modifying the fault code to not
+take i_mmap_rwsem in read mode for COW (and other non-sharable) mappings
+is too involved for a stable fix.
+
+Instead, we simply drop the hugetlb_fault_mutex and i_mmap_rwsem before
+unmapping.  This is OK as it is technically not needed.  They are
+reacquired after unmapping as expected by calling code.  Since this is
+done in an uncommon error path, the overhead of dropping and reacquiring
+mutexes is acceptable.
+
+While making changes, remove redundant BUG_ON after unmap_ref_private.
+
+[1] https://lkml.kernel.org/r/000000000000b73ccc05b5cf8558@google.com
+
+Link: https://lkml.kernel.org/r/4c5781b8-3b00-761e-c0c7-c5edebb6ec1a@oracle.com
+Fixes: c0d0381ade79 ("hugetlbfs: use i_mmap_rwsem for more pmd sharing synchronization")
+Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
+Reported-by: syzbot+5eee4145df3c15e96625@syzkaller.appspotmail.com
+Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
+Cc: Michal Hocko <mhocko@kernel.org>
+Cc: Hugh Dickins <hughd@google.com>
+Cc: "Aneesh Kumar K . V" <aneesh.kumar@linux.vnet.ibm.com>
+Cc: Davidlohr Bueso <dave@stgolabs.net>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/hugetlb.c |   22 +++++++++++++++++++++-
+ 1 file changed, 21 insertions(+), 1 deletion(-)
+
+--- a/mm/hugetlb.c
++++ b/mm/hugetlb.c
+@@ -4106,10 +4106,30 @@ retry_avoidcopy:
+                * may get SIGKILLed if it later faults.
+                */
+               if (outside_reserve) {
++                      struct address_space *mapping = vma->vm_file->f_mapping;
++                      pgoff_t idx;
++                      u32 hash;
++
+                       put_page(old_page);
+                       BUG_ON(huge_pte_none(pte));
++                      /*
++                       * Drop hugetlb_fault_mutex and i_mmap_rwsem before
++                       * unmapping.  unmapping needs to hold i_mmap_rwsem
++                       * in write mode.  Dropping i_mmap_rwsem in read mode
++                       * here is OK as COW mappings do not interact with
++                       * PMD sharing.
++                       *
++                       * Reacquire both after unmap operation.
++                       */
++                      idx = vma_hugecache_offset(h, vma, haddr);
++                      hash = hugetlb_fault_mutex_hash(mapping, idx);
++                      mutex_unlock(&hugetlb_fault_mutex_table[hash]);
++                      i_mmap_unlock_read(mapping);
++
+                       unmap_ref_private(mm, vma, old_page, haddr);
+-                      BUG_ON(huge_pte_none(pte));
++
++                      i_mmap_lock_read(mapping);
++                      mutex_lock(&hugetlb_fault_mutex_table[hash]);
+                       spin_lock(ptl);
+                       ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
+                       if (likely(ptep &&
diff --git a/queue-5.10/mm-memmap-defer-init-doesn-t-work-as-expected.patch b/queue-5.10/mm-memmap-defer-init-doesn-t-work-as-expected.patch

new file mode 100644 (file)

index 0000000..8c6af60
--- /dev/null
+++ b/queue-5.10/mm-memmap-defer-init-doesn-t-work-as-expected.patch
@@ -0,0 +1,155 @@
+From dc2da7b45ffe954a0090f5d0310ed7b0b37d2bd2 Mon Sep 17 00:00:00 2001
+From: Baoquan He <bhe@redhat.com>
+Date: Tue, 29 Dec 2020 15:14:37 -0800
+Subject: mm: memmap defer init doesn't work as expected
+
+From: Baoquan He <bhe@redhat.com>
+
+commit dc2da7b45ffe954a0090f5d0310ed7b0b37d2bd2 upstream.
+
+VMware observed a performance regression during memmap init on their
+platform, and bisected to commit 73a6e474cb376 ("mm: memmap_init:
+iterate over memblock regions rather that check each PFN") causing it.
+
+Before the commit:
+
+  [0.033176] Normal zone: 1445888 pages used for memmap
+  [0.033176] Normal zone: 89391104 pages, LIFO batch:63
+  [0.035851] ACPI: PM-Timer IO Port: 0x448
+
+With commit
+
+  [0.026874] Normal zone: 1445888 pages used for memmap
+  [0.026875] Normal zone: 89391104 pages, LIFO batch:63
+  [2.028450] ACPI: PM-Timer IO Port: 0x448
+
+The root cause is the current memmap defer init doesn't work as expected.
+
+Before, memmap_init_zone() was used to do memmap init of one whole zone,
+to initialize all low zones of one numa node, but defer memmap init of
+the last zone in that numa node.  However, since commit 73a6e474cb376,
+function memmap_init() is adapted to iterater over memblock regions
+inside one zone, then call memmap_init_zone() to do memmap init for each
+region.
+
+E.g, on VMware's system, the memory layout is as below, there are two
+memory regions in node 2.  The current code will mistakenly initialize the
+whole 1st region [mem 0xab00000000-0xfcffffffff], then do memmap defer to
+iniatialize only one memmory section on the 2nd region [mem
+0x10000000000-0x1033fffffff].  In fact, we only expect to see that there's
+only one memory section's memmap initialized.  That's why more time is
+costed at the time.
+
+[    0.008842] ACPI: SRAT: Node 0 PXM 0 [mem 0x00000000-0x0009ffff]
+[    0.008842] ACPI: SRAT: Node 0 PXM 0 [mem 0x00100000-0xbfffffff]
+[    0.008843] ACPI: SRAT: Node 0 PXM 0 [mem 0x100000000-0x55ffffffff]
+[    0.008844] ACPI: SRAT: Node 1 PXM 1 [mem 0x5600000000-0xaaffffffff]
+[    0.008844] ACPI: SRAT: Node 2 PXM 2 [mem 0xab00000000-0xfcffffffff]
+[    0.008845] ACPI: SRAT: Node 2 PXM 2 [mem 0x10000000000-0x1033fffffff]
+
+Now, let's add a parameter 'zone_end_pfn' to memmap_init_zone() to pass
+down the real zone end pfn so that defer_init() can use it to judge
+whether defer need be taken in zone wide.
+
+Link: https://lkml.kernel.org/r/20201223080811.16211-1-bhe@redhat.com
+Link: https://lkml.kernel.org/r/20201223080811.16211-2-bhe@redhat.com
+Fixes: commit 73a6e474cb376 ("mm: memmap_init: iterate over memblock regions rather that check each PFN")
+Signed-off-by: Baoquan He <bhe@redhat.com>
+Reported-by: Rahul Gopakumar <gopakumarr@vmware.com>
+Reviewed-by: Mike Rapoport <rppt@linux.ibm.com>
+Cc: David Hildenbrand <david@redhat.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/ia64/mm/init.c |    4 ++--
+ include/linux/mm.h  |    5 +++--
+ mm/memory_hotplug.c |    2 +-
+ mm/page_alloc.c     |    8 +++++---
+ 4 files changed, 11 insertions(+), 8 deletions(-)
+
+--- a/arch/ia64/mm/init.c
++++ b/arch/ia64/mm/init.c
+@@ -536,7 +536,7 @@ virtual_memmap_init(u64 start, u64 end,
+ 
+       if (map_start < map_end)
+               memmap_init_zone((unsigned long)(map_end - map_start),
+-                               args->nid, args->zone, page_to_pfn(map_start),
++                               args->nid, args->zone, page_to_pfn(map_start), page_to_pfn(map_end),
+                                MEMINIT_EARLY, NULL, MIGRATE_MOVABLE);
+       return 0;
+ }
+@@ -546,7 +546,7 @@ memmap_init (unsigned long size, int nid
+            unsigned long start_pfn)
+ {
+       if (!vmem_map) {
+-              memmap_init_zone(size, nid, zone, start_pfn,
++              memmap_init_zone(size, nid, zone, start_pfn, start_pfn + size,
+                                MEMINIT_EARLY, NULL, MIGRATE_MOVABLE);
+       } else {
+               struct page *start;
+--- a/include/linux/mm.h
++++ b/include/linux/mm.h
+@@ -2439,8 +2439,9 @@ extern int __meminit __early_pfn_to_nid(
+ #endif
+ 
+ extern void set_dma_reserve(unsigned long new_dma_reserve);
+-extern void memmap_init_zone(unsigned long, int, unsigned long, unsigned long,
+-              enum meminit_context, struct vmem_altmap *, int migratetype);
++extern void memmap_init_zone(unsigned long, int, unsigned long,
++              unsigned long, unsigned long, enum meminit_context,
++              struct vmem_altmap *, int migratetype);
+ extern void setup_per_zone_wmarks(void);
+ extern int __meminit init_per_zone_wmark_min(void);
+ extern void mem_init(void);
+--- a/mm/memory_hotplug.c
++++ b/mm/memory_hotplug.c
+@@ -714,7 +714,7 @@ void __ref move_pfn_range_to_zone(struct
+        * expects the zone spans the pfn range. All the pages in the range
+        * are reserved so nobody should be touching them so we should be safe
+        */
+-      memmap_init_zone(nr_pages, nid, zone_idx(zone), start_pfn,
++      memmap_init_zone(nr_pages, nid, zone_idx(zone), start_pfn, 0,
+                        MEMINIT_HOTPLUG, altmap, migratetype);
+ 
+       set_zone_contiguous(zone);
+--- a/mm/page_alloc.c
++++ b/mm/page_alloc.c
+@@ -448,6 +448,8 @@ defer_init(int nid, unsigned long pfn, u
+       if (end_pfn < pgdat_end_pfn(NODE_DATA(nid)))
+               return false;
+ 
++      if (NODE_DATA(nid)->first_deferred_pfn != ULONG_MAX)
++              return true;
+       /*
+        * We start only with one section of pages, more pages are added as
+        * needed until the rest of deferred pages are initialized.
+@@ -6050,7 +6052,7 @@ overlap_memmap_init(unsigned long zone,
+  * zone stats (e.g., nr_isolate_pageblock) are touched.
+  */
+ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
+-              unsigned long start_pfn,
++              unsigned long start_pfn, unsigned long zone_end_pfn,
+               enum meminit_context context,
+               struct vmem_altmap *altmap, int migratetype)
+ {
+@@ -6086,7 +6088,7 @@ void __meminit memmap_init_zone(unsigned
+               if (context == MEMINIT_EARLY) {
+                       if (overlap_memmap_init(zone, &pfn))
+                               continue;
+-                      if (defer_init(nid, pfn, end_pfn))
++                      if (defer_init(nid, pfn, zone_end_pfn))
+                               break;
+               }
+ 
+@@ -6200,7 +6202,7 @@ void __meminit __weak memmap_init(unsign
+ 
+               if (end_pfn > start_pfn) {
+                       size = end_pfn - start_pfn;
+-                      memmap_init_zone(size, nid, zone, start_pfn,
++                      memmap_init_zone(size, nid, zone, start_pfn, range_end_pfn,
+                                        MEMINIT_EARLY, NULL, MIGRATE_MOVABLE);
+               }
+       }
diff --git a/queue-5.10/opp-call-the-missing-clk_put-on-error.patch b/queue-5.10/opp-call-the-missing-clk_put-on-error.patch

new file mode 100644 (file)

index 0000000..8a36a4e
--- /dev/null
+++ b/queue-5.10/opp-call-the-missing-clk_put-on-error.patch
@@ -0,0 +1,42 @@
+From 0e1d9ca1766f5d95fb881f57b6c4a1ffa63d4648 Mon Sep 17 00:00:00 2001
+From: Viresh Kumar <viresh.kumar@linaro.org>
+Date: Mon, 28 Dec 2020 10:51:04 +0530
+Subject: opp: Call the missing clk_put() on error
+
+From: Viresh Kumar <viresh.kumar@linaro.org>
+
+commit 0e1d9ca1766f5d95fb881f57b6c4a1ffa63d4648 upstream.
+
+Fix the clock reference counting by calling the missing clk_put() in the
+error path.
+
+Cc: v5.10 <stable@vger.kernel.org> # v5.10
+Fixes: dd461cd9183f ("opp: Allow dev_pm_opp_get_opp_table() to return -EPROBE_DEFER")
+Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/opp/core.c |    5 ++++-
+ 1 file changed, 4 insertions(+), 1 deletion(-)
+
+--- a/drivers/opp/core.c
++++ b/drivers/opp/core.c
+@@ -1111,7 +1111,7 @@ static struct opp_table *_allocate_opp_t
+       ret = dev_pm_opp_of_find_icc_paths(dev, opp_table);
+       if (ret) {
+               if (ret == -EPROBE_DEFER)
+-                      goto remove_opp_dev;
++                      goto put_clk;
+ 
+               dev_warn(dev, "%s: Error finding interconnect paths: %d\n",
+                        __func__, ret);
+@@ -1125,6 +1125,9 @@ static struct opp_table *_allocate_opp_t
+       list_add(&opp_table->node, &opp_tables);
+       return opp_table;
+ 
++put_clk:
++      if (!IS_ERR(opp_table->clk))
++              clk_put(opp_table->clk);
+ remove_opp_dev:
+       _remove_opp_dev(opp_dev, opp_table);
+ err:
diff --git a/queue-5.10/opp-fix-memory-leak-in-_allocate_opp_table.patch b/queue-5.10/opp-fix-memory-leak-in-_allocate_opp_table.patch

new file mode 100644 (file)

index 0000000..9a5dfd8
--- /dev/null
+++ b/queue-5.10/opp-fix-memory-leak-in-_allocate_opp_table.patch
@@ -0,0 +1,80 @@
+From 976509bb310b913d30577f15b58bdd30effb0542 Mon Sep 17 00:00:00 2001
+From: Quanyang Wang <quanyang.wang@windriver.com>
+Date: Thu, 24 Dec 2020 18:49:27 +0800
+Subject: opp: fix memory leak in _allocate_opp_table
+
+From: Quanyang Wang <quanyang.wang@windriver.com>
+
+commit 976509bb310b913d30577f15b58bdd30effb0542 upstream.
+
+In function _allocate_opp_table, opp_dev is allocated and referenced
+by opp_table via _add_opp_dev. But in the case that the subsequent calls
+return -EPROBE_DEFER, it will jump to err label and opp_table will be
+freed. Then opp_dev becomes an unreferenced object to cause memory leak.
+So let's call _remove_opp_dev to do the cleanup.
+
+This fixes the following kmemleak report:
+
+unreferenced object 0xffff000801524a00 (size 128):
+  comm "swapper/0", pid 1, jiffies 4294892465 (age 84.616s)
+  hex dump (first 32 bytes):
+    40 00 56 01 08 00 ff ff 40 00 56 01 08 00 ff ff  @.V.....@.V.....
+    b8 52 77 7f 08 00 ff ff 00 3c 4c 00 08 00 ff ff  .Rw......<L.....
+  backtrace:
+    [<00000000b1289fb1>] kmemleak_alloc+0x30/0x40
+    [<0000000056da48f0>] kmem_cache_alloc+0x3d4/0x588
+    [<00000000a84b3b0e>] _add_opp_dev+0x2c/0x88
+    [<0000000062a380cd>] _add_opp_table_indexed+0x124/0x268
+    [<000000008b4c8f1f>] dev_pm_opp_of_add_table+0x20/0x1d8
+    [<00000000e5316798>] dev_pm_opp_of_cpumask_add_table+0x48/0xf0
+    [<00000000db0a8ec2>] dt_cpufreq_probe+0x20c/0x448
+    [<0000000030a3a26c>] platform_probe+0x68/0xd8
+    [<00000000c618e78d>] really_probe+0xd0/0x3a0
+    [<00000000642e856f>] driver_probe_device+0x58/0xb8
+    [<00000000f10f5307>] device_driver_attach+0x74/0x80
+    [<0000000004f254b8>] __driver_attach+0x58/0xe0
+    [<0000000009d5d19e>] bus_for_each_dev+0x70/0xc8
+    [<0000000000d22e1c>] driver_attach+0x24/0x30
+    [<0000000001d4e952>] bus_add_driver+0x14c/0x1f0
+    [<0000000089928aaa>] driver_register+0x64/0x120
+
+Cc: v5.10 <stable@vger.kernel.org> # v5.10
+Fixes: dd461cd9183f ("opp: Allow dev_pm_opp_get_opp_table() to return -EPROBE_DEFER")
+Signed-off-by: Quanyang Wang <quanyang.wang@windriver.com>
+[ Viresh: Added the stable tag ]
+Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/opp/core.c |    6 ++++--
+ 1 file changed, 4 insertions(+), 2 deletions(-)
+
+--- a/drivers/opp/core.c
++++ b/drivers/opp/core.c
+@@ -1102,7 +1102,7 @@ static struct opp_table *_allocate_opp_t
+       if (IS_ERR(opp_table->clk)) {
+               ret = PTR_ERR(opp_table->clk);
+               if (ret == -EPROBE_DEFER)
+-                      goto err;
++                      goto remove_opp_dev;
+ 
+               dev_dbg(dev, "%s: Couldn't find clock: %d\n", __func__, ret);
+       }
+@@ -1111,7 +1111,7 @@ static struct opp_table *_allocate_opp_t
+       ret = dev_pm_opp_of_find_icc_paths(dev, opp_table);
+       if (ret) {
+               if (ret == -EPROBE_DEFER)
+-                      goto err;
++                      goto remove_opp_dev;
+ 
+               dev_warn(dev, "%s: Error finding interconnect paths: %d\n",
+                        __func__, ret);
+@@ -1125,6 +1125,8 @@ static struct opp_table *_allocate_opp_t
+       list_add(&opp_table->node, &opp_tables);
+       return opp_table;
+ 
++remove_opp_dev:
++      _remove_opp_dev(opp_dev, opp_table);
+ err:
+       kfree(opp_table);
+       return ERR_PTR(ret);
diff --git a/queue-5.10/scsi-block-fix-a-race-in-the-runtime-power-management-code.patch b/queue-5.10/scsi-block-fix-a-race-in-the-runtime-power-management-code.patch

new file mode 100644 (file)

index 0000000..45bd2d8
--- /dev/null
+++ b/queue-5.10/scsi-block-fix-a-race-in-the-runtime-power-management-code.patch
@@ -0,0 +1,80 @@
+From fa4d0f1992a96f6d7c988ef423e3127e613f6ac9 Mon Sep 17 00:00:00 2001
+From: Bart Van Assche <bvanassche@acm.org>
+Date: Tue, 8 Dec 2020 21:29:44 -0800
+Subject: scsi: block: Fix a race in the runtime power management code
+
+From: Bart Van Assche <bvanassche@acm.org>
+
+commit fa4d0f1992a96f6d7c988ef423e3127e613f6ac9 upstream.
+
+With the current implementation the following race can happen:
+
+ * blk_pre_runtime_suspend() calls blk_freeze_queue_start() and
+   blk_mq_unfreeze_queue().
+
+ * blk_queue_enter() calls blk_queue_pm_only() and that function returns
+   true.
+
+ * blk_queue_enter() calls blk_pm_request_resume() and that function does
+   not call pm_request_resume() because the queue runtime status is
+   RPM_ACTIVE.
+
+ * blk_pre_runtime_suspend() changes the queue status into RPM_SUSPENDING.
+
+Fix this race by changing the queue runtime status into RPM_SUSPENDING
+before switching q_usage_counter to atomic mode.
+
+Link: https://lore.kernel.org/r/20201209052951.16136-2-bvanassche@acm.org
+Fixes: 986d413b7c15 ("blk-mq: Enable support for runtime power management")
+Cc: Ming Lei <ming.lei@redhat.com>
+Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+Cc: stable <stable@vger.kernel.org>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Hannes Reinecke <hare@suse.de>
+Reviewed-by: Jens Axboe <axboe@kernel.dk>
+Acked-by: Alan Stern <stern@rowland.harvard.edu>
+Acked-by: Stanley Chu <stanley.chu@mediatek.com>
+Co-developed-by: Can Guo <cang@codeaurora.org>
+Signed-off-by: Can Guo <cang@codeaurora.org>
+Signed-off-by: Bart Van Assche <bvanassche@acm.org>
+Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ block/blk-pm.c |   15 +++++++++------
+ 1 file changed, 9 insertions(+), 6 deletions(-)
+
+--- a/block/blk-pm.c
++++ b/block/blk-pm.c
+@@ -67,6 +67,10 @@ int blk_pre_runtime_suspend(struct reque
+ 
+       WARN_ON_ONCE(q->rpm_status != RPM_ACTIVE);
+ 
++      spin_lock_irq(&q->queue_lock);
++      q->rpm_status = RPM_SUSPENDING;
++      spin_unlock_irq(&q->queue_lock);
++
+       /*
+        * Increase the pm_only counter before checking whether any
+        * non-PM blk_queue_enter() calls are in progress to avoid that any
+@@ -89,15 +93,14 @@ int blk_pre_runtime_suspend(struct reque
+       /* Switch q_usage_counter back to per-cpu mode. */
+       blk_mq_unfreeze_queue(q);
+ 
+-      spin_lock_irq(&q->queue_lock);
+-      if (ret < 0)
++      if (ret < 0) {
++              spin_lock_irq(&q->queue_lock);
++              q->rpm_status = RPM_ACTIVE;
+               pm_runtime_mark_last_busy(q->dev);
+-      else
+-              q->rpm_status = RPM_SUSPENDING;
+-      spin_unlock_irq(&q->queue_lock);
++              spin_unlock_irq(&q->queue_lock);
+ 
+-      if (ret)
+               blk_clear_pm_only(q);
++      }
+ 
+       return ret;
+ }
diff --git a/queue-5.10/series b/queue-5.10/series

index b8f5948e6d9e36b0f284517dce3ce0104961b7ba..10bf26375589cd40fe78583a352ea9c00ce91ed2 100644 (file)
--- a/queue-5.10/series
+++ b/queue-5.10/series
@@ -7,3 +7,14 @@ drm-amd-display-add-get_dig_frontend-implementation-for-dcex.patch
  io_uring-close-a-small-race-gap-for-files-cancel.patch
  jffs2-allow-setting-rp_size-to-zero-during-remountin.patch
  jffs2-fix-null-pointer-dereference-in-rp_size-fs-opt.patch
+spi-dw-bt1-fix-undefined-devm_mux_control_get-symbol.patch
+opp-fix-memory-leak-in-_allocate_opp_table.patch
+opp-call-the-missing-clk_put-on-error.patch
+scsi-block-fix-a-race-in-the-runtime-power-management-code.patch
+mm-hugetlb-fix-deadlock-in-hugetlb_cow-error-path.patch
+mm-memmap-defer-init-doesn-t-work-as-expected.patch
+lib-zlib-fix-inflating-zlib-streams-on-s390.patch
+io_uring-don-t-assume-mm-is-constant-across-submits.patch
+io_uring-add-a-helper-for-setting-a-ref-node.patch
+io_uring-fix-io_sqe_files_unregister-hangs.patch
+kernel-io_uring-cancel-io_uring-before-task-works.patch
diff --git a/queue-5.10/spi-dw-bt1-fix-undefined-devm_mux_control_get-symbol.patch b/queue-5.10/spi-dw-bt1-fix-undefined-devm_mux_control_get-symbol.patch

new file mode 100644 (file)

index 0000000..6a4b41b
--- /dev/null
+++ b/queue-5.10/spi-dw-bt1-fix-undefined-devm_mux_control_get-symbol.patch
@@ -0,0 +1,57 @@
+From 7218838109fef61cdec988ff728e902d434c9cc5 Mon Sep 17 00:00:00 2001
+From: Serge Semin <Sergey.Semin@baikalelectronics.ru>
+Date: Fri, 27 Nov 2020 17:46:11 +0300
+Subject: spi: dw-bt1: Fix undefined devm_mux_control_get symbol
+
+From: Serge Semin <Sergey.Semin@baikalelectronics.ru>
+
+commit 7218838109fef61cdec988ff728e902d434c9cc5 upstream.
+
+I mistakenly added the select attributes to the SPI_DW_BT1_DIRMAP config
+instead of having them defined in SPI_DW_BT1. If the kernel doesn't have
+the MULTIPLEXER and MUX_MMIO configs manually enabled and the
+SPI_DW_BT1_DIRMAP config hasn't been selected, Baikal-T1 SPI device will
+always fail to be probed by the driver. Fix that and the error reported by
+the test robot:
+
+>> ld.lld: error: undefined symbol: devm_mux_control_get
+   >>> referenced by spi-dw-bt1.c
+   >>> spi/spi-dw-bt1.o:(dw_spi_bt1_sys_init) in archive drivers/built-in.a
+
+by moving the MULTIPLEXER/MUX_MMIO configs selection to the SPI_DW_BT1
+config.
+
+Link: https://lore.kernel.org/lkml/202011161745.uYRlekse-lkp@intel.com/
+Link: https://lore.kernel.org/linux-spi/20201116040721.8001-1-rdunlap@infradead.org/
+Fixes: abf00907538e ("spi: dw: Add Baikal-T1 SPI Controller glue driver")
+Reported-by: kernel test robot <lkp@intel.com>
+Signed-off-by: Serge Semin <Sergey.Semin@baikalelectronics.ru>
+Cc: Randy Dunlap <rdunlap@infradead.org>
+Cc: Ramil Zaripov <Ramil.Zaripov@baikalelectronics.ru>
+Link: https://lore.kernel.org/r/20201127144612.4204-1-Sergey.Semin@baikalelectronics.ru
+Signed-off-by: Mark Brown <broonie@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/spi/Kconfig |    3 +--
+ 1 file changed, 1 insertion(+), 2 deletions(-)
+
+--- a/drivers/spi/Kconfig
++++ b/drivers/spi/Kconfig
+@@ -256,6 +256,7 @@ config SPI_DW_BT1
+       tristate "Baikal-T1 SPI driver for DW SPI core"
+       depends on MIPS_BAIKAL_T1 || COMPILE_TEST
+       select MULTIPLEXER
++      select MUX_MMIO
+       help
+         Baikal-T1 SoC is equipped with three DW APB SSI-based MMIO SPI
+         controllers. Two of them are pretty much normal: with IRQ, DMA,
+@@ -269,8 +270,6 @@ config SPI_DW_BT1
+ config SPI_DW_BT1_DIRMAP
+       bool "Directly mapped Baikal-T1 Boot SPI flash support"
+       depends on SPI_DW_BT1
+-      select MULTIPLEXER
+-      select MUX_MMIO
+       help
+         Directly mapped SPI flash memory is an interface specific to the
+         Baikal-T1 System Boot Controller. It is a 16MB MMIO region, which
author	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Mon, 4 Jan 2021 10:24:04 +0000 (11:24 +0100)
committer	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Mon, 4 Jan 2021 10:24:04 +0000 (11:24 +0100)
queue-5.10/io_uring-add-a-helper-for-setting-a-ref-node.patch	[new file with mode: 0644]	patch \| blob
queue-5.10/io_uring-don-t-assume-mm-is-constant-across-submits.patch	[new file with mode: 0644]	patch \| blob
queue-5.10/io_uring-fix-io_sqe_files_unregister-hangs.patch	[new file with mode: 0644]	patch \| blob
queue-5.10/kernel-io_uring-cancel-io_uring-before-task-works.patch	[new file with mode: 0644]	patch \| blob
queue-5.10/lib-zlib-fix-inflating-zlib-streams-on-s390.patch	[new file with mode: 0644]	patch \| blob
queue-5.10/mm-hugetlb-fix-deadlock-in-hugetlb_cow-error-path.patch	[new file with mode: 0644]	patch \| blob
queue-5.10/mm-memmap-defer-init-doesn-t-work-as-expected.patch	[new file with mode: 0644]	patch \| blob
queue-5.10/opp-call-the-missing-clk_put-on-error.patch	[new file with mode: 0644]	patch \| blob
queue-5.10/opp-fix-memory-leak-in-_allocate_opp_table.patch	[new file with mode: 0644]	patch \| blob
queue-5.10/scsi-block-fix-a-race-in-the-runtime-power-management-code.patch	[new file with mode: 0644]	patch \| blob
queue-5.10/series		patch \| blob \| blame \| history
queue-5.10/spi-dw-bt1-fix-undefined-devm_mux_control_get-symbol.patch	[new file with mode: 0644]	patch \| blob