Fixes for 4.14

author Sasha Levin <sashal@kernel.org>

Sun, 23 Aug 2020 01:16:36 +0000 (21:16 -0400)

committer Sasha Levin <sashal@kernel.org>

Sun, 23 Aug 2020 01:16:36 +0000 (21:16 -0400)
author Sasha Levin <sashal@kernel.org>
Sun, 23 Aug 2020 01:16:36 +0000 (21:16 -0400)
committer Sasha Levin <sashal@kernel.org>
Sun, 23 Aug 2020 01:16:36 +0000 (21:16 -0400)
diff --git a/queue-4.14/btrfs-don-t-show-full-path-of-bind-mounts-in-subvol.patch b/queue-4.14/btrfs-don-t-show-full-path-of-bind-mounts-in-subvol.patch

new file mode 100644 (file)

index 0000000..fae2d58
--- /dev/null
+++ b/queue-4.14/btrfs-don-t-show-full-path-of-bind-mounts-in-subvol.patch
@@ -0,0 +1,67 @@
+From 73b1c4acaad39877bfe0ee9001df2d87f9f2581e Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 22 Jul 2020 11:12:46 -0400
+Subject: btrfs: don't show full path of bind mounts in subvol=
+
+From: Josef Bacik <josef@toxicpanda.com>
+
+[ Upstream commit 3ef3959b29c4a5bd65526ab310a1a18ae533172a ]
+
+Chris Murphy reported a problem where rpm ostree will bind mount a bunch
+of things for whatever voodoo it's doing.  But when it does this
+/proc/mounts shows something like
+
+  /dev/sda /mnt/test btrfs rw,relatime,subvolid=256,subvol=/foo 0 0
+  /dev/sda /mnt/test/baz btrfs rw,relatime,subvolid=256,subvol=/foo/bar 0 0
+
+Despite subvolid=256 being subvol=/foo.  This is because we're just
+spitting out the dentry of the mount point, which in the case of bind
+mounts is the source path for the mountpoint.  Instead we should spit
+out the path to the actual subvol.  Fix this by looking up the name for
+the subvolid we have mounted.  With this fix the same test looks like
+this
+
+  /dev/sda /mnt/test btrfs rw,relatime,subvolid=256,subvol=/foo 0 0
+  /dev/sda /mnt/test/baz btrfs rw,relatime,subvolid=256,subvol=/foo 0 0
+
+Reported-by: Chris Murphy <chris@colorremedies.com>
+CC: stable@vger.kernel.org # 4.4+
+Signed-off-by: Josef Bacik <josef@toxicpanda.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/btrfs/super.c | 10 ++++++++--
+ 1 file changed, 8 insertions(+), 2 deletions(-)
+
+diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
+index ca95e57b60ee1..eb64d4b159e07 100644
+--- a/fs/btrfs/super.c
++++ b/fs/btrfs/super.c
+@@ -1221,6 +1221,7 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
+ {
+       struct btrfs_fs_info *info = btrfs_sb(dentry->d_sb);
+       char *compress_type;
++      const char *subvol_name;
+ 
+       if (btrfs_test_opt(info, DEGRADED))
+               seq_puts(seq, ",degraded");
+@@ -1307,8 +1308,13 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
+ #endif
+       seq_printf(seq, ",subvolid=%llu",
+                 BTRFS_I(d_inode(dentry))->root->root_key.objectid);
+-      seq_puts(seq, ",subvol=");
+-      seq_dentry(seq, dentry, " \t\n\\");
++      subvol_name = btrfs_get_subvol_name_from_objectid(info,
++                      BTRFS_I(d_inode(dentry))->root->root_key.objectid);
++      if (!IS_ERR(subvol_name)) {
++              seq_puts(seq, ",subvol=");
++              seq_escape(seq, subvol_name, " \t\n\\");
++              kfree(subvol_name);
++      }
+       return 0;
+ }
+ 
+-- 
+2.25.1
+
diff --git a/queue-4.14/btrfs-export-helpers-for-subvolume-name-id-resolutio.patch b/queue-4.14/btrfs-export-helpers-for-subvolume-name-id-resolutio.patch

new file mode 100644 (file)

index 0000000..d63b9e0
--- /dev/null
+++ b/queue-4.14/btrfs-export-helpers-for-subvolume-name-id-resolutio.patch
@@ -0,0 +1,107 @@
+From f84c46c31e2e89ddf8b0b7b6508a55d74fe34403 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 21 Feb 2020 14:56:12 +0100
+Subject: btrfs: export helpers for subvolume name/id resolution
+
+From: Marcos Paulo de Souza <mpdesouza@suse.com>
+
+[ Upstream commit c0c907a47dccf2cf26251a8fb4a8e7a3bf79ce84 ]
+
+The functions will be used outside of export.c and super.c to allow
+resolving subvolume name from a given id, eg. for subvolume deletion by
+id ioctl.
+
+Signed-off-by: Marcos Paulo de Souza <mpdesouza@suse.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+[ split from the next patch ]
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/btrfs/ctree.h  | 2 ++
+ fs/btrfs/export.c | 8 ++++----
+ fs/btrfs/export.h | 5 +++++
+ fs/btrfs/super.c  | 8 ++++----
+ 4 files changed, 15 insertions(+), 8 deletions(-)
+
+diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
+index 5412b12491cb8..de951987fd23d 100644
+--- a/fs/btrfs/ctree.h
++++ b/fs/btrfs/ctree.h
+@@ -3262,6 +3262,8 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size);
+ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
+                       unsigned long new_flags);
+ int btrfs_sync_fs(struct super_block *sb, int wait);
++char *btrfs_get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info,
++                                        u64 subvol_objectid);
+ 
+ static inline __printf(2, 3)
+ void btrfs_no_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
+diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
+index 3aeb5770f8965..b6ce765aa7f33 100644
+--- a/fs/btrfs/export.c
++++ b/fs/btrfs/export.c
+@@ -56,9 +56,9 @@ static int btrfs_encode_fh(struct inode *inode, u32 *fh, int *max_len,
+       return type;
+ }
+ 
+-static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
+-                                     u64 root_objectid, u32 generation,
+-                                     int check_generation)
++struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
++                              u64 root_objectid, u32 generation,
++                              int check_generation)
+ {
+       struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+       struct btrfs_root *root;
+@@ -151,7 +151,7 @@ static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh,
+       return btrfs_get_dentry(sb, objectid, root_objectid, generation, 1);
+ }
+ 
+-static struct dentry *btrfs_get_parent(struct dentry *child)
++struct dentry *btrfs_get_parent(struct dentry *child)
+ {
+       struct inode *dir = d_inode(child);
+       struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
+diff --git a/fs/btrfs/export.h b/fs/btrfs/export.h
+index 91b3908e7c549..15db024621414 100644
+--- a/fs/btrfs/export.h
++++ b/fs/btrfs/export.h
+@@ -17,4 +17,9 @@ struct btrfs_fid {
+       u64 parent_root_objectid;
+ } __attribute__ ((packed));
+ 
++struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
++                              u64 root_objectid, u32 generation,
++                              int check_generation);
++struct dentry *btrfs_get_parent(struct dentry *child);
++
+ #endif
+diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
+index 17a8463ef35c1..ca95e57b60ee1 100644
+--- a/fs/btrfs/super.c
++++ b/fs/btrfs/super.c
+@@ -939,8 +939,8 @@ out:
+       return error;
+ }
+ 
+-static char *get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info,
+-                                         u64 subvol_objectid)
++char *btrfs_get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info,
++                                        u64 subvol_objectid)
+ {
+       struct btrfs_root *root = fs_info->tree_root;
+       struct btrfs_root *fs_root;
+@@ -1427,8 +1427,8 @@ static struct dentry *mount_subvol(const char *subvol_name, u64 subvol_objectid,
+                               goto out;
+                       }
+               }
+-              subvol_name = get_subvol_name_from_objectid(btrfs_sb(mnt->mnt_sb),
+-                                                          subvol_objectid);
++              subvol_name = btrfs_get_subvol_name_from_objectid(
++                                      btrfs_sb(mnt->mnt_sb), subvol_objectid);
+               if (IS_ERR(subvol_name)) {
+                       root = ERR_CAST(subvol_name);
+                       subvol_name = NULL;
+-- 
+2.25.1
+
diff --git a/queue-4.14/btrfs-inode-fix-null-pointer-dereference-if-inode-do.patch b/queue-4.14/btrfs-inode-fix-null-pointer-dereference-if-inode-do.patch

new file mode 100644 (file)

index 0000000..e10e30c
--- /dev/null
+++ b/queue-4.14/btrfs-inode-fix-null-pointer-dereference-if-inode-do.patch
@@ -0,0 +1,109 @@
+From 100f0653e012f14174d1f7ea130107ebe07dd25b Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 28 Jul 2020 16:39:26 +0800
+Subject: btrfs: inode: fix NULL pointer dereference if inode doesn't need
+ compression
+
+From: Qu Wenruo <wqu@suse.com>
+
+[ Upstream commit 1e6e238c3002ea3611465ce5f32777ddd6a40126 ]
+
+[BUG]
+There is a bug report of NULL pointer dereference caused in
+compress_file_extent():
+
+  Oops: Kernel access of bad area, sig: 11 [#1]
+  LE PAGE_SIZE=64K MMU=Hash SMP NR_CPUS=2048 NUMA pSeries
+  Workqueue: btrfs-delalloc btrfs_delalloc_helper [btrfs]
+  NIP [c008000006dd4d34] compress_file_range.constprop.41+0x75c/0x8a0 [btrfs]
+  LR [c008000006dd4d1c] compress_file_range.constprop.41+0x744/0x8a0 [btrfs]
+  Call Trace:
+  [c000000c69093b00] [c008000006dd4d1c] compress_file_range.constprop.41+0x744/0x8a0 [btrfs] (unreliable)
+  [c000000c69093bd0] [c008000006dd4ebc] async_cow_start+0x44/0xa0 [btrfs]
+  [c000000c69093c10] [c008000006e14824] normal_work_helper+0xdc/0x598 [btrfs]
+  [c000000c69093c80] [c0000000001608c0] process_one_work+0x2c0/0x5b0
+  [c000000c69093d10] [c000000000160c38] worker_thread+0x88/0x660
+  [c000000c69093db0] [c00000000016b55c] kthread+0x1ac/0x1c0
+  [c000000c69093e20] [c00000000000b660] ret_from_kernel_thread+0x5c/0x7c
+  ---[ end trace f16954aa20d822f6 ]---
+
+[CAUSE]
+For the following execution route of compress_file_range(), it's
+possible to hit NULL pointer dereference:
+
+ compress_file_extent()
+ |- pages = NULL;
+ |- start = async_chunk->start = 0;
+ |- end = async_chunk = 4095;
+ |- nr_pages = 1;
+ |- inode_need_compress() == false; <<< Possible, see later explanation
+ |  Now, we have nr_pages = 1, pages = NULL
+ |- cont:
+ |-            ret = cow_file_range_inline();
+ |-            if (ret <= 0) {
+ |-            for (i = 0; i < nr_pages; i++) {
+ |-                    WARN_ON(pages[i]->mapping);     <<< Crash
+
+To enter above call execution branch, we need the following race:
+
+    Thread 1 (chattr)     |            Thread 2 (writeback)
+--------------------------+------------------------------
+                          | btrfs_run_delalloc_range
+                          | |- inode_need_compress = true
+                          | |- cow_file_range_async()
+btrfs_ioctl_set_flag()    |
+|- binode_flags |=        |
+   BTRFS_INODE_NOCOMPRESS |
+                          | compress_file_range()
+                          | |- inode_need_compress = false
+                          | |- nr_page = 1 while pages = NULL
+                          | |  Then hit the crash
+
+[FIX]
+This patch will fix it by checking @pages before doing accessing it.
+This patch is only designed as a hot fix and easy to backport.
+
+More elegant fix may make btrfs only check inode_need_compress() once to
+avoid such race, but that would be another story.
+
+Reported-by: Luciano Chavez <chavez@us.ibm.com>
+Fixes: 4d3a800ebb12 ("btrfs: merge nr_pages input and output parameter in compress_pages")
+CC: stable@vger.kernel.org # 4.14.x: cecc8d9038d16: btrfs: Move free_pages_out label in inline extent handling branch in compress_file_range
+CC: stable@vger.kernel.org # 4.14+
+Signed-off-by: Qu Wenruo <wqu@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/btrfs/inode.c | 15 +++++++++++----
+ 1 file changed, 11 insertions(+), 4 deletions(-)
+
+diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
+index dc520749f51db..17856e92b93d1 100644
+--- a/fs/btrfs/inode.c
++++ b/fs/btrfs/inode.c
+@@ -630,11 +630,18 @@ cont:
+                                                              start,
+                                                              end - start + 1);
+ 
+-                      for (i = 0; i < nr_pages; i++) {
+-                              WARN_ON(pages[i]->mapping);
+-                              put_page(pages[i]);
++                      /*
++                       * Ensure we only free the compressed pages if we have
++                       * them allocated, as we can still reach here with
++                       * inode_need_compress() == false.
++                       */
++                      if (pages) {
++                              for (i = 0; i < nr_pages; i++) {
++                                      WARN_ON(pages[i]->mapping);
++                                      put_page(pages[i]);
++                              }
++                              kfree(pages);
+                       }
+-                      kfree(pages);
+ 
+                       return;
+               }
+-- 
+2.25.1
+
diff --git a/queue-4.14/btrfs-move-free_pages_out-label-in-inline-extent-han.patch b/queue-4.14/btrfs-move-free_pages_out-label-in-inline-extent-han.patch

new file mode 100644 (file)

index 0000000..a3e548d
--- /dev/null
+++ b/queue-4.14/btrfs-move-free_pages_out-label-in-inline-extent-han.patch
@@ -0,0 +1,59 @@
+From 3f9d136ee7b1741f5b715179f3a9cd44de4efd84 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 17 Jul 2019 14:41:45 +0300
+Subject: btrfs: Move free_pages_out label in inline extent handling branch in
+ compress_file_range
+
+From: Nikolay Borisov <nborisov@suse.com>
+
+[ Upstream commit cecc8d9038d164eda61fbcd72520975a554ea63e ]
+
+This label is only executed if compress_file_range fails to create an
+inline extent. So move its code in the semantically related inline
+extent handling branch. No functional changes.
+
+Signed-off-by: Nikolay Borisov <nborisov@suse.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/btrfs/inode.c | 16 ++++++++--------
+ 1 file changed, 8 insertions(+), 8 deletions(-)
+
+diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
+index 57908ee964a20..dc520749f51db 100644
+--- a/fs/btrfs/inode.c
++++ b/fs/btrfs/inode.c
+@@ -629,7 +629,14 @@ cont:
+                               btrfs_free_reserved_data_space_noquota(inode,
+                                                              start,
+                                                              end - start + 1);
+-                      goto free_pages_out;
++
++                      for (i = 0; i < nr_pages; i++) {
++                              WARN_ON(pages[i]->mapping);
++                              put_page(pages[i]);
++                      }
++                      kfree(pages);
++
++                      return;
+               }
+       }
+ 
+@@ -708,13 +715,6 @@ cleanup_and_bail_uncompressed:
+       *num_added += 1;
+ 
+       return;
+-
+-free_pages_out:
+-      for (i = 0; i < nr_pages; i++) {
+-              WARN_ON(pages[i]->mapping);
+-              put_page(pages[i]);
+-      }
+-      kfree(pages);
+ }
+ 
+ static void free_async_extent_pages(struct async_extent *async_extent)
+-- 
+2.25.1
+
diff --git a/queue-4.14/btrfs-sysfs-use-nofs-for-device-creation.patch b/queue-4.14/btrfs-sysfs-use-nofs-for-device-creation.patch

new file mode 100644 (file)

index 0000000..8258daa
--- /dev/null
+++ b/queue-4.14/btrfs-sysfs-use-nofs-for-device-creation.patch
@@ -0,0 +1,190 @@
+From 8e2073529525e69ef39ac62d70caa93777e132e2 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 21 Jul 2020 10:17:50 -0400
+Subject: btrfs: sysfs: use NOFS for device creation
+
+From: Josef Bacik <josef@toxicpanda.com>
+
+Dave hit this splat during testing btrfs/078:
+
+  ======================================================
+  WARNING: possible circular locking dependency detected
+  5.8.0-rc6-default+ #1191 Not tainted
+  ------------------------------------------------------
+  kswapd0/75 is trying to acquire lock:
+  ffffa040e9d04ff8 (&delayed_node->mutex){+.+.}-{3:3}, at: __btrfs_release_delayed_node.part.0+0x3f/0x310 [btrfs]
+
+  but task is already holding lock:
+  ffffffff8b0c8040 (fs_reclaim){+.+.}-{0:0}, at: __fs_reclaim_acquire+0x5/0x30
+
+  which lock already depends on the new lock.
+
+  the existing dependency chain (in reverse order) is:
+
+  -> #2 (fs_reclaim){+.+.}-{0:0}:
+        __lock_acquire+0x56f/0xaa0
+        lock_acquire+0xa3/0x440
+        fs_reclaim_acquire.part.0+0x25/0x30
+        __kmalloc_track_caller+0x49/0x330
+        kstrdup+0x2e/0x60
+        __kernfs_new_node.constprop.0+0x44/0x250
+        kernfs_new_node+0x25/0x50
+        kernfs_create_link+0x34/0xa0
+        sysfs_do_create_link_sd+0x5e/0xd0
+        btrfs_sysfs_add_devices_dir+0x65/0x100 [btrfs]
+        btrfs_init_new_device+0x44c/0x12b0 [btrfs]
+        btrfs_ioctl+0xc3c/0x25c0 [btrfs]
+        ksys_ioctl+0x68/0xa0
+        __x64_sys_ioctl+0x16/0x20
+        do_syscall_64+0x50/0xe0
+        entry_SYSCALL_64_after_hwframe+0x44/0xa9
+
+  -> #1 (&fs_info->chunk_mutex){+.+.}-{3:3}:
+        __lock_acquire+0x56f/0xaa0
+        lock_acquire+0xa3/0x440
+        __mutex_lock+0xa0/0xaf0
+        btrfs_chunk_alloc+0x137/0x3e0 [btrfs]
+        find_free_extent+0xb44/0xfb0 [btrfs]
+        btrfs_reserve_extent+0x9b/0x180 [btrfs]
+        btrfs_alloc_tree_block+0xc1/0x350 [btrfs]
+        alloc_tree_block_no_bg_flush+0x4a/0x60 [btrfs]
+        __btrfs_cow_block+0x143/0x7a0 [btrfs]
+        btrfs_cow_block+0x15f/0x310 [btrfs]
+        push_leaf_right+0x150/0x240 [btrfs]
+        split_leaf+0x3cd/0x6d0 [btrfs]
+        btrfs_search_slot+0xd14/0xf70 [btrfs]
+        btrfs_insert_empty_items+0x64/0xc0 [btrfs]
+        __btrfs_commit_inode_delayed_items+0xb2/0x840 [btrfs]
+        btrfs_async_run_delayed_root+0x10e/0x1d0 [btrfs]
+        btrfs_work_helper+0x2f9/0x650 [btrfs]
+        process_one_work+0x22c/0x600
+        worker_thread+0x50/0x3b0
+        kthread+0x137/0x150
+        ret_from_fork+0x1f/0x30
+
+  -> #0 (&delayed_node->mutex){+.+.}-{3:3}:
+        check_prev_add+0x98/0xa20
+        validate_chain+0xa8c/0x2a00
+        __lock_acquire+0x56f/0xaa0
+        lock_acquire+0xa3/0x440
+        __mutex_lock+0xa0/0xaf0
+        __btrfs_release_delayed_node.part.0+0x3f/0x310 [btrfs]
+        btrfs_evict_inode+0x3bf/0x560 [btrfs]
+        evict+0xd6/0x1c0
+        dispose_list+0x48/0x70
+        prune_icache_sb+0x54/0x80
+        super_cache_scan+0x121/0x1a0
+        do_shrink_slab+0x175/0x420
+        shrink_slab+0xb1/0x2e0
+        shrink_node+0x192/0x600
+        balance_pgdat+0x31f/0x750
+        kswapd+0x206/0x510
+        kthread+0x137/0x150
+        ret_from_fork+0x1f/0x30
+
+  other info that might help us debug this:
+
+  Chain exists of:
+    &delayed_node->mutex --> &fs_info->chunk_mutex --> fs_reclaim
+
+   Possible unsafe locking scenario:
+
+        CPU0                    CPU1
+        ----                    ----
+    lock(fs_reclaim);
+                                lock(&fs_info->chunk_mutex);
+                                lock(fs_reclaim);
+    lock(&delayed_node->mutex);
+
+   *** DEADLOCK ***
+
+  3 locks held by kswapd0/75:
+   #0: ffffffff8b0c8040 (fs_reclaim){+.+.}-{0:0}, at: __fs_reclaim_acquire+0x5/0x30
+   #1: ffffffff8b0b50b8 (shrinker_rwsem){++++}-{3:3}, at: shrink_slab+0x54/0x2e0
+   #2: ffffa040e057c0e8 (&type->s_umount_key#26){++++}-{3:3}, at: trylock_super+0x16/0x50
+
+  stack backtrace:
+  CPU: 2 PID: 75 Comm: kswapd0 Not tainted 5.8.0-rc6-default+ #1191
+  Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.0-59-gc9ba527-rebuilt.opensuse.org 04/01/2014
+  Call Trace:
+   dump_stack+0x78/0xa0
+   check_noncircular+0x16f/0x190
+   check_prev_add+0x98/0xa20
+   validate_chain+0xa8c/0x2a00
+   __lock_acquire+0x56f/0xaa0
+   lock_acquire+0xa3/0x440
+   ? __btrfs_release_delayed_node.part.0+0x3f/0x310 [btrfs]
+   __mutex_lock+0xa0/0xaf0
+   ? __btrfs_release_delayed_node.part.0+0x3f/0x310 [btrfs]
+   ? __lock_acquire+0x56f/0xaa0
+   ? __btrfs_release_delayed_node.part.0+0x3f/0x310 [btrfs]
+   ? lock_acquire+0xa3/0x440
+   ? btrfs_evict_inode+0x138/0x560 [btrfs]
+   ? btrfs_evict_inode+0x2fe/0x560 [btrfs]
+   ? __btrfs_release_delayed_node.part.0+0x3f/0x310 [btrfs]
+   __btrfs_release_delayed_node.part.0+0x3f/0x310 [btrfs]
+   btrfs_evict_inode+0x3bf/0x560 [btrfs]
+   evict+0xd6/0x1c0
+   dispose_list+0x48/0x70
+   prune_icache_sb+0x54/0x80
+   super_cache_scan+0x121/0x1a0
+   do_shrink_slab+0x175/0x420
+   shrink_slab+0xb1/0x2e0
+   shrink_node+0x192/0x600
+   balance_pgdat+0x31f/0x750
+   kswapd+0x206/0x510
+   ? _raw_spin_unlock_irqrestore+0x3e/0x50
+   ? finish_wait+0x90/0x90
+   ? balance_pgdat+0x750/0x750
+   kthread+0x137/0x150
+   ? kthread_stop+0x2a0/0x2a0
+   ret_from_fork+0x1f/0x30
+
+This is because we're holding the chunk_mutex while adding this device
+and adding its sysfs entries.  We actually hold different locks in
+different places when calling this function, the dev_replace semaphore
+for instance in dev replace, so instead of moving this call around
+simply wrap it's operations in NOFS.
+
+CC: stable@vger.kernel.org # 4.14+
+Reported-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Josef Bacik <josef@toxicpanda.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+---
+ fs/btrfs/sysfs.c | 4 ++++
+ 1 file changed, 4 insertions(+)
+
+diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
+index f05341bda1d14..383546ff62f04 100644
+--- a/fs/btrfs/sysfs.c
++++ b/fs/btrfs/sysfs.c
+@@ -25,6 +25,7 @@
+ #include <linux/bug.h>
+ #include <linux/genhd.h>
+ #include <linux/debugfs.h>
++#include <linux/sched/mm.h>
+ 
+ #include "ctree.h"
+ #include "disk-io.h"
+@@ -749,7 +750,9 @@ int btrfs_sysfs_add_device_link(struct btrfs_fs_devices *fs_devices,
+ {
+       int error = 0;
+       struct btrfs_device *dev;
++      unsigned int nofs_flag;
+ 
++      nofs_flag = memalloc_nofs_save();
+       list_for_each_entry(dev, &fs_devices->devices, dev_list) {
+               struct hd_struct *disk;
+               struct kobject *disk_kobj;
+@@ -768,6 +771,7 @@ int btrfs_sysfs_add_device_link(struct btrfs_fs_devices *fs_devices,
+               if (error)
+                       break;
+       }
++      memalloc_nofs_restore(nofs_flag);
+ 
+       return error;
+ }
+-- 
+2.25.1
+
diff --git a/queue-4.14/drm-vgem-replace-opencoded-version-of-drm_gem_dumb_m.patch b/queue-4.14/drm-vgem-replace-opencoded-version-of-drm_gem_dumb_m.patch

new file mode 100644 (file)

index 0000000..f1d9376
--- /dev/null
+++ b/queue-4.14/drm-vgem-replace-opencoded-version-of-drm_gem_dumb_m.patch
@@ -0,0 +1,83 @@
+From a3b3f77521585f5548f773354c2284ff238e5a98 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 8 Jul 2020 16:49:11 +0100
+Subject: drm/vgem: Replace opencoded version of drm_gem_dumb_map_offset()
+
+From: Chris Wilson <chris@chris-wilson.co.uk>
+
+[ Upstream commit 119c53d2d4044c59c450c4f5a568d80b9d861856 ]
+
+drm_gem_dumb_map_offset() now exists and does everything
+vgem_gem_dump_map does and *ought* to do.
+
+In particular, vgem_gem_dumb_map() was trying to reject mmapping an
+imported dmabuf by checking the existence of obj->filp. Unfortunately,
+we always allocated an obj->filp, even if unused for an imported dmabuf.
+Instead, the drm_gem_dumb_map_offset(), since commit 90378e589192
+("drm/gem: drm_gem_dumb_map_offset(): reject dma-buf"), uses the
+obj->import_attach to reject such invalid mmaps.
+
+This prevents vgem from allowing userspace mmapping the dumb handle and
+attempting to incorrectly fault in remote pages belonging to another
+device, where there may not even be a struct page.
+
+v2: Use the default drm_gem_dumb_map_offset() callback
+
+Fixes: af33a9190d02 ("drm/vgem: Enable dmabuf import interfaces")
+Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch>
+Cc: <stable@vger.kernel.org> # v4.13+
+Link: https://patchwork.freedesktop.org/patch/msgid/20200708154911.21236-1-chris@chris-wilson.co.uk
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/gpu/drm/vgem/vgem_drv.c | 27 ---------------------------
+ 1 file changed, 27 deletions(-)
+
+diff --git a/drivers/gpu/drm/vgem/vgem_drv.c b/drivers/gpu/drm/vgem/vgem_drv.c
+index aa592277d5108..67037eb9a80ee 100644
+--- a/drivers/gpu/drm/vgem/vgem_drv.c
++++ b/drivers/gpu/drm/vgem/vgem_drv.c
+@@ -220,32 +220,6 @@ static int vgem_gem_dumb_create(struct drm_file *file, struct drm_device *dev,
+       return 0;
+ }
+ 
+-static int vgem_gem_dumb_map(struct drm_file *file, struct drm_device *dev,
+-                           uint32_t handle, uint64_t *offset)
+-{
+-      struct drm_gem_object *obj;
+-      int ret;
+-
+-      obj = drm_gem_object_lookup(file, handle);
+-      if (!obj)
+-              return -ENOENT;
+-
+-      if (!obj->filp) {
+-              ret = -EINVAL;
+-              goto unref;
+-      }
+-
+-      ret = drm_gem_create_mmap_offset(obj);
+-      if (ret)
+-              goto unref;
+-
+-      *offset = drm_vma_node_offset_addr(&obj->vma_node);
+-unref:
+-      drm_gem_object_put_unlocked(obj);
+-
+-      return ret;
+-}
+-
+ static struct drm_ioctl_desc vgem_ioctls[] = {
+       DRM_IOCTL_DEF_DRV(VGEM_FENCE_ATTACH, vgem_fence_attach_ioctl, DRM_AUTH|DRM_RENDER_ALLOW),
+       DRM_IOCTL_DEF_DRV(VGEM_FENCE_SIGNAL, vgem_fence_signal_ioctl, DRM_AUTH|DRM_RENDER_ALLOW),
+@@ -439,7 +413,6 @@ static struct drm_driver vgem_driver = {
+       .fops                           = &vgem_driver_fops,
+ 
+       .dumb_create                    = vgem_gem_dumb_create,
+-      .dumb_map_offset                = vgem_gem_dumb_map,
+ 
+       .prime_handle_to_fd = drm_gem_prime_handle_to_fd,
+       .prime_fd_to_handle = drm_gem_prime_fd_to_handle,
+-- 
+2.25.1
+
diff --git a/queue-4.14/khugepaged-adjust-vm_bug_on_mm-in-__khugepaged_enter.patch b/queue-4.14/khugepaged-adjust-vm_bug_on_mm-in-__khugepaged_enter.patch

new file mode 100644 (file)

index 0000000..76068e4
--- /dev/null
+++ b/queue-4.14/khugepaged-adjust-vm_bug_on_mm-in-__khugepaged_enter.patch
@@ -0,0 +1,51 @@
+From fa056309ff894d8a3e5575094b72f3265834f25b Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 20 Aug 2020 17:42:02 -0700
+Subject: khugepaged: adjust VM_BUG_ON_MM() in __khugepaged_enter()
+
+From: Hugh Dickins <hughd@google.com>
+
+[ Upstream commit f3f99d63a8156c7a4a6b20aac22b53c5579c7dc1 ]
+
+syzbot crashes on the VM_BUG_ON_MM(khugepaged_test_exit(mm), mm) in
+__khugepaged_enter(): yes, when one thread is about to dump core, has set
+core_state, and is waiting for others, another might do something calling
+__khugepaged_enter(), which now crashes because I lumped the core_state
+test (known as "mmget_still_valid") into khugepaged_test_exit().  I still
+think it's best to lump them together, so just in this exceptional case,
+check mm->mm_users directly instead of khugepaged_test_exit().
+
+Fixes: bbe98f9cadff ("khugepaged: khugepaged_test_exit() check mmget_still_valid()")
+Reported-by: syzbot <syzkaller@googlegroups.com>
+Signed-off-by: Hugh Dickins <hughd@google.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Acked-by: Yang Shi <shy828301@gmail.com>
+Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
+Cc: Andrea Arcangeli <aarcange@redhat.com>
+Cc: Song Liu <songliubraving@fb.com>
+Cc: Mike Kravetz <mike.kravetz@oracle.com>
+Cc: Eric Dumazet <edumazet@google.com>
+Cc: <stable@vger.kernel.org>   [4.8+]
+Link: http://lkml.kernel.org/r/alpine.LSU.2.11.2008141503370.18085@eggly.anvils
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ mm/khugepaged.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/mm/khugepaged.c b/mm/khugepaged.c
+index a1b7475c05d04..9dfe364d4c0d1 100644
+--- a/mm/khugepaged.c
++++ b/mm/khugepaged.c
+@@ -407,7 +407,7 @@ int __khugepaged_enter(struct mm_struct *mm)
+               return -ENOMEM;
+ 
+       /* __khugepaged_exit() must not run from under us */
+-      VM_BUG_ON_MM(khugepaged_test_exit(mm), mm);
++      VM_BUG_ON_MM(atomic_read(&mm->mm_users) == 0, mm);
+       if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) {
+               free_mm_slot(mm_slot);
+               return 0;
+-- 
+2.25.1
+
diff --git a/queue-4.14/khugepaged-khugepaged_test_exit-check-mmget_still_va.patch b/queue-4.14/khugepaged-khugepaged_test_exit-check-mmget_still_va.patch

new file mode 100644 (file)

index 0000000..d8fe346
--- /dev/null
+++ b/queue-4.14/khugepaged-khugepaged_test_exit-check-mmget_still_va.patch
@@ -0,0 +1,60 @@
+From 55df81fc13e7d52519d0b86d994fac5725102ade Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 6 Aug 2020 23:26:25 -0700
+Subject: khugepaged: khugepaged_test_exit() check mmget_still_valid()
+
+From: Hugh Dickins <hughd@google.com>
+
+[ Upstream commit bbe98f9cadff58cdd6a4acaeba0efa8565dabe65 ]
+
+Move collapse_huge_page()'s mmget_still_valid() check into
+khugepaged_test_exit() itself.  collapse_huge_page() is used for anon THP
+only, and earned its mmget_still_valid() check because it inserts a huge
+pmd entry in place of the page table's pmd entry; whereas
+collapse_file()'s retract_page_tables() or collapse_pte_mapped_thp()
+merely clears the page table's pmd entry.  But core dumping without mmap
+lock must have been as open to mistaking a racily cleared pmd entry for a
+page table at physical page 0, as exit_mmap() was.  And we certainly have
+no interest in mapping as a THP once dumping core.
+
+Fixes: 59ea6d06cfa9 ("coredump: fix race condition between collapse_huge_page() and core dumping")
+Signed-off-by: Hugh Dickins <hughd@google.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Cc: Andrea Arcangeli <aarcange@redhat.com>
+Cc: Song Liu <songliubraving@fb.com>
+Cc: Mike Kravetz <mike.kravetz@oracle.com>
+Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
+Cc: <stable@vger.kernel.org>   [4.8+]
+Link: http://lkml.kernel.org/r/alpine.LSU.2.11.2008021217020.27773@eggly.anvils
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ mm/khugepaged.c | 5 +----
+ 1 file changed, 1 insertion(+), 4 deletions(-)
+
+diff --git a/mm/khugepaged.c b/mm/khugepaged.c
+index 04b4c38d0c184..a1b7475c05d04 100644
+--- a/mm/khugepaged.c
++++ b/mm/khugepaged.c
+@@ -394,7 +394,7 @@ static void insert_to_mm_slots_hash(struct mm_struct *mm,
+ 
+ static inline int khugepaged_test_exit(struct mm_struct *mm)
+ {
+-      return atomic_read(&mm->mm_users) == 0;
++      return atomic_read(&mm->mm_users) == 0 || !mmget_still_valid(mm);
+ }
+ 
+ int __khugepaged_enter(struct mm_struct *mm)
+@@ -1006,9 +1006,6 @@ static void collapse_huge_page(struct mm_struct *mm,
+        * handled by the anon_vma lock + PG_lock.
+        */
+       down_write(&mm->mmap_sem);
+-      result = SCAN_ANY_PROCESS;
+-      if (!mmget_still_valid(mm))
+-              goto out;
+       result = hugepage_vma_revalidate(mm, address, &vma);
+       if (result)
+               goto out;
+-- 
+2.25.1
+
diff --git a/queue-4.14/perf-probe-fix-memory-leakage-when-the-probe-point-i.patch b/queue-4.14/perf-probe-fix-memory-leakage-when-the-probe-point-i.patch

new file mode 100644 (file)

index 0000000..9af9415
--- /dev/null
+++ b/queue-4.14/perf-probe-fix-memory-leakage-when-the-probe-point-i.patch
@@ -0,0 +1,52 @@
+From e3ae49bcd65f6ddea463635ed15ef04e050ef84c Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 10 Jul 2020 22:11:23 +0900
+Subject: perf probe: Fix memory leakage when the probe point is not found
+
+From: Masami Hiramatsu <mhiramat@kernel.org>
+
+[ Upstream commit 12d572e785b15bc764e956caaa8a4c846fd15694 ]
+
+Fix the memory leakage in debuginfo__find_trace_events() when the probe
+point is not found in the debuginfo. If there is no probe point found in
+the debuginfo, debuginfo__find_probes() will NOT return -ENOENT, but 0.
+
+Thus the caller of debuginfo__find_probes() must check the tf.ntevs and
+release the allocated memory for the array of struct probe_trace_event.
+
+The current code releases the memory only if the debuginfo__find_probes()
+hits an error but not checks tf.ntevs. In the result, the memory allocated
+on *tevs are not released if tf.ntevs == 0.
+
+This fixes the memory leakage by checking tf.ntevs == 0 in addition to
+ret < 0.
+
+Fixes: ff741783506c ("perf probe: Introduce debuginfo to encapsulate dwarf information")
+Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
+Reviewed-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
+Cc: Andi Kleen <ak@linux.intel.com>
+Cc: Oleg Nesterov <oleg@redhat.com>
+Cc: stable@vger.kernel.org
+Link: http://lore.kernel.org/lkml/159438668346.62703.10887420400718492503.stgit@devnote2
+Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ tools/perf/util/probe-finder.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/tools/perf/util/probe-finder.c b/tools/perf/util/probe-finder.c
+index 8f7f9d05f38c0..bfa6d9d215569 100644
+--- a/tools/perf/util/probe-finder.c
++++ b/tools/perf/util/probe-finder.c
+@@ -1354,7 +1354,7 @@ int debuginfo__find_trace_events(struct debuginfo *dbg,
+       tf.ntevs = 0;
+ 
+       ret = debuginfo__find_probes(dbg, &tf.pf);
+-      if (ret < 0) {
++      if (ret < 0 || tf.ntevs == 0) {
+               for (i = 0; i < tf.ntevs; i++)
+                       clear_probe_trace_event(&tf.tevs[i]);
+               zfree(tevs);
+-- 
+2.25.1
+
diff --git a/queue-4.14/powerpc-allow-4224-bytes-of-stack-expansion-for-the-.patch b/queue-4.14/powerpc-allow-4224-bytes-of-stack-expansion-for-the-.patch

new file mode 100644 (file)

index 0000000..8e9bd00
--- /dev/null
+++ b/queue-4.14/powerpc-allow-4224-bytes-of-stack-expansion-for-the-.patch
@@ -0,0 +1,188 @@
+From 37b26bb493c39a3d0798104ef907b24e3cc8c521 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 24 Jul 2020 19:25:25 +1000
+Subject: powerpc: Allow 4224 bytes of stack expansion for the signal frame
+
+From: Michael Ellerman <mpe@ellerman.id.au>
+
+[ Upstream commit 63dee5df43a31f3844efabc58972f0a206ca4534 ]
+
+We have powerpc specific logic in our page fault handling to decide if
+an access to an unmapped address below the stack pointer should expand
+the stack VMA.
+
+The code was originally added in 2004 "ported from 2.4". The rough
+logic is that the stack is allowed to grow to 1MB with no extra
+checking. Over 1MB the access must be within 2048 bytes of the stack
+pointer, or be from a user instruction that updates the stack pointer.
+
+The 2048 byte allowance below the stack pointer is there to cover the
+288 byte "red zone" as well as the "about 1.5kB" needed by the signal
+delivery code.
+
+Unfortunately since then the signal frame has expanded, and is now
+4224 bytes on 64-bit kernels with transactional memory enabled. This
+means if a process has consumed more than 1MB of stack, and its stack
+pointer lies less than 4224 bytes from the next page boundary, signal
+delivery will fault when trying to expand the stack and the process
+will see a SEGV.
+
+The total size of the signal frame is the size of struct rt_sigframe
+(which includes the red zone) plus __SIGNAL_FRAMESIZE (128 bytes on
+64-bit).
+
+The 2048 byte allowance was correct until 2008 as the signal frame
+was:
+
+struct rt_sigframe {
+        struct ucontext    uc;                           /*     0  1440 */
+        /* --- cacheline 11 boundary (1408 bytes) was 32 bytes ago --- */
+        long unsigned int          _unused[2];           /*  1440    16 */
+        unsigned int               tramp[6];             /*  1456    24 */
+        struct siginfo *           pinfo;                /*  1480     8 */
+        void *                     puc;                  /*  1488     8 */
+        struct siginfo     info;                         /*  1496   128 */
+        /* --- cacheline 12 boundary (1536 bytes) was 88 bytes ago --- */
+        char                       abigap[288];          /*  1624   288 */
+
+        /* size: 1920, cachelines: 15, members: 7 */
+        /* padding: 8 */
+};
+
+1920 + 128 = 2048
+
+Then in commit ce48b2100785 ("powerpc: Add VSX context save/restore,
+ptrace and signal support") (Jul 2008) the signal frame expanded to
+2304 bytes:
+
+struct rt_sigframe {
+        struct ucontext    uc;                           /*     0  1696 */     <--
+        /* --- cacheline 13 boundary (1664 bytes) was 32 bytes ago --- */
+        long unsigned int          _unused[2];           /*  1696    16 */
+        unsigned int               tramp[6];             /*  1712    24 */
+        struct siginfo *           pinfo;                /*  1736     8 */
+        void *                     puc;                  /*  1744     8 */
+        struct siginfo     info;                         /*  1752   128 */
+        /* --- cacheline 14 boundary (1792 bytes) was 88 bytes ago --- */
+        char                       abigap[288];          /*  1880   288 */
+
+        /* size: 2176, cachelines: 17, members: 7 */
+        /* padding: 8 */
+};
+
+2176 + 128 = 2304
+
+At this point we should have been exposed to the bug, though as far as
+I know it was never reported. I no longer have a system old enough to
+easily test on.
+
+Then in 2010 commit 320b2b8de126 ("mm: keep a guard page below a
+grow-down stack segment") caused our stack expansion code to never
+trigger, as there was always a VMA found for a write up to PAGE_SIZE
+below r1.
+
+That meant the bug was hidden as we continued to expand the signal
+frame in commit 2b0a576d15e0 ("powerpc: Add new transactional memory
+state to the signal context") (Feb 2013):
+
+struct rt_sigframe {
+        struct ucontext    uc;                           /*     0  1696 */
+        /* --- cacheline 13 boundary (1664 bytes) was 32 bytes ago --- */
+        struct ucontext    uc_transact;                  /*  1696  1696 */     <--
+        /* --- cacheline 26 boundary (3328 bytes) was 64 bytes ago --- */
+        long unsigned int          _unused[2];           /*  3392    16 */
+        unsigned int               tramp[6];             /*  3408    24 */
+        struct siginfo *           pinfo;                /*  3432     8 */
+        void *                     puc;                  /*  3440     8 */
+        struct siginfo     info;                         /*  3448   128 */
+        /* --- cacheline 27 boundary (3456 bytes) was 120 bytes ago --- */
+        char                       abigap[288];          /*  3576   288 */
+
+        /* size: 3872, cachelines: 31, members: 8 */
+        /* padding: 8 */
+        /* last cacheline: 32 bytes */
+};
+
+3872 + 128 = 4000
+
+And commit 573ebfa6601f ("powerpc: Increase stack redzone for 64-bit
+userspace to 512 bytes") (Feb 2014):
+
+struct rt_sigframe {
+        struct ucontext    uc;                           /*     0  1696 */
+        /* --- cacheline 13 boundary (1664 bytes) was 32 bytes ago --- */
+        struct ucontext    uc_transact;                  /*  1696  1696 */
+        /* --- cacheline 26 boundary (3328 bytes) was 64 bytes ago --- */
+        long unsigned int          _unused[2];           /*  3392    16 */
+        unsigned int               tramp[6];             /*  3408    24 */
+        struct siginfo *           pinfo;                /*  3432     8 */
+        void *                     puc;                  /*  3440     8 */
+        struct siginfo     info;                         /*  3448   128 */
+        /* --- cacheline 27 boundary (3456 bytes) was 120 bytes ago --- */
+        char                       abigap[512];          /*  3576   512 */     <--
+
+        /* size: 4096, cachelines: 32, members: 8 */
+        /* padding: 8 */
+};
+
+4096 + 128 = 4224
+
+Then finally in 2017, commit 1be7107fbe18 ("mm: larger stack guard
+gap, between vmas") exposed us to the existing bug, because it changed
+the stack VMA to be the correct/real size, meaning our stack expansion
+code is now triggered.
+
+Fix it by increasing the allowance to 4224 bytes.
+
+Hard-coding 4224 is obviously unsafe against future expansions of the
+signal frame in the same way as the existing code. We can't easily use
+sizeof() because the signal frame structure is not in a header. We
+will either fix that, or rip out all the custom stack expansion
+checking logic entirely.
+
+Fixes: ce48b2100785 ("powerpc: Add VSX context save/restore, ptrace and signal support")
+Cc: stable@vger.kernel.org # v2.6.27+
+Reported-by: Tom Lane <tgl@sss.pgh.pa.us>
+Tested-by: Daniel Axtens <dja@axtens.net>
+Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
+Link: https://lore.kernel.org/r/20200724092528.1578671-2-mpe@ellerman.id.au
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/powerpc/mm/fault.c | 7 +++++--
+ 1 file changed, 5 insertions(+), 2 deletions(-)
+
+diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
+index 998c77e600a43..ebe97e5500ee5 100644
+--- a/arch/powerpc/mm/fault.c
++++ b/arch/powerpc/mm/fault.c
+@@ -224,6 +224,9 @@ static bool bad_kernel_fault(bool is_exec, unsigned long error_code,
+       return is_exec || (address >= TASK_SIZE);
+ }
+ 
++// This comes from 64-bit struct rt_sigframe + __SIGNAL_FRAMESIZE
++#define SIGFRAME_MAX_SIZE     (4096 + 128)
++
+ static bool bad_stack_expansion(struct pt_regs *regs, unsigned long address,
+                               struct vm_area_struct *vma, unsigned int flags,
+                               bool *must_retry)
+@@ -231,7 +234,7 @@ static bool bad_stack_expansion(struct pt_regs *regs, unsigned long address,
+       /*
+        * N.B. The POWER/Open ABI allows programs to access up to
+        * 288 bytes below the stack pointer.
+-       * The kernel signal delivery code writes up to about 1.5kB
++       * The kernel signal delivery code writes a bit over 4KB
+        * below the stack pointer (r1) before decrementing it.
+        * The exec code can write slightly over 640kB to the stack
+        * before setting the user r1.  Thus we allow the stack to
+@@ -256,7 +259,7 @@ static bool bad_stack_expansion(struct pt_regs *regs, unsigned long address,
+                * between the last mapped region and the stack will
+                * expand the stack rather than segfaulting.
+                */
+-              if (address + 2048 >= uregs->gpr[1])
++              if (address + SIGFRAME_MAX_SIZE >= uregs->gpr[1])
+                       return false;
+ 
+               if ((flags & FAULT_FLAG_WRITE) && (flags & FAULT_FLAG_USER) &&
+-- 
+2.25.1
+
diff --git a/queue-4.14/powerpc-mm-only-read-faulting-instruction-when-neces.patch b/queue-4.14/powerpc-mm-only-read-faulting-instruction-when-neces.patch

new file mode 100644 (file)

index 0000000..afe0dd9
--- /dev/null
+++ b/queue-4.14/powerpc-mm-only-read-faulting-instruction-when-neces.patch
@@ -0,0 +1,180 @@
+From 678ba6516fac09d9fbb6f23d38b5659df1554622 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 23 May 2018 10:53:22 +0200
+Subject: powerpc/mm: Only read faulting instruction when necessary in
+ do_page_fault()
+
+From: Christophe Leroy <christophe.leroy@c-s.fr>
+
+[ Upstream commit 0e36b0d12501e278686634712975b785bae11641 ]
+
+Commit a7a9dcd882a67 ("powerpc: Avoid taking a data miss on every
+userspace instruction miss") has shown that limiting the read of
+faulting instruction to likely cases improves performance.
+
+This patch goes further into this direction by limiting the read
+of the faulting instruction to the only cases where it is likely
+needed.
+
+On an MPC885, with the same benchmark app as in the commit referred
+above, we see a reduction of about 3900 dTLB misses (approx 3%):
+
+Before the patch:
+ Performance counter stats for './fault 500' (10 runs):
+
+         683033312      cpu-cycles                                                    ( +-  0.03% )
+            134538      dTLB-load-misses                                              ( +-  0.03% )
+             46099      iTLB-load-misses                                              ( +-  0.02% )
+             19681      faults                                                        ( +-  0.02% )
+
+       5.389747878 seconds time elapsed                                          ( +-  0.06% )
+
+With the patch:
+
+ Performance counter stats for './fault 500' (10 runs):
+
+         682112862      cpu-cycles                                                    ( +-  0.03% )
+            130619      dTLB-load-misses                                              ( +-  0.03% )
+             46073      iTLB-load-misses                                              ( +-  0.05% )
+             19681      faults                                                        ( +-  0.01% )
+
+       5.381342641 seconds time elapsed                                          ( +-  0.07% )
+
+The proper work of the huge stack expansion was tested with the
+following app:
+
+int main(int argc, char **argv)
+{
+       char buf[1024 * 1025];
+
+       sprintf(buf, "Hello world !\n");
+       printf(buf);
+
+       exit(0);
+}
+
+Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
+Reviewed-by: Nicholas Piggin <npiggin@gmail.com>
+[mpe: Add include of pagemap.h to fix build errors]
+Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/powerpc/mm/fault.c | 50 ++++++++++++++++++++++++++++-------------
+ 1 file changed, 34 insertions(+), 16 deletions(-)
+
+diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
+index 5fc8a010fdf07..998c77e600a43 100644
+--- a/arch/powerpc/mm/fault.c
++++ b/arch/powerpc/mm/fault.c
+@@ -22,6 +22,7 @@
+ #include <linux/errno.h>
+ #include <linux/string.h>
+ #include <linux/types.h>
++#include <linux/pagemap.h>
+ #include <linux/ptrace.h>
+ #include <linux/mman.h>
+ #include <linux/mm.h>
+@@ -66,15 +67,11 @@ static inline bool notify_page_fault(struct pt_regs *regs)
+ }
+ 
+ /*
+- * Check whether the instruction at regs->nip is a store using
++ * Check whether the instruction inst is a store using
+  * an update addressing form which will update r1.
+  */
+-static bool store_updates_sp(struct pt_regs *regs)
++static bool store_updates_sp(unsigned int inst)
+ {
+-      unsigned int inst;
+-
+-      if (get_user(inst, (unsigned int __user *)regs->nip))
+-              return false;
+       /* check for 1 in the rA field */
+       if (((inst >> 16) & 0x1f) != 1)
+               return false;
+@@ -228,8 +225,8 @@ static bool bad_kernel_fault(bool is_exec, unsigned long error_code,
+ }
+ 
+ static bool bad_stack_expansion(struct pt_regs *regs, unsigned long address,
+-                              struct vm_area_struct *vma,
+-                              bool store_update_sp)
++                              struct vm_area_struct *vma, unsigned int flags,
++                              bool *must_retry)
+ {
+       /*
+        * N.B. The POWER/Open ABI allows programs to access up to
+@@ -241,6 +238,7 @@ static bool bad_stack_expansion(struct pt_regs *regs, unsigned long address,
+        * expand to 1MB without further checks.
+        */
+       if (address + 0x100000 < vma->vm_end) {
++              unsigned int __user *nip = (unsigned int __user *)regs->nip;
+               /* get user regs even if this fault is in kernel mode */
+               struct pt_regs *uregs = current->thread.regs;
+               if (uregs == NULL)
+@@ -258,8 +256,22 @@ static bool bad_stack_expansion(struct pt_regs *regs, unsigned long address,
+                * between the last mapped region and the stack will
+                * expand the stack rather than segfaulting.
+                */
+-              if (address + 2048 < uregs->gpr[1] && !store_update_sp)
+-                      return true;
++              if (address + 2048 >= uregs->gpr[1])
++                      return false;
++
++              if ((flags & FAULT_FLAG_WRITE) && (flags & FAULT_FLAG_USER) &&
++                  access_ok(VERIFY_READ, nip, sizeof(*nip))) {
++                      unsigned int inst;
++                      int res;
++
++                      pagefault_disable();
++                      res = __get_user_inatomic(inst, nip);
++                      pagefault_enable();
++                      if (!res)
++                              return !store_updates_sp(inst);
++                      *must_retry = true;
++              }
++              return true;
+       }
+       return false;
+ }
+@@ -392,7 +404,7 @@ static int __do_page_fault(struct pt_regs *regs, unsigned long address,
+       int is_user = user_mode(regs);
+       int is_write = page_fault_is_write(error_code);
+       int fault, major = 0;
+-      bool store_update_sp = false;
++      bool must_retry = false;
+ 
+       if (notify_page_fault(regs))
+               return 0;
+@@ -439,9 +451,6 @@ static int __do_page_fault(struct pt_regs *regs, unsigned long address,
+        * can result in fault, which will cause a deadlock when called with
+        * mmap_sem held
+        */
+-      if (is_write && is_user)
+-              store_update_sp = store_updates_sp(regs);
+-
+       if (is_user)
+               flags |= FAULT_FLAG_USER;
+       if (is_write)
+@@ -488,8 +497,17 @@ retry:
+               return bad_area(regs, address);
+ 
+       /* The stack is being expanded, check if it's valid */
+-      if (unlikely(bad_stack_expansion(regs, address, vma, store_update_sp)))
+-              return bad_area(regs, address);
++      if (unlikely(bad_stack_expansion(regs, address, vma, flags,
++                                       &must_retry))) {
++              if (!must_retry)
++                      return bad_area(regs, address);
++
++              up_read(&mm->mmap_sem);
++              if (fault_in_pages_readable((const char __user *)regs->nip,
++                                          sizeof(unsigned int)))
++                      return bad_area_nosemaphore(regs, address);
++              goto retry;
++      }
+ 
+       /* Try to expand it */
+       if (unlikely(expand_stack(vma, address)))
+-- 
+2.25.1
+
diff --git a/queue-4.14/series b/queue-4.14/series

new file mode 100644 (file)

index 0000000..74f8034
--- /dev/null
+++ b/queue-4.14/series
@@ -0,0 +1,11 @@
+drm-vgem-replace-opencoded-version-of-drm_gem_dumb_m.patch
+perf-probe-fix-memory-leakage-when-the-probe-point-i.patch
+khugepaged-khugepaged_test_exit-check-mmget_still_va.patch
+khugepaged-adjust-vm_bug_on_mm-in-__khugepaged_enter.patch
+powerpc-mm-only-read-faulting-instruction-when-neces.patch
+powerpc-allow-4224-bytes-of-stack-expansion-for-the-.patch
+btrfs-export-helpers-for-subvolume-name-id-resolutio.patch
+btrfs-don-t-show-full-path-of-bind-mounts-in-subvol.patch
+btrfs-move-free_pages_out-label-in-inline-extent-han.patch
+btrfs-inode-fix-null-pointer-dereference-if-inode-do.patch
+btrfs-sysfs-use-nofs-for-device-creation.patch
author	Sasha Levin <sashal@kernel.org>
	Sun, 23 Aug 2020 01:16:36 +0000 (21:16 -0400)
committer	Sasha Levin <sashal@kernel.org>
	Sun, 23 Aug 2020 01:16:36 +0000 (21:16 -0400)
queue-4.14/btrfs-don-t-show-full-path-of-bind-mounts-in-subvol.patch	[new file with mode: 0644]	patch \| blob
queue-4.14/btrfs-export-helpers-for-subvolume-name-id-resolutio.patch	[new file with mode: 0644]	patch \| blob
queue-4.14/btrfs-inode-fix-null-pointer-dereference-if-inode-do.patch	[new file with mode: 0644]	patch \| blob
queue-4.14/btrfs-move-free_pages_out-label-in-inline-extent-han.patch	[new file with mode: 0644]	patch \| blob
queue-4.14/btrfs-sysfs-use-nofs-for-device-creation.patch	[new file with mode: 0644]	patch \| blob
queue-4.14/drm-vgem-replace-opencoded-version-of-drm_gem_dumb_m.patch	[new file with mode: 0644]	patch \| blob
queue-4.14/khugepaged-adjust-vm_bug_on_mm-in-__khugepaged_enter.patch	[new file with mode: 0644]	patch \| blob
queue-4.14/khugepaged-khugepaged_test_exit-check-mmget_still_va.patch	[new file with mode: 0644]	patch \| blob
queue-4.14/perf-probe-fix-memory-leakage-when-the-probe-point-i.patch	[new file with mode: 0644]	patch \| blob
queue-4.14/powerpc-allow-4224-bytes-of-stack-expansion-for-the-.patch	[new file with mode: 0644]	patch \| blob
queue-4.14/powerpc-mm-only-read-faulting-instruction-when-neces.patch	[new file with mode: 0644]	patch \| blob
queue-4.14/series	[new file with mode: 0644]	patch \| blob