]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
4.9-stable patches
authorGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Mon, 4 Feb 2019 10:12:15 +0000 (11:12 +0100)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Mon, 4 Feb 2019 10:12:15 +0000 (11:12 +0100)
added patches:
fs-don-t-scan-the-inode-cache-before-sb_born-is-set.patch

queue-4.9/fs-don-t-scan-the-inode-cache-before-sb_born-is-set.patch [new file with mode: 0644]
queue-4.9/series

diff --git a/queue-4.9/fs-don-t-scan-the-inode-cache-before-sb_born-is-set.patch b/queue-4.9/fs-don-t-scan-the-inode-cache-before-sb_born-is-set.patch
new file mode 100644 (file)
index 0000000..c9b45c7
--- /dev/null
@@ -0,0 +1,101 @@
+From 79f546a696bff2590169fb5684e23d65f4d9f591 Mon Sep 17 00:00:00 2001
+From: Dave Chinner <dchinner@redhat.com>
+Date: Fri, 11 May 2018 11:20:57 +1000
+Subject: fs: don't scan the inode cache before SB_BORN is set
+
+From: Dave Chinner <dchinner@redhat.com>
+
+commit 79f546a696bff2590169fb5684e23d65f4d9f591 upstream.
+
+We recently had an oops reported on a 4.14 kernel in
+xfs_reclaim_inodes_count() where sb->s_fs_info pointed to garbage
+and so the m_perag_tree lookup walked into lala land.  It produces
+an oops down this path during the failed mount:
+
+  radix_tree_gang_lookup_tag+0xc4/0x130
+  xfs_perag_get_tag+0x37/0xf0
+  xfs_reclaim_inodes_count+0x32/0x40
+  xfs_fs_nr_cached_objects+0x11/0x20
+  super_cache_count+0x35/0xc0
+  shrink_slab.part.66+0xb1/0x370
+  shrink_node+0x7e/0x1a0
+  try_to_free_pages+0x199/0x470
+  __alloc_pages_slowpath+0x3a1/0xd20
+  __alloc_pages_nodemask+0x1c3/0x200
+  cache_grow_begin+0x20b/0x2e0
+  fallback_alloc+0x160/0x200
+  kmem_cache_alloc+0x111/0x4e0
+
+The problem is that the superblock shrinker is running before the
+filesystem structures it depends on have been fully set up. i.e.
+the shrinker is registered in sget(), before ->fill_super() has been
+called, and the shrinker can call into the filesystem before
+fill_super() does it's setup work. Essentially we are exposed to
+both use-after-free and use-before-initialisation bugs here.
+
+To fix this, add a check for the SB_BORN flag in super_cache_count.
+In general, this flag is not set until ->fs_mount() completes
+successfully, so we know that it is set after the filesystem
+setup has completed. This matches the trylock_super() behaviour
+which will not let super_cache_scan() run if SB_BORN is not set, and
+hence will not allow the superblock shrinker from entering the
+filesystem while it is being set up or after it has failed setup
+and is being torn down.
+
+Cc: stable@kernel.org
+Signed-Off-By: Dave Chinner <dchinner@redhat.com>
+Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
+Signed-off-by: Aaron Lu <aaron.lu@linux.alibaba.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/super.c |   30 ++++++++++++++++++++++++------
+ 1 file changed, 24 insertions(+), 6 deletions(-)
+
+--- a/fs/super.c
++++ b/fs/super.c
+@@ -119,13 +119,23 @@ static unsigned long super_cache_count(s
+       sb = container_of(shrink, struct super_block, s_shrink);
+       /*
+-       * Don't call trylock_super as it is a potential
+-       * scalability bottleneck. The counts could get updated
+-       * between super_cache_count and super_cache_scan anyway.
+-       * Call to super_cache_count with shrinker_rwsem held
+-       * ensures the safety of call to list_lru_shrink_count() and
+-       * s_op->nr_cached_objects().
++       * We don't call trylock_super() here as it is a scalability bottleneck,
++       * so we're exposed to partial setup state. The shrinker rwsem does not
++       * protect filesystem operations backing list_lru_shrink_count() or
++       * s_op->nr_cached_objects(). Counts can change between
++       * super_cache_count and super_cache_scan, so we really don't need locks
++       * here.
++       *
++       * However, if we are currently mounting the superblock, the underlying
++       * filesystem might be in a state of partial construction and hence it
++       * is dangerous to access it.  trylock_super() uses a MS_BORN check to
++       * avoid this situation, so do the same here. The memory barrier is
++       * matched with the one in mount_fs() as we don't hold locks here.
+        */
++      if (!(sb->s_flags & MS_BORN))
++              return 0;
++      smp_rmb();
++
+       if (sb->s_op && sb->s_op->nr_cached_objects)
+               total_objects = sb->s_op->nr_cached_objects(sb, sc);
+@@ -1193,6 +1203,14 @@ mount_fs(struct file_system_type *type,
+       sb = root->d_sb;
+       BUG_ON(!sb);
+       WARN_ON(!sb->s_bdi);
++
++      /*
++       * Write barrier is for super_cache_count(). We place it before setting
++       * MS_BORN as the data dependency between the two functions is the
++       * superblock structure contents that we just set up, not the MS_BORN
++       * flag.
++       */
++      smp_wmb();
+       sb->s_flags |= MS_BORN;
+       error = security_sb_kern_mount(sb, flags, secdata);
index 86c999d1cbd2b873c84953c258173cd7b41fded1..a137cf218e590dbb0615b83a3126bbafc96591f4 100644 (file)
@@ -26,3 +26,4 @@ mm-hwpoison-use-do_send_sig_info-instead-of-force_sig.patch
 mm-migrate-don-t-rely-on-__pagemovable-of-newpage-after-unlocking-it.patch
 cifs-always-resolve-hostname-before-reconnecting.patch
 drivers-core-remove-glue-dirs-from-sysfs-earlier.patch
+fs-don-t-scan-the-inode-cache-before-sb_born-is-set.patch