]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
4.3-stable patches
authorGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Sun, 14 Feb 2016 01:57:25 +0000 (17:57 -0800)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Sun, 14 Feb 2016 01:57:25 +0000 (17:57 -0800)
added patches:
fix-sysvfs-symlinks.patch
md-raid10-fix-data-corruption-and-crash-during-resync.patch
mm-memcontrol-fix-possible-memcg-leak-due-to-interrupted-reclaim.patch

queue-4.3/fix-sysvfs-symlinks.patch [new file with mode: 0644]
queue-4.3/md-raid10-fix-data-corruption-and-crash-during-resync.patch [new file with mode: 0644]
queue-4.3/mm-memcontrol-fix-possible-memcg-leak-due-to-interrupted-reclaim.patch [new file with mode: 0644]
queue-4.3/series

diff --git a/queue-4.3/fix-sysvfs-symlinks.patch b/queue-4.3/fix-sysvfs-symlinks.patch
new file mode 100644 (file)
index 0000000..998238b
--- /dev/null
@@ -0,0 +1,46 @@
+From 0ebf7f10d67a70e120f365018f1c5fce9ddc567d Mon Sep 17 00:00:00 2001
+From: Al Viro <viro@zeniv.linux.org.uk>
+Date: Mon, 23 Nov 2015 21:11:08 -0500
+Subject: fix sysvfs symlinks
+
+From: Al Viro <viro@zeniv.linux.org.uk>
+
+commit 0ebf7f10d67a70e120f365018f1c5fce9ddc567d upstream.
+
+The thing got broken back in 2002 - sysvfs does *not* have inline
+symlinks; even short ones have bodies stored in the first block
+of file.  sysv_symlink() handles that correctly; unfortunately,
+attempting to look an existing symlink up will end up confusing
+them for inline symlinks, and interpret the block number containing
+the body as the body itself.
+
+Nobody has noticed until now, which says something about the level
+of testing sysvfs gets ;-/
+
+Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/sysv/inode.c |   11 ++---------
+ 1 file changed, 2 insertions(+), 9 deletions(-)
+
+--- a/fs/sysv/inode.c
++++ b/fs/sysv/inode.c
+@@ -162,15 +162,8 @@ void sysv_set_inode(struct inode *inode,
+               inode->i_fop = &sysv_dir_operations;
+               inode->i_mapping->a_ops = &sysv_aops;
+       } else if (S_ISLNK(inode->i_mode)) {
+-              if (inode->i_blocks) {
+-                      inode->i_op = &sysv_symlink_inode_operations;
+-                      inode->i_mapping->a_ops = &sysv_aops;
+-              } else {
+-                      inode->i_op = &simple_symlink_inode_operations;
+-                      inode->i_link = (char *)SYSV_I(inode)->i_data;
+-                      nd_terminate_link(inode->i_link, inode->i_size,
+-                              sizeof(SYSV_I(inode)->i_data) - 1);
+-              }
++              inode->i_op = &sysv_symlink_inode_operations;
++              inode->i_mapping->a_ops = &sysv_aops;
+       } else
+               init_special_inode(inode, inode->i_mode, rdev);
+ }
diff --git a/queue-4.3/md-raid10-fix-data-corruption-and-crash-during-resync.patch b/queue-4.3/md-raid10-fix-data-corruption-and-crash-during-resync.patch
new file mode 100644 (file)
index 0000000..c71c976
--- /dev/null
@@ -0,0 +1,80 @@
+From cc57858831e3e9678291de730c4b4d2e52a19f59 Mon Sep 17 00:00:00 2001
+From: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
+Date: Fri, 18 Dec 2015 15:19:16 +1100
+Subject: md/raid10: fix data corruption and crash during resync
+
+From: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
+
+commit cc57858831e3e9678291de730c4b4d2e52a19f59 upstream.
+
+The commit c31df25f20e3 ("md/raid10: make sync_request_write() call
+bio_copy_data()") replaced manual data copying with bio_copy_data() but
+it doesn't work as intended. The source bio (fbio) is already processed,
+so its bvec_iter has bi_size == 0 and bi_idx == bi_vcnt.  Because of
+this, bio_copy_data() either does not copy anything, or worse, copies
+data from the ->bi_next bio if it is set.  This causes wrong data to be
+written to drives during resync and sometimes lockups/crashes in
+bio_copy_data():
+
+[  517.338478] NMI watchdog: BUG: soft lockup - CPU#0 stuck for 22s! [md126_raid10:3319]
+[  517.347324] Modules linked in: raid10 xt_CHECKSUM ipt_MASQUERADE nf_nat_masquerade_ipv4 tun ip6t_rpfilter ip6t_REJECT nf_reject_ipv6 ipt_REJECT nf_reject_ipv4 xt_conntrack ebtable_nat ebtable_broute bridge stp llc ebtable_filter ebtables ip6table_nat nf_conntrack_ipv6 nf_defrag_ipv6 nf_nat_ipv6 ip6table_mangle ip6table_security ip6table_raw ip6table_filter ip6_tables iptable_nat nf_conntrack_ipv4 nf_defrag_ipv4 nf_nat_ipv4 nf_nat nf_conntrack iptable_mangle iptable_security iptable_raw iptable_filter ip_tables x86_pkg_temp_thermal coretemp kvm_intel kvm crct10dif_pclmul crc32_pclmul cryptd shpchp pcspkr ipmi_si ipmi_msghandler tpm_crb acpi_power_meter acpi_cpufreq ext4 mbcache jbd2 sr_mod cdrom sd_mod e1000e ax88179_178a usbnet mii ahci ata_generic crc32c_intel libahci ptp pata_acpi libata pps_core wmi sunrpc dm_mirror dm_region_hash dm_log dm_mod
+[  517.440555] CPU: 0 PID: 3319 Comm: md126_raid10 Not tainted 4.3.0-rc6+ #1
+[  517.448384] Hardware name: Intel Corporation PURLEY/PURLEY, BIOS PLYDCRB1.86B.0055.D14.1509221924 09/22/2015
+[  517.459768] task: ffff880153773980 ti: ffff880150df8000 task.ti: ffff880150df8000
+[  517.468529] RIP: 0010:[<ffffffff812e1888>]  [<ffffffff812e1888>] bio_copy_data+0xc8/0x3c0
+[  517.478164] RSP: 0018:ffff880150dfbc98  EFLAGS: 00000246
+[  517.484341] RAX: ffff880169356688 RBX: 0000000000001000 RCX: 0000000000000000
+[  517.492558] RDX: 0000000000000000 RSI: ffffea0001ac2980 RDI: ffffea0000d835c0
+[  517.500773] RBP: ffff880150dfbd08 R08: 0000000000000001 R09: ffff880153773980
+[  517.508987] R10: ffff880169356600 R11: 0000000000001000 R12: 0000000000010000
+[  517.517199] R13: 000000000000e000 R14: 0000000000000000 R15: 0000000000001000
+[  517.525412] FS:  0000000000000000(0000) GS:ffff880174a00000(0000) knlGS:0000000000000000
+[  517.534844] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
+[  517.541507] CR2: 00007f8a044d5fed CR3: 0000000169504000 CR4: 00000000001406f0
+[  517.549722] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
+[  517.557929] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
+[  517.566144] Stack:
+[  517.568626]  ffff880174a16bc0 ffff880153773980 ffff880169356600 0000000000000000
+[  517.577659]  0000000000000001 0000000000000001 ffff880153773980 ffff88016a61a800
+[  517.586715]  ffff880150dfbcf8 0000000000000001 ffff88016dd209e0 0000000000001000
+[  517.595773] Call Trace:
+[  517.598747]  [<ffffffffa043ef95>] raid10d+0xfc5/0x1690 [raid10]
+[  517.605610]  [<ffffffff816697ae>] ? __schedule+0x29e/0x8e2
+[  517.611987]  [<ffffffff814ff206>] md_thread+0x106/0x140
+[  517.618072]  [<ffffffff810c1d80>] ? wait_woken+0x80/0x80
+[  517.624252]  [<ffffffff814ff100>] ? super_1_load+0x520/0x520
+[  517.630817]  [<ffffffff8109ef89>] kthread+0xc9/0xe0
+[  517.636506]  [<ffffffff8109eec0>] ? flush_kthread_worker+0x70/0x70
+[  517.643653]  [<ffffffff8166d99f>] ret_from_fork+0x3f/0x70
+[  517.649929]  [<ffffffff8109eec0>] ? flush_kthread_worker+0x70/0x70
+
+Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
+Reviewed-by: Shaohua Li <shli@kernel.org>
+Fixes: c31df25f20e3 ("md/raid10: make sync_request_write() call bio_copy_data()")
+Signed-off-by: NeilBrown <neilb@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/md/raid10.c |    4 +++-
+ 1 file changed, 3 insertions(+), 1 deletion(-)
+
+--- a/drivers/md/raid10.c
++++ b/drivers/md/raid10.c
+@@ -1944,6 +1944,8 @@ static void sync_request_write(struct md
+       first = i;
+       fbio = r10_bio->devs[i].bio;
++      fbio->bi_iter.bi_size = r10_bio->sectors << 9;
++      fbio->bi_iter.bi_idx = 0;
+       vcnt = (r10_bio->sectors + (PAGE_SIZE >> 9) - 1) >> (PAGE_SHIFT - 9);
+       /* now find blocks with errors */
+@@ -1987,7 +1989,7 @@ static void sync_request_write(struct md
+               bio_reset(tbio);
+               tbio->bi_vcnt = vcnt;
+-              tbio->bi_iter.bi_size = r10_bio->sectors << 9;
++              tbio->bi_iter.bi_size = fbio->bi_iter.bi_size;
+               tbio->bi_rw = WRITE;
+               tbio->bi_private = r10_bio;
+               tbio->bi_iter.bi_sector = r10_bio->devs[i].addr;
diff --git a/queue-4.3/mm-memcontrol-fix-possible-memcg-leak-due-to-interrupted-reclaim.patch b/queue-4.3/mm-memcontrol-fix-possible-memcg-leak-due-to-interrupted-reclaim.patch
new file mode 100644 (file)
index 0000000..6814876
--- /dev/null
@@ -0,0 +1,156 @@
+From 6df38689e0e9a07ff4f42c06b302e203b33667e9 Mon Sep 17 00:00:00 2001
+From: Vladimir Davydov <vdavydov@virtuozzo.com>
+Date: Tue, 29 Dec 2015 14:54:10 -0800
+Subject: mm: memcontrol: fix possible memcg leak due to interrupted reclaim
+
+From: Vladimir Davydov <vdavydov@virtuozzo.com>
+
+commit 6df38689e0e9a07ff4f42c06b302e203b33667e9 upstream.
+
+Memory cgroup reclaim can be interrupted with mem_cgroup_iter_break()
+once enough pages have been reclaimed, in which case, in contrast to a
+full round-trip over a cgroup sub-tree, the current position stored in
+mem_cgroup_reclaim_iter of the target cgroup does not get invalidated
+and so is left holding the reference to the last scanned cgroup.  If the
+target cgroup does not get scanned again (we might have just reclaimed
+the last page or all processes might exit and free their memory
+voluntary), we will leak it, because there is nobody to put the
+reference held by the iterator.
+
+The problem is easy to reproduce by running the following command
+sequence in a loop:
+
+    mkdir /sys/fs/cgroup/memory/test
+    echo 100M > /sys/fs/cgroup/memory/test/memory.limit_in_bytes
+    echo $$ > /sys/fs/cgroup/memory/test/cgroup.procs
+    memhog 150M
+    echo $$ > /sys/fs/cgroup/memory/cgroup.procs
+    rmdir test
+
+The cgroups generated by it will never get freed.
+
+This patch fixes this issue by making mem_cgroup_iter avoid taking
+reference to the current position.  In order not to hit use-after-free
+bug while running reclaim in parallel with cgroup deletion, we make use
+of ->css_released cgroup callback to clear references to the dying
+cgroup in all reclaim iterators that might refer to it.  This callback
+is called right before scheduling rcu work which will free css, so if we
+access iter->position from rcu read section, we might be sure it won't
+go away under us.
+
+[hannes@cmpxchg.org: clean up css ref handling]
+Fixes: 5ac8fb31ad2e ("mm: memcontrol: convert reclaim iterator to simple css refcounting")
+Signed-off-by: Vladimir Davydov <vdavydov@virtuozzo.com>
+Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
+Acked-by: Michal Hocko <mhocko@kernel.org>
+Acked-by: Johannes Weiner <hannes@cmpxchg.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/memcontrol.c |   60 ++++++++++++++++++++++++++++++++++++++++++--------------
+ 1 file changed, 46 insertions(+), 14 deletions(-)
+
+--- a/mm/memcontrol.c
++++ b/mm/memcontrol.c
+@@ -902,14 +902,20 @@ struct mem_cgroup *mem_cgroup_iter(struc
+               if (prev && reclaim->generation != iter->generation)
+                       goto out_unlock;
+-              do {
++              while (1) {
+                       pos = READ_ONCE(iter->position);
++                      if (!pos || css_tryget(&pos->css))
++                              break;
+                       /*
+-                       * A racing update may change the position and
+-                       * put the last reference, hence css_tryget(),
+-                       * or retry to see the updated position.
++                       * css reference reached zero, so iter->position will
++                       * be cleared by ->css_released. However, we should not
++                       * rely on this happening soon, because ->css_released
++                       * is called from a work queue, and by busy-waiting we
++                       * might block it. So we clear iter->position right
++                       * away.
+                        */
+-              } while (pos && !css_tryget(&pos->css));
++                      (void)cmpxchg(&iter->position, pos, NULL);
++              }
+       }
+       if (pos)
+@@ -955,17 +961,13 @@ struct mem_cgroup *mem_cgroup_iter(struc
+       }
+       if (reclaim) {
+-              if (cmpxchg(&iter->position, pos, memcg) == pos) {
+-                      if (memcg)
+-                              css_get(&memcg->css);
+-                      if (pos)
+-                              css_put(&pos->css);
+-              }
+-
+               /*
+-               * pairs with css_tryget when dereferencing iter->position
+-               * above.
++               * The position could have already been updated by a competing
++               * thread, so check that the value hasn't changed since we read
++               * it to avoid reclaiming from the same cgroup twice.
+                */
++              (void)cmpxchg(&iter->position, pos, memcg);
++
+               if (pos)
+                       css_put(&pos->css);
+@@ -998,6 +1000,28 @@ void mem_cgroup_iter_break(struct mem_cg
+               css_put(&prev->css);
+ }
++static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
++{
++      struct mem_cgroup *memcg = dead_memcg;
++      struct mem_cgroup_reclaim_iter *iter;
++      struct mem_cgroup_per_zone *mz;
++      int nid, zid;
++      int i;
++
++      while ((memcg = parent_mem_cgroup(memcg))) {
++              for_each_node(nid) {
++                      for (zid = 0; zid < MAX_NR_ZONES; zid++) {
++                              mz = &memcg->nodeinfo[nid]->zoneinfo[zid];
++                              for (i = 0; i <= DEF_PRIORITY; i++) {
++                                      iter = &mz->iter[i];
++                                      cmpxchg(&iter->position,
++                                              dead_memcg, NULL);
++                              }
++                      }
++              }
++      }
++}
++
+ /*
+  * Iteration constructs for visiting all cgroups (under a tree).  If
+  * loops are exited prematurely (break), mem_cgroup_iter_break() must
+@@ -4360,6 +4384,13 @@ static void mem_cgroup_css_offline(struc
+       wb_memcg_offline(memcg);
+ }
++static void mem_cgroup_css_released(struct cgroup_subsys_state *css)
++{
++      struct mem_cgroup *memcg = mem_cgroup_from_css(css);
++
++      invalidate_reclaim_iterators(memcg);
++}
++
+ static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
+ {
+       struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+@@ -5216,6 +5247,7 @@ struct cgroup_subsys memory_cgrp_subsys
+       .css_alloc = mem_cgroup_css_alloc,
+       .css_online = mem_cgroup_css_online,
+       .css_offline = mem_cgroup_css_offline,
++      .css_released = mem_cgroup_css_released,
+       .css_free = mem_cgroup_css_free,
+       .css_reset = mem_cgroup_css_reset,
+       .can_attach = mem_cgroup_can_attach,
index 48bbfc39506c92c8039805f5bbbd1732c56f9bf3..99fa0d31b20d7900768cc92dd959d89b2f16dfca 100644 (file)
@@ -92,3 +92,6 @@ media-vb2-dma-contig-fully-cache-synchronise-buffers-in-prepare-and-finish.patch
 media-vb2-dma-sg-fully-cache-synchronise-buffers-in-prepare-and-finish.patch
 media-v4l2-ctrls-fix-setting-autocluster-to-manual-with-vidioc_s_ctrl.patch
 revert-ivtv-avoid-going-past-input-audio-array.patch
+mm-memcontrol-fix-possible-memcg-leak-due-to-interrupted-reclaim.patch
+md-raid10-fix-data-corruption-and-crash-during-resync.patch
+fix-sysvfs-symlinks.patch