From fb7d6f5eda2530a2d8cfcd7f7db3459c08b85fe0 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Sat, 13 Feb 2016 17:57:25 -0800
Subject: [PATCH] 4.3-stable patches

added patches:
	fix-sysvfs-symlinks.patch
	md-raid10-fix-data-corruption-and-crash-during-resync.patch
	mm-memcontrol-fix-possible-memcg-leak-due-to-interrupted-reclaim.patch
---
 queue-4.3/fix-sysvfs-symlinks.patch           |  46 ++++++
 ...a-corruption-and-crash-during-resync.patch |  80 +++++++++
 ...emcg-leak-due-to-interrupted-reclaim.patch | 156 ++++++++++++++++++
 queue-4.3/series                              |   3 +
 4 files changed, 285 insertions(+)
 create mode 100644 queue-4.3/fix-sysvfs-symlinks.patch
 create mode 100644 queue-4.3/md-raid10-fix-data-corruption-and-crash-during-resync.patch
 create mode 100644 queue-4.3/mm-memcontrol-fix-possible-memcg-leak-due-to-interrupted-reclaim.patch

diff --git a/queue-4.3/fix-sysvfs-symlinks.patch b/queue-4.3/fix-sysvfs-symlinks.patch
new file mode 100644
index 00000000000..998238bf533
--- /dev/null
+++ b/queue-4.3/fix-sysvfs-symlinks.patch
@@ -0,0 +1,46 @@
+From 0ebf7f10d67a70e120f365018f1c5fce9ddc567d Mon Sep 17 00:00:00 2001
+From: Al Viro <viro@zeniv.linux.org.uk>
+Date: Mon, 23 Nov 2015 21:11:08 -0500
+Subject: fix sysvfs symlinks
+
+From: Al Viro <viro@zeniv.linux.org.uk>
+
+commit 0ebf7f10d67a70e120f365018f1c5fce9ddc567d upstream.
+
+The thing got broken back in 2002 - sysvfs does *not* have inline
+symlinks; even short ones have bodies stored in the first block
+of file.  sysv_symlink() handles that correctly; unfortunately,
+attempting to look an existing symlink up will end up confusing
+them for inline symlinks, and interpret the block number containing
+the body as the body itself.
+
+Nobody has noticed until now, which says something about the level
+of testing sysvfs gets ;-/
+
+Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/sysv/inode.c |   11 ++---------
+ 1 file changed, 2 insertions(+), 9 deletions(-)
+
+--- a/fs/sysv/inode.c
++++ b/fs/sysv/inode.c
+@@ -162,15 +162,8 @@ void sysv_set_inode(struct inode *inode,
+ 		inode->i_fop = &sysv_dir_operations;
+ 		inode->i_mapping->a_ops = &sysv_aops;
+ 	} else if (S_ISLNK(inode->i_mode)) {
+-		if (inode->i_blocks) {
+-			inode->i_op = &sysv_symlink_inode_operations;
+-			inode->i_mapping->a_ops = &sysv_aops;
+-		} else {
+-			inode->i_op = &simple_symlink_inode_operations;
+-			inode->i_link = (char *)SYSV_I(inode)->i_data;
+-			nd_terminate_link(inode->i_link, inode->i_size,
+-				sizeof(SYSV_I(inode)->i_data) - 1);
+-		}
++		inode->i_op = &sysv_symlink_inode_operations;
++		inode->i_mapping->a_ops = &sysv_aops;
+ 	} else
+ 		init_special_inode(inode, inode->i_mode, rdev);
+ }
diff --git a/queue-4.3/md-raid10-fix-data-corruption-and-crash-during-resync.patch b/queue-4.3/md-raid10-fix-data-corruption-and-crash-during-resync.patch
new file mode 100644
index 00000000000..c71c976ef58
--- /dev/null
+++ b/queue-4.3/md-raid10-fix-data-corruption-and-crash-during-resync.patch
@@ -0,0 +1,80 @@
+From cc57858831e3e9678291de730c4b4d2e52a19f59 Mon Sep 17 00:00:00 2001
+From: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
+Date: Fri, 18 Dec 2015 15:19:16 +1100
+Subject: md/raid10: fix data corruption and crash during resync
+
+From: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
+
+commit cc57858831e3e9678291de730c4b4d2e52a19f59 upstream.
+
+The commit c31df25f20e3 ("md/raid10: make sync_request_write() call
+bio_copy_data()") replaced manual data copying with bio_copy_data() but
+it doesn't work as intended. The source bio (fbio) is already processed,
+so its bvec_iter has bi_size == 0 and bi_idx == bi_vcnt.  Because of
+this, bio_copy_data() either does not copy anything, or worse, copies
+data from the ->bi_next bio if it is set.  This causes wrong data to be
+written to drives during resync and sometimes lockups/crashes in
+bio_copy_data():
+
+[  517.338478] NMI watchdog: BUG: soft lockup - CPU#0 stuck for 22s! [md126_raid10:3319]
+[  517.347324] Modules linked in: raid10 xt_CHECKSUM ipt_MASQUERADE nf_nat_masquerade_ipv4 tun ip6t_rpfilter ip6t_REJECT nf_reject_ipv6 ipt_REJECT nf_reject_ipv4 xt_conntrack ebtable_nat ebtable_broute bridge stp llc ebtable_filter ebtables ip6table_nat nf_conntrack_ipv6 nf_defrag_ipv6 nf_nat_ipv6 ip6table_mangle ip6table_security ip6table_raw ip6table_filter ip6_tables iptable_nat nf_conntrack_ipv4 nf_defrag_ipv4 nf_nat_ipv4 nf_nat nf_conntrack iptable_mangle iptable_security iptable_raw iptable_filter ip_tables x86_pkg_temp_thermal coretemp kvm_intel kvm crct10dif_pclmul crc32_pclmul cryptd shpchp pcspkr ipmi_si ipmi_msghandler tpm_crb acpi_power_meter acpi_cpufreq ext4 mbcache jbd2 sr_mod cdrom sd_mod e1000e ax88179_178a usbnet mii ahci ata_generic crc32c_intel libahci ptp pata_acpi libata pps_core wmi sunrpc dm_mirror dm_region_hash dm_log dm_mod
+[  517.440555] CPU: 0 PID: 3319 Comm: md126_raid10 Not tainted 4.3.0-rc6+ #1
+[  517.448384] Hardware name: Intel Corporation PURLEY/PURLEY, BIOS PLYDCRB1.86B.0055.D14.1509221924 09/22/2015
+[  517.459768] task: ffff880153773980 ti: ffff880150df8000 task.ti: ffff880150df8000
+[  517.468529] RIP: 0010:[<ffffffff812e1888>]  [<ffffffff812e1888>] bio_copy_data+0xc8/0x3c0
+[  517.478164] RSP: 0018:ffff880150dfbc98  EFLAGS: 00000246
+[  517.484341] RAX: ffff880169356688 RBX: 0000000000001000 RCX: 0000000000000000
+[  517.492558] RDX: 0000000000000000 RSI: ffffea0001ac2980 RDI: ffffea0000d835c0
+[  517.500773] RBP: ffff880150dfbd08 R08: 0000000000000001 R09: ffff880153773980
+[  517.508987] R10: ffff880169356600 R11: 0000000000001000 R12: 0000000000010000
+[  517.517199] R13: 000000000000e000 R14: 0000000000000000 R15: 0000000000001000
+[  517.525412] FS:  0000000000000000(0000) GS:ffff880174a00000(0000) knlGS:0000000000000000
+[  517.534844] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
+[  517.541507] CR2: 00007f8a044d5fed CR3: 0000000169504000 CR4: 00000000001406f0
+[  517.549722] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
+[  517.557929] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
+[  517.566144] Stack:
+[  517.568626]  ffff880174a16bc0 ffff880153773980 ffff880169356600 0000000000000000
+[  517.577659]  0000000000000001 0000000000000001 ffff880153773980 ffff88016a61a800
+[  517.586715]  ffff880150dfbcf8 0000000000000001 ffff88016dd209e0 0000000000001000
+[  517.595773] Call Trace:
+[  517.598747]  [<ffffffffa043ef95>] raid10d+0xfc5/0x1690 [raid10]
+[  517.605610]  [<ffffffff816697ae>] ? __schedule+0x29e/0x8e2
+[  517.611987]  [<ffffffff814ff206>] md_thread+0x106/0x140
+[  517.618072]  [<ffffffff810c1d80>] ? wait_woken+0x80/0x80
+[  517.624252]  [<ffffffff814ff100>] ? super_1_load+0x520/0x520
+[  517.630817]  [<ffffffff8109ef89>] kthread+0xc9/0xe0
+[  517.636506]  [<ffffffff8109eec0>] ? flush_kthread_worker+0x70/0x70
+[  517.643653]  [<ffffffff8166d99f>] ret_from_fork+0x3f/0x70
+[  517.649929]  [<ffffffff8109eec0>] ? flush_kthread_worker+0x70/0x70
+
+Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
+Reviewed-by: Shaohua Li <shli@kernel.org>
+Fixes: c31df25f20e3 ("md/raid10: make sync_request_write() call bio_copy_data()")
+Signed-off-by: NeilBrown <neilb@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/md/raid10.c |    4 +++-
+ 1 file changed, 3 insertions(+), 1 deletion(-)
+
+--- a/drivers/md/raid10.c
++++ b/drivers/md/raid10.c
+@@ -1944,6 +1944,8 @@ static void sync_request_write(struct md
+ 
+ 	first = i;
+ 	fbio = r10_bio->devs[i].bio;
++	fbio->bi_iter.bi_size = r10_bio->sectors << 9;
++	fbio->bi_iter.bi_idx = 0;
+ 
+ 	vcnt = (r10_bio->sectors + (PAGE_SIZE >> 9) - 1) >> (PAGE_SHIFT - 9);
+ 	/* now find blocks with errors */
+@@ -1987,7 +1989,7 @@ static void sync_request_write(struct md
+ 		bio_reset(tbio);
+ 
+ 		tbio->bi_vcnt = vcnt;
+-		tbio->bi_iter.bi_size = r10_bio->sectors << 9;
++		tbio->bi_iter.bi_size = fbio->bi_iter.bi_size;
+ 		tbio->bi_rw = WRITE;
+ 		tbio->bi_private = r10_bio;
+ 		tbio->bi_iter.bi_sector = r10_bio->devs[i].addr;
diff --git a/queue-4.3/mm-memcontrol-fix-possible-memcg-leak-due-to-interrupted-reclaim.patch b/queue-4.3/mm-memcontrol-fix-possible-memcg-leak-due-to-interrupted-reclaim.patch
new file mode 100644
index 00000000000..6814876bcaf
--- /dev/null
+++ b/queue-4.3/mm-memcontrol-fix-possible-memcg-leak-due-to-interrupted-reclaim.patch
@@ -0,0 +1,156 @@
+From 6df38689e0e9a07ff4f42c06b302e203b33667e9 Mon Sep 17 00:00:00 2001
+From: Vladimir Davydov <vdavydov@virtuozzo.com>
+Date: Tue, 29 Dec 2015 14:54:10 -0800
+Subject: mm: memcontrol: fix possible memcg leak due to interrupted reclaim
+
+From: Vladimir Davydov <vdavydov@virtuozzo.com>
+
+commit 6df38689e0e9a07ff4f42c06b302e203b33667e9 upstream.
+
+Memory cgroup reclaim can be interrupted with mem_cgroup_iter_break()
+once enough pages have been reclaimed, in which case, in contrast to a
+full round-trip over a cgroup sub-tree, the current position stored in
+mem_cgroup_reclaim_iter of the target cgroup does not get invalidated
+and so is left holding the reference to the last scanned cgroup.  If the
+target cgroup does not get scanned again (we might have just reclaimed
+the last page or all processes might exit and free their memory
+voluntary), we will leak it, because there is nobody to put the
+reference held by the iterator.
+
+The problem is easy to reproduce by running the following command
+sequence in a loop:
+
+    mkdir /sys/fs/cgroup/memory/test
+    echo 100M > /sys/fs/cgroup/memory/test/memory.limit_in_bytes
+    echo $$ > /sys/fs/cgroup/memory/test/cgroup.procs
+    memhog 150M
+    echo $$ > /sys/fs/cgroup/memory/cgroup.procs
+    rmdir test
+
+The cgroups generated by it will never get freed.
+
+This patch fixes this issue by making mem_cgroup_iter avoid taking
+reference to the current position.  In order not to hit use-after-free
+bug while running reclaim in parallel with cgroup deletion, we make use
+of ->css_released cgroup callback to clear references to the dying
+cgroup in all reclaim iterators that might refer to it.  This callback
+is called right before scheduling rcu work which will free css, so if we
+access iter->position from rcu read section, we might be sure it won't
+go away under us.
+
+[hannes@cmpxchg.org: clean up css ref handling]
+Fixes: 5ac8fb31ad2e ("mm: memcontrol: convert reclaim iterator to simple css refcounting")
+Signed-off-by: Vladimir Davydov <vdavydov@virtuozzo.com>
+Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
+Acked-by: Michal Hocko <mhocko@kernel.org>
+Acked-by: Johannes Weiner <hannes@cmpxchg.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/memcontrol.c |   60 ++++++++++++++++++++++++++++++++++++++++++--------------
+ 1 file changed, 46 insertions(+), 14 deletions(-)
+
+--- a/mm/memcontrol.c
++++ b/mm/memcontrol.c
+@@ -902,14 +902,20 @@ struct mem_cgroup *mem_cgroup_iter(struc
+ 		if (prev && reclaim->generation != iter->generation)
+ 			goto out_unlock;
+ 
+-		do {
++		while (1) {
+ 			pos = READ_ONCE(iter->position);
++			if (!pos || css_tryget(&pos->css))
++				break;
+ 			/*
+-			 * A racing update may change the position and
+-			 * put the last reference, hence css_tryget(),
+-			 * or retry to see the updated position.
++			 * css reference reached zero, so iter->position will
++			 * be cleared by ->css_released. However, we should not
++			 * rely on this happening soon, because ->css_released
++			 * is called from a work queue, and by busy-waiting we
++			 * might block it. So we clear iter->position right
++			 * away.
+ 			 */
+-		} while (pos && !css_tryget(&pos->css));
++			(void)cmpxchg(&iter->position, pos, NULL);
++		}
+ 	}
+ 
+ 	if (pos)
+@@ -955,17 +961,13 @@ struct mem_cgroup *mem_cgroup_iter(struc
+ 	}
+ 
+ 	if (reclaim) {
+-		if (cmpxchg(&iter->position, pos, memcg) == pos) {
+-			if (memcg)
+-				css_get(&memcg->css);
+-			if (pos)
+-				css_put(&pos->css);
+-		}
+-
+ 		/*
+-		 * pairs with css_tryget when dereferencing iter->position
+-		 * above.
++		 * The position could have already been updated by a competing
++		 * thread, so check that the value hasn't changed since we read
++		 * it to avoid reclaiming from the same cgroup twice.
+ 		 */
++		(void)cmpxchg(&iter->position, pos, memcg);
++
+ 		if (pos)
+ 			css_put(&pos->css);
+ 
+@@ -998,6 +1000,28 @@ void mem_cgroup_iter_break(struct mem_cg
+ 		css_put(&prev->css);
+ }
+ 
++static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
++{
++	struct mem_cgroup *memcg = dead_memcg;
++	struct mem_cgroup_reclaim_iter *iter;
++	struct mem_cgroup_per_zone *mz;
++	int nid, zid;
++	int i;
++
++	while ((memcg = parent_mem_cgroup(memcg))) {
++		for_each_node(nid) {
++			for (zid = 0; zid < MAX_NR_ZONES; zid++) {
++				mz = &memcg->nodeinfo[nid]->zoneinfo[zid];
++				for (i = 0; i <= DEF_PRIORITY; i++) {
++					iter = &mz->iter[i];
++					cmpxchg(&iter->position,
++						dead_memcg, NULL);
++				}
++			}
++		}
++	}
++}
++
+ /*
+  * Iteration constructs for visiting all cgroups (under a tree).  If
+  * loops are exited prematurely (break), mem_cgroup_iter_break() must
+@@ -4360,6 +4384,13 @@ static void mem_cgroup_css_offline(struc
+ 	wb_memcg_offline(memcg);
+ }
+ 
++static void mem_cgroup_css_released(struct cgroup_subsys_state *css)
++{
++	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
++
++	invalidate_reclaim_iterators(memcg);
++}
++
+ static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
+ {
+ 	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+@@ -5216,6 +5247,7 @@ struct cgroup_subsys memory_cgrp_subsys
+ 	.css_alloc = mem_cgroup_css_alloc,
+ 	.css_online = mem_cgroup_css_online,
+ 	.css_offline = mem_cgroup_css_offline,
++	.css_released = mem_cgroup_css_released,
+ 	.css_free = mem_cgroup_css_free,
+ 	.css_reset = mem_cgroup_css_reset,
+ 	.can_attach = mem_cgroup_can_attach,
diff --git a/queue-4.3/series b/queue-4.3/series
index 48bbfc39506..99fa0d31b20 100644
--- a/queue-4.3/series
+++ b/queue-4.3/series
@@ -92,3 +92,6 @@ media-vb2-dma-contig-fully-cache-synchronise-buffers-in-prepare-and-finish.patch
 media-vb2-dma-sg-fully-cache-synchronise-buffers-in-prepare-and-finish.patch
 media-v4l2-ctrls-fix-setting-autocluster-to-manual-with-vidioc_s_ctrl.patch
 revert-ivtv-avoid-going-past-input-audio-array.patch
+mm-memcontrol-fix-possible-memcg-leak-due-to-interrupted-reclaim.patch
+md-raid10-fix-data-corruption-and-crash-during-resync.patch
+fix-sysvfs-symlinks.patch
-- 
2.47.3