From fb7d6f5eda2530a2d8cfcd7f7db3459c08b85fe0 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Sat, 13 Feb 2016 17:57:25 -0800 Subject: [PATCH] 4.3-stable patches added patches: fix-sysvfs-symlinks.patch md-raid10-fix-data-corruption-and-crash-during-resync.patch mm-memcontrol-fix-possible-memcg-leak-due-to-interrupted-reclaim.patch --- queue-4.3/fix-sysvfs-symlinks.patch | 46 ++++++ ...a-corruption-and-crash-during-resync.patch | 80 +++++++++ ...emcg-leak-due-to-interrupted-reclaim.patch | 156 ++++++++++++++++++ queue-4.3/series | 3 + 4 files changed, 285 insertions(+) create mode 100644 queue-4.3/fix-sysvfs-symlinks.patch create mode 100644 queue-4.3/md-raid10-fix-data-corruption-and-crash-during-resync.patch create mode 100644 queue-4.3/mm-memcontrol-fix-possible-memcg-leak-due-to-interrupted-reclaim.patch diff --git a/queue-4.3/fix-sysvfs-symlinks.patch b/queue-4.3/fix-sysvfs-symlinks.patch new file mode 100644 index 00000000000..998238bf533 --- /dev/null +++ b/queue-4.3/fix-sysvfs-symlinks.patch @@ -0,0 +1,46 @@ +From 0ebf7f10d67a70e120f365018f1c5fce9ddc567d Mon Sep 17 00:00:00 2001 +From: Al Viro +Date: Mon, 23 Nov 2015 21:11:08 -0500 +Subject: fix sysvfs symlinks + +From: Al Viro + +commit 0ebf7f10d67a70e120f365018f1c5fce9ddc567d upstream. + +The thing got broken back in 2002 - sysvfs does *not* have inline +symlinks; even short ones have bodies stored in the first block +of file. sysv_symlink() handles that correctly; unfortunately, +attempting to look an existing symlink up will end up confusing +them for inline symlinks, and interpret the block number containing +the body as the body itself. + +Nobody has noticed until now, which says something about the level +of testing sysvfs gets ;-/ + +Signed-off-by: Al Viro +Signed-off-by: Greg Kroah-Hartman + +--- + fs/sysv/inode.c | 11 ++--------- + 1 file changed, 2 insertions(+), 9 deletions(-) + +--- a/fs/sysv/inode.c ++++ b/fs/sysv/inode.c +@@ -162,15 +162,8 @@ void sysv_set_inode(struct inode *inode, + inode->i_fop = &sysv_dir_operations; + inode->i_mapping->a_ops = &sysv_aops; + } else if (S_ISLNK(inode->i_mode)) { +- if (inode->i_blocks) { +- inode->i_op = &sysv_symlink_inode_operations; +- inode->i_mapping->a_ops = &sysv_aops; +- } else { +- inode->i_op = &simple_symlink_inode_operations; +- inode->i_link = (char *)SYSV_I(inode)->i_data; +- nd_terminate_link(inode->i_link, inode->i_size, +- sizeof(SYSV_I(inode)->i_data) - 1); +- } ++ inode->i_op = &sysv_symlink_inode_operations; ++ inode->i_mapping->a_ops = &sysv_aops; + } else + init_special_inode(inode, inode->i_mode, rdev); + } diff --git a/queue-4.3/md-raid10-fix-data-corruption-and-crash-during-resync.patch b/queue-4.3/md-raid10-fix-data-corruption-and-crash-during-resync.patch new file mode 100644 index 00000000000..c71c976ef58 --- /dev/null +++ b/queue-4.3/md-raid10-fix-data-corruption-and-crash-during-resync.patch @@ -0,0 +1,80 @@ +From cc57858831e3e9678291de730c4b4d2e52a19f59 Mon Sep 17 00:00:00 2001 +From: Artur Paszkiewicz +Date: Fri, 18 Dec 2015 15:19:16 +1100 +Subject: md/raid10: fix data corruption and crash during resync + +From: Artur Paszkiewicz + +commit cc57858831e3e9678291de730c4b4d2e52a19f59 upstream. + +The commit c31df25f20e3 ("md/raid10: make sync_request_write() call +bio_copy_data()") replaced manual data copying with bio_copy_data() but +it doesn't work as intended. The source bio (fbio) is already processed, +so its bvec_iter has bi_size == 0 and bi_idx == bi_vcnt. Because of +this, bio_copy_data() either does not copy anything, or worse, copies +data from the ->bi_next bio if it is set. This causes wrong data to be +written to drives during resync and sometimes lockups/crashes in +bio_copy_data(): + +[ 517.338478] NMI watchdog: BUG: soft lockup - CPU#0 stuck for 22s! [md126_raid10:3319] +[ 517.347324] Modules linked in: raid10 xt_CHECKSUM ipt_MASQUERADE nf_nat_masquerade_ipv4 tun ip6t_rpfilter ip6t_REJECT nf_reject_ipv6 ipt_REJECT nf_reject_ipv4 xt_conntrack ebtable_nat ebtable_broute bridge stp llc ebtable_filter ebtables ip6table_nat nf_conntrack_ipv6 nf_defrag_ipv6 nf_nat_ipv6 ip6table_mangle ip6table_security ip6table_raw ip6table_filter ip6_tables iptable_nat nf_conntrack_ipv4 nf_defrag_ipv4 nf_nat_ipv4 nf_nat nf_conntrack iptable_mangle iptable_security iptable_raw iptable_filter ip_tables x86_pkg_temp_thermal coretemp kvm_intel kvm crct10dif_pclmul crc32_pclmul cryptd shpchp pcspkr ipmi_si ipmi_msghandler tpm_crb acpi_power_meter acpi_cpufreq ext4 mbcache jbd2 sr_mod cdrom sd_mod e1000e ax88179_178a usbnet mii ahci ata_generic crc32c_intel libahci ptp pata_acpi libata pps_core wmi sunrpc dm_mirror dm_region_hash dm_log dm_mod +[ 517.440555] CPU: 0 PID: 3319 Comm: md126_raid10 Not tainted 4.3.0-rc6+ #1 +[ 517.448384] Hardware name: Intel Corporation PURLEY/PURLEY, BIOS PLYDCRB1.86B.0055.D14.1509221924 09/22/2015 +[ 517.459768] task: ffff880153773980 ti: ffff880150df8000 task.ti: ffff880150df8000 +[ 517.468529] RIP: 0010:[] [] bio_copy_data+0xc8/0x3c0 +[ 517.478164] RSP: 0018:ffff880150dfbc98 EFLAGS: 00000246 +[ 517.484341] RAX: ffff880169356688 RBX: 0000000000001000 RCX: 0000000000000000 +[ 517.492558] RDX: 0000000000000000 RSI: ffffea0001ac2980 RDI: ffffea0000d835c0 +[ 517.500773] RBP: ffff880150dfbd08 R08: 0000000000000001 R09: ffff880153773980 +[ 517.508987] R10: ffff880169356600 R11: 0000000000001000 R12: 0000000000010000 +[ 517.517199] R13: 000000000000e000 R14: 0000000000000000 R15: 0000000000001000 +[ 517.525412] FS: 0000000000000000(0000) GS:ffff880174a00000(0000) knlGS:0000000000000000 +[ 517.534844] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 +[ 517.541507] CR2: 00007f8a044d5fed CR3: 0000000169504000 CR4: 00000000001406f0 +[ 517.549722] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 +[ 517.557929] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 +[ 517.566144] Stack: +[ 517.568626] ffff880174a16bc0 ffff880153773980 ffff880169356600 0000000000000000 +[ 517.577659] 0000000000000001 0000000000000001 ffff880153773980 ffff88016a61a800 +[ 517.586715] ffff880150dfbcf8 0000000000000001 ffff88016dd209e0 0000000000001000 +[ 517.595773] Call Trace: +[ 517.598747] [] raid10d+0xfc5/0x1690 [raid10] +[ 517.605610] [] ? __schedule+0x29e/0x8e2 +[ 517.611987] [] md_thread+0x106/0x140 +[ 517.618072] [] ? wait_woken+0x80/0x80 +[ 517.624252] [] ? super_1_load+0x520/0x520 +[ 517.630817] [] kthread+0xc9/0xe0 +[ 517.636506] [] ? flush_kthread_worker+0x70/0x70 +[ 517.643653] [] ret_from_fork+0x3f/0x70 +[ 517.649929] [] ? flush_kthread_worker+0x70/0x70 + +Signed-off-by: Artur Paszkiewicz +Reviewed-by: Shaohua Li +Fixes: c31df25f20e3 ("md/raid10: make sync_request_write() call bio_copy_data()") +Signed-off-by: NeilBrown +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/md/raid10.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +--- a/drivers/md/raid10.c ++++ b/drivers/md/raid10.c +@@ -1944,6 +1944,8 @@ static void sync_request_write(struct md + + first = i; + fbio = r10_bio->devs[i].bio; ++ fbio->bi_iter.bi_size = r10_bio->sectors << 9; ++ fbio->bi_iter.bi_idx = 0; + + vcnt = (r10_bio->sectors + (PAGE_SIZE >> 9) - 1) >> (PAGE_SHIFT - 9); + /* now find blocks with errors */ +@@ -1987,7 +1989,7 @@ static void sync_request_write(struct md + bio_reset(tbio); + + tbio->bi_vcnt = vcnt; +- tbio->bi_iter.bi_size = r10_bio->sectors << 9; ++ tbio->bi_iter.bi_size = fbio->bi_iter.bi_size; + tbio->bi_rw = WRITE; + tbio->bi_private = r10_bio; + tbio->bi_iter.bi_sector = r10_bio->devs[i].addr; diff --git a/queue-4.3/mm-memcontrol-fix-possible-memcg-leak-due-to-interrupted-reclaim.patch b/queue-4.3/mm-memcontrol-fix-possible-memcg-leak-due-to-interrupted-reclaim.patch new file mode 100644 index 00000000000..6814876bcaf --- /dev/null +++ b/queue-4.3/mm-memcontrol-fix-possible-memcg-leak-due-to-interrupted-reclaim.patch @@ -0,0 +1,156 @@ +From 6df38689e0e9a07ff4f42c06b302e203b33667e9 Mon Sep 17 00:00:00 2001 +From: Vladimir Davydov +Date: Tue, 29 Dec 2015 14:54:10 -0800 +Subject: mm: memcontrol: fix possible memcg leak due to interrupted reclaim + +From: Vladimir Davydov + +commit 6df38689e0e9a07ff4f42c06b302e203b33667e9 upstream. + +Memory cgroup reclaim can be interrupted with mem_cgroup_iter_break() +once enough pages have been reclaimed, in which case, in contrast to a +full round-trip over a cgroup sub-tree, the current position stored in +mem_cgroup_reclaim_iter of the target cgroup does not get invalidated +and so is left holding the reference to the last scanned cgroup. If the +target cgroup does not get scanned again (we might have just reclaimed +the last page or all processes might exit and free their memory +voluntary), we will leak it, because there is nobody to put the +reference held by the iterator. + +The problem is easy to reproduce by running the following command +sequence in a loop: + + mkdir /sys/fs/cgroup/memory/test + echo 100M > /sys/fs/cgroup/memory/test/memory.limit_in_bytes + echo $$ > /sys/fs/cgroup/memory/test/cgroup.procs + memhog 150M + echo $$ > /sys/fs/cgroup/memory/cgroup.procs + rmdir test + +The cgroups generated by it will never get freed. + +This patch fixes this issue by making mem_cgroup_iter avoid taking +reference to the current position. In order not to hit use-after-free +bug while running reclaim in parallel with cgroup deletion, we make use +of ->css_released cgroup callback to clear references to the dying +cgroup in all reclaim iterators that might refer to it. This callback +is called right before scheduling rcu work which will free css, so if we +access iter->position from rcu read section, we might be sure it won't +go away under us. + +[hannes@cmpxchg.org: clean up css ref handling] +Fixes: 5ac8fb31ad2e ("mm: memcontrol: convert reclaim iterator to simple css refcounting") +Signed-off-by: Vladimir Davydov +Signed-off-by: Johannes Weiner +Acked-by: Michal Hocko +Acked-by: Johannes Weiner +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + mm/memcontrol.c | 60 ++++++++++++++++++++++++++++++++++++++++++-------------- + 1 file changed, 46 insertions(+), 14 deletions(-) + +--- a/mm/memcontrol.c ++++ b/mm/memcontrol.c +@@ -902,14 +902,20 @@ struct mem_cgroup *mem_cgroup_iter(struc + if (prev && reclaim->generation != iter->generation) + goto out_unlock; + +- do { ++ while (1) { + pos = READ_ONCE(iter->position); ++ if (!pos || css_tryget(&pos->css)) ++ break; + /* +- * A racing update may change the position and +- * put the last reference, hence css_tryget(), +- * or retry to see the updated position. ++ * css reference reached zero, so iter->position will ++ * be cleared by ->css_released. However, we should not ++ * rely on this happening soon, because ->css_released ++ * is called from a work queue, and by busy-waiting we ++ * might block it. So we clear iter->position right ++ * away. + */ +- } while (pos && !css_tryget(&pos->css)); ++ (void)cmpxchg(&iter->position, pos, NULL); ++ } + } + + if (pos) +@@ -955,17 +961,13 @@ struct mem_cgroup *mem_cgroup_iter(struc + } + + if (reclaim) { +- if (cmpxchg(&iter->position, pos, memcg) == pos) { +- if (memcg) +- css_get(&memcg->css); +- if (pos) +- css_put(&pos->css); +- } +- + /* +- * pairs with css_tryget when dereferencing iter->position +- * above. ++ * The position could have already been updated by a competing ++ * thread, so check that the value hasn't changed since we read ++ * it to avoid reclaiming from the same cgroup twice. + */ ++ (void)cmpxchg(&iter->position, pos, memcg); ++ + if (pos) + css_put(&pos->css); + +@@ -998,6 +1000,28 @@ void mem_cgroup_iter_break(struct mem_cg + css_put(&prev->css); + } + ++static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg) ++{ ++ struct mem_cgroup *memcg = dead_memcg; ++ struct mem_cgroup_reclaim_iter *iter; ++ struct mem_cgroup_per_zone *mz; ++ int nid, zid; ++ int i; ++ ++ while ((memcg = parent_mem_cgroup(memcg))) { ++ for_each_node(nid) { ++ for (zid = 0; zid < MAX_NR_ZONES; zid++) { ++ mz = &memcg->nodeinfo[nid]->zoneinfo[zid]; ++ for (i = 0; i <= DEF_PRIORITY; i++) { ++ iter = &mz->iter[i]; ++ cmpxchg(&iter->position, ++ dead_memcg, NULL); ++ } ++ } ++ } ++ } ++} ++ + /* + * Iteration constructs for visiting all cgroups (under a tree). If + * loops are exited prematurely (break), mem_cgroup_iter_break() must +@@ -4360,6 +4384,13 @@ static void mem_cgroup_css_offline(struc + wb_memcg_offline(memcg); + } + ++static void mem_cgroup_css_released(struct cgroup_subsys_state *css) ++{ ++ struct mem_cgroup *memcg = mem_cgroup_from_css(css); ++ ++ invalidate_reclaim_iterators(memcg); ++} ++ + static void mem_cgroup_css_free(struct cgroup_subsys_state *css) + { + struct mem_cgroup *memcg = mem_cgroup_from_css(css); +@@ -5216,6 +5247,7 @@ struct cgroup_subsys memory_cgrp_subsys + .css_alloc = mem_cgroup_css_alloc, + .css_online = mem_cgroup_css_online, + .css_offline = mem_cgroup_css_offline, ++ .css_released = mem_cgroup_css_released, + .css_free = mem_cgroup_css_free, + .css_reset = mem_cgroup_css_reset, + .can_attach = mem_cgroup_can_attach, diff --git a/queue-4.3/series b/queue-4.3/series index 48bbfc39506..99fa0d31b20 100644 --- a/queue-4.3/series +++ b/queue-4.3/series @@ -92,3 +92,6 @@ media-vb2-dma-contig-fully-cache-synchronise-buffers-in-prepare-and-finish.patch media-vb2-dma-sg-fully-cache-synchronise-buffers-in-prepare-and-finish.patch media-v4l2-ctrls-fix-setting-autocluster-to-manual-with-vidioc_s_ctrl.patch revert-ivtv-avoid-going-past-input-audio-array.patch +mm-memcontrol-fix-possible-memcg-leak-due-to-interrupted-reclaim.patch +md-raid10-fix-data-corruption-and-crash-during-resync.patch +fix-sysvfs-symlinks.patch -- 2.47.3