From: Greg Kroah-Hartman Date: Fri, 6 Dec 2013 18:03:35 +0000 (-0800) Subject: 3.12-stable patches X-Git-Tag: v3.4.73~9 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=a691151e17c26e1f2ea53123b2db2e2b861ed734;p=thirdparty%2Fkernel%2Fstable-queue.git 3.12-stable patches added patches: md-raid5-use-conf-device_lock-protect-changing-of-multi-thread-resources.patch mm-numa-return-the-number-of-base-pages-altered-by-protection-changes.patch --- diff --git a/queue-3.12/md-raid5-use-conf-device_lock-protect-changing-of-multi-thread-resources.patch b/queue-3.12/md-raid5-use-conf-device_lock-protect-changing-of-multi-thread-resources.patch new file mode 100644 index 00000000000..9c2df96f0f0 --- /dev/null +++ b/queue-3.12/md-raid5-use-conf-device_lock-protect-changing-of-multi-thread-resources.patch @@ -0,0 +1,200 @@ +From 60aaf933854511630e16be4efe0f96485e132de4 Mon Sep 17 00:00:00 2001 +From: majianpeng +Date: Thu, 14 Nov 2013 15:16:20 +1100 +Subject: md/raid5: Use conf->device_lock protect changing of multi-thread resources. + +From: majianpeng + +commit 60aaf933854511630e16be4efe0f96485e132de4 upstream. +and commit 0c775d5208284700de423e6746259da54a42e1f5 + +When we change group_thread_cnt from sysfs entry, it can OOPS. + +The kernel messages are: +[ 135.299021] BUG: unable to handle kernel NULL pointer dereference at (null) +[ 135.299073] IP: [] handle_active_stripes+0x32b/0x440 +[ 135.299107] PGD 0 +[ 135.299122] Oops: 0000 [#1] SMP +[ 135.299144] Modules linked in: netconsole e1000e ptp pps_core +[ 135.299188] CPU: 3 PID: 2225 Comm: md0_raid5 Not tainted 3.12.0+ #24 +[ 135.299214] Hardware name: To Be Filled By O.E.M. To Be Filled By O.E.M./To be filled by O.E.M., BIOS 080015 11/09/2011 +[ 135.299255] task: ffff8800b9638f80 ti: ffff8800b77a4000 task.ti: ffff8800b77a4000 +[ 135.299283] RIP: 0010:[] [] handle_active_stripes+0x32b/0x440 +[ 135.299323] RSP: 0018:ffff8800b77a5c48 EFLAGS: 00010002 +[ 135.299344] RAX: ffff880037bb5c70 RBX: 0000000000000000 RCX: 0000000000000008 +[ 135.299371] RDX: ffff880037bb5cb8 RSI: 0000000000000001 RDI: ffff880037bb5c00 +[ 135.299398] RBP: ffff8800b77a5d08 R08: 0000000000000001 R09: 0000000000000000 +[ 135.299425] R10: ffff8800b77a5c98 R11: 00000000ffffffff R12: ffff880037bb5c00 +[ 135.299452] R13: 0000000000000000 R14: 0000000000000000 R15: ffff880037bb5c70 +[ 135.299479] FS: 0000000000000000(0000) GS:ffff88013fd80000(0000) knlGS:0000000000000000 +[ 135.299510] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b +[ 135.299532] CR2: 0000000000000000 CR3: 0000000001c0b000 CR4: 00000000000407e0 +[ 135.299559] Stack: +[ 135.299570] ffff8800b77a5c88 ffffffff8107383e ffff8800b77a5c88 ffff880037a64300 +[ 135.299611] 000000000000ec08 ffff880037bb5cb8 ffff8800b77a5c98 ffffffffffffffd8 +[ 135.299654] 000000000000ec08 ffff880037bb5c60 ffff8800b77a5c98 ffff8800b77a5c98 +[ 135.299696] Call Trace: +[ 135.299711] [] ? __wake_up+0x4e/0x70 +[ 135.299733] [] raid5d+0x4c8/0x680 +[ 135.299756] [] ? schedule_timeout+0x15d/0x1f0 +[ 135.299781] [] md_thread+0x11f/0x170 +[ 135.299804] [] ? wake_up_bit+0x40/0x40 +[ 135.299826] [] ? md_rdev_init+0x110/0x110 +[ 135.299850] [] kthread+0xc6/0xd0 +[ 135.299871] [] ? kthread_freezable_should_stop+0x70/0x70 +[ 135.299899] [] ret_from_fork+0x7c/0xb0 +[ 135.299923] [] ? kthread_freezable_should_stop+0x70/0x70 +[ 135.299951] Code: ff ff ff 0f 84 d7 fe ff ff e9 5c fe ff ff 66 90 41 8b b4 24 d8 01 00 00 45 31 ed 85 f6 0f 8e 7b fd ff ff 49 8b 9c 24 d0 01 00 00 <48> 3b 1b 49 89 dd 0f 85 67 fd ff ff 48 8d 43 28 31 d2 eb 17 90 +[ 135.300005] RIP [] handle_active_stripes+0x32b/0x440 +[ 135.300005] RSP +[ 135.300005] CR2: 0000000000000000 +[ 135.300005] ---[ end trace 504854e5bb7562ed ]--- +[ 135.300005] Kernel panic - not syncing: Fatal exception + +This is because raid5d() can be running when the multi-thread +resources are changed via system. We see need to provide locking. + +mddev->device_lock is suitable, but we cannot simple call +alloc_thread_groups under this lock as we cannot allocate memory +while holding a spinlock. +So change alloc_thread_groups() to allocate and return the data +structures, then raid5_store_group_thread_cnt() can take the lock +while updating the pointers to the data structures. + +This fixes a bug introduced in 3.12 and so is suitable for the 3.12.x +stable series. + +Fixes: b721420e8719131896b009b11edbbd27 +Signed-off-by: Jianpeng Ma +Signed-off-by: NeilBrown +Reviewed-by: Shaohua Li +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/md/raid5.c | 63 ++++++++++++++++++++++++++++++++--------------------- + 1 file changed, 39 insertions(+), 24 deletions(-) + +--- a/drivers/md/raid5.c ++++ b/drivers/md/raid5.c +@@ -5214,15 +5214,18 @@ raid5_show_group_thread_cnt(struct mddev + return 0; + } + +-static int alloc_thread_groups(struct r5conf *conf, int cnt); ++static int alloc_thread_groups(struct r5conf *conf, int cnt, ++ int *group_cnt, ++ int *worker_cnt_per_group, ++ struct r5worker_group **worker_groups); + static ssize_t + raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len) + { + struct r5conf *conf = mddev->private; + unsigned long new; + int err; +- struct r5worker_group *old_groups; +- int old_group_cnt; ++ struct r5worker_group *new_groups, *old_groups; ++ int group_cnt, worker_cnt_per_group; + + if (len >= PAGE_SIZE) + return -EINVAL; +@@ -5238,17 +5241,19 @@ raid5_store_group_thread_cnt(struct mdde + mddev_suspend(mddev); + + old_groups = conf->worker_groups; +- old_group_cnt = conf->worker_cnt_per_group; +- + if (old_groups) + flush_workqueue(raid5_wq); + +- conf->worker_groups = NULL; +- err = alloc_thread_groups(conf, new); +- if (err) { +- conf->worker_groups = old_groups; +- conf->worker_cnt_per_group = old_group_cnt; +- } else { ++ err = alloc_thread_groups(conf, new, ++ &group_cnt, &worker_cnt_per_group, ++ &new_groups); ++ if (!err) { ++ spin_lock_irq(&conf->device_lock); ++ conf->group_cnt = group_cnt; ++ conf->worker_cnt_per_group = worker_cnt_per_group; ++ conf->worker_groups = new_groups; ++ spin_unlock_irq(&conf->device_lock); ++ + if (old_groups) + kfree(old_groups[0].workers); + kfree(old_groups); +@@ -5278,33 +5283,36 @@ static struct attribute_group raid5_attr + .attrs = raid5_attrs, + }; + +-static int alloc_thread_groups(struct r5conf *conf, int cnt) ++static int alloc_thread_groups(struct r5conf *conf, int cnt, ++ int *group_cnt, ++ int *worker_cnt_per_group, ++ struct r5worker_group **worker_groups) + { + int i, j; + ssize_t size; + struct r5worker *workers; + +- conf->worker_cnt_per_group = cnt; ++ *worker_cnt_per_group = cnt; + if (cnt == 0) { +- conf->worker_groups = NULL; ++ *group_cnt = 0; ++ *worker_groups = NULL; + return 0; + } +- conf->group_cnt = num_possible_nodes(); ++ *group_cnt = num_possible_nodes(); + size = sizeof(struct r5worker) * cnt; +- workers = kzalloc(size * conf->group_cnt, GFP_NOIO); +- conf->worker_groups = kzalloc(sizeof(struct r5worker_group) * +- conf->group_cnt, GFP_NOIO); +- if (!conf->worker_groups || !workers) { ++ workers = kzalloc(size * *group_cnt, GFP_NOIO); ++ *worker_groups = kzalloc(sizeof(struct r5worker_group) * ++ *group_cnt, GFP_NOIO); ++ if (!*worker_groups || !workers) { + kfree(workers); +- kfree(conf->worker_groups); +- conf->worker_groups = NULL; ++ kfree(*worker_groups); + return -ENOMEM; + } + +- for (i = 0; i < conf->group_cnt; i++) { ++ for (i = 0; i < *group_cnt; i++) { + struct r5worker_group *group; + +- group = &conf->worker_groups[i]; ++ group = &(*worker_groups)[i]; + INIT_LIST_HEAD(&group->handle_list); + group->conf = conf; + group->workers = workers + i * cnt; +@@ -5462,6 +5470,8 @@ static struct r5conf *setup_conf(struct + struct md_rdev *rdev; + struct disk_info *disk; + char pers_name[6]; ++ int group_cnt, worker_cnt_per_group; ++ struct r5worker_group *new_group; + + if (mddev->new_level != 5 + && mddev->new_level != 4 +@@ -5496,7 +5506,12 @@ static struct r5conf *setup_conf(struct + if (conf == NULL) + goto abort; + /* Don't enable multi-threading by default*/ +- if (alloc_thread_groups(conf, 0)) ++ if (!alloc_thread_groups(conf, 0, &group_cnt, &worker_cnt_per_group, ++ &new_group)) { ++ conf->group_cnt = group_cnt; ++ conf->worker_cnt_per_group = worker_cnt_per_group; ++ conf->worker_groups = new_group; ++ } else + goto abort; + spin_lock_init(&conf->device_lock); + seqcount_init(&conf->gen_lock); diff --git a/queue-3.12/mm-numa-return-the-number-of-base-pages-altered-by-protection-changes.patch b/queue-3.12/mm-numa-return-the-number-of-base-pages-altered-by-protection-changes.patch new file mode 100644 index 00000000000..46607b4b3f6 --- /dev/null +++ b/queue-3.12/mm-numa-return-the-number-of-base-pages-altered-by-protection-changes.patch @@ -0,0 +1,127 @@ +From 72403b4a0fbdf433c1fe0127e49864658f6f6468 Mon Sep 17 00:00:00 2001 +From: Mel Gorman +Date: Tue, 12 Nov 2013 15:08:32 -0800 +Subject: mm: numa: return the number of base pages altered by protection changes + +From: Mel Gorman + +commit 72403b4a0fbdf433c1fe0127e49864658f6f6468 upstream. + +Commit 0255d4918480 ("mm: Account for a THP NUMA hinting update as one +PTE update") was added to account for the number of PTE updates when +marking pages prot_numa. task_numa_work was using the old return value +to track how much address space had been updated. Altering the return +value causes the scanner to do more work than it is configured or +documented to in a single unit of work. + +This patch reverts that commit and accounts for the number of THP +updates separately in vmstat. It is up to the administrator to +interpret the pair of values correctly. This is a straight-forward +operation and likely to only be of interest when actively debugging NUMA +balancing problems. + +The impact of this patch is that the NUMA PTE scanner will scan slower +when THP is enabled and workloads may converge slower as a result. On +the flip size system CPU usage should be lower than recent tests +reported. This is an illustrative example of a short single JVM specjbb +test + +specjbb + 3.12.0 3.12.0 + vanilla acctupdates +TPut 1 26143.00 ( 0.00%) 25747.00 ( -1.51%) +TPut 7 185257.00 ( 0.00%) 183202.00 ( -1.11%) +TPut 13 329760.00 ( 0.00%) 346577.00 ( 5.10%) +TPut 19 442502.00 ( 0.00%) 460146.00 ( 3.99%) +TPut 25 540634.00 ( 0.00%) 549053.00 ( 1.56%) +TPut 31 512098.00 ( 0.00%) 519611.00 ( 1.47%) +TPut 37 461276.00 ( 0.00%) 474973.00 ( 2.97%) +TPut 43 403089.00 ( 0.00%) 414172.00 ( 2.75%) + + 3.12.0 3.12.0 + vanillaacctupdates +User 5169.64 5184.14 +System 100.45 80.02 +Elapsed 252.75 251.85 + +Performance is similar but note the reduction in system CPU time. While +this showed a performance gain, it will not be universal but at least +it'll be behaving as documented. The vmstats are obviously different but +here is an obvious interpretation of them from mmtests. + + 3.12.0 3.12.0 + vanillaacctupdates +NUMA page range updates 1408326 11043064 +NUMA huge PMD updates 0 21040 +NUMA PTE updates 1408326 291624 + +"NUMA page range updates" == nr_pte_updates and is the value returned to +the NUMA pte scanner. NUMA huge PMD updates were the number of THP +updates which in combination can be used to calculate how many ptes were +updated from userspace. + +Signed-off-by: Mel Gorman +Reported-by: Alex Thorlton +Reviewed-by: Rik van Riel +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Mel Gorman +Signed-off-by: Greg Kroah-Hartman + +--- + include/linux/vm_event_item.h | 1 + + mm/mprotect.c | 7 ++++++- + mm/vmstat.c | 1 + + 3 files changed, 8 insertions(+), 1 deletion(-) + +--- a/include/linux/vm_event_item.h ++++ b/include/linux/vm_event_item.h +@@ -39,6 +39,7 @@ enum vm_event_item { PGPGIN, PGPGOUT, PS + PAGEOUTRUN, ALLOCSTALL, PGROTATED, + #ifdef CONFIG_NUMA_BALANCING + NUMA_PTE_UPDATES, ++ NUMA_HUGE_PTE_UPDATES, + NUMA_HINT_FAULTS, + NUMA_HINT_FAULTS_LOCAL, + NUMA_PAGE_MIGRATE, +--- a/mm/mprotect.c ++++ b/mm/mprotect.c +@@ -138,6 +138,7 @@ static inline unsigned long change_pmd_r + pmd_t *pmd; + unsigned long next; + unsigned long pages = 0; ++ unsigned long nr_huge_updates = 0; + bool all_same_node; + + pmd = pmd_offset(pud, addr); +@@ -148,7 +149,8 @@ static inline unsigned long change_pmd_r + split_huge_page_pmd(vma, addr, pmd); + else if (change_huge_pmd(vma, pmd, addr, newprot, + prot_numa)) { +- pages++; ++ pages += HPAGE_PMD_NR; ++ nr_huge_updates++; + continue; + } + /* fall through */ +@@ -168,6 +170,9 @@ static inline unsigned long change_pmd_r + change_pmd_protnuma(vma->vm_mm, addr, pmd); + } while (pmd++, addr = next, addr != end); + ++ if (nr_huge_updates) ++ count_vm_numa_events(NUMA_HUGE_PTE_UPDATES, nr_huge_updates); ++ + return pages; + } + +--- a/mm/vmstat.c ++++ b/mm/vmstat.c +@@ -812,6 +812,7 @@ const char * const vmstat_text[] = { + + #ifdef CONFIG_NUMA_BALANCING + "numa_pte_updates", ++ "numa_huge_pte_updates", + "numa_hint_faults", + "numa_hint_faults_local", + "numa_pages_migrated", diff --git a/queue-3.12/series b/queue-3.12/series index dd7a149344f..092973a07c4 100644 --- a/queue-3.12/series +++ b/queue-3.12/series @@ -59,3 +59,5 @@ ipv6-fix-possible-seqlock-deadlock-in-ip6_finish_output2.patch pktgen-xfrm-update-ipv4-header-total-len-and-checksum-after-tranformation.patch xfrm-fix-null-pointer-dereference-when-decoding-sessions.patch xfs-add-capability-check-to-free-eofblocks-ioctl.patch +mm-numa-return-the-number-of-base-pages-altered-by-protection-changes.patch +md-raid5-use-conf-device_lock-protect-changing-of-multi-thread-resources.patch