From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Fri, 6 Dec 2013 18:03:35 +0000 (-0800)
Subject: 3.12-stable patches
X-Git-Tag: v3.4.73~9
X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=a691151e17c26e1f2ea53123b2db2e2b861ed734;p=thirdparty%2Fkernel%2Fstable-queue.git

3.12-stable patches

added patches:
	md-raid5-use-conf-device_lock-protect-changing-of-multi-thread-resources.patch
	mm-numa-return-the-number-of-base-pages-altered-by-protection-changes.patch
---

diff --git a/queue-3.12/md-raid5-use-conf-device_lock-protect-changing-of-multi-thread-resources.patch b/queue-3.12/md-raid5-use-conf-device_lock-protect-changing-of-multi-thread-resources.patch
new file mode 100644
index 00000000000..9c2df96f0f0
--- /dev/null
+++ b/queue-3.12/md-raid5-use-conf-device_lock-protect-changing-of-multi-thread-resources.patch
@@ -0,0 +1,200 @@
+From 60aaf933854511630e16be4efe0f96485e132de4 Mon Sep 17 00:00:00 2001
+From: majianpeng <majianpeng@gmail.com>
+Date: Thu, 14 Nov 2013 15:16:20 +1100
+Subject: md/raid5: Use conf->device_lock protect changing of multi-thread resources.
+
+From: majianpeng <majianpeng@gmail.com>
+
+commit 60aaf933854511630e16be4efe0f96485e132de4 upstream.
+and commit 0c775d5208284700de423e6746259da54a42e1f5
+
+When we change group_thread_cnt from sysfs entry, it can OOPS.
+
+The kernel messages are:
+[  135.299021] BUG: unable to handle kernel NULL pointer dereference at           (null)
+[  135.299073] IP: [<ffffffff815188ab>] handle_active_stripes+0x32b/0x440
+[  135.299107] PGD 0
+[  135.299122] Oops: 0000 [#1] SMP
+[  135.299144] Modules linked in: netconsole e1000e ptp pps_core
+[  135.299188] CPU: 3 PID: 2225 Comm: md0_raid5 Not tainted 3.12.0+ #24
+[  135.299214] Hardware name: To Be Filled By O.E.M. To Be Filled By O.E.M./To be filled by O.E.M., BIOS 080015  11/09/2011
+[  135.299255] task: ffff8800b9638f80 ti: ffff8800b77a4000 task.ti: ffff8800b77a4000
+[  135.299283] RIP: 0010:[<ffffffff815188ab>]  [<ffffffff815188ab>] handle_active_stripes+0x32b/0x440
+[  135.299323] RSP: 0018:ffff8800b77a5c48  EFLAGS: 00010002
+[  135.299344] RAX: ffff880037bb5c70 RBX: 0000000000000000 RCX: 0000000000000008
+[  135.299371] RDX: ffff880037bb5cb8 RSI: 0000000000000001 RDI: ffff880037bb5c00
+[  135.299398] RBP: ffff8800b77a5d08 R08: 0000000000000001 R09: 0000000000000000
+[  135.299425] R10: ffff8800b77a5c98 R11: 00000000ffffffff R12: ffff880037bb5c00
+[  135.299452] R13: 0000000000000000 R14: 0000000000000000 R15: ffff880037bb5c70
+[  135.299479] FS:  0000000000000000(0000) GS:ffff88013fd80000(0000) knlGS:0000000000000000
+[  135.299510] CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
+[  135.299532] CR2: 0000000000000000 CR3: 0000000001c0b000 CR4: 00000000000407e0
+[  135.299559] Stack:
+[  135.299570]  ffff8800b77a5c88 ffffffff8107383e ffff8800b77a5c88 ffff880037a64300
+[  135.299611]  000000000000ec08 ffff880037bb5cb8 ffff8800b77a5c98 ffffffffffffffd8
+[  135.299654]  000000000000ec08 ffff880037bb5c60 ffff8800b77a5c98 ffff8800b77a5c98
+[  135.299696] Call Trace:
+[  135.299711]  [<ffffffff8107383e>] ? __wake_up+0x4e/0x70
+[  135.299733]  [<ffffffff81518f88>] raid5d+0x4c8/0x680
+[  135.299756]  [<ffffffff817174ed>] ? schedule_timeout+0x15d/0x1f0
+[  135.299781]  [<ffffffff81524c9f>] md_thread+0x11f/0x170
+[  135.299804]  [<ffffffff81069cd0>] ? wake_up_bit+0x40/0x40
+[  135.299826]  [<ffffffff81524b80>] ? md_rdev_init+0x110/0x110
+[  135.299850]  [<ffffffff81069656>] kthread+0xc6/0xd0
+[  135.299871]  [<ffffffff81069590>] ? kthread_freezable_should_stop+0x70/0x70
+[  135.299899]  [<ffffffff81722ffc>] ret_from_fork+0x7c/0xb0
+[  135.299923]  [<ffffffff81069590>] ? kthread_freezable_should_stop+0x70/0x70
+[  135.299951] Code: ff ff ff 0f 84 d7 fe ff ff e9 5c fe ff ff 66 90 41 8b b4 24 d8 01 00 00 45 31 ed 85 f6 0f 8e 7b fd ff ff 49 8b 9c 24 d0 01 00 00 <48> 3b 1b 49 89 dd 0f 85 67 fd ff ff 48 8d 43 28 31 d2 eb 17 90
+[  135.300005] RIP  [<ffffffff815188ab>] handle_active_stripes+0x32b/0x440
+[  135.300005]  RSP <ffff8800b77a5c48>
+[  135.300005] CR2: 0000000000000000
+[  135.300005] ---[ end trace 504854e5bb7562ed ]---
+[  135.300005] Kernel panic - not syncing: Fatal exception
+
+This is because raid5d() can be running when the multi-thread
+resources are changed via system. We see need to provide locking.
+
+mddev->device_lock is suitable, but we cannot simple call
+alloc_thread_groups under this lock as we cannot allocate memory
+while holding a spinlock.
+So change alloc_thread_groups() to allocate and return the data
+structures, then raid5_store_group_thread_cnt() can take the lock
+while updating the pointers to the data structures.
+
+This fixes a bug introduced in 3.12 and so is suitable for the 3.12.x
+stable series.
+
+Fixes: b721420e8719131896b009b11edbbd27
+Signed-off-by: Jianpeng Ma <majianpeng@gmail.com>
+Signed-off-by: NeilBrown <neilb@suse.de>
+Reviewed-by: Shaohua Li <shli@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/md/raid5.c |   63 ++++++++++++++++++++++++++++++++---------------------
+ 1 file changed, 39 insertions(+), 24 deletions(-)
+
+--- a/drivers/md/raid5.c
++++ b/drivers/md/raid5.c
+@@ -5214,15 +5214,18 @@ raid5_show_group_thread_cnt(struct mddev
+ 		return 0;
+ }
+ 
+-static int alloc_thread_groups(struct r5conf *conf, int cnt);
++static int alloc_thread_groups(struct r5conf *conf, int cnt,
++			       int *group_cnt,
++			       int *worker_cnt_per_group,
++			       struct r5worker_group **worker_groups);
+ static ssize_t
+ raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len)
+ {
+ 	struct r5conf *conf = mddev->private;
+ 	unsigned long new;
+ 	int err;
+-	struct r5worker_group *old_groups;
+-	int old_group_cnt;
++	struct r5worker_group *new_groups, *old_groups;
++	int group_cnt, worker_cnt_per_group;
+ 
+ 	if (len >= PAGE_SIZE)
+ 		return -EINVAL;
+@@ -5238,17 +5241,19 @@ raid5_store_group_thread_cnt(struct mdde
+ 	mddev_suspend(mddev);
+ 
+ 	old_groups = conf->worker_groups;
+-	old_group_cnt = conf->worker_cnt_per_group;
+-
+ 	if (old_groups)
+ 		flush_workqueue(raid5_wq);
+ 
+-	conf->worker_groups = NULL;
+-	err = alloc_thread_groups(conf, new);
+-	if (err) {
+-		conf->worker_groups = old_groups;
+-		conf->worker_cnt_per_group = old_group_cnt;
+-	} else {
++	err = alloc_thread_groups(conf, new,
++				  &group_cnt, &worker_cnt_per_group,
++				  &new_groups);
++	if (!err) {
++		spin_lock_irq(&conf->device_lock);
++		conf->group_cnt = group_cnt;
++		conf->worker_cnt_per_group = worker_cnt_per_group;
++		conf->worker_groups = new_groups;
++		spin_unlock_irq(&conf->device_lock);
++
+ 		if (old_groups)
+ 			kfree(old_groups[0].workers);
+ 		kfree(old_groups);
+@@ -5278,33 +5283,36 @@ static struct attribute_group raid5_attr
+ 	.attrs = raid5_attrs,
+ };
+ 
+-static int alloc_thread_groups(struct r5conf *conf, int cnt)
++static int alloc_thread_groups(struct r5conf *conf, int cnt,
++			       int *group_cnt,
++			       int *worker_cnt_per_group,
++			       struct r5worker_group **worker_groups)
+ {
+ 	int i, j;
+ 	ssize_t size;
+ 	struct r5worker *workers;
+ 
+-	conf->worker_cnt_per_group = cnt;
++	*worker_cnt_per_group = cnt;
+ 	if (cnt == 0) {
+-		conf->worker_groups = NULL;
++		*group_cnt = 0;
++		*worker_groups = NULL;
+ 		return 0;
+ 	}
+-	conf->group_cnt = num_possible_nodes();
++	*group_cnt = num_possible_nodes();
+ 	size = sizeof(struct r5worker) * cnt;
+-	workers = kzalloc(size * conf->group_cnt, GFP_NOIO);
+-	conf->worker_groups = kzalloc(sizeof(struct r5worker_group) *
+-				conf->group_cnt, GFP_NOIO);
+-	if (!conf->worker_groups || !workers) {
++	workers = kzalloc(size * *group_cnt, GFP_NOIO);
++	*worker_groups = kzalloc(sizeof(struct r5worker_group) *
++				*group_cnt, GFP_NOIO);
++	if (!*worker_groups || !workers) {
+ 		kfree(workers);
+-		kfree(conf->worker_groups);
+-		conf->worker_groups = NULL;
++		kfree(*worker_groups);
+ 		return -ENOMEM;
+ 	}
+ 
+-	for (i = 0; i < conf->group_cnt; i++) {
++	for (i = 0; i < *group_cnt; i++) {
+ 		struct r5worker_group *group;
+ 
+-		group = &conf->worker_groups[i];
++		group = &(*worker_groups)[i];
+ 		INIT_LIST_HEAD(&group->handle_list);
+ 		group->conf = conf;
+ 		group->workers = workers + i * cnt;
+@@ -5462,6 +5470,8 @@ static struct r5conf *setup_conf(struct
+ 	struct md_rdev *rdev;
+ 	struct disk_info *disk;
+ 	char pers_name[6];
++	int group_cnt, worker_cnt_per_group;
++	struct r5worker_group *new_group;
+ 
+ 	if (mddev->new_level != 5
+ 	    && mddev->new_level != 4
+@@ -5496,7 +5506,12 @@ static struct r5conf *setup_conf(struct
+ 	if (conf == NULL)
+ 		goto abort;
+ 	/* Don't enable multi-threading by default*/
+-	if (alloc_thread_groups(conf, 0))
++	if (!alloc_thread_groups(conf, 0, &group_cnt, &worker_cnt_per_group,
++				 &new_group)) {
++		conf->group_cnt = group_cnt;
++		conf->worker_cnt_per_group = worker_cnt_per_group;
++		conf->worker_groups = new_group;
++	} else
+ 		goto abort;
+ 	spin_lock_init(&conf->device_lock);
+ 	seqcount_init(&conf->gen_lock);
diff --git a/queue-3.12/mm-numa-return-the-number-of-base-pages-altered-by-protection-changes.patch b/queue-3.12/mm-numa-return-the-number-of-base-pages-altered-by-protection-changes.patch
new file mode 100644
index 00000000000..46607b4b3f6
--- /dev/null
+++ b/queue-3.12/mm-numa-return-the-number-of-base-pages-altered-by-protection-changes.patch
@@ -0,0 +1,127 @@
+From 72403b4a0fbdf433c1fe0127e49864658f6f6468 Mon Sep 17 00:00:00 2001
+From: Mel Gorman <mgorman@suse.de>
+Date: Tue, 12 Nov 2013 15:08:32 -0800
+Subject: mm: numa: return the number of base pages altered by protection changes
+
+From: Mel Gorman <mgorman@suse.de>
+
+commit 72403b4a0fbdf433c1fe0127e49864658f6f6468 upstream.
+
+Commit 0255d4918480 ("mm: Account for a THP NUMA hinting update as one
+PTE update") was added to account for the number of PTE updates when
+marking pages prot_numa.  task_numa_work was using the old return value
+to track how much address space had been updated.  Altering the return
+value causes the scanner to do more work than it is configured or
+documented to in a single unit of work.
+
+This patch reverts that commit and accounts for the number of THP
+updates separately in vmstat.  It is up to the administrator to
+interpret the pair of values correctly.  This is a straight-forward
+operation and likely to only be of interest when actively debugging NUMA
+balancing problems.
+
+The impact of this patch is that the NUMA PTE scanner will scan slower
+when THP is enabled and workloads may converge slower as a result.  On
+the flip size system CPU usage should be lower than recent tests
+reported.  This is an illustrative example of a short single JVM specjbb
+test
+
+specjbb
+                       3.12.0                3.12.0
+                      vanilla      acctupdates
+TPut 1      26143.00 (  0.00%)     25747.00 ( -1.51%)
+TPut 7     185257.00 (  0.00%)    183202.00 ( -1.11%)
+TPut 13    329760.00 (  0.00%)    346577.00 (  5.10%)
+TPut 19    442502.00 (  0.00%)    460146.00 (  3.99%)
+TPut 25    540634.00 (  0.00%)    549053.00 (  1.56%)
+TPut 31    512098.00 (  0.00%)    519611.00 (  1.47%)
+TPut 37    461276.00 (  0.00%)    474973.00 (  2.97%)
+TPut 43    403089.00 (  0.00%)    414172.00 (  2.75%)
+
+              3.12.0      3.12.0
+             vanillaacctupdates
+User         5169.64     5184.14
+System        100.45       80.02
+Elapsed       252.75      251.85
+
+Performance is similar but note the reduction in system CPU time.  While
+this showed a performance gain, it will not be universal but at least
+it'll be behaving as documented.  The vmstats are obviously different but
+here is an obvious interpretation of them from mmtests.
+
+                                3.12.0      3.12.0
+                               vanillaacctupdates
+NUMA page range updates        1408326    11043064
+NUMA huge PMD updates                0       21040
+NUMA PTE updates               1408326      291624
+
+"NUMA page range updates" == nr_pte_updates and is the value returned to
+the NUMA pte scanner.  NUMA huge PMD updates were the number of THP
+updates which in combination can be used to calculate how many ptes were
+updated from userspace.
+
+Signed-off-by: Mel Gorman <mgorman@suse.de>
+Reported-by: Alex Thorlton <athorlton@sgi.com>
+Reviewed-by: Rik van Riel <riel@redhat.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Mel Gorman <mgorman@suse.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ include/linux/vm_event_item.h |    1 +
+ mm/mprotect.c                 |    7 ++++++-
+ mm/vmstat.c                   |    1 +
+ 3 files changed, 8 insertions(+), 1 deletion(-)
+
+--- a/include/linux/vm_event_item.h
++++ b/include/linux/vm_event_item.h
+@@ -39,6 +39,7 @@ enum vm_event_item { PGPGIN, PGPGOUT, PS
+ 		PAGEOUTRUN, ALLOCSTALL, PGROTATED,
+ #ifdef CONFIG_NUMA_BALANCING
+ 		NUMA_PTE_UPDATES,
++		NUMA_HUGE_PTE_UPDATES,
+ 		NUMA_HINT_FAULTS,
+ 		NUMA_HINT_FAULTS_LOCAL,
+ 		NUMA_PAGE_MIGRATE,
+--- a/mm/mprotect.c
++++ b/mm/mprotect.c
+@@ -138,6 +138,7 @@ static inline unsigned long change_pmd_r
+ 	pmd_t *pmd;
+ 	unsigned long next;
+ 	unsigned long pages = 0;
++	unsigned long nr_huge_updates = 0;
+ 	bool all_same_node;
+ 
+ 	pmd = pmd_offset(pud, addr);
+@@ -148,7 +149,8 @@ static inline unsigned long change_pmd_r
+ 				split_huge_page_pmd(vma, addr, pmd);
+ 			else if (change_huge_pmd(vma, pmd, addr, newprot,
+ 						 prot_numa)) {
+-				pages++;
++				pages += HPAGE_PMD_NR;
++				nr_huge_updates++;
+ 				continue;
+ 			}
+ 			/* fall through */
+@@ -168,6 +170,9 @@ static inline unsigned long change_pmd_r
+ 			change_pmd_protnuma(vma->vm_mm, addr, pmd);
+ 	} while (pmd++, addr = next, addr != end);
+ 
++	if (nr_huge_updates)
++		count_vm_numa_events(NUMA_HUGE_PTE_UPDATES, nr_huge_updates);
++
+ 	return pages;
+ }
+ 
+--- a/mm/vmstat.c
++++ b/mm/vmstat.c
+@@ -812,6 +812,7 @@ const char * const vmstat_text[] = {
+ 
+ #ifdef CONFIG_NUMA_BALANCING
+ 	"numa_pte_updates",
++	"numa_huge_pte_updates",
+ 	"numa_hint_faults",
+ 	"numa_hint_faults_local",
+ 	"numa_pages_migrated",
diff --git a/queue-3.12/series b/queue-3.12/series
index dd7a149344f..092973a07c4 100644
--- a/queue-3.12/series
+++ b/queue-3.12/series
@@ -59,3 +59,5 @@ ipv6-fix-possible-seqlock-deadlock-in-ip6_finish_output2.patch
 pktgen-xfrm-update-ipv4-header-total-len-and-checksum-after-tranformation.patch
 xfrm-fix-null-pointer-dereference-when-decoding-sessions.patch
 xfs-add-capability-check-to-free-eofblocks-ioctl.patch
+mm-numa-return-the-number-of-base-pages-altered-by-protection-changes.patch
+md-raid5-use-conf-device_lock-protect-changing-of-multi-thread-resources.patch