3.12-stable patches

author Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Fri, 6 Dec 2013 18:03:35 +0000 (10:03 -0800)

committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Fri, 6 Dec 2013 18:03:35 +0000 (10:03 -0800)
author Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Fri, 6 Dec 2013 18:03:35 +0000 (10:03 -0800)
committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Fri, 6 Dec 2013 18:03:35 +0000 (10:03 -0800)
diff --git a/queue-3.12/md-raid5-use-conf-device_lock-protect-changing-of-multi-thread-resources.patch b/queue-3.12/md-raid5-use-conf-device_lock-protect-changing-of-multi-thread-resources.patch

new file mode 100644 (file)

index 0000000..9c2df96
--- /dev/null
+++ b/queue-3.12/md-raid5-use-conf-device_lock-protect-changing-of-multi-thread-resources.patch
@@ -0,0 +1,200 @@
+From 60aaf933854511630e16be4efe0f96485e132de4 Mon Sep 17 00:00:00 2001
+From: majianpeng <majianpeng@gmail.com>
+Date: Thu, 14 Nov 2013 15:16:20 +1100
+Subject: md/raid5: Use conf->device_lock protect changing of multi-thread resources.
+
+From: majianpeng <majianpeng@gmail.com>
+
+commit 60aaf933854511630e16be4efe0f96485e132de4 upstream.
+and commit 0c775d5208284700de423e6746259da54a42e1f5
+
+When we change group_thread_cnt from sysfs entry, it can OOPS.
+
+The kernel messages are:
+[  135.299021] BUG: unable to handle kernel NULL pointer dereference at           (null)
+[  135.299073] IP: [<ffffffff815188ab>] handle_active_stripes+0x32b/0x440
+[  135.299107] PGD 0
+[  135.299122] Oops: 0000 [#1] SMP
+[  135.299144] Modules linked in: netconsole e1000e ptp pps_core
+[  135.299188] CPU: 3 PID: 2225 Comm: md0_raid5 Not tainted 3.12.0+ #24
+[  135.299214] Hardware name: To Be Filled By O.E.M. To Be Filled By O.E.M./To be filled by O.E.M., BIOS 080015  11/09/2011
+[  135.299255] task: ffff8800b9638f80 ti: ffff8800b77a4000 task.ti: ffff8800b77a4000
+[  135.299283] RIP: 0010:[<ffffffff815188ab>]  [<ffffffff815188ab>] handle_active_stripes+0x32b/0x440
+[  135.299323] RSP: 0018:ffff8800b77a5c48  EFLAGS: 00010002
+[  135.299344] RAX: ffff880037bb5c70 RBX: 0000000000000000 RCX: 0000000000000008
+[  135.299371] RDX: ffff880037bb5cb8 RSI: 0000000000000001 RDI: ffff880037bb5c00
+[  135.299398] RBP: ffff8800b77a5d08 R08: 0000000000000001 R09: 0000000000000000
+[  135.299425] R10: ffff8800b77a5c98 R11: 00000000ffffffff R12: ffff880037bb5c00
+[  135.299452] R13: 0000000000000000 R14: 0000000000000000 R15: ffff880037bb5c70
+[  135.299479] FS:  0000000000000000(0000) GS:ffff88013fd80000(0000) knlGS:0000000000000000
+[  135.299510] CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
+[  135.299532] CR2: 0000000000000000 CR3: 0000000001c0b000 CR4: 00000000000407e0
+[  135.299559] Stack:
+[  135.299570]  ffff8800b77a5c88 ffffffff8107383e ffff8800b77a5c88 ffff880037a64300
+[  135.299611]  000000000000ec08 ffff880037bb5cb8 ffff8800b77a5c98 ffffffffffffffd8
+[  135.299654]  000000000000ec08 ffff880037bb5c60 ffff8800b77a5c98 ffff8800b77a5c98
+[  135.299696] Call Trace:
+[  135.299711]  [<ffffffff8107383e>] ? __wake_up+0x4e/0x70
+[  135.299733]  [<ffffffff81518f88>] raid5d+0x4c8/0x680
+[  135.299756]  [<ffffffff817174ed>] ? schedule_timeout+0x15d/0x1f0
+[  135.299781]  [<ffffffff81524c9f>] md_thread+0x11f/0x170
+[  135.299804]  [<ffffffff81069cd0>] ? wake_up_bit+0x40/0x40
+[  135.299826]  [<ffffffff81524b80>] ? md_rdev_init+0x110/0x110
+[  135.299850]  [<ffffffff81069656>] kthread+0xc6/0xd0
+[  135.299871]  [<ffffffff81069590>] ? kthread_freezable_should_stop+0x70/0x70
+[  135.299899]  [<ffffffff81722ffc>] ret_from_fork+0x7c/0xb0
+[  135.299923]  [<ffffffff81069590>] ? kthread_freezable_should_stop+0x70/0x70
+[  135.299951] Code: ff ff ff 0f 84 d7 fe ff ff e9 5c fe ff ff 66 90 41 8b b4 24 d8 01 00 00 45 31 ed 85 f6 0f 8e 7b fd ff ff 49 8b 9c 24 d0 01 00 00 <48> 3b 1b 49 89 dd 0f 85 67 fd ff ff 48 8d 43 28 31 d2 eb 17 90
+[  135.300005] RIP  [<ffffffff815188ab>] handle_active_stripes+0x32b/0x440
+[  135.300005]  RSP <ffff8800b77a5c48>
+[  135.300005] CR2: 0000000000000000
+[  135.300005] ---[ end trace 504854e5bb7562ed ]---
+[  135.300005] Kernel panic - not syncing: Fatal exception
+
+This is because raid5d() can be running when the multi-thread
+resources are changed via system. We see need to provide locking.
+
+mddev->device_lock is suitable, but we cannot simple call
+alloc_thread_groups under this lock as we cannot allocate memory
+while holding a spinlock.
+So change alloc_thread_groups() to allocate and return the data
+structures, then raid5_store_group_thread_cnt() can take the lock
+while updating the pointers to the data structures.
+
+This fixes a bug introduced in 3.12 and so is suitable for the 3.12.x
+stable series.
+
+Fixes: b721420e8719131896b009b11edbbd27
+Signed-off-by: Jianpeng Ma <majianpeng@gmail.com>
+Signed-off-by: NeilBrown <neilb@suse.de>
+Reviewed-by: Shaohua Li <shli@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/md/raid5.c |   63 ++++++++++++++++++++++++++++++++---------------------
+ 1 file changed, 39 insertions(+), 24 deletions(-)
+
+--- a/drivers/md/raid5.c
++++ b/drivers/md/raid5.c
+@@ -5214,15 +5214,18 @@ raid5_show_group_thread_cnt(struct mddev
+               return 0;
+ }
+ 
+-static int alloc_thread_groups(struct r5conf *conf, int cnt);
++static int alloc_thread_groups(struct r5conf *conf, int cnt,
++                             int *group_cnt,
++                             int *worker_cnt_per_group,
++                             struct r5worker_group **worker_groups);
+ static ssize_t
+ raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len)
+ {
+       struct r5conf *conf = mddev->private;
+       unsigned long new;
+       int err;
+-      struct r5worker_group *old_groups;
+-      int old_group_cnt;
++      struct r5worker_group *new_groups, *old_groups;
++      int group_cnt, worker_cnt_per_group;
+ 
+       if (len >= PAGE_SIZE)
+               return -EINVAL;
+@@ -5238,17 +5241,19 @@ raid5_store_group_thread_cnt(struct mdde
+       mddev_suspend(mddev);
+ 
+       old_groups = conf->worker_groups;
+-      old_group_cnt = conf->worker_cnt_per_group;
+-
+       if (old_groups)
+               flush_workqueue(raid5_wq);
+ 
+-      conf->worker_groups = NULL;
+-      err = alloc_thread_groups(conf, new);
+-      if (err) {
+-              conf->worker_groups = old_groups;
+-              conf->worker_cnt_per_group = old_group_cnt;
+-      } else {
++      err = alloc_thread_groups(conf, new,
++                                &group_cnt, &worker_cnt_per_group,
++                                &new_groups);
++      if (!err) {
++              spin_lock_irq(&conf->device_lock);
++              conf->group_cnt = group_cnt;
++              conf->worker_cnt_per_group = worker_cnt_per_group;
++              conf->worker_groups = new_groups;
++              spin_unlock_irq(&conf->device_lock);
++
+               if (old_groups)
+                       kfree(old_groups[0].workers);
+               kfree(old_groups);
+@@ -5278,33 +5283,36 @@ static struct attribute_group raid5_attr
+       .attrs = raid5_attrs,
+ };
+ 
+-static int alloc_thread_groups(struct r5conf *conf, int cnt)
++static int alloc_thread_groups(struct r5conf *conf, int cnt,
++                             int *group_cnt,
++                             int *worker_cnt_per_group,
++                             struct r5worker_group **worker_groups)
+ {
+       int i, j;
+       ssize_t size;
+       struct r5worker *workers;
+ 
+-      conf->worker_cnt_per_group = cnt;
++      *worker_cnt_per_group = cnt;
+       if (cnt == 0) {
+-              conf->worker_groups = NULL;
++              *group_cnt = 0;
++              *worker_groups = NULL;
+               return 0;
+       }
+-      conf->group_cnt = num_possible_nodes();
++      *group_cnt = num_possible_nodes();
+       size = sizeof(struct r5worker) * cnt;
+-      workers = kzalloc(size * conf->group_cnt, GFP_NOIO);
+-      conf->worker_groups = kzalloc(sizeof(struct r5worker_group) *
+-                              conf->group_cnt, GFP_NOIO);
+-      if (!conf->worker_groups || !workers) {
++      workers = kzalloc(size * *group_cnt, GFP_NOIO);
++      *worker_groups = kzalloc(sizeof(struct r5worker_group) *
++                              *group_cnt, GFP_NOIO);
++      if (!*worker_groups || !workers) {
+               kfree(workers);
+-              kfree(conf->worker_groups);
+-              conf->worker_groups = NULL;
++              kfree(*worker_groups);
+               return -ENOMEM;
+       }
+ 
+-      for (i = 0; i < conf->group_cnt; i++) {
++      for (i = 0; i < *group_cnt; i++) {
+               struct r5worker_group *group;
+ 
+-              group = &conf->worker_groups[i];
++              group = &(*worker_groups)[i];
+               INIT_LIST_HEAD(&group->handle_list);
+               group->conf = conf;
+               group->workers = workers + i * cnt;
+@@ -5462,6 +5470,8 @@ static struct r5conf *setup_conf(struct
+       struct md_rdev *rdev;
+       struct disk_info *disk;
+       char pers_name[6];
++      int group_cnt, worker_cnt_per_group;
++      struct r5worker_group *new_group;
+ 
+       if (mddev->new_level != 5
+           && mddev->new_level != 4
+@@ -5496,7 +5506,12 @@ static struct r5conf *setup_conf(struct
+       if (conf == NULL)
+               goto abort;
+       /* Don't enable multi-threading by default*/
+-      if (alloc_thread_groups(conf, 0))
++      if (!alloc_thread_groups(conf, 0, &group_cnt, &worker_cnt_per_group,
++                               &new_group)) {
++              conf->group_cnt = group_cnt;
++              conf->worker_cnt_per_group = worker_cnt_per_group;
++              conf->worker_groups = new_group;
++      } else
+               goto abort;
+       spin_lock_init(&conf->device_lock);
+       seqcount_init(&conf->gen_lock);
diff --git a/queue-3.12/mm-numa-return-the-number-of-base-pages-altered-by-protection-changes.patch b/queue-3.12/mm-numa-return-the-number-of-base-pages-altered-by-protection-changes.patch

new file mode 100644 (file)

index 0000000..46607b4
--- /dev/null
+++ b/queue-3.12/mm-numa-return-the-number-of-base-pages-altered-by-protection-changes.patch
@@ -0,0 +1,127 @@
+From 72403b4a0fbdf433c1fe0127e49864658f6f6468 Mon Sep 17 00:00:00 2001
+From: Mel Gorman <mgorman@suse.de>
+Date: Tue, 12 Nov 2013 15:08:32 -0800
+Subject: mm: numa: return the number of base pages altered by protection changes
+
+From: Mel Gorman <mgorman@suse.de>
+
+commit 72403b4a0fbdf433c1fe0127e49864658f6f6468 upstream.
+
+Commit 0255d4918480 ("mm: Account for a THP NUMA hinting update as one
+PTE update") was added to account for the number of PTE updates when
+marking pages prot_numa.  task_numa_work was using the old return value
+to track how much address space had been updated.  Altering the return
+value causes the scanner to do more work than it is configured or
+documented to in a single unit of work.
+
+This patch reverts that commit and accounts for the number of THP
+updates separately in vmstat.  It is up to the administrator to
+interpret the pair of values correctly.  This is a straight-forward
+operation and likely to only be of interest when actively debugging NUMA
+balancing problems.
+
+The impact of this patch is that the NUMA PTE scanner will scan slower
+when THP is enabled and workloads may converge slower as a result.  On
+the flip size system CPU usage should be lower than recent tests
+reported.  This is an illustrative example of a short single JVM specjbb
+test
+
+specjbb
+                       3.12.0                3.12.0
+                      vanilla      acctupdates
+TPut 1      26143.00 (  0.00%)     25747.00 ( -1.51%)
+TPut 7     185257.00 (  0.00%)    183202.00 ( -1.11%)
+TPut 13    329760.00 (  0.00%)    346577.00 (  5.10%)
+TPut 19    442502.00 (  0.00%)    460146.00 (  3.99%)
+TPut 25    540634.00 (  0.00%)    549053.00 (  1.56%)
+TPut 31    512098.00 (  0.00%)    519611.00 (  1.47%)
+TPut 37    461276.00 (  0.00%)    474973.00 (  2.97%)
+TPut 43    403089.00 (  0.00%)    414172.00 (  2.75%)
+
+              3.12.0      3.12.0
+             vanillaacctupdates
+User         5169.64     5184.14
+System        100.45       80.02
+Elapsed       252.75      251.85
+
+Performance is similar but note the reduction in system CPU time.  While
+this showed a performance gain, it will not be universal but at least
+it'll be behaving as documented.  The vmstats are obviously different but
+here is an obvious interpretation of them from mmtests.
+
+                                3.12.0      3.12.0
+                               vanillaacctupdates
+NUMA page range updates        1408326    11043064
+NUMA huge PMD updates                0       21040
+NUMA PTE updates               1408326      291624
+
+"NUMA page range updates" == nr_pte_updates and is the value returned to
+the NUMA pte scanner.  NUMA huge PMD updates were the number of THP
+updates which in combination can be used to calculate how many ptes were
+updated from userspace.
+
+Signed-off-by: Mel Gorman <mgorman@suse.de>
+Reported-by: Alex Thorlton <athorlton@sgi.com>
+Reviewed-by: Rik van Riel <riel@redhat.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Mel Gorman <mgorman@suse.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ include/linux/vm_event_item.h |    1 +
+ mm/mprotect.c                 |    7 ++++++-
+ mm/vmstat.c                   |    1 +
+ 3 files changed, 8 insertions(+), 1 deletion(-)
+
+--- a/include/linux/vm_event_item.h
++++ b/include/linux/vm_event_item.h
+@@ -39,6 +39,7 @@ enum vm_event_item { PGPGIN, PGPGOUT, PS
+               PAGEOUTRUN, ALLOCSTALL, PGROTATED,
+ #ifdef CONFIG_NUMA_BALANCING
+               NUMA_PTE_UPDATES,
++              NUMA_HUGE_PTE_UPDATES,
+               NUMA_HINT_FAULTS,
+               NUMA_HINT_FAULTS_LOCAL,
+               NUMA_PAGE_MIGRATE,
+--- a/mm/mprotect.c
++++ b/mm/mprotect.c
+@@ -138,6 +138,7 @@ static inline unsigned long change_pmd_r
+       pmd_t *pmd;
+       unsigned long next;
+       unsigned long pages = 0;
++      unsigned long nr_huge_updates = 0;
+       bool all_same_node;
+ 
+       pmd = pmd_offset(pud, addr);
+@@ -148,7 +149,8 @@ static inline unsigned long change_pmd_r
+                               split_huge_page_pmd(vma, addr, pmd);
+                       else if (change_huge_pmd(vma, pmd, addr, newprot,
+                                                prot_numa)) {
+-                              pages++;
++                              pages += HPAGE_PMD_NR;
++                              nr_huge_updates++;
+                               continue;
+                       }
+                       /* fall through */
+@@ -168,6 +170,9 @@ static inline unsigned long change_pmd_r
+                       change_pmd_protnuma(vma->vm_mm, addr, pmd);
+       } while (pmd++, addr = next, addr != end);
+ 
++      if (nr_huge_updates)
++              count_vm_numa_events(NUMA_HUGE_PTE_UPDATES, nr_huge_updates);
++
+       return pages;
+ }
+ 
+--- a/mm/vmstat.c
++++ b/mm/vmstat.c
+@@ -812,6 +812,7 @@ const char * const vmstat_text[] = {
+ 
+ #ifdef CONFIG_NUMA_BALANCING
+       "numa_pte_updates",
++      "numa_huge_pte_updates",
+       "numa_hint_faults",
+       "numa_hint_faults_local",
+       "numa_pages_migrated",
diff --git a/queue-3.12/series b/queue-3.12/series

index dd7a149344f22af67eeceb799dbefdd442e28d6c..092973a07c47f81f017ae8028bbab25d8bf78f60 100644 (file)
--- a/queue-3.12/series
+++ b/queue-3.12/series
@@ -59,3 +59,5 @@ ipv6-fix-possible-seqlock-deadlock-in-ip6_finish_output2.patch
  pktgen-xfrm-update-ipv4-header-total-len-and-checksum-after-tranformation.patch
  xfrm-fix-null-pointer-dereference-when-decoding-sessions.patch
  xfs-add-capability-check-to-free-eofblocks-ioctl.patch
+mm-numa-return-the-number-of-base-pages-altered-by-protection-changes.patch
+md-raid5-use-conf-device_lock-protect-changing-of-multi-thread-resources.patch
author	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Fri, 6 Dec 2013 18:03:35 +0000 (10:03 -0800)
committer	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Fri, 6 Dec 2013 18:03:35 +0000 (10:03 -0800)
queue-3.12/md-raid5-use-conf-device_lock-protect-changing-of-multi-thread-resources.patch	[new file with mode: 0644]	patch \| blob
queue-3.12/mm-numa-return-the-number-of-base-pages-altered-by-protection-changes.patch	[new file with mode: 0644]	patch \| blob
queue-3.12/series		patch \| blob \| blame \| history