3.9-stable patches

author Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Fri, 14 Jun 2013 21:58:03 +0000 (14:58 -0700)

committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Fri, 14 Jun 2013 21:58:03 +0000 (14:58 -0700)
author Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Fri, 14 Jun 2013 21:58:03 +0000 (14:58 -0700)
committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Fri, 14 Jun 2013 21:58:03 +0000 (14:58 -0700)
diff --git a/queue-3.9/md-raid1-5-10-disable-write-same-until-a-recovery-strategy-is-in-place.patch b/queue-3.9/md-raid1-5-10-disable-write-same-until-a-recovery-strategy-is-in-place.patch

new file mode 100644 (file)

index 0000000..fd49e82
--- /dev/null
+++ b/queue-3.9/md-raid1-5-10-disable-write-same-until-a-recovery-strategy-is-in-place.patch
@@ -0,0 +1,89 @@
+From 5026d7a9b2f3eb1f9bda66c18ac6bc3036ec9020 Mon Sep 17 00:00:00 2001
+From: "H. Peter Anvin" <hpa@zytor.com>
+Date: Wed, 12 Jun 2013 07:37:43 -0700
+Subject: md/raid1,5,10: Disable WRITE SAME until a recovery strategy is in place
+
+From: "H. Peter Anvin" <hpa@zytor.com>
+
+commit 5026d7a9b2f3eb1f9bda66c18ac6bc3036ec9020 upstream.
+
+There are cases where the kernel will believe that the WRITE SAME
+command is supported by a block device which does not, in fact,
+support WRITE SAME.  This currently happens for SATA drivers behind a
+SAS controller, but there are probably a hundred other ways that can
+happen, including drive firmware bugs.
+
+After receiving an error for WRITE SAME the block layer will retry the
+request as a plain write of zeroes, but mdraid will consider the
+failure as fatal and consider the drive failed.  This has the effect
+that all the mirrors containing a specific set of data are each
+offlined in very rapid succession resulting in data loss.
+
+However, just bouncing the request back up to the block layer isn't
+ideal either, because the whole initial request-retry sequence should
+be inside the write bitmap fence, which probably means that md needs
+to do its own conversion of WRITE SAME to write zero.
+
+Until the failure scenario has been sorted out, disable WRITE SAME for
+raid1, raid5, and raid10.
+
+[neilb: added raid5]
+
+This patch is appropriate for any -stable since 3.7 when write_same
+support was added.
+
+Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
+Signed-off-by: NeilBrown <neilb@suse.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/md/raid1.c  |    4 ++--
+ drivers/md/raid10.c |    3 +--
+ drivers/md/raid5.c  |    4 +++-
+ 3 files changed, 6 insertions(+), 5 deletions(-)
+
+--- a/drivers/md/raid1.c
++++ b/drivers/md/raid1.c
+@@ -2837,8 +2837,8 @@ static int run(struct mddev *mddev)
+               return PTR_ERR(conf);
+ 
+       if (mddev->queue)
+-              blk_queue_max_write_same_sectors(mddev->queue,
+-                                               mddev->chunk_sectors);
++              blk_queue_max_write_same_sectors(mddev->queue, 0);
++
+       rdev_for_each(rdev, mddev) {
+               if (!mddev->gendisk)
+                       continue;
+--- a/drivers/md/raid10.c
++++ b/drivers/md/raid10.c
+@@ -3635,8 +3635,7 @@ static int run(struct mddev *mddev)
+       if (mddev->queue) {
+               blk_queue_max_discard_sectors(mddev->queue,
+                                             mddev->chunk_sectors);
+-              blk_queue_max_write_same_sectors(mddev->queue,
+-                                               mddev->chunk_sectors);
++              blk_queue_max_write_same_sectors(mddev->queue, 0);
+               blk_queue_io_min(mddev->queue, chunk_size);
+               if (conf->geo.raid_disks % conf->geo.near_copies)
+                       blk_queue_io_opt(mddev->queue, chunk_size * conf->geo.raid_disks);
+--- a/drivers/md/raid5.c
++++ b/drivers/md/raid5.c
+@@ -5457,7 +5457,7 @@ static int run(struct mddev *mddev)
+               if (mddev->major_version == 0 &&
+                   mddev->minor_version > 90)
+                       rdev->recovery_offset = reshape_offset;
+-                      
++
+               if (rdev->recovery_offset < reshape_offset) {
+                       /* We need to check old and new layout */
+                       if (!only_parity(rdev->raid_disk,
+@@ -5580,6 +5580,8 @@ static int run(struct mddev *mddev)
+                */
+               mddev->queue->limits.discard_zeroes_data = 0;
+ 
++              blk_queue_max_write_same_sectors(mddev->queue, 0);
++
+               rdev_for_each(rdev, mddev) {
+                       disk_stack_limits(mddev->gendisk, rdev->bdev,
+                                         rdev->data_offset << 9);
diff --git a/queue-3.9/md-raid1-consider-write-as-successful-only-if-at-least-one-non-faulty-and-non-rebuilding-drive-completed-it.patch b/queue-3.9/md-raid1-consider-write-as-successful-only-if-at-least-one-non-faulty-and-non-rebuilding-drive-completed-it.patch

new file mode 100644 (file)

index 0000000..7d322d9
--- /dev/null
+++ b/queue-3.9/md-raid1-consider-write-as-successful-only-if-at-least-one-non-faulty-and-non-rebuilding-drive-completed-it.patch
@@ -0,0 +1,88 @@
+From 3056e3aec8d8ba61a0710fb78b2d562600aa2ea7 Mon Sep 17 00:00:00 2001
+From: Alex Lyakas <alex@zadarastorage.com>
+Date: Tue, 4 Jun 2013 20:42:21 +0300
+Subject: md/raid1: consider WRITE as successful only if at least one non-Faulty and non-rebuilding drive completed it.
+
+From: Alex Lyakas <alex@zadarastorage.com>
+
+commit 3056e3aec8d8ba61a0710fb78b2d562600aa2ea7 upstream.
+
+Without that fix, the following scenario could happen:
+
+- RAID1 with drives A and B; drive B was freshly-added and is rebuilding
+- Drive A fails
+- WRITE request arrives to the array. It is failed by drive A, so
+r1_bio is marked as R1BIO_WriteError, but the rebuilding drive B
+succeeds in writing it, so the same r1_bio is marked as
+R1BIO_Uptodate.
+- r1_bio arrives to handle_write_finished, badblocks are disabled,
+md_error()->error() does nothing because we don't fail the last drive
+of raid1
+- raid_end_bio_io()  calls call_bio_endio()
+- As a result, in call_bio_endio():
+        if (!test_bit(R1BIO_Uptodate, &r1_bio->state))
+                clear_bit(BIO_UPTODATE, &bio->bi_flags);
+this code doesn't clear the BIO_UPTODATE flag, and the whole master
+WRITE succeeds, back to the upper layer.
+
+So we returned success to the upper layer, even though we had written
+the data onto the rebuilding drive only. But when we want to read the
+data back, we would not read from the rebuilding drive, so this data
+is lost.
+
+[neilb - applied identical change to raid10 as well]
+
+This bug can result in lost data, so it is suitable for any
+-stable kernel.
+
+Signed-off-by: Alex Lyakas <alex@zadarastorage.com>
+Signed-off-by: NeilBrown <neilb@suse.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/md/raid1.c  |   12 +++++++++++-
+ drivers/md/raid10.c |   12 +++++++++++-
+ 2 files changed, 22 insertions(+), 2 deletions(-)
+
+--- a/drivers/md/raid1.c
++++ b/drivers/md/raid1.c
+@@ -427,7 +427,17 @@ static void raid1_end_write_request(stru
+ 
+               r1_bio->bios[mirror] = NULL;
+               to_put = bio;
+-              set_bit(R1BIO_Uptodate, &r1_bio->state);
++              /*
++               * Do not set R1BIO_Uptodate if the current device is
++               * rebuilding or Faulty. This is because we cannot use
++               * such device for properly reading the data back (we could
++               * potentially use it, if the current write would have felt
++               * before rdev->recovery_offset, but for simplicity we don't
++               * check this here.
++               */
++              if (test_bit(In_sync, &conf->mirrors[mirror].rdev->flags) &&
++                  !test_bit(Faulty, &conf->mirrors[mirror].rdev->flags))
++                      set_bit(R1BIO_Uptodate, &r1_bio->state);
+ 
+               /* Maybe we can clear some bad blocks. */
+               if (is_badblock(conf->mirrors[mirror].rdev,
+--- a/drivers/md/raid10.c
++++ b/drivers/md/raid10.c
+@@ -490,7 +490,17 @@ static void raid10_end_write_request(str
+               sector_t first_bad;
+               int bad_sectors;
+ 
+-              set_bit(R10BIO_Uptodate, &r10_bio->state);
++              /*
++               * Do not set R10BIO_Uptodate if the current device is
++               * rebuilding or Faulty. This is because we cannot use
++               * such device for properly reading the data back (we could
++               * potentially use it, if the current write would have felt
++               * before rdev->recovery_offset, but for simplicity we don't
++               * check this here.
++               */
++              if (test_bit(In_sync, &rdev->flags) &&
++                  !test_bit(Faulty, &rdev->flags))
++                      set_bit(R10BIO_Uptodate, &r10_bio->state);
+ 
+               /* Maybe we can clear some bad blocks. */
+               if (is_badblock(rdev,
diff --git a/queue-3.9/md-raid1-raid10-use-freeze_array-in-place-of-raise_barrier-in-various-places.patch b/queue-3.9/md-raid1-raid10-use-freeze_array-in-place-of-raise_barrier-in-various-places.patch

new file mode 100644 (file)

index 0000000..e91d523
--- /dev/null
+++ b/queue-3.9/md-raid1-raid10-use-freeze_array-in-place-of-raise_barrier-in-various-places.patch
@@ -0,0 +1,176 @@
+From e2d59925221cd562e07fee38ec8839f7209ae603 Mon Sep 17 00:00:00 2001
+From: NeilBrown <neilb@suse.de>
+Date: Wed, 12 Jun 2013 11:01:22 +1000
+Subject: md/raid1,raid10: use freeze_array in place of raise_barrier in various places.
+
+From: NeilBrown <neilb@suse.de>
+
+commit e2d59925221cd562e07fee38ec8839f7209ae603 upstream.
+
+Various places in raid1 and raid10 are calling raise_barrier when they
+really should call freeze_array.
+The former is only intended to be called from "make_request".
+The later has extra checks for 'nr_queued' and makes a call to
+flush_pending_writes(), so it is safe to call it from within the
+management thread.
+
+Using raise_barrier will sometimes deadlock.  Using freeze_array
+should not.
+
+As 'freeze_array' currently expects one request to be pending (in
+handle_read_error - the only previous caller), we need to pass
+it the number of pending requests (extra) to ignore.
+
+The deadlock was made particularly noticeable by commits
+050b66152f87c7 (raid10) and 6b740b8d79252f13 (raid1) which
+appeared in 3.4, so the fix is appropriate for any -stable
+kernel since then.
+
+This patch probably won't apply directly to some early kernels and
+will need to be applied by hand.
+
+Reported-by: Alexander Lyakas <alex.bolshoy@gmail.com>
+Signed-off-by: NeilBrown <neilb@suse.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/md/raid1.c  |   22 +++++++++++-----------
+ drivers/md/raid10.c |   14 +++++++-------
+ 2 files changed, 18 insertions(+), 18 deletions(-)
+
+--- a/drivers/md/raid1.c
++++ b/drivers/md/raid1.c
+@@ -890,17 +890,17 @@ static void allow_barrier(struct r1conf
+       wake_up(&conf->wait_barrier);
+ }
+ 
+-static void freeze_array(struct r1conf *conf)
++static void freeze_array(struct r1conf *conf, int extra)
+ {
+       /* stop syncio and normal IO and wait for everything to
+        * go quite.
+        * We increment barrier and nr_waiting, and then
+-       * wait until nr_pending match nr_queued+1
++       * wait until nr_pending match nr_queued+extra
+        * This is called in the context of one normal IO request
+        * that has failed. Thus any sync request that might be pending
+        * will be blocked by nr_pending, and we need to wait for
+        * pending IO requests to complete or be queued for re-try.
+-       * Thus the number queued (nr_queued) plus this request (1)
++       * Thus the number queued (nr_queued) plus this request (extra)
+        * must match the number of pending IOs (nr_pending) before
+        * we continue.
+        */
+@@ -908,7 +908,7 @@ static void freeze_array(struct r1conf *
+       conf->barrier++;
+       conf->nr_waiting++;
+       wait_event_lock_irq_cmd(conf->wait_barrier,
+-                              conf->nr_pending == conf->nr_queued+1,
++                              conf->nr_pending == conf->nr_queued+extra,
+                               conf->resync_lock,
+                               flush_pending_writes(conf));
+       spin_unlock_irq(&conf->resync_lock);
+@@ -1568,8 +1568,8 @@ static int raid1_add_disk(struct mddev *
+                * we wait for all outstanding requests to complete.
+                */
+               synchronize_sched();
+-              raise_barrier(conf);
+-              lower_barrier(conf);
++              freeze_array(conf, 0);
++              unfreeze_array(conf);
+               clear_bit(Unmerged, &rdev->flags);
+       }
+       md_integrity_add_rdev(rdev, mddev);
+@@ -1619,11 +1619,11 @@ static int raid1_remove_disk(struct mdde
+                        */
+                       struct md_rdev *repl =
+                               conf->mirrors[conf->raid_disks + number].rdev;
+-                      raise_barrier(conf);
++                      freeze_array(conf, 0);
+                       clear_bit(Replacement, &repl->flags);
+                       p->rdev = repl;
+                       conf->mirrors[conf->raid_disks + number].rdev = NULL;
+-                      lower_barrier(conf);
++                      unfreeze_array(conf);
+                       clear_bit(WantReplacement, &rdev->flags);
+               } else
+                       clear_bit(WantReplacement, &rdev->flags);
+@@ -2240,7 +2240,7 @@ static void handle_read_error(struct r1c
+        * frozen
+        */
+       if (mddev->ro == 0) {
+-              freeze_array(conf);
++              freeze_array(conf, 1);
+               fix_read_error(conf, r1_bio->read_disk,
+                              r1_bio->sector, r1_bio->sectors);
+               unfreeze_array(conf);
+@@ -3019,7 +3019,7 @@ static int raid1_reshape(struct mddev *m
+               return -ENOMEM;
+       }
+ 
+-      raise_barrier(conf);
++      freeze_array(conf, 0);
+ 
+       /* ok, everything is stopped */
+       oldpool = conf->r1bio_pool;
+@@ -3050,7 +3050,7 @@ static int raid1_reshape(struct mddev *m
+       conf->raid_disks = mddev->raid_disks = raid_disks;
+       mddev->delta_disks = 0;
+ 
+-      lower_barrier(conf);
++      unfreeze_array(conf);
+ 
+       set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+       md_wakeup_thread(mddev->thread);
+--- a/drivers/md/raid10.c
++++ b/drivers/md/raid10.c
+@@ -1065,17 +1065,17 @@ static void allow_barrier(struct r10conf
+       wake_up(&conf->wait_barrier);
+ }
+ 
+-static void freeze_array(struct r10conf *conf)
++static void freeze_array(struct r10conf *conf, int extra)
+ {
+       /* stop syncio and normal IO and wait for everything to
+        * go quiet.
+        * We increment barrier and nr_waiting, and then
+-       * wait until nr_pending match nr_queued+1
++       * wait until nr_pending match nr_queued+extra
+        * This is called in the context of one normal IO request
+        * that has failed. Thus any sync request that might be pending
+        * will be blocked by nr_pending, and we need to wait for
+        * pending IO requests to complete or be queued for re-try.
+-       * Thus the number queued (nr_queued) plus this request (1)
++       * Thus the number queued (nr_queued) plus this request (extra)
+        * must match the number of pending IOs (nr_pending) before
+        * we continue.
+        */
+@@ -1083,7 +1083,7 @@ static void freeze_array(struct r10conf
+       conf->barrier++;
+       conf->nr_waiting++;
+       wait_event_lock_irq_cmd(conf->wait_barrier,
+-                              conf->nr_pending == conf->nr_queued+1,
++                              conf->nr_pending == conf->nr_queued+extra,
+                               conf->resync_lock,
+                               flush_pending_writes(conf));
+ 
+@@ -1849,8 +1849,8 @@ static int raid10_add_disk(struct mddev
+                * we wait for all outstanding requests to complete.
+                */
+               synchronize_sched();
+-              raise_barrier(conf, 0);
+-              lower_barrier(conf);
++              freeze_array(conf, 0);
++              unfreeze_array(conf);
+               clear_bit(Unmerged, &rdev->flags);
+       }
+       md_integrity_add_rdev(rdev, mddev);
+@@ -2646,7 +2646,7 @@ static void handle_read_error(struct mdd
+       r10_bio->devs[slot].bio = NULL;
+ 
+       if (mddev->ro == 0) {
+-              freeze_array(conf);
++              freeze_array(conf, 1);
+               fix_read_error(conf, mddev, r10_bio);
+               unfreeze_array(conf);
+       } else
diff --git a/queue-3.9/mm-migration-add-migrate_entry_wait_huge.patch b/queue-3.9/mm-migration-add-migrate_entry_wait_huge.patch

new file mode 100644 (file)

index 0000000..8ab4b2a
--- /dev/null
+++ b/queue-3.9/mm-migration-add-migrate_entry_wait_huge.patch
@@ -0,0 +1,109 @@
+From 30dad30922ccc733cfdbfe232090cf674dc374dc Mon Sep 17 00:00:00 2001
+From: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
+Date: Wed, 12 Jun 2013 14:05:04 -0700
+Subject: mm: migration: add migrate_entry_wait_huge()
+
+From: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
+
+commit 30dad30922ccc733cfdbfe232090cf674dc374dc upstream.
+
+When we have a page fault for the address which is backed by a hugepage
+under migration, the kernel can't wait correctly and do busy looping on
+hugepage fault until the migration finishes.  As a result, users who try
+to kick hugepage migration (via soft offlining, for example) occasionally
+experience long delay or soft lockup.
+
+This is because pte_offset_map_lock() can't get a correct migration entry
+or a correct page table lock for hugepage.  This patch introduces
+migration_entry_wait_huge() to solve this.
+
+Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
+Reviewed-by: Rik van Riel <riel@redhat.com>
+Reviewed-by: Wanpeng Li <liwanp@linux.vnet.ibm.com>
+Reviewed-by: Michal Hocko <mhocko@suse.cz>
+Cc: Mel Gorman <mgorman@suse.de>
+Cc: Andi Kleen <andi@firstfloor.org>
+Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ include/linux/swapops.h |    3 +++
+ mm/hugetlb.c            |    2 +-
+ mm/migrate.c            |   23 ++++++++++++++++++-----
+ 3 files changed, 22 insertions(+), 6 deletions(-)
+
+--- a/include/linux/swapops.h
++++ b/include/linux/swapops.h
+@@ -137,6 +137,7 @@ static inline void make_migration_entry_
+ 
+ extern void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
+                                       unsigned long address);
++extern void migration_entry_wait_huge(struct mm_struct *mm, pte_t *pte);
+ #else
+ 
+ #define make_migration_entry(page, write) swp_entry(0, 0)
+@@ -148,6 +149,8 @@ static inline int is_migration_entry(swp
+ static inline void make_migration_entry_read(swp_entry_t *entryp) { }
+ static inline void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
+                                        unsigned long address) { }
++static inline void migration_entry_wait_huge(struct mm_struct *mm,
++                                      pte_t *pte) { }
+ static inline int is_write_migration_entry(swp_entry_t entry)
+ {
+       return 0;
+--- a/mm/hugetlb.c
++++ b/mm/hugetlb.c
+@@ -2823,7 +2823,7 @@ int hugetlb_fault(struct mm_struct *mm,
+       if (ptep) {
+               entry = huge_ptep_get(ptep);
+               if (unlikely(is_hugetlb_entry_migration(entry))) {
+-                      migration_entry_wait(mm, (pmd_t *)ptep, address);
++                      migration_entry_wait_huge(mm, ptep);
+                       return 0;
+               } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
+                       return VM_FAULT_HWPOISON_LARGE |
+--- a/mm/migrate.c
++++ b/mm/migrate.c
+@@ -200,15 +200,14 @@ static void remove_migration_ptes(struct
+  * get to the page and wait until migration is finished.
+  * When we return from this function the fault will be retried.
+  */
+-void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
+-                              unsigned long address)
++static void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
++                              spinlock_t *ptl)
+ {
+-      pte_t *ptep, pte;
+-      spinlock_t *ptl;
++      pte_t pte;
+       swp_entry_t entry;
+       struct page *page;
+ 
+-      ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
++      spin_lock(ptl);
+       pte = *ptep;
+       if (!is_swap_pte(pte))
+               goto out;
+@@ -236,6 +235,20 @@ out:
+       pte_unmap_unlock(ptep, ptl);
+ }
+ 
++void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
++                              unsigned long address)
++{
++      spinlock_t *ptl = pte_lockptr(mm, pmd);
++      pte_t *ptep = pte_offset_map(pmd, address);
++      __migration_entry_wait(mm, ptep, ptl);
++}
++
++void migration_entry_wait_huge(struct mm_struct *mm, pte_t *pte)
++{
++      spinlock_t *ptl = &(mm)->page_table_lock;
++      __migration_entry_wait(mm, pte, ptl);
++}
++
+ #ifdef CONFIG_BLOCK
+ /* Returns true if all buffers are successfully locked */
+ static bool buffer_migrate_lock_buffers(struct buffer_head *head,
diff --git a/queue-3.9/mm-page_alloc.c-fix-watermark-check-in-__zone_watermark_ok.patch b/queue-3.9/mm-page_alloc.c-fix-watermark-check-in-__zone_watermark_ok.patch

new file mode 100644 (file)

index 0000000..cff7815
--- /dev/null
+++ b/queue-3.9/mm-page_alloc.c-fix-watermark-check-in-__zone_watermark_ok.patch
@@ -0,0 +1,95 @@
+From 026b08147923142e925a7d0aaa39038055ae0156 Mon Sep 17 00:00:00 2001
+From: Tomasz Stanislawski <t.stanislaws@samsung.com>
+Date: Wed, 12 Jun 2013 14:05:02 -0700
+Subject: mm/page_alloc.c: fix watermark check in __zone_watermark_ok()
+
+From: Tomasz Stanislawski <t.stanislaws@samsung.com>
+
+commit 026b08147923142e925a7d0aaa39038055ae0156 upstream.
+
+The watermark check consists of two sub-checks.  The first one is:
+
+       if (free_pages <= min + lowmem_reserve)
+               return false;
+
+The check assures that there is minimal amount of RAM in the zone.  If
+CMA is used then the free_pages is reduced by the number of free pages
+in CMA prior to the over-mentioned check.
+
+       if (!(alloc_flags & ALLOC_CMA))
+               free_pages -= zone_page_state(z, NR_FREE_CMA_PAGES);
+
+This prevents the zone from being drained from pages available for
+non-movable allocations.
+
+The second check prevents the zone from getting too fragmented.
+
+       for (o = 0; o < order; o++) {
+               free_pages -= z->free_area[o].nr_free << o;
+               min >>= 1;
+               if (free_pages <= min)
+                       return false;
+       }
+
+The field z->free_area[o].nr_free is equal to the number of free pages
+including free CMA pages.  Therefore the CMA pages are subtracted twice.
+This may cause a false positive fail of __zone_watermark_ok() if the CMA
+area gets strongly fragmented.  In such a case there are many 0-order
+free pages located in CMA.  Those pages are subtracted twice therefore
+they will quickly drain free_pages during the check against
+fragmentation.  The test fails even though there are many free non-cma
+pages in the zone.
+
+This patch fixes this issue by subtracting CMA pages only for a purpose of
+(free_pages <= min + lowmem_reserve) check.
+
+Laura said:
+
+  We were observing allocation failures of higher order pages (order 5 =
+  128K typically) under tight memory conditions resulting in driver
+  failure.  The output from the page allocation failure showed plenty of
+  free pages of the appropriate order/type/zone and mostly CMA pages in
+  the lower orders.
+
+  For full disclosure, we still observed some page allocation failures
+  even after applying the patch but the number was drastically reduced and
+  those failures were attributed to fragmentation/other system issues.
+
+Signed-off-by: Tomasz Stanislawski <t.stanislaws@samsung.com>
+Signed-off-by: Kyungmin Park <kyungmin.park@samsung.com>
+Tested-by: Laura Abbott <lauraa@codeaurora.org>
+Cc: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
+Acked-by: Minchan Kim <minchan@kernel.org>
+Cc: Mel Gorman <mel@csn.ul.ie>
+Tested-by: Marek Szyprowski <m.szyprowski@samsung.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/page_alloc.c |    6 ++++--
+ 1 file changed, 4 insertions(+), 2 deletions(-)
+
+--- a/mm/page_alloc.c
++++ b/mm/page_alloc.c
+@@ -1626,6 +1626,7 @@ static bool __zone_watermark_ok(struct z
+       long min = mark;
+       long lowmem_reserve = z->lowmem_reserve[classzone_idx];
+       int o;
++      long free_cma = 0;
+ 
+       free_pages -= (1 << order) - 1;
+       if (alloc_flags & ALLOC_HIGH)
+@@ -1635,9 +1636,10 @@ static bool __zone_watermark_ok(struct z
+ #ifdef CONFIG_CMA
+       /* If allocation can't use CMA areas don't use free CMA pages */
+       if (!(alloc_flags & ALLOC_CMA))
+-              free_pages -= zone_page_state(z, NR_FREE_CMA_PAGES);
++              free_cma = zone_page_state(z, NR_FREE_CMA_PAGES);
+ #endif
+-      if (free_pages <= min + lowmem_reserve)
++
++      if (free_pages - free_cma <= min + lowmem_reserve)
+               return false;
+       for (o = 0; o < order; o++) {
+               /* At the next order, this order's pages become unavailable */
diff --git a/queue-3.9/series b/queue-3.9/series

index ad17f78ab5f4c6a327b12d434d06e1640a06c9f1..0c8158b215fd173c177cebaf056b15adca463657 100644 (file)
--- a/queue-3.9/series
+++ b/queue-3.9/series
@@ -22,3 +22,10 @@ memcg-don-t-initialize-kmem-cache-destroying-work-for-root-caches.patch
  wl12xx-fix-minimum-required-firmware-version-for-wl127x-multirole.patch
  drm-i915-prefer-vbt-modes-for-svdo-lvds-over-edid.patch
  swap-avoid-read_swap_cache_async-race-to-deadlock-while-waiting-on-discard-i-o-completion.patch
+md-raid1-consider-write-as-successful-only-if-at-least-one-non-faulty-and-non-rebuilding-drive-completed-it.patch
+md-raid1-5-10-disable-write-same-until-a-recovery-strategy-is-in-place.patch
+md-raid1-raid10-use-freeze_array-in-place-of-raise_barrier-in-various-places.patch
+mm-page_alloc.c-fix-watermark-check-in-__zone_watermark_ok.patch
+mm-migration-add-migrate_entry_wait_huge.patch
+x86-fix-adjust_range_size_mask-calling-position.patch
+x86-fix-typo-in-kexec-register-clearing.patch
diff --git a/queue-3.9/x86-fix-adjust_range_size_mask-calling-position.patch b/queue-3.9/x86-fix-adjust_range_size_mask-calling-position.patch

new file mode 100644 (file)

index 0000000..ccbfc63
--- /dev/null
+++ b/queue-3.9/x86-fix-adjust_range_size_mask-calling-position.patch
@@ -0,0 +1,91 @@
+From 7de3d66b1387ddf5a37d9689e5eb8510fb75c765 Mon Sep 17 00:00:00 2001
+From: Yinghai Lu <yinghai@kernel.org>
+Date: Fri, 31 May 2013 08:53:07 -0700
+Subject: x86: Fix adjust_range_size_mask calling position
+
+From: Yinghai Lu <yinghai@kernel.org>
+
+commit 7de3d66b1387ddf5a37d9689e5eb8510fb75c765 upstream.
+
+Commit
+
+    8d57470d x86, mm: setup page table in top-down
+
+causes a kernel panic while setting mem=2G.
+
+     [mem 0x00000000-0x000fffff] page 4k
+     [mem 0x7fe00000-0x7fffffff] page 1G
+     [mem 0x7c000000-0x7fdfffff] page 1G
+     [mem 0x00100000-0x001fffff] page 4k
+     [mem 0x00200000-0x7bffffff] page 2M
+
+for last entry is not what we want, we should have
+     [mem 0x00200000-0x3fffffff] page 2M
+     [mem 0x40000000-0x7bffffff] page 1G
+
+Actually we merge the continuous ranges with same page size too early.
+in this case, before merging we have
+     [mem 0x00200000-0x3fffffff] page 2M
+     [mem 0x40000000-0x7bffffff] page 2M
+after merging them, will get
+     [mem 0x00200000-0x7bffffff] page 2M
+even we can use 1G page to map
+     [mem 0x40000000-0x7bffffff]
+
+that will cause problem, because we already map
+     [mem 0x7fe00000-0x7fffffff] page 1G
+     [mem 0x7c000000-0x7fdfffff] page 1G
+with 1G page, aka [0x40000000-0x7fffffff] is mapped with 1G page already.
+During phys_pud_init() for [0x40000000-0x7bffffff], it will not
+reuse existing that pud page, and allocate new one then try to use
+2M page to map it instead, as page_size_mask does not include
+PG_LEVEL_1G. At end will have [7c000000-0x7fffffff] not mapped, loop
+in phys_pmd_init stop mapping at 0x7bffffff.
+
+That is right behavoir, it maps exact range with exact page size that
+we ask, and we should explicitly call it to map [7c000000-0x7fffffff]
+before or after mapping 0x40000000-0x7bffffff.
+Anyway we need to make sure ranges' page_size_mask correct and consistent
+after split_mem_range for each range.
+
+Fix that by calling adjust_range_size_mask before merging range
+with same page size.
+
+-v2: update change log.
+-v3: add more explanation why [7c000000-0x7fffffff] is not mapped, and
+    it causes panic.
+
+Bisected-by: "Xie, ChanglongX" <changlongx.xie@intel.com>
+Bisected-by: Yuanhan Liu <yuanhan.liu@linux.intel.com>
+Reported-and-tested-by: Yuanhan Liu <yuanhan.liu@linux.intel.com>
+Signed-off-by: Yinghai Lu <yinghai@kernel.org>
+Link: http://lkml.kernel.org/r/1370015587-20835-1-git-send-email-yinghai@kernel.org
+Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/mm/init.c |    6 +++---
+ 1 file changed, 3 insertions(+), 3 deletions(-)
+
+--- a/arch/x86/mm/init.c
++++ b/arch/x86/mm/init.c
+@@ -277,6 +277,9 @@ static int __meminit split_mem_range(str
+       end_pfn = limit_pfn;
+       nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
+ 
++      if (!after_bootmem)
++              adjust_range_page_size_mask(mr, nr_range);
++
+       /* try to merge same page size and continuous */
+       for (i = 0; nr_range > 1 && i < nr_range - 1; i++) {
+               unsigned long old_start;
+@@ -291,9 +294,6 @@ static int __meminit split_mem_range(str
+               nr_range--;
+       }
+ 
+-      if (!after_bootmem)
+-              adjust_range_page_size_mask(mr, nr_range);
+-
+       for (i = 0; i < nr_range; i++)
+               printk(KERN_DEBUG " [mem %#010lx-%#010lx] page %s\n",
+                               mr[i].start, mr[i].end - 1,
diff --git a/queue-3.9/x86-fix-typo-in-kexec-register-clearing.patch b/queue-3.9/x86-fix-typo-in-kexec-register-clearing.patch

new file mode 100644 (file)

index 0000000..e732404
--- /dev/null
+++ b/queue-3.9/x86-fix-typo-in-kexec-register-clearing.patch
@@ -0,0 +1,33 @@
+From c8a22d19dd238ede87aa0ac4f7dbea8da039b9c1 Mon Sep 17 00:00:00 2001
+From: Kees Cook <keescook@chromium.org>
+Date: Wed, 5 Jun 2013 11:47:18 -0700
+Subject: x86: Fix typo in kexec register clearing
+
+From: Kees Cook <keescook@chromium.org>
+
+commit c8a22d19dd238ede87aa0ac4f7dbea8da039b9c1 upstream.
+
+Fixes a typo in register clearing code. Thanks to PaX Team for fixing
+this originally, and James Troup for pointing it out.
+
+Signed-off-by: Kees Cook <keescook@chromium.org>
+Link: http://lkml.kernel.org/r/20130605184718.GA8396@www.outflux.net
+Cc: PaX Team <pageexec@freemail.hu>
+Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/kernel/relocate_kernel_64.S |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/arch/x86/kernel/relocate_kernel_64.S
++++ b/arch/x86/kernel/relocate_kernel_64.S
+@@ -160,7 +160,7 @@ identity_mapped:
+       xorq    %rbp, %rbp
+       xorq    %r8,  %r8
+       xorq    %r9,  %r9
+-      xorq    %r10, %r9
++      xorq    %r10, %r10
+       xorq    %r11, %r11
+       xorq    %r12, %r12
+       xorq    %r13, %r13
author	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Fri, 14 Jun 2013 21:58:03 +0000 (14:58 -0700)
committer	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Fri, 14 Jun 2013 21:58:03 +0000 (14:58 -0700)
queue-3.9/md-raid1-5-10-disable-write-same-until-a-recovery-strategy-is-in-place.patch	[new file with mode: 0644]	patch \| blob
queue-3.9/md-raid1-consider-write-as-successful-only-if-at-least-one-non-faulty-and-non-rebuilding-drive-completed-it.patch	[new file with mode: 0644]	patch \| blob
queue-3.9/md-raid1-raid10-use-freeze_array-in-place-of-raise_barrier-in-various-places.patch	[new file with mode: 0644]	patch \| blob
queue-3.9/mm-migration-add-migrate_entry_wait_huge.patch	[new file with mode: 0644]	patch \| blob
queue-3.9/mm-page_alloc.c-fix-watermark-check-in-__zone_watermark_ok.patch	[new file with mode: 0644]	patch \| blob
queue-3.9/series		patch \| blob \| blame \| history
queue-3.9/x86-fix-adjust_range_size_mask-calling-position.patch	[new file with mode: 0644]	patch \| blob
queue-3.9/x86-fix-typo-in-kexec-register-clearing.patch	[new file with mode: 0644]	patch \| blob