From: Greg Kroah-Hartman Date: Fri, 14 Jun 2013 21:58:03 +0000 (-0700) Subject: 3.9-stable patches X-Git-Tag: v3.0.83~8 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=46c5d174655d1fcff8f6795fa6f2709b2879d5be;p=thirdparty%2Fkernel%2Fstable-queue.git 3.9-stable patches added patches: md-raid1-5-10-disable-write-same-until-a-recovery-strategy-is-in-place.patch md-raid1-consider-write-as-successful-only-if-at-least-one-non-faulty-and-non-rebuilding-drive-completed-it.patch md-raid1-raid10-use-freeze_array-in-place-of-raise_barrier-in-various-places.patch mm-migration-add-migrate_entry_wait_huge.patch mm-page_alloc.c-fix-watermark-check-in-__zone_watermark_ok.patch x86-fix-adjust_range_size_mask-calling-position.patch x86-fix-typo-in-kexec-register-clearing.patch --- diff --git a/queue-3.9/md-raid1-5-10-disable-write-same-until-a-recovery-strategy-is-in-place.patch b/queue-3.9/md-raid1-5-10-disable-write-same-until-a-recovery-strategy-is-in-place.patch new file mode 100644 index 00000000000..fd49e821d0e --- /dev/null +++ b/queue-3.9/md-raid1-5-10-disable-write-same-until-a-recovery-strategy-is-in-place.patch @@ -0,0 +1,89 @@ +From 5026d7a9b2f3eb1f9bda66c18ac6bc3036ec9020 Mon Sep 17 00:00:00 2001 +From: "H. Peter Anvin" +Date: Wed, 12 Jun 2013 07:37:43 -0700 +Subject: md/raid1,5,10: Disable WRITE SAME until a recovery strategy is in place + +From: "H. Peter Anvin" + +commit 5026d7a9b2f3eb1f9bda66c18ac6bc3036ec9020 upstream. + +There are cases where the kernel will believe that the WRITE SAME +command is supported by a block device which does not, in fact, +support WRITE SAME. This currently happens for SATA drivers behind a +SAS controller, but there are probably a hundred other ways that can +happen, including drive firmware bugs. + +After receiving an error for WRITE SAME the block layer will retry the +request as a plain write of zeroes, but mdraid will consider the +failure as fatal and consider the drive failed. This has the effect +that all the mirrors containing a specific set of data are each +offlined in very rapid succession resulting in data loss. + +However, just bouncing the request back up to the block layer isn't +ideal either, because the whole initial request-retry sequence should +be inside the write bitmap fence, which probably means that md needs +to do its own conversion of WRITE SAME to write zero. + +Until the failure scenario has been sorted out, disable WRITE SAME for +raid1, raid5, and raid10. + +[neilb: added raid5] + +This patch is appropriate for any -stable since 3.7 when write_same +support was added. + +Signed-off-by: H. Peter Anvin +Signed-off-by: NeilBrown +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/md/raid1.c | 4 ++-- + drivers/md/raid10.c | 3 +-- + drivers/md/raid5.c | 4 +++- + 3 files changed, 6 insertions(+), 5 deletions(-) + +--- a/drivers/md/raid1.c ++++ b/drivers/md/raid1.c +@@ -2837,8 +2837,8 @@ static int run(struct mddev *mddev) + return PTR_ERR(conf); + + if (mddev->queue) +- blk_queue_max_write_same_sectors(mddev->queue, +- mddev->chunk_sectors); ++ blk_queue_max_write_same_sectors(mddev->queue, 0); ++ + rdev_for_each(rdev, mddev) { + if (!mddev->gendisk) + continue; +--- a/drivers/md/raid10.c ++++ b/drivers/md/raid10.c +@@ -3635,8 +3635,7 @@ static int run(struct mddev *mddev) + if (mddev->queue) { + blk_queue_max_discard_sectors(mddev->queue, + mddev->chunk_sectors); +- blk_queue_max_write_same_sectors(mddev->queue, +- mddev->chunk_sectors); ++ blk_queue_max_write_same_sectors(mddev->queue, 0); + blk_queue_io_min(mddev->queue, chunk_size); + if (conf->geo.raid_disks % conf->geo.near_copies) + blk_queue_io_opt(mddev->queue, chunk_size * conf->geo.raid_disks); +--- a/drivers/md/raid5.c ++++ b/drivers/md/raid5.c +@@ -5457,7 +5457,7 @@ static int run(struct mddev *mddev) + if (mddev->major_version == 0 && + mddev->minor_version > 90) + rdev->recovery_offset = reshape_offset; +- ++ + if (rdev->recovery_offset < reshape_offset) { + /* We need to check old and new layout */ + if (!only_parity(rdev->raid_disk, +@@ -5580,6 +5580,8 @@ static int run(struct mddev *mddev) + */ + mddev->queue->limits.discard_zeroes_data = 0; + ++ blk_queue_max_write_same_sectors(mddev->queue, 0); ++ + rdev_for_each(rdev, mddev) { + disk_stack_limits(mddev->gendisk, rdev->bdev, + rdev->data_offset << 9); diff --git a/queue-3.9/md-raid1-consider-write-as-successful-only-if-at-least-one-non-faulty-and-non-rebuilding-drive-completed-it.patch b/queue-3.9/md-raid1-consider-write-as-successful-only-if-at-least-one-non-faulty-and-non-rebuilding-drive-completed-it.patch new file mode 100644 index 00000000000..7d322d931f6 --- /dev/null +++ b/queue-3.9/md-raid1-consider-write-as-successful-only-if-at-least-one-non-faulty-and-non-rebuilding-drive-completed-it.patch @@ -0,0 +1,88 @@ +From 3056e3aec8d8ba61a0710fb78b2d562600aa2ea7 Mon Sep 17 00:00:00 2001 +From: Alex Lyakas +Date: Tue, 4 Jun 2013 20:42:21 +0300 +Subject: md/raid1: consider WRITE as successful only if at least one non-Faulty and non-rebuilding drive completed it. + +From: Alex Lyakas + +commit 3056e3aec8d8ba61a0710fb78b2d562600aa2ea7 upstream. + +Without that fix, the following scenario could happen: + +- RAID1 with drives A and B; drive B was freshly-added and is rebuilding +- Drive A fails +- WRITE request arrives to the array. It is failed by drive A, so +r1_bio is marked as R1BIO_WriteError, but the rebuilding drive B +succeeds in writing it, so the same r1_bio is marked as +R1BIO_Uptodate. +- r1_bio arrives to handle_write_finished, badblocks are disabled, +md_error()->error() does nothing because we don't fail the last drive +of raid1 +- raid_end_bio_io() calls call_bio_endio() +- As a result, in call_bio_endio(): + if (!test_bit(R1BIO_Uptodate, &r1_bio->state)) + clear_bit(BIO_UPTODATE, &bio->bi_flags); +this code doesn't clear the BIO_UPTODATE flag, and the whole master +WRITE succeeds, back to the upper layer. + +So we returned success to the upper layer, even though we had written +the data onto the rebuilding drive only. But when we want to read the +data back, we would not read from the rebuilding drive, so this data +is lost. + +[neilb - applied identical change to raid10 as well] + +This bug can result in lost data, so it is suitable for any +-stable kernel. + +Signed-off-by: Alex Lyakas +Signed-off-by: NeilBrown +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/md/raid1.c | 12 +++++++++++- + drivers/md/raid10.c | 12 +++++++++++- + 2 files changed, 22 insertions(+), 2 deletions(-) + +--- a/drivers/md/raid1.c ++++ b/drivers/md/raid1.c +@@ -427,7 +427,17 @@ static void raid1_end_write_request(stru + + r1_bio->bios[mirror] = NULL; + to_put = bio; +- set_bit(R1BIO_Uptodate, &r1_bio->state); ++ /* ++ * Do not set R1BIO_Uptodate if the current device is ++ * rebuilding or Faulty. This is because we cannot use ++ * such device for properly reading the data back (we could ++ * potentially use it, if the current write would have felt ++ * before rdev->recovery_offset, but for simplicity we don't ++ * check this here. ++ */ ++ if (test_bit(In_sync, &conf->mirrors[mirror].rdev->flags) && ++ !test_bit(Faulty, &conf->mirrors[mirror].rdev->flags)) ++ set_bit(R1BIO_Uptodate, &r1_bio->state); + + /* Maybe we can clear some bad blocks. */ + if (is_badblock(conf->mirrors[mirror].rdev, +--- a/drivers/md/raid10.c ++++ b/drivers/md/raid10.c +@@ -490,7 +490,17 @@ static void raid10_end_write_request(str + sector_t first_bad; + int bad_sectors; + +- set_bit(R10BIO_Uptodate, &r10_bio->state); ++ /* ++ * Do not set R10BIO_Uptodate if the current device is ++ * rebuilding or Faulty. This is because we cannot use ++ * such device for properly reading the data back (we could ++ * potentially use it, if the current write would have felt ++ * before rdev->recovery_offset, but for simplicity we don't ++ * check this here. ++ */ ++ if (test_bit(In_sync, &rdev->flags) && ++ !test_bit(Faulty, &rdev->flags)) ++ set_bit(R10BIO_Uptodate, &r10_bio->state); + + /* Maybe we can clear some bad blocks. */ + if (is_badblock(rdev, diff --git a/queue-3.9/md-raid1-raid10-use-freeze_array-in-place-of-raise_barrier-in-various-places.patch b/queue-3.9/md-raid1-raid10-use-freeze_array-in-place-of-raise_barrier-in-various-places.patch new file mode 100644 index 00000000000..e91d5231265 --- /dev/null +++ b/queue-3.9/md-raid1-raid10-use-freeze_array-in-place-of-raise_barrier-in-various-places.patch @@ -0,0 +1,176 @@ +From e2d59925221cd562e07fee38ec8839f7209ae603 Mon Sep 17 00:00:00 2001 +From: NeilBrown +Date: Wed, 12 Jun 2013 11:01:22 +1000 +Subject: md/raid1,raid10: use freeze_array in place of raise_barrier in various places. + +From: NeilBrown + +commit e2d59925221cd562e07fee38ec8839f7209ae603 upstream. + +Various places in raid1 and raid10 are calling raise_barrier when they +really should call freeze_array. +The former is only intended to be called from "make_request". +The later has extra checks for 'nr_queued' and makes a call to +flush_pending_writes(), so it is safe to call it from within the +management thread. + +Using raise_barrier will sometimes deadlock. Using freeze_array +should not. + +As 'freeze_array' currently expects one request to be pending (in +handle_read_error - the only previous caller), we need to pass +it the number of pending requests (extra) to ignore. + +The deadlock was made particularly noticeable by commits +050b66152f87c7 (raid10) and 6b740b8d79252f13 (raid1) which +appeared in 3.4, so the fix is appropriate for any -stable +kernel since then. + +This patch probably won't apply directly to some early kernels and +will need to be applied by hand. + +Reported-by: Alexander Lyakas +Signed-off-by: NeilBrown +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/md/raid1.c | 22 +++++++++++----------- + drivers/md/raid10.c | 14 +++++++------- + 2 files changed, 18 insertions(+), 18 deletions(-) + +--- a/drivers/md/raid1.c ++++ b/drivers/md/raid1.c +@@ -890,17 +890,17 @@ static void allow_barrier(struct r1conf + wake_up(&conf->wait_barrier); + } + +-static void freeze_array(struct r1conf *conf) ++static void freeze_array(struct r1conf *conf, int extra) + { + /* stop syncio and normal IO and wait for everything to + * go quite. + * We increment barrier and nr_waiting, and then +- * wait until nr_pending match nr_queued+1 ++ * wait until nr_pending match nr_queued+extra + * This is called in the context of one normal IO request + * that has failed. Thus any sync request that might be pending + * will be blocked by nr_pending, and we need to wait for + * pending IO requests to complete or be queued for re-try. +- * Thus the number queued (nr_queued) plus this request (1) ++ * Thus the number queued (nr_queued) plus this request (extra) + * must match the number of pending IOs (nr_pending) before + * we continue. + */ +@@ -908,7 +908,7 @@ static void freeze_array(struct r1conf * + conf->barrier++; + conf->nr_waiting++; + wait_event_lock_irq_cmd(conf->wait_barrier, +- conf->nr_pending == conf->nr_queued+1, ++ conf->nr_pending == conf->nr_queued+extra, + conf->resync_lock, + flush_pending_writes(conf)); + spin_unlock_irq(&conf->resync_lock); +@@ -1568,8 +1568,8 @@ static int raid1_add_disk(struct mddev * + * we wait for all outstanding requests to complete. + */ + synchronize_sched(); +- raise_barrier(conf); +- lower_barrier(conf); ++ freeze_array(conf, 0); ++ unfreeze_array(conf); + clear_bit(Unmerged, &rdev->flags); + } + md_integrity_add_rdev(rdev, mddev); +@@ -1619,11 +1619,11 @@ static int raid1_remove_disk(struct mdde + */ + struct md_rdev *repl = + conf->mirrors[conf->raid_disks + number].rdev; +- raise_barrier(conf); ++ freeze_array(conf, 0); + clear_bit(Replacement, &repl->flags); + p->rdev = repl; + conf->mirrors[conf->raid_disks + number].rdev = NULL; +- lower_barrier(conf); ++ unfreeze_array(conf); + clear_bit(WantReplacement, &rdev->flags); + } else + clear_bit(WantReplacement, &rdev->flags); +@@ -2240,7 +2240,7 @@ static void handle_read_error(struct r1c + * frozen + */ + if (mddev->ro == 0) { +- freeze_array(conf); ++ freeze_array(conf, 1); + fix_read_error(conf, r1_bio->read_disk, + r1_bio->sector, r1_bio->sectors); + unfreeze_array(conf); +@@ -3019,7 +3019,7 @@ static int raid1_reshape(struct mddev *m + return -ENOMEM; + } + +- raise_barrier(conf); ++ freeze_array(conf, 0); + + /* ok, everything is stopped */ + oldpool = conf->r1bio_pool; +@@ -3050,7 +3050,7 @@ static int raid1_reshape(struct mddev *m + conf->raid_disks = mddev->raid_disks = raid_disks; + mddev->delta_disks = 0; + +- lower_barrier(conf); ++ unfreeze_array(conf); + + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + md_wakeup_thread(mddev->thread); +--- a/drivers/md/raid10.c ++++ b/drivers/md/raid10.c +@@ -1065,17 +1065,17 @@ static void allow_barrier(struct r10conf + wake_up(&conf->wait_barrier); + } + +-static void freeze_array(struct r10conf *conf) ++static void freeze_array(struct r10conf *conf, int extra) + { + /* stop syncio and normal IO and wait for everything to + * go quiet. + * We increment barrier and nr_waiting, and then +- * wait until nr_pending match nr_queued+1 ++ * wait until nr_pending match nr_queued+extra + * This is called in the context of one normal IO request + * that has failed. Thus any sync request that might be pending + * will be blocked by nr_pending, and we need to wait for + * pending IO requests to complete or be queued for re-try. +- * Thus the number queued (nr_queued) plus this request (1) ++ * Thus the number queued (nr_queued) plus this request (extra) + * must match the number of pending IOs (nr_pending) before + * we continue. + */ +@@ -1083,7 +1083,7 @@ static void freeze_array(struct r10conf + conf->barrier++; + conf->nr_waiting++; + wait_event_lock_irq_cmd(conf->wait_barrier, +- conf->nr_pending == conf->nr_queued+1, ++ conf->nr_pending == conf->nr_queued+extra, + conf->resync_lock, + flush_pending_writes(conf)); + +@@ -1849,8 +1849,8 @@ static int raid10_add_disk(struct mddev + * we wait for all outstanding requests to complete. + */ + synchronize_sched(); +- raise_barrier(conf, 0); +- lower_barrier(conf); ++ freeze_array(conf, 0); ++ unfreeze_array(conf); + clear_bit(Unmerged, &rdev->flags); + } + md_integrity_add_rdev(rdev, mddev); +@@ -2646,7 +2646,7 @@ static void handle_read_error(struct mdd + r10_bio->devs[slot].bio = NULL; + + if (mddev->ro == 0) { +- freeze_array(conf); ++ freeze_array(conf, 1); + fix_read_error(conf, mddev, r10_bio); + unfreeze_array(conf); + } else diff --git a/queue-3.9/mm-migration-add-migrate_entry_wait_huge.patch b/queue-3.9/mm-migration-add-migrate_entry_wait_huge.patch new file mode 100644 index 00000000000..8ab4b2ac6e2 --- /dev/null +++ b/queue-3.9/mm-migration-add-migrate_entry_wait_huge.patch @@ -0,0 +1,109 @@ +From 30dad30922ccc733cfdbfe232090cf674dc374dc Mon Sep 17 00:00:00 2001 +From: Naoya Horiguchi +Date: Wed, 12 Jun 2013 14:05:04 -0700 +Subject: mm: migration: add migrate_entry_wait_huge() + +From: Naoya Horiguchi + +commit 30dad30922ccc733cfdbfe232090cf674dc374dc upstream. + +When we have a page fault for the address which is backed by a hugepage +under migration, the kernel can't wait correctly and do busy looping on +hugepage fault until the migration finishes. As a result, users who try +to kick hugepage migration (via soft offlining, for example) occasionally +experience long delay or soft lockup. + +This is because pte_offset_map_lock() can't get a correct migration entry +or a correct page table lock for hugepage. This patch introduces +migration_entry_wait_huge() to solve this. + +Signed-off-by: Naoya Horiguchi +Reviewed-by: Rik van Riel +Reviewed-by: Wanpeng Li +Reviewed-by: Michal Hocko +Cc: Mel Gorman +Cc: Andi Kleen +Cc: KOSAKI Motohiro +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + include/linux/swapops.h | 3 +++ + mm/hugetlb.c | 2 +- + mm/migrate.c | 23 ++++++++++++++++++----- + 3 files changed, 22 insertions(+), 6 deletions(-) + +--- a/include/linux/swapops.h ++++ b/include/linux/swapops.h +@@ -137,6 +137,7 @@ static inline void make_migration_entry_ + + extern void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, + unsigned long address); ++extern void migration_entry_wait_huge(struct mm_struct *mm, pte_t *pte); + #else + + #define make_migration_entry(page, write) swp_entry(0, 0) +@@ -148,6 +149,8 @@ static inline int is_migration_entry(swp + static inline void make_migration_entry_read(swp_entry_t *entryp) { } + static inline void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, + unsigned long address) { } ++static inline void migration_entry_wait_huge(struct mm_struct *mm, ++ pte_t *pte) { } + static inline int is_write_migration_entry(swp_entry_t entry) + { + return 0; +--- a/mm/hugetlb.c ++++ b/mm/hugetlb.c +@@ -2823,7 +2823,7 @@ int hugetlb_fault(struct mm_struct *mm, + if (ptep) { + entry = huge_ptep_get(ptep); + if (unlikely(is_hugetlb_entry_migration(entry))) { +- migration_entry_wait(mm, (pmd_t *)ptep, address); ++ migration_entry_wait_huge(mm, ptep); + return 0; + } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) + return VM_FAULT_HWPOISON_LARGE | +--- a/mm/migrate.c ++++ b/mm/migrate.c +@@ -200,15 +200,14 @@ static void remove_migration_ptes(struct + * get to the page and wait until migration is finished. + * When we return from this function the fault will be retried. + */ +-void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, +- unsigned long address) ++static void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep, ++ spinlock_t *ptl) + { +- pte_t *ptep, pte; +- spinlock_t *ptl; ++ pte_t pte; + swp_entry_t entry; + struct page *page; + +- ptep = pte_offset_map_lock(mm, pmd, address, &ptl); ++ spin_lock(ptl); + pte = *ptep; + if (!is_swap_pte(pte)) + goto out; +@@ -236,6 +235,20 @@ out: + pte_unmap_unlock(ptep, ptl); + } + ++void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, ++ unsigned long address) ++{ ++ spinlock_t *ptl = pte_lockptr(mm, pmd); ++ pte_t *ptep = pte_offset_map(pmd, address); ++ __migration_entry_wait(mm, ptep, ptl); ++} ++ ++void migration_entry_wait_huge(struct mm_struct *mm, pte_t *pte) ++{ ++ spinlock_t *ptl = &(mm)->page_table_lock; ++ __migration_entry_wait(mm, pte, ptl); ++} ++ + #ifdef CONFIG_BLOCK + /* Returns true if all buffers are successfully locked */ + static bool buffer_migrate_lock_buffers(struct buffer_head *head, diff --git a/queue-3.9/mm-page_alloc.c-fix-watermark-check-in-__zone_watermark_ok.patch b/queue-3.9/mm-page_alloc.c-fix-watermark-check-in-__zone_watermark_ok.patch new file mode 100644 index 00000000000..cff78155885 --- /dev/null +++ b/queue-3.9/mm-page_alloc.c-fix-watermark-check-in-__zone_watermark_ok.patch @@ -0,0 +1,95 @@ +From 026b08147923142e925a7d0aaa39038055ae0156 Mon Sep 17 00:00:00 2001 +From: Tomasz Stanislawski +Date: Wed, 12 Jun 2013 14:05:02 -0700 +Subject: mm/page_alloc.c: fix watermark check in __zone_watermark_ok() + +From: Tomasz Stanislawski + +commit 026b08147923142e925a7d0aaa39038055ae0156 upstream. + +The watermark check consists of two sub-checks. The first one is: + + if (free_pages <= min + lowmem_reserve) + return false; + +The check assures that there is minimal amount of RAM in the zone. If +CMA is used then the free_pages is reduced by the number of free pages +in CMA prior to the over-mentioned check. + + if (!(alloc_flags & ALLOC_CMA)) + free_pages -= zone_page_state(z, NR_FREE_CMA_PAGES); + +This prevents the zone from being drained from pages available for +non-movable allocations. + +The second check prevents the zone from getting too fragmented. + + for (o = 0; o < order; o++) { + free_pages -= z->free_area[o].nr_free << o; + min >>= 1; + if (free_pages <= min) + return false; + } + +The field z->free_area[o].nr_free is equal to the number of free pages +including free CMA pages. Therefore the CMA pages are subtracted twice. +This may cause a false positive fail of __zone_watermark_ok() if the CMA +area gets strongly fragmented. In such a case there are many 0-order +free pages located in CMA. Those pages are subtracted twice therefore +they will quickly drain free_pages during the check against +fragmentation. The test fails even though there are many free non-cma +pages in the zone. + +This patch fixes this issue by subtracting CMA pages only for a purpose of +(free_pages <= min + lowmem_reserve) check. + +Laura said: + + We were observing allocation failures of higher order pages (order 5 = + 128K typically) under tight memory conditions resulting in driver + failure. The output from the page allocation failure showed plenty of + free pages of the appropriate order/type/zone and mostly CMA pages in + the lower orders. + + For full disclosure, we still observed some page allocation failures + even after applying the patch but the number was drastically reduced and + those failures were attributed to fragmentation/other system issues. + +Signed-off-by: Tomasz Stanislawski +Signed-off-by: Kyungmin Park +Tested-by: Laura Abbott +Cc: Bartlomiej Zolnierkiewicz +Acked-by: Minchan Kim +Cc: Mel Gorman +Tested-by: Marek Szyprowski +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + mm/page_alloc.c | 6 ++++-- + 1 file changed, 4 insertions(+), 2 deletions(-) + +--- a/mm/page_alloc.c ++++ b/mm/page_alloc.c +@@ -1626,6 +1626,7 @@ static bool __zone_watermark_ok(struct z + long min = mark; + long lowmem_reserve = z->lowmem_reserve[classzone_idx]; + int o; ++ long free_cma = 0; + + free_pages -= (1 << order) - 1; + if (alloc_flags & ALLOC_HIGH) +@@ -1635,9 +1636,10 @@ static bool __zone_watermark_ok(struct z + #ifdef CONFIG_CMA + /* If allocation can't use CMA areas don't use free CMA pages */ + if (!(alloc_flags & ALLOC_CMA)) +- free_pages -= zone_page_state(z, NR_FREE_CMA_PAGES); ++ free_cma = zone_page_state(z, NR_FREE_CMA_PAGES); + #endif +- if (free_pages <= min + lowmem_reserve) ++ ++ if (free_pages - free_cma <= min + lowmem_reserve) + return false; + for (o = 0; o < order; o++) { + /* At the next order, this order's pages become unavailable */ diff --git a/queue-3.9/series b/queue-3.9/series index ad17f78ab5f..0c8158b215f 100644 --- a/queue-3.9/series +++ b/queue-3.9/series @@ -22,3 +22,10 @@ memcg-don-t-initialize-kmem-cache-destroying-work-for-root-caches.patch wl12xx-fix-minimum-required-firmware-version-for-wl127x-multirole.patch drm-i915-prefer-vbt-modes-for-svdo-lvds-over-edid.patch swap-avoid-read_swap_cache_async-race-to-deadlock-while-waiting-on-discard-i-o-completion.patch +md-raid1-consider-write-as-successful-only-if-at-least-one-non-faulty-and-non-rebuilding-drive-completed-it.patch +md-raid1-5-10-disable-write-same-until-a-recovery-strategy-is-in-place.patch +md-raid1-raid10-use-freeze_array-in-place-of-raise_barrier-in-various-places.patch +mm-page_alloc.c-fix-watermark-check-in-__zone_watermark_ok.patch +mm-migration-add-migrate_entry_wait_huge.patch +x86-fix-adjust_range_size_mask-calling-position.patch +x86-fix-typo-in-kexec-register-clearing.patch diff --git a/queue-3.9/x86-fix-adjust_range_size_mask-calling-position.patch b/queue-3.9/x86-fix-adjust_range_size_mask-calling-position.patch new file mode 100644 index 00000000000..ccbfc634f9a --- /dev/null +++ b/queue-3.9/x86-fix-adjust_range_size_mask-calling-position.patch @@ -0,0 +1,91 @@ +From 7de3d66b1387ddf5a37d9689e5eb8510fb75c765 Mon Sep 17 00:00:00 2001 +From: Yinghai Lu +Date: Fri, 31 May 2013 08:53:07 -0700 +Subject: x86: Fix adjust_range_size_mask calling position + +From: Yinghai Lu + +commit 7de3d66b1387ddf5a37d9689e5eb8510fb75c765 upstream. + +Commit + + 8d57470d x86, mm: setup page table in top-down + +causes a kernel panic while setting mem=2G. + + [mem 0x00000000-0x000fffff] page 4k + [mem 0x7fe00000-0x7fffffff] page 1G + [mem 0x7c000000-0x7fdfffff] page 1G + [mem 0x00100000-0x001fffff] page 4k + [mem 0x00200000-0x7bffffff] page 2M + +for last entry is not what we want, we should have + [mem 0x00200000-0x3fffffff] page 2M + [mem 0x40000000-0x7bffffff] page 1G + +Actually we merge the continuous ranges with same page size too early. +in this case, before merging we have + [mem 0x00200000-0x3fffffff] page 2M + [mem 0x40000000-0x7bffffff] page 2M +after merging them, will get + [mem 0x00200000-0x7bffffff] page 2M +even we can use 1G page to map + [mem 0x40000000-0x7bffffff] + +that will cause problem, because we already map + [mem 0x7fe00000-0x7fffffff] page 1G + [mem 0x7c000000-0x7fdfffff] page 1G +with 1G page, aka [0x40000000-0x7fffffff] is mapped with 1G page already. +During phys_pud_init() for [0x40000000-0x7bffffff], it will not +reuse existing that pud page, and allocate new one then try to use +2M page to map it instead, as page_size_mask does not include +PG_LEVEL_1G. At end will have [7c000000-0x7fffffff] not mapped, loop +in phys_pmd_init stop mapping at 0x7bffffff. + +That is right behavoir, it maps exact range with exact page size that +we ask, and we should explicitly call it to map [7c000000-0x7fffffff] +before or after mapping 0x40000000-0x7bffffff. +Anyway we need to make sure ranges' page_size_mask correct and consistent +after split_mem_range for each range. + +Fix that by calling adjust_range_size_mask before merging range +with same page size. + +-v2: update change log. +-v3: add more explanation why [7c000000-0x7fffffff] is not mapped, and + it causes panic. + +Bisected-by: "Xie, ChanglongX" +Bisected-by: Yuanhan Liu +Reported-and-tested-by: Yuanhan Liu +Signed-off-by: Yinghai Lu +Link: http://lkml.kernel.org/r/1370015587-20835-1-git-send-email-yinghai@kernel.org +Signed-off-by: H. Peter Anvin +Signed-off-by: Greg Kroah-Hartman + +--- + arch/x86/mm/init.c | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +--- a/arch/x86/mm/init.c ++++ b/arch/x86/mm/init.c +@@ -277,6 +277,9 @@ static int __meminit split_mem_range(str + end_pfn = limit_pfn; + nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0); + ++ if (!after_bootmem) ++ adjust_range_page_size_mask(mr, nr_range); ++ + /* try to merge same page size and continuous */ + for (i = 0; nr_range > 1 && i < nr_range - 1; i++) { + unsigned long old_start; +@@ -291,9 +294,6 @@ static int __meminit split_mem_range(str + nr_range--; + } + +- if (!after_bootmem) +- adjust_range_page_size_mask(mr, nr_range); +- + for (i = 0; i < nr_range; i++) + printk(KERN_DEBUG " [mem %#010lx-%#010lx] page %s\n", + mr[i].start, mr[i].end - 1, diff --git a/queue-3.9/x86-fix-typo-in-kexec-register-clearing.patch b/queue-3.9/x86-fix-typo-in-kexec-register-clearing.patch new file mode 100644 index 00000000000..e7324048358 --- /dev/null +++ b/queue-3.9/x86-fix-typo-in-kexec-register-clearing.patch @@ -0,0 +1,33 @@ +From c8a22d19dd238ede87aa0ac4f7dbea8da039b9c1 Mon Sep 17 00:00:00 2001 +From: Kees Cook +Date: Wed, 5 Jun 2013 11:47:18 -0700 +Subject: x86: Fix typo in kexec register clearing + +From: Kees Cook + +commit c8a22d19dd238ede87aa0ac4f7dbea8da039b9c1 upstream. + +Fixes a typo in register clearing code. Thanks to PaX Team for fixing +this originally, and James Troup for pointing it out. + +Signed-off-by: Kees Cook +Link: http://lkml.kernel.org/r/20130605184718.GA8396@www.outflux.net +Cc: PaX Team +Signed-off-by: H. Peter Anvin +Signed-off-by: Greg Kroah-Hartman + +--- + arch/x86/kernel/relocate_kernel_64.S | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/arch/x86/kernel/relocate_kernel_64.S ++++ b/arch/x86/kernel/relocate_kernel_64.S +@@ -160,7 +160,7 @@ identity_mapped: + xorq %rbp, %rbp + xorq %r8, %r8 + xorq %r9, %r9 +- xorq %r10, %r9 ++ xorq %r10, %r10 + xorq %r11, %r11 + xorq %r12, %r12 + xorq %r13, %r13