From 54e2e8c7ca943df2b7e4b2ce7be4c1e19c929430 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Fri, 8 Mar 2019 12:15:55 +0100 Subject: [PATCH] 3.18-stable patches added patches: hugetlbfs-fix-races-and-page-leaks-during-migration.patch xtensa-fix-get_wchan.patch --- ...aces-and-page-leaks-during-migration.patch | 201 ++++++++++++++++++ queue-3.18/series | 2 + queue-3.18/xtensa-fix-get_wchan.patch | 35 +++ 3 files changed, 238 insertions(+) create mode 100644 queue-3.18/hugetlbfs-fix-races-and-page-leaks-during-migration.patch create mode 100644 queue-3.18/xtensa-fix-get_wchan.patch diff --git a/queue-3.18/hugetlbfs-fix-races-and-page-leaks-during-migration.patch b/queue-3.18/hugetlbfs-fix-races-and-page-leaks-during-migration.patch new file mode 100644 index 00000000000..cb8de0d44f9 --- /dev/null +++ b/queue-3.18/hugetlbfs-fix-races-and-page-leaks-during-migration.patch @@ -0,0 +1,201 @@ +From cb6acd01e2e43fd8bad11155752b7699c3d0fb76 Mon Sep 17 00:00:00 2001 +From: Mike Kravetz +Date: Thu, 28 Feb 2019 16:22:02 -0800 +Subject: hugetlbfs: fix races and page leaks during migration + +From: Mike Kravetz + +commit cb6acd01e2e43fd8bad11155752b7699c3d0fb76 upstream. + +hugetlb pages should only be migrated if they are 'active'. The +routines set/clear_page_huge_active() modify the active state of hugetlb +pages. + +When a new hugetlb page is allocated at fault time, set_page_huge_active +is called before the page is locked. Therefore, another thread could +race and migrate the page while it is being added to page table by the +fault code. This race is somewhat hard to trigger, but can be seen by +strategically adding udelay to simulate worst case scheduling behavior. +Depending on 'how' the code races, various BUG()s could be triggered. + +To address this issue, simply delay the set_page_huge_active call until +after the page is successfully added to the page table. + +Hugetlb pages can also be leaked at migration time if the pages are +associated with a file in an explicitly mounted hugetlbfs filesystem. +For example, consider a two node system with 4GB worth of huge pages +available. A program mmaps a 2G file in a hugetlbfs filesystem. It +then migrates the pages associated with the file from one node to +another. When the program exits, huge page counts are as follows: + + node0 + 1024 free_hugepages + 1024 nr_hugepages + + node1 + 0 free_hugepages + 1024 nr_hugepages + + Filesystem Size Used Avail Use% Mounted on + nodev 4.0G 2.0G 2.0G 50% /var/opt/hugepool + +That is as expected. 2G of huge pages are taken from the free_hugepages +counts, and 2G is the size of the file in the explicitly mounted +filesystem. If the file is then removed, the counts become: + + node0 + 1024 free_hugepages + 1024 nr_hugepages + + node1 + 1024 free_hugepages + 1024 nr_hugepages + + Filesystem Size Used Avail Use% Mounted on + nodev 4.0G 2.0G 2.0G 50% /var/opt/hugepool + +Note that the filesystem still shows 2G of pages used, while there +actually are no huge pages in use. The only way to 'fix' the filesystem +accounting is to unmount the filesystem + +If a hugetlb page is associated with an explicitly mounted filesystem, +this information in contained in the page_private field. At migration +time, this information is not preserved. To fix, simply transfer +page_private from old to new page at migration time if necessary. + +There is a related race with removing a huge page from a file and +migration. When a huge page is removed from the pagecache, the +page_mapping() field is cleared, yet page_private remains set until the +page is actually freed by free_huge_page(). A page could be migrated +while in this state. However, since page_mapping() is not set the +hugetlbfs specific routine to transfer page_private is not called and we +leak the page count in the filesystem. + +To fix that, check for this condition before migrating a huge page. If +the condition is detected, return EBUSY for the page. + +Link: http://lkml.kernel.org/r/74510272-7319-7372-9ea6-ec914734c179@oracle.com +Link: http://lkml.kernel.org/r/20190212221400.3512-1-mike.kravetz@oracle.com +Fixes: bcc54222309c ("mm: hugetlb: introduce page_huge_active") +Signed-off-by: Mike Kravetz +Reviewed-by: Naoya Horiguchi +Cc: Michal Hocko +Cc: Andrea Arcangeli +Cc: "Kirill A . Shutemov" +Cc: Mel Gorman +Cc: Davidlohr Bueso +Cc: +[mike.kravetz@oracle.com: v2] + Link: http://lkml.kernel.org/r/7534d322-d782-8ac6-1c8d-a8dc380eb3ab@oracle.com +[mike.kravetz@oracle.com: update comment and changelog] + Link: http://lkml.kernel.org/r/420bcfd6-158b-38e4-98da-26d0cd85bd01@oracle.com +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + + +--- + fs/hugetlbfs/inode.c | 12 ++++++++++++ + mm/hugetlb.c | 14 ++++++++++++-- + mm/migrate.c | 11 +++++++++++ + 3 files changed, 35 insertions(+), 2 deletions(-) + +--- a/fs/hugetlbfs/inode.c ++++ b/fs/hugetlbfs/inode.c +@@ -609,6 +609,18 @@ static int hugetlbfs_migrate_page(struct + rc = migrate_huge_page_move_mapping(mapping, newpage, page); + if (rc != MIGRATEPAGE_SUCCESS) + return rc; ++ ++ /* ++ * page_private is subpool pointer in hugetlb pages. Transfer to ++ * new page. PagePrivate is not associated with page_private for ++ * hugetlb pages and can not be set here as only page_huge_active ++ * pages can be migrated. ++ */ ++ if (page_private(page)) { ++ set_page_private(newpage, page_private(page)); ++ set_page_private(page, 0); ++ } ++ + migrate_page_copy(newpage, page); + + return MIGRATEPAGE_SUCCESS; +--- a/mm/hugetlb.c ++++ b/mm/hugetlb.c +@@ -2938,7 +2938,6 @@ retry_avoidcopy: + copy_user_huge_page(new_page, old_page, address, vma, + pages_per_huge_page(h)); + __SetPageUptodate(new_page); +- set_page_huge_active(new_page); + + mmun_start = address & huge_page_mask(h); + mmun_end = mmun_start + huge_page_size(h); +@@ -2959,6 +2958,7 @@ retry_avoidcopy: + make_huge_pte(vma, new_page, 1)); + page_remove_rmap(old_page); + hugepage_add_new_anon_rmap(new_page, vma, address); ++ set_page_huge_active(new_page); + /* Make the old page be freed below */ + new_page = old_page; + } +@@ -3017,6 +3017,7 @@ static int hugetlb_no_page(struct mm_str + struct page *page; + pte_t new_pte; + spinlock_t *ptl; ++ bool new_page = false; + + /* + * Currently, we are forced to kill the process in the event the +@@ -3050,7 +3051,7 @@ retry: + } + clear_huge_page(page, address, pages_per_huge_page(h)); + __SetPageUptodate(page); +- set_page_huge_active(page); ++ new_page = true; + + if (vma->vm_flags & VM_MAYSHARE) { + int err; +@@ -3126,6 +3127,15 @@ retry: + } + + spin_unlock(ptl); ++ ++ /* ++ * Only make newly allocated pages active. Existing pages found ++ * in the pagecache could be !page_huge_active() if they have been ++ * isolated for migration. ++ */ ++ if (new_page) ++ set_page_huge_active(page); ++ + unlock_page(page); + out: + return ret; +--- a/mm/migrate.c ++++ b/mm/migrate.c +@@ -1050,6 +1050,16 @@ static int unmap_and_move_huge_page(new_ + lock_page(hpage); + } + ++ /* ++ * Check for pages which are in the process of being freed. Without ++ * page_mapping() set, hugetlbfs specific move page routine will not ++ * be called and we could leak usage counts for subpools. ++ */ ++ if (page_private(hpage) && !page_mapping(hpage)) { ++ rc = -EBUSY; ++ goto out_unlock; ++ } ++ + if (PageAnon(hpage)) + anon_vma = page_get_anon_vma(hpage); + +@@ -1067,6 +1077,7 @@ static int unmap_and_move_huge_page(new_ + if (rc == MIGRATEPAGE_SUCCESS) + hugetlb_cgroup_migrate(hpage, new_hpage); + ++out_unlock: + unlock_page(hpage); + out: + if (rc != -EAGAIN) diff --git a/queue-3.18/series b/queue-3.18/series index 96d1adfce56..2d9771878ee 100644 --- a/queue-3.18/series +++ b/queue-3.18/series @@ -51,3 +51,5 @@ net-avoid-use-ipcb-in-cipso_v4_error.patch net-phy-micrel-ksz8061-link-failure-after-cable-connect.patch netlabel-fix-out-of-bounds-memory-accesses.patch ip6mr-do-not-call-__ip6_inc_stats-from-preemptible-context.patch +hugetlbfs-fix-races-and-page-leaks-during-migration.patch +xtensa-fix-get_wchan.patch diff --git a/queue-3.18/xtensa-fix-get_wchan.patch b/queue-3.18/xtensa-fix-get_wchan.patch new file mode 100644 index 00000000000..240495ffb3c --- /dev/null +++ b/queue-3.18/xtensa-fix-get_wchan.patch @@ -0,0 +1,35 @@ +From d90b88fd3653f1fb66ecc6571b860d5a5749fa56 Mon Sep 17 00:00:00 2001 +From: Max Filippov +Date: Wed, 2 Jan 2019 01:08:32 -0800 +Subject: xtensa: fix get_wchan + +From: Max Filippov + +commit d90b88fd3653f1fb66ecc6571b860d5a5749fa56 upstream. + +Stack unwinding is implemented incorrectly in xtensa get_wchan: instead +of extracting a0 and a1 registers from the spill location under the +stack pointer it extracts a word pointed to by the stack pointer and +subtracts 4 or 3 from it. + +Cc: stable@vger.kernel.org +Signed-off-by: Max Filippov +Signed-off-by: Greg Kroah-Hartman + +--- + arch/xtensa/kernel/process.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +--- a/arch/xtensa/kernel/process.c ++++ b/arch/xtensa/kernel/process.c +@@ -303,8 +303,8 @@ unsigned long get_wchan(struct task_stru + + /* Stack layout: sp-4: ra, sp-3: sp' */ + +- pc = MAKE_PC_FROM_RA(*(unsigned long*)sp - 4, sp); +- sp = *(unsigned long *)sp - 3; ++ pc = MAKE_PC_FROM_RA(SPILL_SLOT(sp, 0), sp); ++ sp = SPILL_SLOT(sp, 1); + } while (count++ < 16); + return 0; + } -- 2.47.3