]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/blame - queue-6.8/fork-defer-linking-file-vma-until-vma-is-fully-initialized.patch
6.8-stable patches
[thirdparty/kernel/stable-queue.git] / queue-6.8 / fork-defer-linking-file-vma-until-vma-is-fully-initialized.patch
CommitLineData
5c17fab8
GKH
1From 35e351780fa9d8240dd6f7e4f245f9ea37e96c19 Mon Sep 17 00:00:00 2001
2From: Miaohe Lin <linmiaohe@huawei.com>
3Date: Wed, 10 Apr 2024 17:14:41 +0800
4Subject: fork: defer linking file vma until vma is fully initialized
5
6From: Miaohe Lin <linmiaohe@huawei.com>
7
8commit 35e351780fa9d8240dd6f7e4f245f9ea37e96c19 upstream.
9
10Thorvald reported a WARNING [1]. And the root cause is below race:
11
12 CPU 1 CPU 2
13 fork hugetlbfs_fallocate
14 dup_mmap hugetlbfs_punch_hole
15 i_mmap_lock_write(mapping);
16 vma_interval_tree_insert_after -- Child vma is visible through i_mmap tree.
17 i_mmap_unlock_write(mapping);
18 hugetlb_dup_vma_private -- Clear vma_lock outside i_mmap_rwsem!
19 i_mmap_lock_write(mapping);
20 hugetlb_vmdelete_list
21 vma_interval_tree_foreach
22 hugetlb_vma_trylock_write -- Vma_lock is cleared.
23 tmp->vm_ops->open -- Alloc new vma_lock outside i_mmap_rwsem!
24 hugetlb_vma_unlock_write -- Vma_lock is assigned!!!
25 i_mmap_unlock_write(mapping);
26
27hugetlb_dup_vma_private() and hugetlb_vm_op_open() are called outside
28i_mmap_rwsem lock while vma lock can be used in the same time. Fix this
29by deferring linking file vma until vma is fully initialized. Those vmas
30should be initialized first before they can be used.
31
32Link: https://lkml.kernel.org/r/20240410091441.3539905-1-linmiaohe@huawei.com
33Fixes: 8d9bfb260814 ("hugetlb: add vma based lock for pmd sharing")
34Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
35Reported-by: Thorvald Natvig <thorvald@google.com>
36Closes: https://lore.kernel.org/linux-mm/20240129161735.6gmjsswx62o4pbja@revolver/T/ [1]
37Reviewed-by: Jane Chu <jane.chu@oracle.com>
38Cc: Christian Brauner <brauner@kernel.org>
39Cc: Heiko Carstens <hca@linux.ibm.com>
40Cc: Kent Overstreet <kent.overstreet@linux.dev>
41Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
42Cc: Mateusz Guzik <mjguzik@gmail.com>
43Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
44Cc: Miaohe Lin <linmiaohe@huawei.com>
45Cc: Muchun Song <muchun.song@linux.dev>
46Cc: Oleg Nesterov <oleg@redhat.com>
47Cc: Peng Zhang <zhangpeng.00@bytedance.com>
48Cc: Tycho Andersen <tandersen@netflix.com>
49Cc: <stable@vger.kernel.org>
50Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
51Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
52---
53 kernel/fork.c | 33 +++++++++++++++++----------------
54 1 file changed, 17 insertions(+), 16 deletions(-)
55
56--- a/kernel/fork.c
57+++ b/kernel/fork.c
58@@ -712,6 +712,23 @@ static __latent_entropy int dup_mmap(str
59 } else if (anon_vma_fork(tmp, mpnt))
60 goto fail_nomem_anon_vma_fork;
61 vm_flags_clear(tmp, VM_LOCKED_MASK);
62+ /*
63+ * Copy/update hugetlb private vma information.
64+ */
65+ if (is_vm_hugetlb_page(tmp))
66+ hugetlb_dup_vma_private(tmp);
67+
68+ /*
69+ * Link the vma into the MT. After using __mt_dup(), memory
70+ * allocation is not necessary here, so it cannot fail.
71+ */
72+ vma_iter_bulk_store(&vmi, tmp);
73+
74+ mm->map_count++;
75+
76+ if (tmp->vm_ops && tmp->vm_ops->open)
77+ tmp->vm_ops->open(tmp);
78+
79 file = tmp->vm_file;
80 if (file) {
81 struct address_space *mapping = file->f_mapping;
82@@ -728,25 +745,9 @@ static __latent_entropy int dup_mmap(str
83 i_mmap_unlock_write(mapping);
84 }
85
86- /*
87- * Copy/update hugetlb private vma information.
88- */
89- if (is_vm_hugetlb_page(tmp))
90- hugetlb_dup_vma_private(tmp);
91-
92- /*
93- * Link the vma into the MT. After using __mt_dup(), memory
94- * allocation is not necessary here, so it cannot fail.
95- */
96- vma_iter_bulk_store(&vmi, tmp);
97-
98- mm->map_count++;
99 if (!(tmp->vm_flags & VM_WIPEONFORK))
100 retval = copy_page_range(tmp, mpnt);
101
102- if (tmp->vm_ops && tmp->vm_ops->open)
103- tmp->vm_ops->open(tmp);
104-
105 if (retval) {
106 mpnt = vma_next(&vmi);
107 goto loop_out;