]>
Commit | Line | Data |
---|---|---|
5c17fab8 GKH |
1 | From 35e351780fa9d8240dd6f7e4f245f9ea37e96c19 Mon Sep 17 00:00:00 2001 |
2 | From: Miaohe Lin <linmiaohe@huawei.com> | |
3 | Date: Wed, 10 Apr 2024 17:14:41 +0800 | |
4 | Subject: fork: defer linking file vma until vma is fully initialized | |
5 | ||
6 | From: Miaohe Lin <linmiaohe@huawei.com> | |
7 | ||
8 | commit 35e351780fa9d8240dd6f7e4f245f9ea37e96c19 upstream. | |
9 | ||
10 | Thorvald reported a WARNING [1]. And the root cause is below race: | |
11 | ||
12 | CPU 1 CPU 2 | |
13 | fork hugetlbfs_fallocate | |
14 | dup_mmap hugetlbfs_punch_hole | |
15 | i_mmap_lock_write(mapping); | |
16 | vma_interval_tree_insert_after -- Child vma is visible through i_mmap tree. | |
17 | i_mmap_unlock_write(mapping); | |
18 | hugetlb_dup_vma_private -- Clear vma_lock outside i_mmap_rwsem! | |
19 | i_mmap_lock_write(mapping); | |
20 | hugetlb_vmdelete_list | |
21 | vma_interval_tree_foreach | |
22 | hugetlb_vma_trylock_write -- Vma_lock is cleared. | |
23 | tmp->vm_ops->open -- Alloc new vma_lock outside i_mmap_rwsem! | |
24 | hugetlb_vma_unlock_write -- Vma_lock is assigned!!! | |
25 | i_mmap_unlock_write(mapping); | |
26 | ||
27 | hugetlb_dup_vma_private() and hugetlb_vm_op_open() are called outside | |
28 | i_mmap_rwsem lock while vma lock can be used in the same time. Fix this | |
29 | by deferring linking file vma until vma is fully initialized. Those vmas | |
30 | should be initialized first before they can be used. | |
31 | ||
32 | Link: https://lkml.kernel.org/r/20240410091441.3539905-1-linmiaohe@huawei.com | |
33 | Fixes: 8d9bfb260814 ("hugetlb: add vma based lock for pmd sharing") | |
34 | Signed-off-by: Miaohe Lin <linmiaohe@huawei.com> | |
35 | Reported-by: Thorvald Natvig <thorvald@google.com> | |
36 | Closes: https://lore.kernel.org/linux-mm/20240129161735.6gmjsswx62o4pbja@revolver/T/ [1] | |
37 | Reviewed-by: Jane Chu <jane.chu@oracle.com> | |
38 | Cc: Christian Brauner <brauner@kernel.org> | |
39 | Cc: Heiko Carstens <hca@linux.ibm.com> | |
40 | Cc: Kent Overstreet <kent.overstreet@linux.dev> | |
41 | Cc: Liam R. Howlett <Liam.Howlett@oracle.com> | |
42 | Cc: Mateusz Guzik <mjguzik@gmail.com> | |
43 | Cc: Matthew Wilcox (Oracle) <willy@infradead.org> | |
44 | Cc: Miaohe Lin <linmiaohe@huawei.com> | |
45 | Cc: Muchun Song <muchun.song@linux.dev> | |
46 | Cc: Oleg Nesterov <oleg@redhat.com> | |
47 | Cc: Peng Zhang <zhangpeng.00@bytedance.com> | |
48 | Cc: Tycho Andersen <tandersen@netflix.com> | |
49 | Cc: <stable@vger.kernel.org> | |
50 | Signed-off-by: Andrew Morton <akpm@linux-foundation.org> | |
51 | Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> | |
52 | --- | |
53 | kernel/fork.c | 33 +++++++++++++++++---------------- | |
54 | 1 file changed, 17 insertions(+), 16 deletions(-) | |
55 | ||
56 | --- a/kernel/fork.c | |
57 | +++ b/kernel/fork.c | |
58 | @@ -712,6 +712,23 @@ static __latent_entropy int dup_mmap(str | |
59 | } else if (anon_vma_fork(tmp, mpnt)) | |
60 | goto fail_nomem_anon_vma_fork; | |
61 | vm_flags_clear(tmp, VM_LOCKED_MASK); | |
62 | + /* | |
63 | + * Copy/update hugetlb private vma information. | |
64 | + */ | |
65 | + if (is_vm_hugetlb_page(tmp)) | |
66 | + hugetlb_dup_vma_private(tmp); | |
67 | + | |
68 | + /* | |
69 | + * Link the vma into the MT. After using __mt_dup(), memory | |
70 | + * allocation is not necessary here, so it cannot fail. | |
71 | + */ | |
72 | + vma_iter_bulk_store(&vmi, tmp); | |
73 | + | |
74 | + mm->map_count++; | |
75 | + | |
76 | + if (tmp->vm_ops && tmp->vm_ops->open) | |
77 | + tmp->vm_ops->open(tmp); | |
78 | + | |
79 | file = tmp->vm_file; | |
80 | if (file) { | |
81 | struct address_space *mapping = file->f_mapping; | |
82 | @@ -728,25 +745,9 @@ static __latent_entropy int dup_mmap(str | |
83 | i_mmap_unlock_write(mapping); | |
84 | } | |
85 | ||
86 | - /* | |
87 | - * Copy/update hugetlb private vma information. | |
88 | - */ | |
89 | - if (is_vm_hugetlb_page(tmp)) | |
90 | - hugetlb_dup_vma_private(tmp); | |
91 | - | |
92 | - /* | |
93 | - * Link the vma into the MT. After using __mt_dup(), memory | |
94 | - * allocation is not necessary here, so it cannot fail. | |
95 | - */ | |
96 | - vma_iter_bulk_store(&vmi, tmp); | |
97 | - | |
98 | - mm->map_count++; | |
99 | if (!(tmp->vm_flags & VM_WIPEONFORK)) | |
100 | retval = copy_page_range(tmp, mpnt); | |
101 | ||
102 | - if (tmp->vm_ops && tmp->vm_ops->open) | |
103 | - tmp->vm_ops->open(tmp); | |
104 | - | |
105 | if (retval) { | |
106 | mpnt = vma_next(&vmi); | |
107 | goto loop_out; |