]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/blob - releases/4.17.12/vfio-type1-fix-task-tracking-for-qemu-vcpu-hotplug.patch
4.14-stable patches
[thirdparty/kernel/stable-queue.git] / releases / 4.17.12 / vfio-type1-fix-task-tracking-for-qemu-vcpu-hotplug.patch
1 From foo@baz Sat Jul 28 10:14:30 CEST 2018
2 From: Alex Williamson <alex.williamson@redhat.com>
3 Date: Fri, 11 May 2018 09:05:02 -0600
4 Subject: vfio/type1: Fix task tracking for QEMU vCPU hotplug
5
6 From: Alex Williamson <alex.williamson@redhat.com>
7
8 [ Upstream commit 48d8476b41eed63567dd2f0ad125c895b9ac648a ]
9
10 MAP_DMA ioctls might be called from various threads within a process,
11 for example when using QEMU, the vCPU threads are often generating
12 these calls and we therefore take a reference to that vCPU task.
13 However, QEMU also supports vCPU hotplug on some machines and the task
14 that called MAP_DMA may have exited by the time UNMAP_DMA is called,
15 resulting in the mm_struct pointer being NULL and thus a failure to
16 match against the existing mapping.
17
18 To resolve this, we instead take a reference to the thread
19 group_leader, which has the same mm_struct and resource limits, but
20 is less likely exit, at least in the QEMU case. A difficulty here is
21 guaranteeing that the capabilities of the group_leader match that of
22 the calling thread, which we resolve by tracking CAP_IPC_LOCK at the
23 time of calling rather than at an indeterminate time in the future.
24 Potentially this also results in better efficiency as this is now
25 recorded once per MAP_DMA ioctl.
26
27 Reported-by: Xu Yandong <xuyandong2@huawei.com>
28 Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
29 Signed-off-by: Sasha Levin <alexander.levin@microsoft.com>
30 Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
31 ---
32 drivers/vfio/vfio_iommu_type1.c | 73 +++++++++++++++++++++++++---------------
33 1 file changed, 47 insertions(+), 26 deletions(-)
34
35 --- a/drivers/vfio/vfio_iommu_type1.c
36 +++ b/drivers/vfio/vfio_iommu_type1.c
37 @@ -83,6 +83,7 @@ struct vfio_dma {
38 size_t size; /* Map size (bytes) */
39 int prot; /* IOMMU_READ/WRITE */
40 bool iommu_mapped;
41 + bool lock_cap; /* capable(CAP_IPC_LOCK) */
42 struct task_struct *task;
43 struct rb_root pfn_list; /* Ex-user pinned pfn list */
44 };
45 @@ -253,29 +254,25 @@ static int vfio_iova_put_vfio_pfn(struct
46 return ret;
47 }
48
49 -static int vfio_lock_acct(struct task_struct *task, long npage, bool *lock_cap)
50 +static int vfio_lock_acct(struct vfio_dma *dma, long npage, bool async)
51 {
52 struct mm_struct *mm;
53 - bool is_current;
54 int ret;
55
56 if (!npage)
57 return 0;
58
59 - is_current = (task->mm == current->mm);
60 -
61 - mm = is_current ? task->mm : get_task_mm(task);
62 + mm = async ? get_task_mm(dma->task) : dma->task->mm;
63 if (!mm)
64 return -ESRCH; /* process exited */
65
66 ret = down_write_killable(&mm->mmap_sem);
67 if (!ret) {
68 if (npage > 0) {
69 - if (lock_cap ? !*lock_cap :
70 - !has_capability(task, CAP_IPC_LOCK)) {
71 + if (!dma->lock_cap) {
72 unsigned long limit;
73
74 - limit = task_rlimit(task,
75 + limit = task_rlimit(dma->task,
76 RLIMIT_MEMLOCK) >> PAGE_SHIFT;
77
78 if (mm->locked_vm + npage > limit)
79 @@ -289,7 +286,7 @@ static int vfio_lock_acct(struct task_st
80 up_write(&mm->mmap_sem);
81 }
82
83 - if (!is_current)
84 + if (async)
85 mmput(mm);
86
87 return ret;
88 @@ -398,7 +395,7 @@ static int vaddr_get_pfn(struct mm_struc
89 */
90 static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
91 long npage, unsigned long *pfn_base,
92 - bool lock_cap, unsigned long limit)
93 + unsigned long limit)
94 {
95 unsigned long pfn = 0;
96 long ret, pinned = 0, lock_acct = 0;
97 @@ -421,7 +418,7 @@ static long vfio_pin_pages_remote(struct
98 * pages are already counted against the user.
99 */
100 if (!rsvd && !vfio_find_vpfn(dma, iova)) {
101 - if (!lock_cap && current->mm->locked_vm + 1 > limit) {
102 + if (!dma->lock_cap && current->mm->locked_vm + 1 > limit) {
103 put_pfn(*pfn_base, dma->prot);
104 pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", __func__,
105 limit << PAGE_SHIFT);
106 @@ -447,7 +444,7 @@ static long vfio_pin_pages_remote(struct
107 }
108
109 if (!rsvd && !vfio_find_vpfn(dma, iova)) {
110 - if (!lock_cap &&
111 + if (!dma->lock_cap &&
112 current->mm->locked_vm + lock_acct + 1 > limit) {
113 put_pfn(pfn, dma->prot);
114 pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n",
115 @@ -460,7 +457,7 @@ static long vfio_pin_pages_remote(struct
116 }
117
118 out:
119 - ret = vfio_lock_acct(current, lock_acct, &lock_cap);
120 + ret = vfio_lock_acct(dma, lock_acct, false);
121
122 unpin_out:
123 if (ret) {
124 @@ -491,7 +488,7 @@ static long vfio_unpin_pages_remote(stru
125 }
126
127 if (do_accounting)
128 - vfio_lock_acct(dma->task, locked - unlocked, NULL);
129 + vfio_lock_acct(dma, locked - unlocked, true);
130
131 return unlocked;
132 }
133 @@ -508,7 +505,7 @@ static int vfio_pin_page_external(struct
134
135 ret = vaddr_get_pfn(mm, vaddr, dma->prot, pfn_base);
136 if (!ret && do_accounting && !is_invalid_reserved_pfn(*pfn_base)) {
137 - ret = vfio_lock_acct(dma->task, 1, NULL);
138 + ret = vfio_lock_acct(dma, 1, true);
139 if (ret) {
140 put_pfn(*pfn_base, dma->prot);
141 if (ret == -ENOMEM)
142 @@ -535,7 +532,7 @@ static int vfio_unpin_page_external(stru
143 unlocked = vfio_iova_put_vfio_pfn(dma, vpfn);
144
145 if (do_accounting)
146 - vfio_lock_acct(dma->task, -unlocked, NULL);
147 + vfio_lock_acct(dma, -unlocked, true);
148
149 return unlocked;
150 }
151 @@ -827,7 +824,7 @@ static long vfio_unmap_unpin(struct vfio
152 unlocked += vfio_sync_unpin(dma, domain, &unmapped_region_list);
153
154 if (do_accounting) {
155 - vfio_lock_acct(dma->task, -unlocked, NULL);
156 + vfio_lock_acct(dma, -unlocked, true);
157 return 0;
158 }
159 return unlocked;
160 @@ -1042,14 +1039,12 @@ static int vfio_pin_map_dma(struct vfio_
161 size_t size = map_size;
162 long npage;
163 unsigned long pfn, limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
164 - bool lock_cap = capable(CAP_IPC_LOCK);
165 int ret = 0;
166
167 while (size) {
168 /* Pin a contiguous chunk of memory */
169 npage = vfio_pin_pages_remote(dma, vaddr + dma->size,
170 - size >> PAGE_SHIFT, &pfn,
171 - lock_cap, limit);
172 + size >> PAGE_SHIFT, &pfn, limit);
173 if (npage <= 0) {
174 WARN_ON(!npage);
175 ret = (int)npage;
176 @@ -1124,8 +1119,36 @@ static int vfio_dma_do_map(struct vfio_i
177 dma->iova = iova;
178 dma->vaddr = vaddr;
179 dma->prot = prot;
180 - get_task_struct(current);
181 - dma->task = current;
182 +
183 + /*
184 + * We need to be able to both add to a task's locked memory and test
185 + * against the locked memory limit and we need to be able to do both
186 + * outside of this call path as pinning can be asynchronous via the
187 + * external interfaces for mdev devices. RLIMIT_MEMLOCK requires a
188 + * task_struct and VM locked pages requires an mm_struct, however
189 + * holding an indefinite mm reference is not recommended, therefore we
190 + * only hold a reference to a task. We could hold a reference to
191 + * current, however QEMU uses this call path through vCPU threads,
192 + * which can be killed resulting in a NULL mm and failure in the unmap
193 + * path when called via a different thread. Avoid this problem by
194 + * using the group_leader as threads within the same group require
195 + * both CLONE_THREAD and CLONE_VM and will therefore use the same
196 + * mm_struct.
197 + *
198 + * Previously we also used the task for testing CAP_IPC_LOCK at the
199 + * time of pinning and accounting, however has_capability() makes use
200 + * of real_cred, a copy-on-write field, so we can't guarantee that it
201 + * matches group_leader, or in fact that it might not change by the
202 + * time it's evaluated. If a process were to call MAP_DMA with
203 + * CAP_IPC_LOCK but later drop it, it doesn't make sense that they
204 + * possibly see different results for an iommu_mapped vfio_dma vs
205 + * externally mapped. Therefore track CAP_IPC_LOCK in vfio_dma at the
206 + * time of calling MAP_DMA.
207 + */
208 + get_task_struct(current->group_leader);
209 + dma->task = current->group_leader;
210 + dma->lock_cap = capable(CAP_IPC_LOCK);
211 +
212 dma->pfn_list = RB_ROOT;
213
214 /* Insert zero-sized and grow as we map chunks of it */
215 @@ -1160,7 +1183,6 @@ static int vfio_iommu_replay(struct vfio
216 struct vfio_domain *d;
217 struct rb_node *n;
218 unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
219 - bool lock_cap = capable(CAP_IPC_LOCK);
220 int ret;
221
222 /* Arbitrarily pick the first domain in the list for lookups */
223 @@ -1207,8 +1229,7 @@ static int vfio_iommu_replay(struct vfio
224
225 npage = vfio_pin_pages_remote(dma, vaddr,
226 n >> PAGE_SHIFT,
227 - &pfn, lock_cap,
228 - limit);
229 + &pfn, limit);
230 if (npage <= 0) {
231 WARN_ON(!npage);
232 ret = (int)npage;
233 @@ -1485,7 +1506,7 @@ static void vfio_iommu_unmap_unpin_reacc
234 if (!is_invalid_reserved_pfn(vpfn->pfn))
235 locked++;
236 }
237 - vfio_lock_acct(dma->task, locked - unlocked, NULL);
238 + vfio_lock_acct(dma, locked - unlocked, true);
239 }
240 }
241