1 From foo@baz Sat Jul 28 10:14:30 CEST 2018
2 From: Alex Williamson <alex.williamson@redhat.com>
3 Date: Fri, 11 May 2018 09:05:02 -0600
4 Subject: vfio/type1: Fix task tracking for QEMU vCPU hotplug
6 From: Alex Williamson <alex.williamson@redhat.com>
8 [ Upstream commit 48d8476b41eed63567dd2f0ad125c895b9ac648a ]
10 MAP_DMA ioctls might be called from various threads within a process,
11 for example when using QEMU, the vCPU threads are often generating
12 these calls and we therefore take a reference to that vCPU task.
13 However, QEMU also supports vCPU hotplug on some machines and the task
14 that called MAP_DMA may have exited by the time UNMAP_DMA is called,
15 resulting in the mm_struct pointer being NULL and thus a failure to
16 match against the existing mapping.
18 To resolve this, we instead take a reference to the thread
19 group_leader, which has the same mm_struct and resource limits, but
20 is less likely exit, at least in the QEMU case. A difficulty here is
21 guaranteeing that the capabilities of the group_leader match that of
22 the calling thread, which we resolve by tracking CAP_IPC_LOCK at the
23 time of calling rather than at an indeterminate time in the future.
24 Potentially this also results in better efficiency as this is now
25 recorded once per MAP_DMA ioctl.
27 Reported-by: Xu Yandong <xuyandong2@huawei.com>
28 Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
29 Signed-off-by: Sasha Levin <alexander.levin@microsoft.com>
30 Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
32 drivers/vfio/vfio_iommu_type1.c | 73 +++++++++++++++++++++++++---------------
33 1 file changed, 47 insertions(+), 26 deletions(-)
35 --- a/drivers/vfio/vfio_iommu_type1.c
36 +++ b/drivers/vfio/vfio_iommu_type1.c
37 @@ -83,6 +83,7 @@ struct vfio_dma {
38 size_t size; /* Map size (bytes) */
39 int prot; /* IOMMU_READ/WRITE */
41 + bool lock_cap; /* capable(CAP_IPC_LOCK) */
42 struct task_struct *task;
43 struct rb_root pfn_list; /* Ex-user pinned pfn list */
45 @@ -253,29 +254,25 @@ static int vfio_iova_put_vfio_pfn(struct
49 -static int vfio_lock_acct(struct task_struct *task, long npage, bool *lock_cap)
50 +static int vfio_lock_acct(struct vfio_dma *dma, long npage, bool async)
59 - is_current = (task->mm == current->mm);
61 - mm = is_current ? task->mm : get_task_mm(task);
62 + mm = async ? get_task_mm(dma->task) : dma->task->mm;
64 return -ESRCH; /* process exited */
66 ret = down_write_killable(&mm->mmap_sem);
69 - if (lock_cap ? !*lock_cap :
70 - !has_capability(task, CAP_IPC_LOCK)) {
71 + if (!dma->lock_cap) {
74 - limit = task_rlimit(task,
75 + limit = task_rlimit(dma->task,
76 RLIMIT_MEMLOCK) >> PAGE_SHIFT;
78 if (mm->locked_vm + npage > limit)
79 @@ -289,7 +286,7 @@ static int vfio_lock_acct(struct task_st
80 up_write(&mm->mmap_sem);
88 @@ -398,7 +395,7 @@ static int vaddr_get_pfn(struct mm_struc
90 static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
91 long npage, unsigned long *pfn_base,
92 - bool lock_cap, unsigned long limit)
93 + unsigned long limit)
95 unsigned long pfn = 0;
96 long ret, pinned = 0, lock_acct = 0;
97 @@ -421,7 +418,7 @@ static long vfio_pin_pages_remote(struct
98 * pages are already counted against the user.
100 if (!rsvd && !vfio_find_vpfn(dma, iova)) {
101 - if (!lock_cap && current->mm->locked_vm + 1 > limit) {
102 + if (!dma->lock_cap && current->mm->locked_vm + 1 > limit) {
103 put_pfn(*pfn_base, dma->prot);
104 pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", __func__,
105 limit << PAGE_SHIFT);
106 @@ -447,7 +444,7 @@ static long vfio_pin_pages_remote(struct
109 if (!rsvd && !vfio_find_vpfn(dma, iova)) {
111 + if (!dma->lock_cap &&
112 current->mm->locked_vm + lock_acct + 1 > limit) {
113 put_pfn(pfn, dma->prot);
114 pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n",
115 @@ -460,7 +457,7 @@ static long vfio_pin_pages_remote(struct
119 - ret = vfio_lock_acct(current, lock_acct, &lock_cap);
120 + ret = vfio_lock_acct(dma, lock_acct, false);
124 @@ -491,7 +488,7 @@ static long vfio_unpin_pages_remote(stru
128 - vfio_lock_acct(dma->task, locked - unlocked, NULL);
129 + vfio_lock_acct(dma, locked - unlocked, true);
133 @@ -508,7 +505,7 @@ static int vfio_pin_page_external(struct
135 ret = vaddr_get_pfn(mm, vaddr, dma->prot, pfn_base);
136 if (!ret && do_accounting && !is_invalid_reserved_pfn(*pfn_base)) {
137 - ret = vfio_lock_acct(dma->task, 1, NULL);
138 + ret = vfio_lock_acct(dma, 1, true);
140 put_pfn(*pfn_base, dma->prot);
142 @@ -535,7 +532,7 @@ static int vfio_unpin_page_external(stru
143 unlocked = vfio_iova_put_vfio_pfn(dma, vpfn);
146 - vfio_lock_acct(dma->task, -unlocked, NULL);
147 + vfio_lock_acct(dma, -unlocked, true);
151 @@ -827,7 +824,7 @@ static long vfio_unmap_unpin(struct vfio
152 unlocked += vfio_sync_unpin(dma, domain, &unmapped_region_list);
155 - vfio_lock_acct(dma->task, -unlocked, NULL);
156 + vfio_lock_acct(dma, -unlocked, true);
160 @@ -1042,14 +1039,12 @@ static int vfio_pin_map_dma(struct vfio_
161 size_t size = map_size;
163 unsigned long pfn, limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
164 - bool lock_cap = capable(CAP_IPC_LOCK);
168 /* Pin a contiguous chunk of memory */
169 npage = vfio_pin_pages_remote(dma, vaddr + dma->size,
170 - size >> PAGE_SHIFT, &pfn,
172 + size >> PAGE_SHIFT, &pfn, limit);
176 @@ -1124,8 +1119,36 @@ static int vfio_dma_do_map(struct vfio_i
180 - get_task_struct(current);
181 - dma->task = current;
184 + * We need to be able to both add to a task's locked memory and test
185 + * against the locked memory limit and we need to be able to do both
186 + * outside of this call path as pinning can be asynchronous via the
187 + * external interfaces for mdev devices. RLIMIT_MEMLOCK requires a
188 + * task_struct and VM locked pages requires an mm_struct, however
189 + * holding an indefinite mm reference is not recommended, therefore we
190 + * only hold a reference to a task. We could hold a reference to
191 + * current, however QEMU uses this call path through vCPU threads,
192 + * which can be killed resulting in a NULL mm and failure in the unmap
193 + * path when called via a different thread. Avoid this problem by
194 + * using the group_leader as threads within the same group require
195 + * both CLONE_THREAD and CLONE_VM and will therefore use the same
198 + * Previously we also used the task for testing CAP_IPC_LOCK at the
199 + * time of pinning and accounting, however has_capability() makes use
200 + * of real_cred, a copy-on-write field, so we can't guarantee that it
201 + * matches group_leader, or in fact that it might not change by the
202 + * time it's evaluated. If a process were to call MAP_DMA with
203 + * CAP_IPC_LOCK but later drop it, it doesn't make sense that they
204 + * possibly see different results for an iommu_mapped vfio_dma vs
205 + * externally mapped. Therefore track CAP_IPC_LOCK in vfio_dma at the
206 + * time of calling MAP_DMA.
208 + get_task_struct(current->group_leader);
209 + dma->task = current->group_leader;
210 + dma->lock_cap = capable(CAP_IPC_LOCK);
212 dma->pfn_list = RB_ROOT;
214 /* Insert zero-sized and grow as we map chunks of it */
215 @@ -1160,7 +1183,6 @@ static int vfio_iommu_replay(struct vfio
216 struct vfio_domain *d;
218 unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
219 - bool lock_cap = capable(CAP_IPC_LOCK);
222 /* Arbitrarily pick the first domain in the list for lookups */
223 @@ -1207,8 +1229,7 @@ static int vfio_iommu_replay(struct vfio
225 npage = vfio_pin_pages_remote(dma, vaddr,
233 @@ -1485,7 +1506,7 @@ static void vfio_iommu_unmap_unpin_reacc
234 if (!is_invalid_reserved_pfn(vpfn->pfn))
237 - vfio_lock_acct(dma->task, locked - unlocked, NULL);
238 + vfio_lock_acct(dma, locked - unlocked, true);