]> git.ipfire.org Git - thirdparty/kernel/stable.git/blame - drivers/vfio/vfio_iommu_type1.c
vfio iommu type1: Add find_iommu_group() function
[thirdparty/kernel/stable.git] / drivers / vfio / vfio_iommu_type1.c
CommitLineData
73fa0d10
AW
1/*
2 * VFIO: IOMMU DMA mapping support for Type1 IOMMU
3 *
4 * Copyright (C) 2012 Red Hat, Inc. All rights reserved.
5 * Author: Alex Williamson <alex.williamson@redhat.com>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 *
11 * Derived from original vfio:
12 * Copyright 2010 Cisco Systems, Inc. All rights reserved.
13 * Author: Tom Lyon, pugs@cisco.com
14 *
15 * We arbitrarily define a Type1 IOMMU as one matching the below code.
16 * It could be called the x86 IOMMU as it's designed for AMD-Vi & Intel
17 * VT-d, but that makes it harder to re-use as theoretically anyone
18 * implementing a similar IOMMU could make use of this. We expect the
19 * IOMMU to support the IOMMU API and have few to no restrictions around
20 * the IOVA range that can be mapped. The Type1 IOMMU is currently
21 * optimized for relatively static mappings of a userspace process with
22 * userpsace pages pinned into memory. We also assume devices and IOMMU
23 * domains are PCI based as the IOMMU API is still centered around a
24 * device/bus interface rather than a group interface.
25 */
26
27#include <linux/compat.h>
28#include <linux/device.h>
29#include <linux/fs.h>
30#include <linux/iommu.h>
31#include <linux/module.h>
32#include <linux/mm.h>
cd9b2268 33#include <linux/rbtree.h>
73fa0d10
AW
34#include <linux/sched.h>
35#include <linux/slab.h>
36#include <linux/uaccess.h>
37#include <linux/vfio.h>
38#include <linux/workqueue.h>
39
40#define DRIVER_VERSION "0.2"
41#define DRIVER_AUTHOR "Alex Williamson <alex.williamson@redhat.com>"
42#define DRIVER_DESC "Type1 IOMMU driver for VFIO"
43
44static bool allow_unsafe_interrupts;
45module_param_named(allow_unsafe_interrupts,
46 allow_unsafe_interrupts, bool, S_IRUGO | S_IWUSR);
47MODULE_PARM_DESC(allow_unsafe_interrupts,
48 "Enable VFIO IOMMU support for on platforms without interrupt remapping support.");
49
5c6c2b21
AW
50static bool disable_hugepages;
51module_param_named(disable_hugepages,
52 disable_hugepages, bool, S_IRUGO | S_IWUSR);
53MODULE_PARM_DESC(disable_hugepages,
54 "Disable VFIO IOMMU support for IOMMU hugepages.");
55
73fa0d10 56struct vfio_iommu {
1ef3e2bc 57 struct list_head domain_list;
73fa0d10 58 struct mutex lock;
cd9b2268 59 struct rb_root dma_list;
f5c9eceb
WD
60 bool v2;
61 bool nesting;
1ef3e2bc
AW
62};
63
64struct vfio_domain {
65 struct iommu_domain *domain;
66 struct list_head next;
73fa0d10 67 struct list_head group_list;
1ef3e2bc 68 int prot; /* IOMMU_CACHE */
6fe1010d 69 bool fgsp; /* Fine-grained super pages */
73fa0d10
AW
70};
71
72struct vfio_dma {
cd9b2268 73 struct rb_node node;
73fa0d10
AW
74 dma_addr_t iova; /* Device address */
75 unsigned long vaddr; /* Process virtual addr */
166fd7d9 76 size_t size; /* Map size (bytes) */
73fa0d10
AW
77 int prot; /* IOMMU_READ/WRITE */
78};
79
80struct vfio_group {
81 struct iommu_group *iommu_group;
82 struct list_head next;
83};
84
85/*
86 * This code handles mapping and unmapping of user data buffers
87 * into DMA'ble space using the IOMMU
88 */
89
cd9b2268
AW
90static struct vfio_dma *vfio_find_dma(struct vfio_iommu *iommu,
91 dma_addr_t start, size_t size)
92{
93 struct rb_node *node = iommu->dma_list.rb_node;
94
95 while (node) {
96 struct vfio_dma *dma = rb_entry(node, struct vfio_dma, node);
97
98 if (start + size <= dma->iova)
99 node = node->rb_left;
166fd7d9 100 else if (start >= dma->iova + dma->size)
cd9b2268
AW
101 node = node->rb_right;
102 else
103 return dma;
104 }
105
106 return NULL;
107}
108
1ef3e2bc 109static void vfio_link_dma(struct vfio_iommu *iommu, struct vfio_dma *new)
cd9b2268
AW
110{
111 struct rb_node **link = &iommu->dma_list.rb_node, *parent = NULL;
112 struct vfio_dma *dma;
113
114 while (*link) {
115 parent = *link;
116 dma = rb_entry(parent, struct vfio_dma, node);
117
166fd7d9 118 if (new->iova + new->size <= dma->iova)
cd9b2268
AW
119 link = &(*link)->rb_left;
120 else
121 link = &(*link)->rb_right;
122 }
123
124 rb_link_node(&new->node, parent, link);
125 rb_insert_color(&new->node, &iommu->dma_list);
126}
127
1ef3e2bc 128static void vfio_unlink_dma(struct vfio_iommu *iommu, struct vfio_dma *old)
cd9b2268
AW
129{
130 rb_erase(&old->node, &iommu->dma_list);
131}
132
73fa0d10
AW
133struct vwork {
134 struct mm_struct *mm;
135 long npage;
136 struct work_struct work;
137};
138
139/* delayed decrement/increment for locked_vm */
140static void vfio_lock_acct_bg(struct work_struct *work)
141{
142 struct vwork *vwork = container_of(work, struct vwork, work);
143 struct mm_struct *mm;
144
145 mm = vwork->mm;
146 down_write(&mm->mmap_sem);
147 mm->locked_vm += vwork->npage;
148 up_write(&mm->mmap_sem);
149 mmput(mm);
150 kfree(vwork);
151}
152
3624a248 153static void vfio_lock_acct(struct task_struct *task, long npage)
73fa0d10
AW
154{
155 struct vwork *vwork;
156 struct mm_struct *mm;
157
3624a248
KW
158 if (!npage)
159 return;
160
161 mm = get_task_mm(task);
162 if (!mm)
166fd7d9 163 return; /* process exited or nothing to do */
73fa0d10 164
3624a248
KW
165 if (down_write_trylock(&mm->mmap_sem)) {
166 mm->locked_vm += npage;
167 up_write(&mm->mmap_sem);
168 mmput(mm);
73fa0d10
AW
169 return;
170 }
171
172 /*
173 * Couldn't get mmap_sem lock, so must setup to update
174 * mm->locked_vm later. If locked_vm were atomic, we
175 * wouldn't need this silliness
176 */
177 vwork = kmalloc(sizeof(struct vwork), GFP_KERNEL);
3624a248
KW
178 if (!vwork) {
179 mmput(mm);
73fa0d10
AW
180 return;
181 }
182 INIT_WORK(&vwork->work, vfio_lock_acct_bg);
183 vwork->mm = mm;
184 vwork->npage = npage;
185 schedule_work(&vwork->work);
186}
187
188/*
189 * Some mappings aren't backed by a struct page, for example an mmap'd
190 * MMIO range for our own or another device. These use a different
191 * pfn conversion and shouldn't be tracked as locked pages.
192 */
193static bool is_invalid_reserved_pfn(unsigned long pfn)
194{
195 if (pfn_valid(pfn)) {
196 bool reserved;
197 struct page *tail = pfn_to_page(pfn);
668f9abb 198 struct page *head = compound_head(tail);
73fa0d10
AW
199 reserved = !!(PageReserved(head));
200 if (head != tail) {
201 /*
202 * "head" is not a dangling pointer
668f9abb 203 * (compound_head takes care of that)
73fa0d10
AW
204 * but the hugepage may have been split
205 * from under us (and we may not hold a
206 * reference count on the head page so it can
207 * be reused before we run PageReferenced), so
208 * we've to check PageTail before returning
209 * what we just read.
210 */
211 smp_rmb();
212 if (PageTail(tail))
213 return reserved;
214 }
215 return PageReserved(tail);
216 }
217
218 return true;
219}
220
221static int put_pfn(unsigned long pfn, int prot)
222{
223 if (!is_invalid_reserved_pfn(pfn)) {
224 struct page *page = pfn_to_page(pfn);
225 if (prot & IOMMU_WRITE)
226 SetPageDirty(page);
227 put_page(page);
228 return 1;
229 }
230 return 0;
231}
232
ea85cf35
KW
233static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr,
234 int prot, unsigned long *pfn)
73fa0d10
AW
235{
236 struct page *page[1];
237 struct vm_area_struct *vma;
ea85cf35 238 int ret;
73fa0d10 239
ea85cf35
KW
240 if (mm == current->mm) {
241 ret = get_user_pages_fast(vaddr, 1, !!(prot & IOMMU_WRITE),
242 page);
243 } else {
244 unsigned int flags = 0;
245
246 if (prot & IOMMU_WRITE)
247 flags |= FOLL_WRITE;
248
249 down_read(&mm->mmap_sem);
250 ret = get_user_pages_remote(NULL, mm, vaddr, 1, flags, page,
251 NULL);
252 up_read(&mm->mmap_sem);
253 }
254
255 if (ret == 1) {
73fa0d10
AW
256 *pfn = page_to_pfn(page[0]);
257 return 0;
258 }
259
ea85cf35 260 down_read(&mm->mmap_sem);
73fa0d10 261
ea85cf35 262 vma = find_vma_intersection(mm, vaddr, vaddr + 1);
73fa0d10
AW
263
264 if (vma && vma->vm_flags & VM_PFNMAP) {
265 *pfn = ((vaddr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
266 if (is_invalid_reserved_pfn(*pfn))
267 ret = 0;
268 }
269
ea85cf35 270 up_read(&mm->mmap_sem);
73fa0d10
AW
271
272 return ret;
273}
274
166fd7d9
AW
275/*
276 * Attempt to pin pages. We really don't want to track all the pfns and
277 * the iommu can only map chunks of consecutive pfns anyway, so get the
278 * first page and all consecutive pages with the same locking.
279 */
2169037d
KW
280static long vfio_pin_pages_remote(unsigned long vaddr, long npage,
281 int prot, unsigned long *pfn_base)
73fa0d10 282{
166fd7d9
AW
283 unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
284 bool lock_cap = capable(CAP_IPC_LOCK);
285 long ret, i;
babbf176 286 bool rsvd;
73fa0d10 287
166fd7d9
AW
288 if (!current->mm)
289 return -ENODEV;
73fa0d10 290
ea85cf35 291 ret = vaddr_get_pfn(current->mm, vaddr, prot, pfn_base);
166fd7d9
AW
292 if (ret)
293 return ret;
73fa0d10 294
babbf176 295 rsvd = is_invalid_reserved_pfn(*pfn_base);
73fa0d10 296
babbf176 297 if (!rsvd && !lock_cap && current->mm->locked_vm + 1 > limit) {
166fd7d9
AW
298 put_pfn(*pfn_base, prot);
299 pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", __func__,
300 limit << PAGE_SHIFT);
301 return -ENOMEM;
302 }
303
5c6c2b21 304 if (unlikely(disable_hugepages)) {
babbf176 305 if (!rsvd)
3624a248 306 vfio_lock_acct(current, 1);
5c6c2b21
AW
307 return 1;
308 }
309
166fd7d9
AW
310 /* Lock all the consecutive pages from pfn_base */
311 for (i = 1, vaddr += PAGE_SIZE; i < npage; i++, vaddr += PAGE_SIZE) {
73fa0d10
AW
312 unsigned long pfn = 0;
313
ea85cf35 314 ret = vaddr_get_pfn(current->mm, vaddr, prot, &pfn);
166fd7d9
AW
315 if (ret)
316 break;
317
babbf176
AW
318 if (pfn != *pfn_base + i ||
319 rsvd != is_invalid_reserved_pfn(pfn)) {
166fd7d9
AW
320 put_pfn(pfn, prot);
321 break;
73fa0d10
AW
322 }
323
babbf176
AW
324 if (!rsvd && !lock_cap &&
325 current->mm->locked_vm + i + 1 > limit) {
166fd7d9
AW
326 put_pfn(pfn, prot);
327 pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n",
328 __func__, limit << PAGE_SHIFT);
329 break;
330 }
331 }
332
babbf176 333 if (!rsvd)
3624a248 334 vfio_lock_acct(current, i);
166fd7d9
AW
335
336 return i;
337}
338
2169037d
KW
339static long vfio_unpin_pages_remote(unsigned long pfn, long npage,
340 int prot, bool do_accounting)
166fd7d9
AW
341{
342 unsigned long unlocked = 0;
343 long i;
344
345 for (i = 0; i < npage; i++)
346 unlocked += put_pfn(pfn++, prot);
347
348 if (do_accounting)
3624a248 349 vfio_lock_acct(current, -unlocked);
166fd7d9
AW
350
351 return unlocked;
352}
353
1ef3e2bc 354static void vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma)
166fd7d9 355{
1ef3e2bc
AW
356 dma_addr_t iova = dma->iova, end = dma->iova + dma->size;
357 struct vfio_domain *domain, *d;
166fd7d9
AW
358 long unlocked = 0;
359
1ef3e2bc
AW
360 if (!dma->size)
361 return;
362 /*
363 * We use the IOMMU to track the physical addresses, otherwise we'd
364 * need a much more complicated tracking system. Unfortunately that
365 * means we need to use one of the iommu domains to figure out the
366 * pfns to unpin. The rest need to be unmapped in advance so we have
367 * no iommu translations remaining when the pages are unpinned.
368 */
369 domain = d = list_first_entry(&iommu->domain_list,
370 struct vfio_domain, next);
371
c5e66887 372 list_for_each_entry_continue(d, &iommu->domain_list, next) {
1ef3e2bc 373 iommu_unmap(d->domain, dma->iova, dma->size);
c5e66887
AW
374 cond_resched();
375 }
1ef3e2bc 376
166fd7d9 377 while (iova < end) {
6fe1010d
AW
378 size_t unmapped, len;
379 phys_addr_t phys, next;
166fd7d9 380
1ef3e2bc 381 phys = iommu_iova_to_phys(domain->domain, iova);
166fd7d9
AW
382 if (WARN_ON(!phys)) {
383 iova += PAGE_SIZE;
384 continue;
73fa0d10 385 }
166fd7d9 386
6fe1010d
AW
387 /*
388 * To optimize for fewer iommu_unmap() calls, each of which
389 * may require hardware cache flushing, try to find the
390 * largest contiguous physical memory chunk to unmap.
391 */
392 for (len = PAGE_SIZE;
393 !domain->fgsp && iova + len < end; len += PAGE_SIZE) {
394 next = iommu_iova_to_phys(domain->domain, iova + len);
395 if (next != phys + len)
396 break;
397 }
398
399 unmapped = iommu_unmap(domain->domain, iova, len);
1ef3e2bc 400 if (WARN_ON(!unmapped))
166fd7d9
AW
401 break;
402
2169037d
KW
403 unlocked += vfio_unpin_pages_remote(phys >> PAGE_SHIFT,
404 unmapped >> PAGE_SHIFT,
405 dma->prot, false);
166fd7d9 406 iova += unmapped;
c5e66887
AW
407
408 cond_resched();
73fa0d10 409 }
166fd7d9 410
3624a248 411 vfio_lock_acct(current, -unlocked);
73fa0d10
AW
412}
413
1ef3e2bc 414static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma)
73fa0d10 415{
1ef3e2bc
AW
416 vfio_unmap_unpin(iommu, dma);
417 vfio_unlink_dma(iommu, dma);
418 kfree(dma);
419}
73fa0d10 420
1ef3e2bc
AW
421static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu)
422{
423 struct vfio_domain *domain;
4644321f 424 unsigned long bitmap = ULONG_MAX;
166fd7d9 425
1ef3e2bc
AW
426 mutex_lock(&iommu->lock);
427 list_for_each_entry(domain, &iommu->domain_list, next)
d16e0faa 428 bitmap &= domain->domain->pgsize_bitmap;
1ef3e2bc 429 mutex_unlock(&iommu->lock);
73fa0d10 430
4644321f
EA
431 /*
432 * In case the IOMMU supports page sizes smaller than PAGE_SIZE
433 * we pretend PAGE_SIZE is supported and hide sub-PAGE_SIZE sizes.
434 * That way the user will be able to map/unmap buffers whose size/
435 * start address is aligned with PAGE_SIZE. Pinning code uses that
436 * granularity while iommu driver can use the sub-PAGE_SIZE size
437 * to map the buffer.
438 */
439 if (bitmap & ~PAGE_MASK) {
440 bitmap &= PAGE_MASK;
441 bitmap |= PAGE_SIZE;
442 }
443
1ef3e2bc 444 return bitmap;
73fa0d10
AW
445}
446
447static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
448 struct vfio_iommu_type1_dma_unmap *unmap)
449{
73fa0d10 450 uint64_t mask;
cd9b2268 451 struct vfio_dma *dma;
1ef3e2bc 452 size_t unmapped = 0;
cd9b2268 453 int ret = 0;
73fa0d10 454
1ef3e2bc 455 mask = ((uint64_t)1 << __ffs(vfio_pgsize_bitmap(iommu))) - 1;
73fa0d10
AW
456
457 if (unmap->iova & mask)
458 return -EINVAL;
f5bfdbf2 459 if (!unmap->size || unmap->size & mask)
73fa0d10
AW
460 return -EINVAL;
461
73fa0d10
AW
462 WARN_ON(mask & PAGE_MASK);
463
464 mutex_lock(&iommu->lock);
465
1ef3e2bc
AW
466 /*
467 * vfio-iommu-type1 (v1) - User mappings were coalesced together to
468 * avoid tracking individual mappings. This means that the granularity
469 * of the original mapping was lost and the user was allowed to attempt
470 * to unmap any range. Depending on the contiguousness of physical
471 * memory and page sizes supported by the IOMMU, arbitrary unmaps may
472 * or may not have worked. We only guaranteed unmap granularity
473 * matching the original mapping; even though it was untracked here,
474 * the original mappings are reflected in IOMMU mappings. This
475 * resulted in a couple unusual behaviors. First, if a range is not
476 * able to be unmapped, ex. a set of 4k pages that was mapped as a
477 * 2M hugepage into the IOMMU, the unmap ioctl returns success but with
478 * a zero sized unmap. Also, if an unmap request overlaps the first
479 * address of a hugepage, the IOMMU will unmap the entire hugepage.
480 * This also returns success and the returned unmap size reflects the
481 * actual size unmapped.
482 *
483 * We attempt to maintain compatibility with this "v1" interface, but
484 * we take control out of the hands of the IOMMU. Therefore, an unmap
485 * request offset from the beginning of the original mapping will
486 * return success with zero sized unmap. And an unmap request covering
487 * the first iova of mapping will unmap the entire range.
488 *
489 * The v2 version of this interface intends to be more deterministic.
490 * Unmap requests must fully cover previous mappings. Multiple
491 * mappings may still be unmaped by specifying large ranges, but there
492 * must not be any previous mappings bisected by the range. An error
493 * will be returned if these conditions are not met. The v2 interface
494 * will only return success and a size of zero if there were no
495 * mappings within the range.
496 */
497 if (iommu->v2) {
498 dma = vfio_find_dma(iommu, unmap->iova, 0);
499 if (dma && dma->iova != unmap->iova) {
500 ret = -EINVAL;
501 goto unlock;
502 }
503 dma = vfio_find_dma(iommu, unmap->iova + unmap->size - 1, 0);
504 if (dma && dma->iova + dma->size != unmap->iova + unmap->size) {
505 ret = -EINVAL;
506 goto unlock;
507 }
508 }
509
166fd7d9 510 while ((dma = vfio_find_dma(iommu, unmap->iova, unmap->size))) {
1ef3e2bc 511 if (!iommu->v2 && unmap->iova > dma->iova)
166fd7d9 512 break;
1ef3e2bc
AW
513 unmapped += dma->size;
514 vfio_remove_dma(iommu, dma);
166fd7d9 515 }
cd9b2268 516
1ef3e2bc 517unlock:
73fa0d10 518 mutex_unlock(&iommu->lock);
166fd7d9 519
1ef3e2bc 520 /* Report how much was unmapped */
166fd7d9
AW
521 unmap->size = unmapped;
522
523 return ret;
524}
525
526/*
527 * Turns out AMD IOMMU has a page table bug where it won't map large pages
528 * to a region that previously mapped smaller pages. This should be fixed
529 * soon, so this is just a temporary workaround to break mappings down into
530 * PAGE_SIZE. Better to map smaller pages than nothing.
531 */
1ef3e2bc 532static int map_try_harder(struct vfio_domain *domain, dma_addr_t iova,
166fd7d9
AW
533 unsigned long pfn, long npage, int prot)
534{
535 long i;
089f1c6b 536 int ret = 0;
166fd7d9
AW
537
538 for (i = 0; i < npage; i++, pfn++, iova += PAGE_SIZE) {
1ef3e2bc 539 ret = iommu_map(domain->domain, iova,
166fd7d9 540 (phys_addr_t)pfn << PAGE_SHIFT,
1ef3e2bc 541 PAGE_SIZE, prot | domain->prot);
166fd7d9
AW
542 if (ret)
543 break;
544 }
545
546 for (; i < npage && i > 0; i--, iova -= PAGE_SIZE)
1ef3e2bc
AW
547 iommu_unmap(domain->domain, iova, PAGE_SIZE);
548
549 return ret;
550}
551
552static int vfio_iommu_map(struct vfio_iommu *iommu, dma_addr_t iova,
553 unsigned long pfn, long npage, int prot)
554{
555 struct vfio_domain *d;
556 int ret;
557
558 list_for_each_entry(d, &iommu->domain_list, next) {
559 ret = iommu_map(d->domain, iova, (phys_addr_t)pfn << PAGE_SHIFT,
560 npage << PAGE_SHIFT, prot | d->prot);
561 if (ret) {
562 if (ret != -EBUSY ||
563 map_try_harder(d, iova, pfn, npage, prot))
564 goto unwind;
565 }
c5e66887
AW
566
567 cond_resched();
1ef3e2bc
AW
568 }
569
570 return 0;
571
572unwind:
573 list_for_each_entry_continue_reverse(d, &iommu->domain_list, next)
574 iommu_unmap(d->domain, iova, npage << PAGE_SHIFT);
166fd7d9 575
cd9b2268 576 return ret;
73fa0d10
AW
577}
578
579static int vfio_dma_do_map(struct vfio_iommu *iommu,
580 struct vfio_iommu_type1_dma_map *map)
581{
c8dbca16 582 dma_addr_t iova = map->iova;
166fd7d9 583 unsigned long vaddr = map->vaddr;
73fa0d10 584 size_t size = map->size;
166fd7d9 585 long npage;
73fa0d10
AW
586 int ret = 0, prot = 0;
587 uint64_t mask;
1ef3e2bc 588 struct vfio_dma *dma;
d93b3ac0 589 unsigned long pfn;
166fd7d9 590
c8dbca16
AW
591 /* Verify that none of our __u64 fields overflow */
592 if (map->size != size || map->vaddr != vaddr || map->iova != iova)
593 return -EINVAL;
73fa0d10 594
1ef3e2bc 595 mask = ((uint64_t)1 << __ffs(vfio_pgsize_bitmap(iommu))) - 1;
73fa0d10 596
c8dbca16
AW
597 WARN_ON(mask & PAGE_MASK);
598
73fa0d10
AW
599 /* READ/WRITE from device perspective */
600 if (map->flags & VFIO_DMA_MAP_FLAG_WRITE)
601 prot |= IOMMU_WRITE;
602 if (map->flags & VFIO_DMA_MAP_FLAG_READ)
603 prot |= IOMMU_READ;
604
c8dbca16 605 if (!prot || !size || (size | iova | vaddr) & mask)
73fa0d10
AW
606 return -EINVAL;
607
c8dbca16
AW
608 /* Don't allow IOVA or virtual address wrap */
609 if (iova + size - 1 < iova || vaddr + size - 1 < vaddr)
73fa0d10
AW
610 return -EINVAL;
611
612 mutex_lock(&iommu->lock);
613
c8dbca16 614 if (vfio_find_dma(iommu, iova, size)) {
166fd7d9
AW
615 mutex_unlock(&iommu->lock);
616 return -EEXIST;
73fa0d10
AW
617 }
618
1ef3e2bc
AW
619 dma = kzalloc(sizeof(*dma), GFP_KERNEL);
620 if (!dma) {
621 mutex_unlock(&iommu->lock);
622 return -ENOMEM;
623 }
624
c8dbca16
AW
625 dma->iova = iova;
626 dma->vaddr = vaddr;
1ef3e2bc 627 dma->prot = prot;
166fd7d9 628
1ef3e2bc
AW
629 /* Insert zero-sized and grow as we map chunks of it */
630 vfio_link_dma(iommu, dma);
166fd7d9 631
c8dbca16 632 while (size) {
166fd7d9 633 /* Pin a contiguous chunk of memory */
2169037d
KW
634 npage = vfio_pin_pages_remote(vaddr + dma->size,
635 size >> PAGE_SHIFT, prot, &pfn);
166fd7d9
AW
636 if (npage <= 0) {
637 WARN_ON(!npage);
638 ret = (int)npage;
1ef3e2bc 639 break;
166fd7d9
AW
640 }
641
1ef3e2bc 642 /* Map it! */
c8dbca16 643 ret = vfio_iommu_map(iommu, iova + dma->size, pfn, npage, prot);
166fd7d9 644 if (ret) {
2169037d 645 vfio_unpin_pages_remote(pfn, npage, prot, true);
1ef3e2bc 646 break;
166fd7d9
AW
647 }
648
c8dbca16
AW
649 size -= npage << PAGE_SHIFT;
650 dma->size += npage << PAGE_SHIFT;
1ef3e2bc 651 }
166fd7d9 652
1ef3e2bc
AW
653 if (ret)
654 vfio_remove_dma(iommu, dma);
166fd7d9 655
1ef3e2bc
AW
656 mutex_unlock(&iommu->lock);
657 return ret;
658}
659
660static int vfio_bus_type(struct device *dev, void *data)
661{
662 struct bus_type **bus = data;
663
664 if (*bus && *bus != dev->bus)
665 return -EINVAL;
666
667 *bus = dev->bus;
668
669 return 0;
670}
671
672static int vfio_iommu_replay(struct vfio_iommu *iommu,
673 struct vfio_domain *domain)
674{
675 struct vfio_domain *d;
676 struct rb_node *n;
677 int ret;
678
679 /* Arbitrarily pick the first domain in the list for lookups */
680 d = list_first_entry(&iommu->domain_list, struct vfio_domain, next);
681 n = rb_first(&iommu->dma_list);
682
683 /* If there's not a domain, there better not be any mappings */
684 if (WARN_ON(n && !d))
685 return -EINVAL;
686
687 for (; n; n = rb_next(n)) {
688 struct vfio_dma *dma;
689 dma_addr_t iova;
690
691 dma = rb_entry(n, struct vfio_dma, node);
692 iova = dma->iova;
693
694 while (iova < dma->iova + dma->size) {
695 phys_addr_t phys = iommu_iova_to_phys(d->domain, iova);
696 size_t size;
73fa0d10 697
1ef3e2bc
AW
698 if (WARN_ON(!phys)) {
699 iova += PAGE_SIZE;
700 continue;
166fd7d9
AW
701 }
702
1ef3e2bc 703 size = PAGE_SIZE;
73fa0d10 704
1ef3e2bc
AW
705 while (iova + size < dma->iova + dma->size &&
706 phys + size == iommu_iova_to_phys(d->domain,
707 iova + size))
708 size += PAGE_SIZE;
d93b3ac0 709
1ef3e2bc
AW
710 ret = iommu_map(domain->domain, iova, phys,
711 size, dma->prot | domain->prot);
712 if (ret)
713 return ret;
d93b3ac0 714
1ef3e2bc
AW
715 iova += size;
716 }
166fd7d9 717 }
73fa0d10 718
1ef3e2bc 719 return 0;
73fa0d10
AW
720}
721
6fe1010d
AW
722/*
723 * We change our unmap behavior slightly depending on whether the IOMMU
724 * supports fine-grained superpages. IOMMUs like AMD-Vi will use a superpage
725 * for practically any contiguous power-of-two mapping we give it. This means
726 * we don't need to look for contiguous chunks ourselves to make unmapping
727 * more efficient. On IOMMUs with coarse-grained super pages, like Intel VT-d
728 * with discrete 2M/1G/512G/1T superpages, identifying contiguous chunks
729 * significantly boosts non-hugetlbfs mappings and doesn't seem to hurt when
730 * hugetlbfs is in use.
731 */
732static void vfio_test_domain_fgsp(struct vfio_domain *domain)
733{
734 struct page *pages;
735 int ret, order = get_order(PAGE_SIZE * 2);
736
737 pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, order);
738 if (!pages)
739 return;
740
741 ret = iommu_map(domain->domain, 0, page_to_phys(pages), PAGE_SIZE * 2,
742 IOMMU_READ | IOMMU_WRITE | domain->prot);
743 if (!ret) {
744 size_t unmapped = iommu_unmap(domain->domain, 0, PAGE_SIZE);
745
746 if (unmapped == PAGE_SIZE)
747 iommu_unmap(domain->domain, PAGE_SIZE, PAGE_SIZE);
748 else
749 domain->fgsp = true;
750 }
751
752 __free_pages(pages, order);
753}
754
7896c998
KW
755static struct vfio_group *find_iommu_group(struct vfio_domain *domain,
756 struct iommu_group *iommu_group)
757{
758 struct vfio_group *g;
759
760 list_for_each_entry(g, &domain->group_list, next) {
761 if (g->iommu_group == iommu_group)
762 return g;
763 }
764
765 return NULL;
766}
767
73fa0d10
AW
768static int vfio_iommu_type1_attach_group(void *iommu_data,
769 struct iommu_group *iommu_group)
770{
771 struct vfio_iommu *iommu = iommu_data;
7896c998 772 struct vfio_group *group;
1ef3e2bc
AW
773 struct vfio_domain *domain, *d;
774 struct bus_type *bus = NULL;
73fa0d10
AW
775 int ret;
776
73fa0d10
AW
777 mutex_lock(&iommu->lock);
778
1ef3e2bc 779 list_for_each_entry(d, &iommu->domain_list, next) {
7896c998 780 if (find_iommu_group(d, iommu_group)) {
73fa0d10 781 mutex_unlock(&iommu->lock);
73fa0d10
AW
782 return -EINVAL;
783 }
784 }
785
1ef3e2bc
AW
786 group = kzalloc(sizeof(*group), GFP_KERNEL);
787 domain = kzalloc(sizeof(*domain), GFP_KERNEL);
788 if (!group || !domain) {
789 ret = -ENOMEM;
790 goto out_free;
791 }
792
793 group->iommu_group = iommu_group;
794
795 /* Determine bus_type in order to allocate a domain */
796 ret = iommu_group_for_each_dev(iommu_group, &bus, vfio_bus_type);
797 if (ret)
798 goto out_free;
799
800 domain->domain = iommu_domain_alloc(bus);
801 if (!domain->domain) {
802 ret = -EIO;
803 goto out_free;
804 }
805
f5c9eceb
WD
806 if (iommu->nesting) {
807 int attr = 1;
808
809 ret = iommu_domain_set_attr(domain->domain, DOMAIN_ATTR_NESTING,
810 &attr);
811 if (ret)
812 goto out_domain;
813 }
814
1ef3e2bc
AW
815 ret = iommu_attach_group(domain->domain, iommu_group);
816 if (ret)
817 goto out_domain;
818
819 INIT_LIST_HEAD(&domain->group_list);
820 list_add(&group->next, &domain->group_list);
821
822 if (!allow_unsafe_interrupts &&
eb165f05 823 !iommu_capable(bus, IOMMU_CAP_INTR_REMAP)) {
1ef3e2bc
AW
824 pr_warn("%s: No interrupt remapping support. Use the module param \"allow_unsafe_interrupts\" to enable VFIO IOMMU support on this platform\n",
825 __func__);
826 ret = -EPERM;
827 goto out_detach;
828 }
829
eb165f05 830 if (iommu_capable(bus, IOMMU_CAP_CACHE_COHERENCY))
1ef3e2bc
AW
831 domain->prot |= IOMMU_CACHE;
832
73fa0d10 833 /*
1ef3e2bc
AW
834 * Try to match an existing compatible domain. We don't want to
835 * preclude an IOMMU driver supporting multiple bus_types and being
836 * able to include different bus_types in the same IOMMU domain, so
837 * we test whether the domains use the same iommu_ops rather than
838 * testing if they're on the same bus_type.
73fa0d10 839 */
1ef3e2bc
AW
840 list_for_each_entry(d, &iommu->domain_list, next) {
841 if (d->domain->ops == domain->domain->ops &&
842 d->prot == domain->prot) {
843 iommu_detach_group(domain->domain, iommu_group);
844 if (!iommu_attach_group(d->domain, iommu_group)) {
845 list_add(&group->next, &d->group_list);
846 iommu_domain_free(domain->domain);
847 kfree(domain);
848 mutex_unlock(&iommu->lock);
849 return 0;
850 }
851
852 ret = iommu_attach_group(domain->domain, iommu_group);
853 if (ret)
854 goto out_domain;
855 }
73fa0d10
AW
856 }
857
6fe1010d
AW
858 vfio_test_domain_fgsp(domain);
859
1ef3e2bc
AW
860 /* replay mappings on new domains */
861 ret = vfio_iommu_replay(iommu, domain);
862 if (ret)
863 goto out_detach;
864
865 list_add(&domain->next, &iommu->domain_list);
73fa0d10
AW
866
867 mutex_unlock(&iommu->lock);
868
869 return 0;
1ef3e2bc
AW
870
871out_detach:
872 iommu_detach_group(domain->domain, iommu_group);
873out_domain:
874 iommu_domain_free(domain->domain);
875out_free:
876 kfree(domain);
877 kfree(group);
878 mutex_unlock(&iommu->lock);
879 return ret;
880}
881
882static void vfio_iommu_unmap_unpin_all(struct vfio_iommu *iommu)
883{
884 struct rb_node *node;
885
886 while ((node = rb_first(&iommu->dma_list)))
887 vfio_remove_dma(iommu, rb_entry(node, struct vfio_dma, node));
73fa0d10
AW
888}
889
890static void vfio_iommu_type1_detach_group(void *iommu_data,
891 struct iommu_group *iommu_group)
892{
893 struct vfio_iommu *iommu = iommu_data;
1ef3e2bc 894 struct vfio_domain *domain;
73fa0d10
AW
895 struct vfio_group *group;
896
897 mutex_lock(&iommu->lock);
898
1ef3e2bc 899 list_for_each_entry(domain, &iommu->domain_list, next) {
7896c998
KW
900 group = find_iommu_group(domain, iommu_group);
901 if (!group)
902 continue;
1ef3e2bc 903
7896c998
KW
904 iommu_detach_group(domain->domain, iommu_group);
905 list_del(&group->next);
906 kfree(group);
907 /*
908 * Group ownership provides privilege, if the group
909 * list is empty, the domain goes away. If it's the
910 * last domain, then all the mappings go away too.
911 */
912 if (list_empty(&domain->group_list)) {
913 if (list_is_singular(&iommu->domain_list))
914 vfio_iommu_unmap_unpin_all(iommu);
915 iommu_domain_free(domain->domain);
916 list_del(&domain->next);
917 kfree(domain);
73fa0d10 918 }
7896c998 919 goto done;
73fa0d10
AW
920 }
921
1ef3e2bc 922done:
73fa0d10
AW
923 mutex_unlock(&iommu->lock);
924}
925
926static void *vfio_iommu_type1_open(unsigned long arg)
927{
928 struct vfio_iommu *iommu;
929
73fa0d10
AW
930 iommu = kzalloc(sizeof(*iommu), GFP_KERNEL);
931 if (!iommu)
932 return ERR_PTR(-ENOMEM);
933
f5c9eceb
WD
934 switch (arg) {
935 case VFIO_TYPE1_IOMMU:
936 break;
937 case VFIO_TYPE1_NESTING_IOMMU:
938 iommu->nesting = true;
939 case VFIO_TYPE1v2_IOMMU:
940 iommu->v2 = true;
941 break;
942 default:
943 kfree(iommu);
944 return ERR_PTR(-EINVAL);
945 }
946
1ef3e2bc 947 INIT_LIST_HEAD(&iommu->domain_list);
cd9b2268 948 iommu->dma_list = RB_ROOT;
73fa0d10 949 mutex_init(&iommu->lock);
73fa0d10
AW
950
951 return iommu;
952}
953
954static void vfio_iommu_type1_release(void *iommu_data)
955{
956 struct vfio_iommu *iommu = iommu_data;
1ef3e2bc 957 struct vfio_domain *domain, *domain_tmp;
73fa0d10 958 struct vfio_group *group, *group_tmp;
73fa0d10 959
1ef3e2bc 960 vfio_iommu_unmap_unpin_all(iommu);
73fa0d10 961
1ef3e2bc
AW
962 list_for_each_entry_safe(domain, domain_tmp,
963 &iommu->domain_list, next) {
964 list_for_each_entry_safe(group, group_tmp,
965 &domain->group_list, next) {
966 iommu_detach_group(domain->domain, group->iommu_group);
967 list_del(&group->next);
968 kfree(group);
969 }
970 iommu_domain_free(domain->domain);
971 list_del(&domain->next);
972 kfree(domain);
73fa0d10
AW
973 }
974
73fa0d10
AW
975 kfree(iommu);
976}
977
aa429318
AW
978static int vfio_domains_have_iommu_cache(struct vfio_iommu *iommu)
979{
980 struct vfio_domain *domain;
981 int ret = 1;
982
983 mutex_lock(&iommu->lock);
984 list_for_each_entry(domain, &iommu->domain_list, next) {
985 if (!(domain->prot & IOMMU_CACHE)) {
986 ret = 0;
f5bfdbf2 987 break;
aa429318 988 }
73fa0d10 989 }
aa429318 990 mutex_unlock(&iommu->lock);
73fa0d10 991
aa429318 992 return ret;
73fa0d10
AW
993}
994
995static long vfio_iommu_type1_ioctl(void *iommu_data,
996 unsigned int cmd, unsigned long arg)
997{
998 struct vfio_iommu *iommu = iommu_data;
999 unsigned long minsz;
1000
1001 if (cmd == VFIO_CHECK_EXTENSION) {
1002 switch (arg) {
1003 case VFIO_TYPE1_IOMMU:
1ef3e2bc 1004 case VFIO_TYPE1v2_IOMMU:
f5c9eceb 1005 case VFIO_TYPE1_NESTING_IOMMU:
73fa0d10 1006 return 1;
aa429318
AW
1007 case VFIO_DMA_CC_IOMMU:
1008 if (!iommu)
1009 return 0;
1010 return vfio_domains_have_iommu_cache(iommu);
73fa0d10
AW
1011 default:
1012 return 0;
1013 }
1014 } else if (cmd == VFIO_IOMMU_GET_INFO) {
1015 struct vfio_iommu_type1_info info;
1016
1017 minsz = offsetofend(struct vfio_iommu_type1_info, iova_pgsizes);
1018
1019 if (copy_from_user(&info, (void __user *)arg, minsz))
1020 return -EFAULT;
1021
1022 if (info.argsz < minsz)
1023 return -EINVAL;
1024
d4f50ee2 1025 info.flags = VFIO_IOMMU_INFO_PGSIZES;
73fa0d10 1026
1ef3e2bc 1027 info.iova_pgsizes = vfio_pgsize_bitmap(iommu);
73fa0d10 1028
8160c4e4
MT
1029 return copy_to_user((void __user *)arg, &info, minsz) ?
1030 -EFAULT : 0;
73fa0d10
AW
1031
1032 } else if (cmd == VFIO_IOMMU_MAP_DMA) {
1033 struct vfio_iommu_type1_dma_map map;
1034 uint32_t mask = VFIO_DMA_MAP_FLAG_READ |
1035 VFIO_DMA_MAP_FLAG_WRITE;
1036
1037 minsz = offsetofend(struct vfio_iommu_type1_dma_map, size);
1038
1039 if (copy_from_user(&map, (void __user *)arg, minsz))
1040 return -EFAULT;
1041
1042 if (map.argsz < minsz || map.flags & ~mask)
1043 return -EINVAL;
1044
1045 return vfio_dma_do_map(iommu, &map);
1046
1047 } else if (cmd == VFIO_IOMMU_UNMAP_DMA) {
1048 struct vfio_iommu_type1_dma_unmap unmap;
166fd7d9 1049 long ret;
73fa0d10
AW
1050
1051 minsz = offsetofend(struct vfio_iommu_type1_dma_unmap, size);
1052
1053 if (copy_from_user(&unmap, (void __user *)arg, minsz))
1054 return -EFAULT;
1055
1056 if (unmap.argsz < minsz || unmap.flags)
1057 return -EINVAL;
1058
166fd7d9
AW
1059 ret = vfio_dma_do_unmap(iommu, &unmap);
1060 if (ret)
1061 return ret;
1062
8160c4e4
MT
1063 return copy_to_user((void __user *)arg, &unmap, minsz) ?
1064 -EFAULT : 0;
73fa0d10
AW
1065 }
1066
1067 return -ENOTTY;
1068}
1069
1070static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_type1 = {
1071 .name = "vfio-iommu-type1",
1072 .owner = THIS_MODULE,
1073 .open = vfio_iommu_type1_open,
1074 .release = vfio_iommu_type1_release,
1075 .ioctl = vfio_iommu_type1_ioctl,
1076 .attach_group = vfio_iommu_type1_attach_group,
1077 .detach_group = vfio_iommu_type1_detach_group,
1078};
1079
1080static int __init vfio_iommu_type1_init(void)
1081{
73fa0d10
AW
1082 return vfio_register_iommu_driver(&vfio_iommu_driver_ops_type1);
1083}
1084
1085static void __exit vfio_iommu_type1_cleanup(void)
1086{
1087 vfio_unregister_iommu_driver(&vfio_iommu_driver_ops_type1);
1088}
1089
1090module_init(vfio_iommu_type1_init);
1091module_exit(vfio_iommu_type1_cleanup);
1092
1093MODULE_VERSION(DRIVER_VERSION);
1094MODULE_LICENSE("GPL v2");
1095MODULE_AUTHOR(DRIVER_AUTHOR);
1096MODULE_DESCRIPTION(DRIVER_DESC);