]> git.ipfire.org Git - thirdparty/linux.git/blob - drivers/vfio/pci/vfio_pci.c
mmap locking API: convert mmap_sem comments
[thirdparty/linux.git] / drivers / vfio / pci / vfio_pci.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * Copyright (C) 2012 Red Hat, Inc. All rights reserved.
4 * Author: Alex Williamson <alex.williamson@redhat.com>
5 *
6 * Derived from original vfio:
7 * Copyright 2010 Cisco Systems, Inc. All rights reserved.
8 * Author: Tom Lyon, pugs@cisco.com
9 */
10
11 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
12
13 #include <linux/device.h>
14 #include <linux/eventfd.h>
15 #include <linux/file.h>
16 #include <linux/interrupt.h>
17 #include <linux/iommu.h>
18 #include <linux/module.h>
19 #include <linux/mutex.h>
20 #include <linux/notifier.h>
21 #include <linux/pci.h>
22 #include <linux/pm_runtime.h>
23 #include <linux/slab.h>
24 #include <linux/types.h>
25 #include <linux/uaccess.h>
26 #include <linux/vfio.h>
27 #include <linux/vgaarb.h>
28 #include <linux/nospec.h>
29 #include <linux/sched/mm.h>
30
31 #include "vfio_pci_private.h"
32
33 #define DRIVER_VERSION "0.2"
34 #define DRIVER_AUTHOR "Alex Williamson <alex.williamson@redhat.com>"
35 #define DRIVER_DESC "VFIO PCI - User Level meta-driver"
36
37 static char ids[1024] __initdata;
38 module_param_string(ids, ids, sizeof(ids), 0);
39 MODULE_PARM_DESC(ids, "Initial PCI IDs to add to the vfio driver, format is \"vendor:device[:subvendor[:subdevice[:class[:class_mask]]]]\" and multiple comma separated entries can be specified");
40
41 static bool nointxmask;
42 module_param_named(nointxmask, nointxmask, bool, S_IRUGO | S_IWUSR);
43 MODULE_PARM_DESC(nointxmask,
44 "Disable support for PCI 2.3 style INTx masking. If this resolves problems for specific devices, report lspci -vvvxxx to linux-pci@vger.kernel.org so the device can be fixed automatically via the broken_intx_masking flag.");
45
46 #ifdef CONFIG_VFIO_PCI_VGA
47 static bool disable_vga;
48 module_param(disable_vga, bool, S_IRUGO);
49 MODULE_PARM_DESC(disable_vga, "Disable VGA resource access through vfio-pci");
50 #endif
51
52 static bool disable_idle_d3;
53 module_param(disable_idle_d3, bool, S_IRUGO | S_IWUSR);
54 MODULE_PARM_DESC(disable_idle_d3,
55 "Disable using the PCI D3 low power state for idle, unused devices");
56
57 static bool enable_sriov;
58 #ifdef CONFIG_PCI_IOV
59 module_param(enable_sriov, bool, 0644);
60 MODULE_PARM_DESC(enable_sriov, "Enable support for SR-IOV configuration. Enabling SR-IOV on a PF typically requires support of the userspace PF driver, enabling VFs without such support may result in non-functional VFs or PF.");
61 #endif
62
63 static inline bool vfio_vga_disabled(void)
64 {
65 #ifdef CONFIG_VFIO_PCI_VGA
66 return disable_vga;
67 #else
68 return true;
69 #endif
70 }
71
72 /*
73 * Our VGA arbiter participation is limited since we don't know anything
74 * about the device itself. However, if the device is the only VGA device
75 * downstream of a bridge and VFIO VGA support is disabled, then we can
76 * safely return legacy VGA IO and memory as not decoded since the user
77 * has no way to get to it and routing can be disabled externally at the
78 * bridge.
79 */
80 static unsigned int vfio_pci_set_vga_decode(void *opaque, bool single_vga)
81 {
82 struct vfio_pci_device *vdev = opaque;
83 struct pci_dev *tmp = NULL, *pdev = vdev->pdev;
84 unsigned char max_busnr;
85 unsigned int decodes;
86
87 if (single_vga || !vfio_vga_disabled() || pci_is_root_bus(pdev->bus))
88 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM |
89 VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM;
90
91 max_busnr = pci_bus_max_busnr(pdev->bus);
92 decodes = VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
93
94 while ((tmp = pci_get_class(PCI_CLASS_DISPLAY_VGA << 8, tmp)) != NULL) {
95 if (tmp == pdev ||
96 pci_domain_nr(tmp->bus) != pci_domain_nr(pdev->bus) ||
97 pci_is_root_bus(tmp->bus))
98 continue;
99
100 if (tmp->bus->number >= pdev->bus->number &&
101 tmp->bus->number <= max_busnr) {
102 pci_dev_put(tmp);
103 decodes |= VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM;
104 break;
105 }
106 }
107
108 return decodes;
109 }
110
111 static inline bool vfio_pci_is_vga(struct pci_dev *pdev)
112 {
113 return (pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA;
114 }
115
116 static void vfio_pci_probe_mmaps(struct vfio_pci_device *vdev)
117 {
118 struct resource *res;
119 int i;
120 struct vfio_pci_dummy_resource *dummy_res;
121
122 INIT_LIST_HEAD(&vdev->dummy_resources_list);
123
124 for (i = 0; i < PCI_STD_NUM_BARS; i++) {
125 int bar = i + PCI_STD_RESOURCES;
126
127 res = &vdev->pdev->resource[bar];
128
129 if (!IS_ENABLED(CONFIG_VFIO_PCI_MMAP))
130 goto no_mmap;
131
132 if (!(res->flags & IORESOURCE_MEM))
133 goto no_mmap;
134
135 /*
136 * The PCI core shouldn't set up a resource with a
137 * type but zero size. But there may be bugs that
138 * cause us to do that.
139 */
140 if (!resource_size(res))
141 goto no_mmap;
142
143 if (resource_size(res) >= PAGE_SIZE) {
144 vdev->bar_mmap_supported[bar] = true;
145 continue;
146 }
147
148 if (!(res->start & ~PAGE_MASK)) {
149 /*
150 * Add a dummy resource to reserve the remainder
151 * of the exclusive page in case that hot-add
152 * device's bar is assigned into it.
153 */
154 dummy_res = kzalloc(sizeof(*dummy_res), GFP_KERNEL);
155 if (dummy_res == NULL)
156 goto no_mmap;
157
158 dummy_res->resource.name = "vfio sub-page reserved";
159 dummy_res->resource.start = res->end + 1;
160 dummy_res->resource.end = res->start + PAGE_SIZE - 1;
161 dummy_res->resource.flags = res->flags;
162 if (request_resource(res->parent,
163 &dummy_res->resource)) {
164 kfree(dummy_res);
165 goto no_mmap;
166 }
167 dummy_res->index = bar;
168 list_add(&dummy_res->res_next,
169 &vdev->dummy_resources_list);
170 vdev->bar_mmap_supported[bar] = true;
171 continue;
172 }
173 /*
174 * Here we don't handle the case when the BAR is not page
175 * aligned because we can't expect the BAR will be
176 * assigned into the same location in a page in guest
177 * when we passthrough the BAR. And it's hard to access
178 * this BAR in userspace because we have no way to get
179 * the BAR's location in a page.
180 */
181 no_mmap:
182 vdev->bar_mmap_supported[bar] = false;
183 }
184 }
185
186 static void vfio_pci_try_bus_reset(struct vfio_pci_device *vdev);
187 static void vfio_pci_disable(struct vfio_pci_device *vdev);
188 static int vfio_pci_try_zap_and_vma_lock_cb(struct pci_dev *pdev, void *data);
189
190 /*
191 * INTx masking requires the ability to disable INTx signaling via PCI_COMMAND
192 * _and_ the ability detect when the device is asserting INTx via PCI_STATUS.
193 * If a device implements the former but not the latter we would typically
194 * expect broken_intx_masking be set and require an exclusive interrupt.
195 * However since we do have control of the device's ability to assert INTx,
196 * we can instead pretend that the device does not implement INTx, virtualizing
197 * the pin register to report zero and maintaining DisINTx set on the host.
198 */
199 static bool vfio_pci_nointx(struct pci_dev *pdev)
200 {
201 switch (pdev->vendor) {
202 case PCI_VENDOR_ID_INTEL:
203 switch (pdev->device) {
204 /* All i40e (XL710/X710/XXV710) 10/20/25/40GbE NICs */
205 case 0x1572:
206 case 0x1574:
207 case 0x1580 ... 0x1581:
208 case 0x1583 ... 0x158b:
209 case 0x37d0 ... 0x37d2:
210 return true;
211 default:
212 return false;
213 }
214 }
215
216 return false;
217 }
218
219 static void vfio_pci_probe_power_state(struct vfio_pci_device *vdev)
220 {
221 struct pci_dev *pdev = vdev->pdev;
222 u16 pmcsr;
223
224 if (!pdev->pm_cap)
225 return;
226
227 pci_read_config_word(pdev, pdev->pm_cap + PCI_PM_CTRL, &pmcsr);
228
229 vdev->needs_pm_restore = !(pmcsr & PCI_PM_CTRL_NO_SOFT_RESET);
230 }
231
232 /*
233 * pci_set_power_state() wrapper handling devices which perform a soft reset on
234 * D3->D0 transition. Save state prior to D0/1/2->D3, stash it on the vdev,
235 * restore when returned to D0. Saved separately from pci_saved_state for use
236 * by PM capability emulation and separately from pci_dev internal saved state
237 * to avoid it being overwritten and consumed around other resets.
238 */
239 int vfio_pci_set_power_state(struct vfio_pci_device *vdev, pci_power_t state)
240 {
241 struct pci_dev *pdev = vdev->pdev;
242 bool needs_restore = false, needs_save = false;
243 int ret;
244
245 if (vdev->needs_pm_restore) {
246 if (pdev->current_state < PCI_D3hot && state >= PCI_D3hot) {
247 pci_save_state(pdev);
248 needs_save = true;
249 }
250
251 if (pdev->current_state >= PCI_D3hot && state <= PCI_D0)
252 needs_restore = true;
253 }
254
255 ret = pci_set_power_state(pdev, state);
256
257 if (!ret) {
258 /* D3 might be unsupported via quirk, skip unless in D3 */
259 if (needs_save && pdev->current_state >= PCI_D3hot) {
260 vdev->pm_save = pci_store_saved_state(pdev);
261 } else if (needs_restore) {
262 pci_load_and_free_saved_state(pdev, &vdev->pm_save);
263 pci_restore_state(pdev);
264 }
265 }
266
267 return ret;
268 }
269
270 static int vfio_pci_enable(struct vfio_pci_device *vdev)
271 {
272 struct pci_dev *pdev = vdev->pdev;
273 int ret;
274 u16 cmd;
275 u8 msix_pos;
276
277 vfio_pci_set_power_state(vdev, PCI_D0);
278
279 /* Don't allow our initial saved state to include busmaster */
280 pci_clear_master(pdev);
281
282 ret = pci_enable_device(pdev);
283 if (ret)
284 return ret;
285
286 /* If reset fails because of the device lock, fail this path entirely */
287 ret = pci_try_reset_function(pdev);
288 if (ret == -EAGAIN) {
289 pci_disable_device(pdev);
290 return ret;
291 }
292
293 vdev->reset_works = !ret;
294 pci_save_state(pdev);
295 vdev->pci_saved_state = pci_store_saved_state(pdev);
296 if (!vdev->pci_saved_state)
297 pci_dbg(pdev, "%s: Couldn't store saved state\n", __func__);
298
299 if (likely(!nointxmask)) {
300 if (vfio_pci_nointx(pdev)) {
301 pci_info(pdev, "Masking broken INTx support\n");
302 vdev->nointx = true;
303 pci_intx(pdev, 0);
304 } else
305 vdev->pci_2_3 = pci_intx_mask_supported(pdev);
306 }
307
308 pci_read_config_word(pdev, PCI_COMMAND, &cmd);
309 if (vdev->pci_2_3 && (cmd & PCI_COMMAND_INTX_DISABLE)) {
310 cmd &= ~PCI_COMMAND_INTX_DISABLE;
311 pci_write_config_word(pdev, PCI_COMMAND, cmd);
312 }
313
314 ret = vfio_config_init(vdev);
315 if (ret) {
316 kfree(vdev->pci_saved_state);
317 vdev->pci_saved_state = NULL;
318 pci_disable_device(pdev);
319 return ret;
320 }
321
322 msix_pos = pdev->msix_cap;
323 if (msix_pos) {
324 u16 flags;
325 u32 table;
326
327 pci_read_config_word(pdev, msix_pos + PCI_MSIX_FLAGS, &flags);
328 pci_read_config_dword(pdev, msix_pos + PCI_MSIX_TABLE, &table);
329
330 vdev->msix_bar = table & PCI_MSIX_TABLE_BIR;
331 vdev->msix_offset = table & PCI_MSIX_TABLE_OFFSET;
332 vdev->msix_size = ((flags & PCI_MSIX_FLAGS_QSIZE) + 1) * 16;
333 } else
334 vdev->msix_bar = 0xFF;
335
336 if (!vfio_vga_disabled() && vfio_pci_is_vga(pdev))
337 vdev->has_vga = true;
338
339
340 if (vfio_pci_is_vga(pdev) &&
341 pdev->vendor == PCI_VENDOR_ID_INTEL &&
342 IS_ENABLED(CONFIG_VFIO_PCI_IGD)) {
343 ret = vfio_pci_igd_init(vdev);
344 if (ret) {
345 pci_warn(pdev, "Failed to setup Intel IGD regions\n");
346 goto disable_exit;
347 }
348 }
349
350 if (pdev->vendor == PCI_VENDOR_ID_NVIDIA &&
351 IS_ENABLED(CONFIG_VFIO_PCI_NVLINK2)) {
352 ret = vfio_pci_nvdia_v100_nvlink2_init(vdev);
353 if (ret && ret != -ENODEV) {
354 pci_warn(pdev, "Failed to setup NVIDIA NV2 RAM region\n");
355 goto disable_exit;
356 }
357 }
358
359 if (pdev->vendor == PCI_VENDOR_ID_IBM &&
360 IS_ENABLED(CONFIG_VFIO_PCI_NVLINK2)) {
361 ret = vfio_pci_ibm_npu2_init(vdev);
362 if (ret && ret != -ENODEV) {
363 pci_warn(pdev, "Failed to setup NVIDIA NV2 ATSD region\n");
364 goto disable_exit;
365 }
366 }
367
368 vfio_pci_probe_mmaps(vdev);
369
370 return 0;
371
372 disable_exit:
373 vfio_pci_disable(vdev);
374 return ret;
375 }
376
377 static void vfio_pci_disable(struct vfio_pci_device *vdev)
378 {
379 struct pci_dev *pdev = vdev->pdev;
380 struct vfio_pci_dummy_resource *dummy_res, *tmp;
381 struct vfio_pci_ioeventfd *ioeventfd, *ioeventfd_tmp;
382 int i, bar;
383
384 /* Stop the device from further DMA */
385 pci_clear_master(pdev);
386
387 vfio_pci_set_irqs_ioctl(vdev, VFIO_IRQ_SET_DATA_NONE |
388 VFIO_IRQ_SET_ACTION_TRIGGER,
389 vdev->irq_type, 0, 0, NULL);
390
391 /* Device closed, don't need mutex here */
392 list_for_each_entry_safe(ioeventfd, ioeventfd_tmp,
393 &vdev->ioeventfds_list, next) {
394 vfio_virqfd_disable(&ioeventfd->virqfd);
395 list_del(&ioeventfd->next);
396 kfree(ioeventfd);
397 }
398 vdev->ioeventfds_nr = 0;
399
400 vdev->virq_disabled = false;
401
402 for (i = 0; i < vdev->num_regions; i++)
403 vdev->region[i].ops->release(vdev, &vdev->region[i]);
404
405 vdev->num_regions = 0;
406 kfree(vdev->region);
407 vdev->region = NULL; /* don't krealloc a freed pointer */
408
409 vfio_config_free(vdev);
410
411 for (i = 0; i < PCI_STD_NUM_BARS; i++) {
412 bar = i + PCI_STD_RESOURCES;
413 if (!vdev->barmap[bar])
414 continue;
415 pci_iounmap(pdev, vdev->barmap[bar]);
416 pci_release_selected_regions(pdev, 1 << bar);
417 vdev->barmap[bar] = NULL;
418 }
419
420 list_for_each_entry_safe(dummy_res, tmp,
421 &vdev->dummy_resources_list, res_next) {
422 list_del(&dummy_res->res_next);
423 release_resource(&dummy_res->resource);
424 kfree(dummy_res);
425 }
426
427 vdev->needs_reset = true;
428
429 /*
430 * If we have saved state, restore it. If we can reset the device,
431 * even better. Resetting with current state seems better than
432 * nothing, but saving and restoring current state without reset
433 * is just busy work.
434 */
435 if (pci_load_and_free_saved_state(pdev, &vdev->pci_saved_state)) {
436 pci_info(pdev, "%s: Couldn't reload saved state\n", __func__);
437
438 if (!vdev->reset_works)
439 goto out;
440
441 pci_save_state(pdev);
442 }
443
444 /*
445 * Disable INTx and MSI, presumably to avoid spurious interrupts
446 * during reset. Stolen from pci_reset_function()
447 */
448 pci_write_config_word(pdev, PCI_COMMAND, PCI_COMMAND_INTX_DISABLE);
449
450 /*
451 * Try to get the locks ourselves to prevent a deadlock. The
452 * success of this is dependent on being able to lock the device,
453 * which is not always possible.
454 * We can not use the "try" reset interface here, which will
455 * overwrite the previously restored configuration information.
456 */
457 if (vdev->reset_works && pci_cfg_access_trylock(pdev)) {
458 if (device_trylock(&pdev->dev)) {
459 if (!__pci_reset_function_locked(pdev))
460 vdev->needs_reset = false;
461 device_unlock(&pdev->dev);
462 }
463 pci_cfg_access_unlock(pdev);
464 }
465
466 pci_restore_state(pdev);
467 out:
468 pci_disable_device(pdev);
469
470 vfio_pci_try_bus_reset(vdev);
471
472 if (!disable_idle_d3)
473 vfio_pci_set_power_state(vdev, PCI_D3hot);
474 }
475
476 static struct pci_driver vfio_pci_driver;
477
478 static struct vfio_pci_device *get_pf_vdev(struct vfio_pci_device *vdev,
479 struct vfio_device **pf_dev)
480 {
481 struct pci_dev *physfn = pci_physfn(vdev->pdev);
482
483 if (!vdev->pdev->is_virtfn)
484 return NULL;
485
486 *pf_dev = vfio_device_get_from_dev(&physfn->dev);
487 if (!*pf_dev)
488 return NULL;
489
490 if (pci_dev_driver(physfn) != &vfio_pci_driver) {
491 vfio_device_put(*pf_dev);
492 return NULL;
493 }
494
495 return vfio_device_data(*pf_dev);
496 }
497
498 static void vfio_pci_vf_token_user_add(struct vfio_pci_device *vdev, int val)
499 {
500 struct vfio_device *pf_dev;
501 struct vfio_pci_device *pf_vdev = get_pf_vdev(vdev, &pf_dev);
502
503 if (!pf_vdev)
504 return;
505
506 mutex_lock(&pf_vdev->vf_token->lock);
507 pf_vdev->vf_token->users += val;
508 WARN_ON(pf_vdev->vf_token->users < 0);
509 mutex_unlock(&pf_vdev->vf_token->lock);
510
511 vfio_device_put(pf_dev);
512 }
513
514 static void vfio_pci_release(void *device_data)
515 {
516 struct vfio_pci_device *vdev = device_data;
517
518 mutex_lock(&vdev->reflck->lock);
519
520 if (!(--vdev->refcnt)) {
521 vfio_pci_vf_token_user_add(vdev, -1);
522 vfio_spapr_pci_eeh_release(vdev->pdev);
523 vfio_pci_disable(vdev);
524 if (vdev->err_trigger)
525 eventfd_ctx_put(vdev->err_trigger);
526 if (vdev->req_trigger)
527 eventfd_ctx_put(vdev->req_trigger);
528 }
529
530 mutex_unlock(&vdev->reflck->lock);
531
532 module_put(THIS_MODULE);
533 }
534
535 static int vfio_pci_open(void *device_data)
536 {
537 struct vfio_pci_device *vdev = device_data;
538 int ret = 0;
539
540 if (!try_module_get(THIS_MODULE))
541 return -ENODEV;
542
543 mutex_lock(&vdev->reflck->lock);
544
545 if (!vdev->refcnt) {
546 ret = vfio_pci_enable(vdev);
547 if (ret)
548 goto error;
549
550 vfio_spapr_pci_eeh_open(vdev->pdev);
551 vfio_pci_vf_token_user_add(vdev, 1);
552 }
553 vdev->refcnt++;
554 error:
555 mutex_unlock(&vdev->reflck->lock);
556 if (ret)
557 module_put(THIS_MODULE);
558 return ret;
559 }
560
561 static int vfio_pci_get_irq_count(struct vfio_pci_device *vdev, int irq_type)
562 {
563 if (irq_type == VFIO_PCI_INTX_IRQ_INDEX) {
564 u8 pin;
565
566 if (!IS_ENABLED(CONFIG_VFIO_PCI_INTX) ||
567 vdev->nointx || vdev->pdev->is_virtfn)
568 return 0;
569
570 pci_read_config_byte(vdev->pdev, PCI_INTERRUPT_PIN, &pin);
571
572 return pin ? 1 : 0;
573 } else if (irq_type == VFIO_PCI_MSI_IRQ_INDEX) {
574 u8 pos;
575 u16 flags;
576
577 pos = vdev->pdev->msi_cap;
578 if (pos) {
579 pci_read_config_word(vdev->pdev,
580 pos + PCI_MSI_FLAGS, &flags);
581 return 1 << ((flags & PCI_MSI_FLAGS_QMASK) >> 1);
582 }
583 } else if (irq_type == VFIO_PCI_MSIX_IRQ_INDEX) {
584 u8 pos;
585 u16 flags;
586
587 pos = vdev->pdev->msix_cap;
588 if (pos) {
589 pci_read_config_word(vdev->pdev,
590 pos + PCI_MSIX_FLAGS, &flags);
591
592 return (flags & PCI_MSIX_FLAGS_QSIZE) + 1;
593 }
594 } else if (irq_type == VFIO_PCI_ERR_IRQ_INDEX) {
595 if (pci_is_pcie(vdev->pdev))
596 return 1;
597 } else if (irq_type == VFIO_PCI_REQ_IRQ_INDEX) {
598 return 1;
599 }
600
601 return 0;
602 }
603
604 static int vfio_pci_count_devs(struct pci_dev *pdev, void *data)
605 {
606 (*(int *)data)++;
607 return 0;
608 }
609
610 struct vfio_pci_fill_info {
611 int max;
612 int cur;
613 struct vfio_pci_dependent_device *devices;
614 };
615
616 static int vfio_pci_fill_devs(struct pci_dev *pdev, void *data)
617 {
618 struct vfio_pci_fill_info *fill = data;
619 struct iommu_group *iommu_group;
620
621 if (fill->cur == fill->max)
622 return -EAGAIN; /* Something changed, try again */
623
624 iommu_group = iommu_group_get(&pdev->dev);
625 if (!iommu_group)
626 return -EPERM; /* Cannot reset non-isolated devices */
627
628 fill->devices[fill->cur].group_id = iommu_group_id(iommu_group);
629 fill->devices[fill->cur].segment = pci_domain_nr(pdev->bus);
630 fill->devices[fill->cur].bus = pdev->bus->number;
631 fill->devices[fill->cur].devfn = pdev->devfn;
632 fill->cur++;
633 iommu_group_put(iommu_group);
634 return 0;
635 }
636
637 struct vfio_pci_group_entry {
638 struct vfio_group *group;
639 int id;
640 };
641
642 struct vfio_pci_group_info {
643 int count;
644 struct vfio_pci_group_entry *groups;
645 };
646
647 static int vfio_pci_validate_devs(struct pci_dev *pdev, void *data)
648 {
649 struct vfio_pci_group_info *info = data;
650 struct iommu_group *group;
651 int id, i;
652
653 group = iommu_group_get(&pdev->dev);
654 if (!group)
655 return -EPERM;
656
657 id = iommu_group_id(group);
658
659 for (i = 0; i < info->count; i++)
660 if (info->groups[i].id == id)
661 break;
662
663 iommu_group_put(group);
664
665 return (i == info->count) ? -EINVAL : 0;
666 }
667
668 static bool vfio_pci_dev_below_slot(struct pci_dev *pdev, struct pci_slot *slot)
669 {
670 for (; pdev; pdev = pdev->bus->self)
671 if (pdev->bus == slot->bus)
672 return (pdev->slot == slot);
673 return false;
674 }
675
676 struct vfio_pci_walk_info {
677 int (*fn)(struct pci_dev *, void *data);
678 void *data;
679 struct pci_dev *pdev;
680 bool slot;
681 int ret;
682 };
683
684 static int vfio_pci_walk_wrapper(struct pci_dev *pdev, void *data)
685 {
686 struct vfio_pci_walk_info *walk = data;
687
688 if (!walk->slot || vfio_pci_dev_below_slot(pdev, walk->pdev->slot))
689 walk->ret = walk->fn(pdev, walk->data);
690
691 return walk->ret;
692 }
693
694 static int vfio_pci_for_each_slot_or_bus(struct pci_dev *pdev,
695 int (*fn)(struct pci_dev *,
696 void *data), void *data,
697 bool slot)
698 {
699 struct vfio_pci_walk_info walk = {
700 .fn = fn, .data = data, .pdev = pdev, .slot = slot, .ret = 0,
701 };
702
703 pci_walk_bus(pdev->bus, vfio_pci_walk_wrapper, &walk);
704
705 return walk.ret;
706 }
707
708 static int msix_mmappable_cap(struct vfio_pci_device *vdev,
709 struct vfio_info_cap *caps)
710 {
711 struct vfio_info_cap_header header = {
712 .id = VFIO_REGION_INFO_CAP_MSIX_MAPPABLE,
713 .version = 1
714 };
715
716 return vfio_info_add_capability(caps, &header, sizeof(header));
717 }
718
719 int vfio_pci_register_dev_region(struct vfio_pci_device *vdev,
720 unsigned int type, unsigned int subtype,
721 const struct vfio_pci_regops *ops,
722 size_t size, u32 flags, void *data)
723 {
724 struct vfio_pci_region *region;
725
726 region = krealloc(vdev->region,
727 (vdev->num_regions + 1) * sizeof(*region),
728 GFP_KERNEL);
729 if (!region)
730 return -ENOMEM;
731
732 vdev->region = region;
733 vdev->region[vdev->num_regions].type = type;
734 vdev->region[vdev->num_regions].subtype = subtype;
735 vdev->region[vdev->num_regions].ops = ops;
736 vdev->region[vdev->num_regions].size = size;
737 vdev->region[vdev->num_regions].flags = flags;
738 vdev->region[vdev->num_regions].data = data;
739
740 vdev->num_regions++;
741
742 return 0;
743 }
744
745 struct vfio_devices {
746 struct vfio_device **devices;
747 int cur_index;
748 int max_index;
749 };
750
751 static long vfio_pci_ioctl(void *device_data,
752 unsigned int cmd, unsigned long arg)
753 {
754 struct vfio_pci_device *vdev = device_data;
755 unsigned long minsz;
756
757 if (cmd == VFIO_DEVICE_GET_INFO) {
758 struct vfio_device_info info;
759
760 minsz = offsetofend(struct vfio_device_info, num_irqs);
761
762 if (copy_from_user(&info, (void __user *)arg, minsz))
763 return -EFAULT;
764
765 if (info.argsz < minsz)
766 return -EINVAL;
767
768 info.flags = VFIO_DEVICE_FLAGS_PCI;
769
770 if (vdev->reset_works)
771 info.flags |= VFIO_DEVICE_FLAGS_RESET;
772
773 info.num_regions = VFIO_PCI_NUM_REGIONS + vdev->num_regions;
774 info.num_irqs = VFIO_PCI_NUM_IRQS;
775
776 return copy_to_user((void __user *)arg, &info, minsz) ?
777 -EFAULT : 0;
778
779 } else if (cmd == VFIO_DEVICE_GET_REGION_INFO) {
780 struct pci_dev *pdev = vdev->pdev;
781 struct vfio_region_info info;
782 struct vfio_info_cap caps = { .buf = NULL, .size = 0 };
783 int i, ret;
784
785 minsz = offsetofend(struct vfio_region_info, offset);
786
787 if (copy_from_user(&info, (void __user *)arg, minsz))
788 return -EFAULT;
789
790 if (info.argsz < minsz)
791 return -EINVAL;
792
793 switch (info.index) {
794 case VFIO_PCI_CONFIG_REGION_INDEX:
795 info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
796 info.size = pdev->cfg_size;
797 info.flags = VFIO_REGION_INFO_FLAG_READ |
798 VFIO_REGION_INFO_FLAG_WRITE;
799 break;
800 case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX:
801 info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
802 info.size = pci_resource_len(pdev, info.index);
803 if (!info.size) {
804 info.flags = 0;
805 break;
806 }
807
808 info.flags = VFIO_REGION_INFO_FLAG_READ |
809 VFIO_REGION_INFO_FLAG_WRITE;
810 if (vdev->bar_mmap_supported[info.index]) {
811 info.flags |= VFIO_REGION_INFO_FLAG_MMAP;
812 if (info.index == vdev->msix_bar) {
813 ret = msix_mmappable_cap(vdev, &caps);
814 if (ret)
815 return ret;
816 }
817 }
818
819 break;
820 case VFIO_PCI_ROM_REGION_INDEX:
821 {
822 void __iomem *io;
823 size_t size;
824 u16 cmd;
825
826 info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
827 info.flags = 0;
828
829 /* Report the BAR size, not the ROM size */
830 info.size = pci_resource_len(pdev, info.index);
831 if (!info.size) {
832 /* Shadow ROMs appear as PCI option ROMs */
833 if (pdev->resource[PCI_ROM_RESOURCE].flags &
834 IORESOURCE_ROM_SHADOW)
835 info.size = 0x20000;
836 else
837 break;
838 }
839
840 /*
841 * Is it really there? Enable memory decode for
842 * implicit access in pci_map_rom().
843 */
844 cmd = vfio_pci_memory_lock_and_enable(vdev);
845 io = pci_map_rom(pdev, &size);
846 if (io) {
847 info.flags = VFIO_REGION_INFO_FLAG_READ;
848 pci_unmap_rom(pdev, io);
849 } else {
850 info.size = 0;
851 }
852 vfio_pci_memory_unlock_and_restore(vdev, cmd);
853
854 break;
855 }
856 case VFIO_PCI_VGA_REGION_INDEX:
857 if (!vdev->has_vga)
858 return -EINVAL;
859
860 info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
861 info.size = 0xc0000;
862 info.flags = VFIO_REGION_INFO_FLAG_READ |
863 VFIO_REGION_INFO_FLAG_WRITE;
864
865 break;
866 default:
867 {
868 struct vfio_region_info_cap_type cap_type = {
869 .header.id = VFIO_REGION_INFO_CAP_TYPE,
870 .header.version = 1 };
871
872 if (info.index >=
873 VFIO_PCI_NUM_REGIONS + vdev->num_regions)
874 return -EINVAL;
875 info.index = array_index_nospec(info.index,
876 VFIO_PCI_NUM_REGIONS +
877 vdev->num_regions);
878
879 i = info.index - VFIO_PCI_NUM_REGIONS;
880
881 info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
882 info.size = vdev->region[i].size;
883 info.flags = vdev->region[i].flags;
884
885 cap_type.type = vdev->region[i].type;
886 cap_type.subtype = vdev->region[i].subtype;
887
888 ret = vfio_info_add_capability(&caps, &cap_type.header,
889 sizeof(cap_type));
890 if (ret)
891 return ret;
892
893 if (vdev->region[i].ops->add_capability) {
894 ret = vdev->region[i].ops->add_capability(vdev,
895 &vdev->region[i], &caps);
896 if (ret)
897 return ret;
898 }
899 }
900 }
901
902 if (caps.size) {
903 info.flags |= VFIO_REGION_INFO_FLAG_CAPS;
904 if (info.argsz < sizeof(info) + caps.size) {
905 info.argsz = sizeof(info) + caps.size;
906 info.cap_offset = 0;
907 } else {
908 vfio_info_cap_shift(&caps, sizeof(info));
909 if (copy_to_user((void __user *)arg +
910 sizeof(info), caps.buf,
911 caps.size)) {
912 kfree(caps.buf);
913 return -EFAULT;
914 }
915 info.cap_offset = sizeof(info);
916 }
917
918 kfree(caps.buf);
919 }
920
921 return copy_to_user((void __user *)arg, &info, minsz) ?
922 -EFAULT : 0;
923
924 } else if (cmd == VFIO_DEVICE_GET_IRQ_INFO) {
925 struct vfio_irq_info info;
926
927 minsz = offsetofend(struct vfio_irq_info, count);
928
929 if (copy_from_user(&info, (void __user *)arg, minsz))
930 return -EFAULT;
931
932 if (info.argsz < minsz || info.index >= VFIO_PCI_NUM_IRQS)
933 return -EINVAL;
934
935 switch (info.index) {
936 case VFIO_PCI_INTX_IRQ_INDEX ... VFIO_PCI_MSIX_IRQ_INDEX:
937 case VFIO_PCI_REQ_IRQ_INDEX:
938 break;
939 case VFIO_PCI_ERR_IRQ_INDEX:
940 if (pci_is_pcie(vdev->pdev))
941 break;
942 /* fall through */
943 default:
944 return -EINVAL;
945 }
946
947 info.flags = VFIO_IRQ_INFO_EVENTFD;
948
949 info.count = vfio_pci_get_irq_count(vdev, info.index);
950
951 if (info.index == VFIO_PCI_INTX_IRQ_INDEX)
952 info.flags |= (VFIO_IRQ_INFO_MASKABLE |
953 VFIO_IRQ_INFO_AUTOMASKED);
954 else
955 info.flags |= VFIO_IRQ_INFO_NORESIZE;
956
957 return copy_to_user((void __user *)arg, &info, minsz) ?
958 -EFAULT : 0;
959
960 } else if (cmd == VFIO_DEVICE_SET_IRQS) {
961 struct vfio_irq_set hdr;
962 u8 *data = NULL;
963 int max, ret = 0;
964 size_t data_size = 0;
965
966 minsz = offsetofend(struct vfio_irq_set, count);
967
968 if (copy_from_user(&hdr, (void __user *)arg, minsz))
969 return -EFAULT;
970
971 max = vfio_pci_get_irq_count(vdev, hdr.index);
972
973 ret = vfio_set_irqs_validate_and_prepare(&hdr, max,
974 VFIO_PCI_NUM_IRQS, &data_size);
975 if (ret)
976 return ret;
977
978 if (data_size) {
979 data = memdup_user((void __user *)(arg + minsz),
980 data_size);
981 if (IS_ERR(data))
982 return PTR_ERR(data);
983 }
984
985 mutex_lock(&vdev->igate);
986
987 ret = vfio_pci_set_irqs_ioctl(vdev, hdr.flags, hdr.index,
988 hdr.start, hdr.count, data);
989
990 mutex_unlock(&vdev->igate);
991 kfree(data);
992
993 return ret;
994
995 } else if (cmd == VFIO_DEVICE_RESET) {
996 int ret;
997
998 if (!vdev->reset_works)
999 return -EINVAL;
1000
1001 vfio_pci_zap_and_down_write_memory_lock(vdev);
1002 ret = pci_try_reset_function(vdev->pdev);
1003 up_write(&vdev->memory_lock);
1004
1005 return ret;
1006
1007 } else if (cmd == VFIO_DEVICE_GET_PCI_HOT_RESET_INFO) {
1008 struct vfio_pci_hot_reset_info hdr;
1009 struct vfio_pci_fill_info fill = { 0 };
1010 struct vfio_pci_dependent_device *devices = NULL;
1011 bool slot = false;
1012 int ret = 0;
1013
1014 minsz = offsetofend(struct vfio_pci_hot_reset_info, count);
1015
1016 if (copy_from_user(&hdr, (void __user *)arg, minsz))
1017 return -EFAULT;
1018
1019 if (hdr.argsz < minsz)
1020 return -EINVAL;
1021
1022 hdr.flags = 0;
1023
1024 /* Can we do a slot or bus reset or neither? */
1025 if (!pci_probe_reset_slot(vdev->pdev->slot))
1026 slot = true;
1027 else if (pci_probe_reset_bus(vdev->pdev->bus))
1028 return -ENODEV;
1029
1030 /* How many devices are affected? */
1031 ret = vfio_pci_for_each_slot_or_bus(vdev->pdev,
1032 vfio_pci_count_devs,
1033 &fill.max, slot);
1034 if (ret)
1035 return ret;
1036
1037 WARN_ON(!fill.max); /* Should always be at least one */
1038
1039 /*
1040 * If there's enough space, fill it now, otherwise return
1041 * -ENOSPC and the number of devices affected.
1042 */
1043 if (hdr.argsz < sizeof(hdr) + (fill.max * sizeof(*devices))) {
1044 ret = -ENOSPC;
1045 hdr.count = fill.max;
1046 goto reset_info_exit;
1047 }
1048
1049 devices = kcalloc(fill.max, sizeof(*devices), GFP_KERNEL);
1050 if (!devices)
1051 return -ENOMEM;
1052
1053 fill.devices = devices;
1054
1055 ret = vfio_pci_for_each_slot_or_bus(vdev->pdev,
1056 vfio_pci_fill_devs,
1057 &fill, slot);
1058
1059 /*
1060 * If a device was removed between counting and filling,
1061 * we may come up short of fill.max. If a device was
1062 * added, we'll have a return of -EAGAIN above.
1063 */
1064 if (!ret)
1065 hdr.count = fill.cur;
1066
1067 reset_info_exit:
1068 if (copy_to_user((void __user *)arg, &hdr, minsz))
1069 ret = -EFAULT;
1070
1071 if (!ret) {
1072 if (copy_to_user((void __user *)(arg + minsz), devices,
1073 hdr.count * sizeof(*devices)))
1074 ret = -EFAULT;
1075 }
1076
1077 kfree(devices);
1078 return ret;
1079
1080 } else if (cmd == VFIO_DEVICE_PCI_HOT_RESET) {
1081 struct vfio_pci_hot_reset hdr;
1082 int32_t *group_fds;
1083 struct vfio_pci_group_entry *groups;
1084 struct vfio_pci_group_info info;
1085 struct vfio_devices devs = { .cur_index = 0 };
1086 bool slot = false;
1087 int i, group_idx, mem_idx = 0, count = 0, ret = 0;
1088
1089 minsz = offsetofend(struct vfio_pci_hot_reset, count);
1090
1091 if (copy_from_user(&hdr, (void __user *)arg, minsz))
1092 return -EFAULT;
1093
1094 if (hdr.argsz < minsz || hdr.flags)
1095 return -EINVAL;
1096
1097 /* Can we do a slot or bus reset or neither? */
1098 if (!pci_probe_reset_slot(vdev->pdev->slot))
1099 slot = true;
1100 else if (pci_probe_reset_bus(vdev->pdev->bus))
1101 return -ENODEV;
1102
1103 /*
1104 * We can't let userspace give us an arbitrarily large
1105 * buffer to copy, so verify how many we think there
1106 * could be. Note groups can have multiple devices so
1107 * one group per device is the max.
1108 */
1109 ret = vfio_pci_for_each_slot_or_bus(vdev->pdev,
1110 vfio_pci_count_devs,
1111 &count, slot);
1112 if (ret)
1113 return ret;
1114
1115 /* Somewhere between 1 and count is OK */
1116 if (!hdr.count || hdr.count > count)
1117 return -EINVAL;
1118
1119 group_fds = kcalloc(hdr.count, sizeof(*group_fds), GFP_KERNEL);
1120 groups = kcalloc(hdr.count, sizeof(*groups), GFP_KERNEL);
1121 if (!group_fds || !groups) {
1122 kfree(group_fds);
1123 kfree(groups);
1124 return -ENOMEM;
1125 }
1126
1127 if (copy_from_user(group_fds, (void __user *)(arg + minsz),
1128 hdr.count * sizeof(*group_fds))) {
1129 kfree(group_fds);
1130 kfree(groups);
1131 return -EFAULT;
1132 }
1133
1134 /*
1135 * For each group_fd, get the group through the vfio external
1136 * user interface and store the group and iommu ID. This
1137 * ensures the group is held across the reset.
1138 */
1139 for (group_idx = 0; group_idx < hdr.count; group_idx++) {
1140 struct vfio_group *group;
1141 struct fd f = fdget(group_fds[group_idx]);
1142 if (!f.file) {
1143 ret = -EBADF;
1144 break;
1145 }
1146
1147 group = vfio_group_get_external_user(f.file);
1148 fdput(f);
1149 if (IS_ERR(group)) {
1150 ret = PTR_ERR(group);
1151 break;
1152 }
1153
1154 groups[group_idx].group = group;
1155 groups[group_idx].id =
1156 vfio_external_user_iommu_id(group);
1157 }
1158
1159 kfree(group_fds);
1160
1161 /* release reference to groups on error */
1162 if (ret)
1163 goto hot_reset_release;
1164
1165 info.count = hdr.count;
1166 info.groups = groups;
1167
1168 /*
1169 * Test whether all the affected devices are contained
1170 * by the set of groups provided by the user.
1171 */
1172 ret = vfio_pci_for_each_slot_or_bus(vdev->pdev,
1173 vfio_pci_validate_devs,
1174 &info, slot);
1175 if (ret)
1176 goto hot_reset_release;
1177
1178 devs.max_index = count;
1179 devs.devices = kcalloc(count, sizeof(struct vfio_device *),
1180 GFP_KERNEL);
1181 if (!devs.devices) {
1182 ret = -ENOMEM;
1183 goto hot_reset_release;
1184 }
1185
1186 /*
1187 * We need to get memory_lock for each device, but devices
1188 * can share mmap_lock, therefore we need to zap and hold
1189 * the vma_lock for each device, and only then get each
1190 * memory_lock.
1191 */
1192 ret = vfio_pci_for_each_slot_or_bus(vdev->pdev,
1193 vfio_pci_try_zap_and_vma_lock_cb,
1194 &devs, slot);
1195 if (ret)
1196 goto hot_reset_release;
1197
1198 for (; mem_idx < devs.cur_index; mem_idx++) {
1199 struct vfio_pci_device *tmp;
1200
1201 tmp = vfio_device_data(devs.devices[mem_idx]);
1202
1203 ret = down_write_trylock(&tmp->memory_lock);
1204 if (!ret) {
1205 ret = -EBUSY;
1206 goto hot_reset_release;
1207 }
1208 mutex_unlock(&tmp->vma_lock);
1209 }
1210
1211 /* User has access, do the reset */
1212 ret = pci_reset_bus(vdev->pdev);
1213
1214 hot_reset_release:
1215 for (i = 0; i < devs.cur_index; i++) {
1216 struct vfio_device *device;
1217 struct vfio_pci_device *tmp;
1218
1219 device = devs.devices[i];
1220 tmp = vfio_device_data(device);
1221
1222 if (i < mem_idx)
1223 up_write(&tmp->memory_lock);
1224 else
1225 mutex_unlock(&tmp->vma_lock);
1226 vfio_device_put(device);
1227 }
1228 kfree(devs.devices);
1229
1230 for (group_idx--; group_idx >= 0; group_idx--)
1231 vfio_group_put_external_user(groups[group_idx].group);
1232
1233 kfree(groups);
1234 return ret;
1235 } else if (cmd == VFIO_DEVICE_IOEVENTFD) {
1236 struct vfio_device_ioeventfd ioeventfd;
1237 int count;
1238
1239 minsz = offsetofend(struct vfio_device_ioeventfd, fd);
1240
1241 if (copy_from_user(&ioeventfd, (void __user *)arg, minsz))
1242 return -EFAULT;
1243
1244 if (ioeventfd.argsz < minsz)
1245 return -EINVAL;
1246
1247 if (ioeventfd.flags & ~VFIO_DEVICE_IOEVENTFD_SIZE_MASK)
1248 return -EINVAL;
1249
1250 count = ioeventfd.flags & VFIO_DEVICE_IOEVENTFD_SIZE_MASK;
1251
1252 if (hweight8(count) != 1 || ioeventfd.fd < -1)
1253 return -EINVAL;
1254
1255 return vfio_pci_ioeventfd(vdev, ioeventfd.offset,
1256 ioeventfd.data, count, ioeventfd.fd);
1257 } else if (cmd == VFIO_DEVICE_FEATURE) {
1258 struct vfio_device_feature feature;
1259 uuid_t uuid;
1260
1261 minsz = offsetofend(struct vfio_device_feature, flags);
1262
1263 if (copy_from_user(&feature, (void __user *)arg, minsz))
1264 return -EFAULT;
1265
1266 if (feature.argsz < minsz)
1267 return -EINVAL;
1268
1269 /* Check unknown flags */
1270 if (feature.flags & ~(VFIO_DEVICE_FEATURE_MASK |
1271 VFIO_DEVICE_FEATURE_SET |
1272 VFIO_DEVICE_FEATURE_GET |
1273 VFIO_DEVICE_FEATURE_PROBE))
1274 return -EINVAL;
1275
1276 /* GET & SET are mutually exclusive except with PROBE */
1277 if (!(feature.flags & VFIO_DEVICE_FEATURE_PROBE) &&
1278 (feature.flags & VFIO_DEVICE_FEATURE_SET) &&
1279 (feature.flags & VFIO_DEVICE_FEATURE_GET))
1280 return -EINVAL;
1281
1282 switch (feature.flags & VFIO_DEVICE_FEATURE_MASK) {
1283 case VFIO_DEVICE_FEATURE_PCI_VF_TOKEN:
1284 if (!vdev->vf_token)
1285 return -ENOTTY;
1286
1287 /*
1288 * We do not support GET of the VF Token UUID as this
1289 * could expose the token of the previous device user.
1290 */
1291 if (feature.flags & VFIO_DEVICE_FEATURE_GET)
1292 return -EINVAL;
1293
1294 if (feature.flags & VFIO_DEVICE_FEATURE_PROBE)
1295 return 0;
1296
1297 /* Don't SET unless told to do so */
1298 if (!(feature.flags & VFIO_DEVICE_FEATURE_SET))
1299 return -EINVAL;
1300
1301 if (feature.argsz < minsz + sizeof(uuid))
1302 return -EINVAL;
1303
1304 if (copy_from_user(&uuid, (void __user *)(arg + minsz),
1305 sizeof(uuid)))
1306 return -EFAULT;
1307
1308 mutex_lock(&vdev->vf_token->lock);
1309 uuid_copy(&vdev->vf_token->uuid, &uuid);
1310 mutex_unlock(&vdev->vf_token->lock);
1311
1312 return 0;
1313 default:
1314 return -ENOTTY;
1315 }
1316 }
1317
1318 return -ENOTTY;
1319 }
1320
1321 static ssize_t vfio_pci_rw(void *device_data, char __user *buf,
1322 size_t count, loff_t *ppos, bool iswrite)
1323 {
1324 unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
1325 struct vfio_pci_device *vdev = device_data;
1326
1327 if (index >= VFIO_PCI_NUM_REGIONS + vdev->num_regions)
1328 return -EINVAL;
1329
1330 switch (index) {
1331 case VFIO_PCI_CONFIG_REGION_INDEX:
1332 return vfio_pci_config_rw(vdev, buf, count, ppos, iswrite);
1333
1334 case VFIO_PCI_ROM_REGION_INDEX:
1335 if (iswrite)
1336 return -EINVAL;
1337 return vfio_pci_bar_rw(vdev, buf, count, ppos, false);
1338
1339 case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX:
1340 return vfio_pci_bar_rw(vdev, buf, count, ppos, iswrite);
1341
1342 case VFIO_PCI_VGA_REGION_INDEX:
1343 return vfio_pci_vga_rw(vdev, buf, count, ppos, iswrite);
1344 default:
1345 index -= VFIO_PCI_NUM_REGIONS;
1346 return vdev->region[index].ops->rw(vdev, buf,
1347 count, ppos, iswrite);
1348 }
1349
1350 return -EINVAL;
1351 }
1352
1353 static ssize_t vfio_pci_read(void *device_data, char __user *buf,
1354 size_t count, loff_t *ppos)
1355 {
1356 if (!count)
1357 return 0;
1358
1359 return vfio_pci_rw(device_data, buf, count, ppos, false);
1360 }
1361
1362 static ssize_t vfio_pci_write(void *device_data, const char __user *buf,
1363 size_t count, loff_t *ppos)
1364 {
1365 if (!count)
1366 return 0;
1367
1368 return vfio_pci_rw(device_data, (char __user *)buf, count, ppos, true);
1369 }
1370
1371 /* Return 1 on zap and vma_lock acquired, 0 on contention (only with @try) */
1372 static int vfio_pci_zap_and_vma_lock(struct vfio_pci_device *vdev, bool try)
1373 {
1374 struct vfio_pci_mmap_vma *mmap_vma, *tmp;
1375
1376 /*
1377 * Lock ordering:
1378 * vma_lock is nested under mmap_lock for vm_ops callback paths.
1379 * The memory_lock semaphore is used by both code paths calling
1380 * into this function to zap vmas and the vm_ops.fault callback
1381 * to protect the memory enable state of the device.
1382 *
1383 * When zapping vmas we need to maintain the mmap_lock => vma_lock
1384 * ordering, which requires using vma_lock to walk vma_list to
1385 * acquire an mm, then dropping vma_lock to get the mmap_lock and
1386 * reacquiring vma_lock. This logic is derived from similar
1387 * requirements in uverbs_user_mmap_disassociate().
1388 *
1389 * mmap_lock must always be the top-level lock when it is taken.
1390 * Therefore we can only hold the memory_lock write lock when
1391 * vma_list is empty, as we'd need to take mmap_lock to clear
1392 * entries. vma_list can only be guaranteed empty when holding
1393 * vma_lock, thus memory_lock is nested under vma_lock.
1394 *
1395 * This enables the vm_ops.fault callback to acquire vma_lock,
1396 * followed by memory_lock read lock, while already holding
1397 * mmap_lock without risk of deadlock.
1398 */
1399 while (1) {
1400 struct mm_struct *mm = NULL;
1401
1402 if (try) {
1403 if (!mutex_trylock(&vdev->vma_lock))
1404 return 0;
1405 } else {
1406 mutex_lock(&vdev->vma_lock);
1407 }
1408 while (!list_empty(&vdev->vma_list)) {
1409 mmap_vma = list_first_entry(&vdev->vma_list,
1410 struct vfio_pci_mmap_vma,
1411 vma_next);
1412 mm = mmap_vma->vma->vm_mm;
1413 if (mmget_not_zero(mm))
1414 break;
1415
1416 list_del(&mmap_vma->vma_next);
1417 kfree(mmap_vma);
1418 mm = NULL;
1419 }
1420 if (!mm)
1421 return 1;
1422 mutex_unlock(&vdev->vma_lock);
1423
1424 if (try) {
1425 if (!mmap_read_trylock(mm)) {
1426 mmput(mm);
1427 return 0;
1428 }
1429 } else {
1430 mmap_read_lock(mm);
1431 }
1432 if (mmget_still_valid(mm)) {
1433 if (try) {
1434 if (!mutex_trylock(&vdev->vma_lock)) {
1435 mmap_read_unlock(mm);
1436 mmput(mm);
1437 return 0;
1438 }
1439 } else {
1440 mutex_lock(&vdev->vma_lock);
1441 }
1442 list_for_each_entry_safe(mmap_vma, tmp,
1443 &vdev->vma_list, vma_next) {
1444 struct vm_area_struct *vma = mmap_vma->vma;
1445
1446 if (vma->vm_mm != mm)
1447 continue;
1448
1449 list_del(&mmap_vma->vma_next);
1450 kfree(mmap_vma);
1451
1452 zap_vma_ptes(vma, vma->vm_start,
1453 vma->vm_end - vma->vm_start);
1454 }
1455 mutex_unlock(&vdev->vma_lock);
1456 }
1457 mmap_read_unlock(mm);
1458 mmput(mm);
1459 }
1460 }
1461
1462 void vfio_pci_zap_and_down_write_memory_lock(struct vfio_pci_device *vdev)
1463 {
1464 vfio_pci_zap_and_vma_lock(vdev, false);
1465 down_write(&vdev->memory_lock);
1466 mutex_unlock(&vdev->vma_lock);
1467 }
1468
1469 u16 vfio_pci_memory_lock_and_enable(struct vfio_pci_device *vdev)
1470 {
1471 u16 cmd;
1472
1473 down_write(&vdev->memory_lock);
1474 pci_read_config_word(vdev->pdev, PCI_COMMAND, &cmd);
1475 if (!(cmd & PCI_COMMAND_MEMORY))
1476 pci_write_config_word(vdev->pdev, PCI_COMMAND,
1477 cmd | PCI_COMMAND_MEMORY);
1478
1479 return cmd;
1480 }
1481
1482 void vfio_pci_memory_unlock_and_restore(struct vfio_pci_device *vdev, u16 cmd)
1483 {
1484 pci_write_config_word(vdev->pdev, PCI_COMMAND, cmd);
1485 up_write(&vdev->memory_lock);
1486 }
1487
1488 /* Caller holds vma_lock */
1489 static int __vfio_pci_add_vma(struct vfio_pci_device *vdev,
1490 struct vm_area_struct *vma)
1491 {
1492 struct vfio_pci_mmap_vma *mmap_vma;
1493
1494 mmap_vma = kmalloc(sizeof(*mmap_vma), GFP_KERNEL);
1495 if (!mmap_vma)
1496 return -ENOMEM;
1497
1498 mmap_vma->vma = vma;
1499 list_add(&mmap_vma->vma_next, &vdev->vma_list);
1500
1501 return 0;
1502 }
1503
1504 /*
1505 * Zap mmaps on open so that we can fault them in on access and therefore
1506 * our vma_list only tracks mappings accessed since last zap.
1507 */
1508 static void vfio_pci_mmap_open(struct vm_area_struct *vma)
1509 {
1510 zap_vma_ptes(vma, vma->vm_start, vma->vm_end - vma->vm_start);
1511 }
1512
1513 static void vfio_pci_mmap_close(struct vm_area_struct *vma)
1514 {
1515 struct vfio_pci_device *vdev = vma->vm_private_data;
1516 struct vfio_pci_mmap_vma *mmap_vma;
1517
1518 mutex_lock(&vdev->vma_lock);
1519 list_for_each_entry(mmap_vma, &vdev->vma_list, vma_next) {
1520 if (mmap_vma->vma == vma) {
1521 list_del(&mmap_vma->vma_next);
1522 kfree(mmap_vma);
1523 break;
1524 }
1525 }
1526 mutex_unlock(&vdev->vma_lock);
1527 }
1528
1529 static vm_fault_t vfio_pci_mmap_fault(struct vm_fault *vmf)
1530 {
1531 struct vm_area_struct *vma = vmf->vma;
1532 struct vfio_pci_device *vdev = vma->vm_private_data;
1533 vm_fault_t ret = VM_FAULT_NOPAGE;
1534
1535 mutex_lock(&vdev->vma_lock);
1536 down_read(&vdev->memory_lock);
1537
1538 if (!__vfio_pci_memory_enabled(vdev)) {
1539 ret = VM_FAULT_SIGBUS;
1540 mutex_unlock(&vdev->vma_lock);
1541 goto up_out;
1542 }
1543
1544 if (__vfio_pci_add_vma(vdev, vma)) {
1545 ret = VM_FAULT_OOM;
1546 mutex_unlock(&vdev->vma_lock);
1547 goto up_out;
1548 }
1549
1550 mutex_unlock(&vdev->vma_lock);
1551
1552 if (remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff,
1553 vma->vm_end - vma->vm_start, vma->vm_page_prot))
1554 ret = VM_FAULT_SIGBUS;
1555
1556 up_out:
1557 up_read(&vdev->memory_lock);
1558 return ret;
1559 }
1560
1561 static const struct vm_operations_struct vfio_pci_mmap_ops = {
1562 .open = vfio_pci_mmap_open,
1563 .close = vfio_pci_mmap_close,
1564 .fault = vfio_pci_mmap_fault,
1565 };
1566
1567 static int vfio_pci_mmap(void *device_data, struct vm_area_struct *vma)
1568 {
1569 struct vfio_pci_device *vdev = device_data;
1570 struct pci_dev *pdev = vdev->pdev;
1571 unsigned int index;
1572 u64 phys_len, req_len, pgoff, req_start;
1573 int ret;
1574
1575 index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT);
1576
1577 if (vma->vm_end < vma->vm_start)
1578 return -EINVAL;
1579 if ((vma->vm_flags & VM_SHARED) == 0)
1580 return -EINVAL;
1581 if (index >= VFIO_PCI_NUM_REGIONS) {
1582 int regnum = index - VFIO_PCI_NUM_REGIONS;
1583 struct vfio_pci_region *region = vdev->region + regnum;
1584
1585 if (region && region->ops && region->ops->mmap &&
1586 (region->flags & VFIO_REGION_INFO_FLAG_MMAP))
1587 return region->ops->mmap(vdev, region, vma);
1588 return -EINVAL;
1589 }
1590 if (index >= VFIO_PCI_ROM_REGION_INDEX)
1591 return -EINVAL;
1592 if (!vdev->bar_mmap_supported[index])
1593 return -EINVAL;
1594
1595 phys_len = PAGE_ALIGN(pci_resource_len(pdev, index));
1596 req_len = vma->vm_end - vma->vm_start;
1597 pgoff = vma->vm_pgoff &
1598 ((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1);
1599 req_start = pgoff << PAGE_SHIFT;
1600
1601 if (req_start + req_len > phys_len)
1602 return -EINVAL;
1603
1604 /*
1605 * Even though we don't make use of the barmap for the mmap,
1606 * we need to request the region and the barmap tracks that.
1607 */
1608 if (!vdev->barmap[index]) {
1609 ret = pci_request_selected_regions(pdev,
1610 1 << index, "vfio-pci");
1611 if (ret)
1612 return ret;
1613
1614 vdev->barmap[index] = pci_iomap(pdev, index, 0);
1615 if (!vdev->barmap[index]) {
1616 pci_release_selected_regions(pdev, 1 << index);
1617 return -ENOMEM;
1618 }
1619 }
1620
1621 vma->vm_private_data = vdev;
1622 vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
1623 vma->vm_pgoff = (pci_resource_start(pdev, index) >> PAGE_SHIFT) + pgoff;
1624
1625 /*
1626 * See remap_pfn_range(), called from vfio_pci_fault() but we can't
1627 * change vm_flags within the fault handler. Set them now.
1628 */
1629 vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP;
1630 vma->vm_ops = &vfio_pci_mmap_ops;
1631
1632 return 0;
1633 }
1634
1635 static void vfio_pci_request(void *device_data, unsigned int count)
1636 {
1637 struct vfio_pci_device *vdev = device_data;
1638 struct pci_dev *pdev = vdev->pdev;
1639
1640 mutex_lock(&vdev->igate);
1641
1642 if (vdev->req_trigger) {
1643 if (!(count % 10))
1644 pci_notice_ratelimited(pdev,
1645 "Relaying device request to user (#%u)\n",
1646 count);
1647 eventfd_signal(vdev->req_trigger, 1);
1648 } else if (count == 0) {
1649 pci_warn(pdev,
1650 "No device request channel registered, blocked until released by user\n");
1651 }
1652
1653 mutex_unlock(&vdev->igate);
1654 }
1655
1656 static int vfio_pci_validate_vf_token(struct vfio_pci_device *vdev,
1657 bool vf_token, uuid_t *uuid)
1658 {
1659 /*
1660 * There's always some degree of trust or collaboration between SR-IOV
1661 * PF and VFs, even if just that the PF hosts the SR-IOV capability and
1662 * can disrupt VFs with a reset, but often the PF has more explicit
1663 * access to deny service to the VF or access data passed through the
1664 * VF. We therefore require an opt-in via a shared VF token (UUID) to
1665 * represent this trust. This both prevents that a VF driver might
1666 * assume the PF driver is a trusted, in-kernel driver, and also that
1667 * a PF driver might be replaced with a rogue driver, unknown to in-use
1668 * VF drivers.
1669 *
1670 * Therefore when presented with a VF, if the PF is a vfio device and
1671 * it is bound to the vfio-pci driver, the user needs to provide a VF
1672 * token to access the device, in the form of appending a vf_token to
1673 * the device name, for example:
1674 *
1675 * "0000:04:10.0 vf_token=bd8d9d2b-5a5f-4f5a-a211-f591514ba1f3"
1676 *
1677 * When presented with a PF which has VFs in use, the user must also
1678 * provide the current VF token to prove collaboration with existing
1679 * VF users. If VFs are not in use, the VF token provided for the PF
1680 * device will act to set the VF token.
1681 *
1682 * If the VF token is provided but unused, an error is generated.
1683 */
1684 if (!vdev->pdev->is_virtfn && !vdev->vf_token && !vf_token)
1685 return 0; /* No VF token provided or required */
1686
1687 if (vdev->pdev->is_virtfn) {
1688 struct vfio_device *pf_dev;
1689 struct vfio_pci_device *pf_vdev = get_pf_vdev(vdev, &pf_dev);
1690 bool match;
1691
1692 if (!pf_vdev) {
1693 if (!vf_token)
1694 return 0; /* PF is not vfio-pci, no VF token */
1695
1696 pci_info_ratelimited(vdev->pdev,
1697 "VF token incorrectly provided, PF not bound to vfio-pci\n");
1698 return -EINVAL;
1699 }
1700
1701 if (!vf_token) {
1702 vfio_device_put(pf_dev);
1703 pci_info_ratelimited(vdev->pdev,
1704 "VF token required to access device\n");
1705 return -EACCES;
1706 }
1707
1708 mutex_lock(&pf_vdev->vf_token->lock);
1709 match = uuid_equal(uuid, &pf_vdev->vf_token->uuid);
1710 mutex_unlock(&pf_vdev->vf_token->lock);
1711
1712 vfio_device_put(pf_dev);
1713
1714 if (!match) {
1715 pci_info_ratelimited(vdev->pdev,
1716 "Incorrect VF token provided for device\n");
1717 return -EACCES;
1718 }
1719 } else if (vdev->vf_token) {
1720 mutex_lock(&vdev->vf_token->lock);
1721 if (vdev->vf_token->users) {
1722 if (!vf_token) {
1723 mutex_unlock(&vdev->vf_token->lock);
1724 pci_info_ratelimited(vdev->pdev,
1725 "VF token required to access device\n");
1726 return -EACCES;
1727 }
1728
1729 if (!uuid_equal(uuid, &vdev->vf_token->uuid)) {
1730 mutex_unlock(&vdev->vf_token->lock);
1731 pci_info_ratelimited(vdev->pdev,
1732 "Incorrect VF token provided for device\n");
1733 return -EACCES;
1734 }
1735 } else if (vf_token) {
1736 uuid_copy(&vdev->vf_token->uuid, uuid);
1737 }
1738
1739 mutex_unlock(&vdev->vf_token->lock);
1740 } else if (vf_token) {
1741 pci_info_ratelimited(vdev->pdev,
1742 "VF token incorrectly provided, not a PF or VF\n");
1743 return -EINVAL;
1744 }
1745
1746 return 0;
1747 }
1748
1749 #define VF_TOKEN_ARG "vf_token="
1750
1751 static int vfio_pci_match(void *device_data, char *buf)
1752 {
1753 struct vfio_pci_device *vdev = device_data;
1754 bool vf_token = false;
1755 uuid_t uuid;
1756 int ret;
1757
1758 if (strncmp(pci_name(vdev->pdev), buf, strlen(pci_name(vdev->pdev))))
1759 return 0; /* No match */
1760
1761 if (strlen(buf) > strlen(pci_name(vdev->pdev))) {
1762 buf += strlen(pci_name(vdev->pdev));
1763
1764 if (*buf != ' ')
1765 return 0; /* No match: non-whitespace after name */
1766
1767 while (*buf) {
1768 if (*buf == ' ') {
1769 buf++;
1770 continue;
1771 }
1772
1773 if (!vf_token && !strncmp(buf, VF_TOKEN_ARG,
1774 strlen(VF_TOKEN_ARG))) {
1775 buf += strlen(VF_TOKEN_ARG);
1776
1777 if (strlen(buf) < UUID_STRING_LEN)
1778 return -EINVAL;
1779
1780 ret = uuid_parse(buf, &uuid);
1781 if (ret)
1782 return ret;
1783
1784 vf_token = true;
1785 buf += UUID_STRING_LEN;
1786 } else {
1787 /* Unknown/duplicate option */
1788 return -EINVAL;
1789 }
1790 }
1791 }
1792
1793 ret = vfio_pci_validate_vf_token(vdev, vf_token, &uuid);
1794 if (ret)
1795 return ret;
1796
1797 return 1; /* Match */
1798 }
1799
1800 static const struct vfio_device_ops vfio_pci_ops = {
1801 .name = "vfio-pci",
1802 .open = vfio_pci_open,
1803 .release = vfio_pci_release,
1804 .ioctl = vfio_pci_ioctl,
1805 .read = vfio_pci_read,
1806 .write = vfio_pci_write,
1807 .mmap = vfio_pci_mmap,
1808 .request = vfio_pci_request,
1809 .match = vfio_pci_match,
1810 };
1811
1812 static int vfio_pci_reflck_attach(struct vfio_pci_device *vdev);
1813 static void vfio_pci_reflck_put(struct vfio_pci_reflck *reflck);
1814 static struct pci_driver vfio_pci_driver;
1815
1816 static int vfio_pci_bus_notifier(struct notifier_block *nb,
1817 unsigned long action, void *data)
1818 {
1819 struct vfio_pci_device *vdev = container_of(nb,
1820 struct vfio_pci_device, nb);
1821 struct device *dev = data;
1822 struct pci_dev *pdev = to_pci_dev(dev);
1823 struct pci_dev *physfn = pci_physfn(pdev);
1824
1825 if (action == BUS_NOTIFY_ADD_DEVICE &&
1826 pdev->is_virtfn && physfn == vdev->pdev) {
1827 pci_info(vdev->pdev, "Captured SR-IOV VF %s driver_override\n",
1828 pci_name(pdev));
1829 pdev->driver_override = kasprintf(GFP_KERNEL, "%s",
1830 vfio_pci_ops.name);
1831 } else if (action == BUS_NOTIFY_BOUND_DRIVER &&
1832 pdev->is_virtfn && physfn == vdev->pdev) {
1833 struct pci_driver *drv = pci_dev_driver(pdev);
1834
1835 if (drv && drv != &vfio_pci_driver)
1836 pci_warn(vdev->pdev,
1837 "VF %s bound to driver %s while PF bound to vfio-pci\n",
1838 pci_name(pdev), drv->name);
1839 }
1840
1841 return 0;
1842 }
1843
1844 static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
1845 {
1846 struct vfio_pci_device *vdev;
1847 struct iommu_group *group;
1848 int ret;
1849
1850 if (pdev->hdr_type != PCI_HEADER_TYPE_NORMAL)
1851 return -EINVAL;
1852
1853 /*
1854 * Prevent binding to PFs with VFs enabled, the VFs might be in use
1855 * by the host or other users. We cannot capture the VFs if they
1856 * already exist, nor can we track VF users. Disabling SR-IOV here
1857 * would initiate removing the VFs, which would unbind the driver,
1858 * which is prone to blocking if that VF is also in use by vfio-pci.
1859 * Just reject these PFs and let the user sort it out.
1860 */
1861 if (pci_num_vf(pdev)) {
1862 pci_warn(pdev, "Cannot bind to PF with SR-IOV enabled\n");
1863 return -EBUSY;
1864 }
1865
1866 group = vfio_iommu_group_get(&pdev->dev);
1867 if (!group)
1868 return -EINVAL;
1869
1870 vdev = kzalloc(sizeof(*vdev), GFP_KERNEL);
1871 if (!vdev) {
1872 ret = -ENOMEM;
1873 goto out_group_put;
1874 }
1875
1876 vdev->pdev = pdev;
1877 vdev->irq_type = VFIO_PCI_NUM_IRQS;
1878 mutex_init(&vdev->igate);
1879 spin_lock_init(&vdev->irqlock);
1880 mutex_init(&vdev->ioeventfds_lock);
1881 INIT_LIST_HEAD(&vdev->ioeventfds_list);
1882 mutex_init(&vdev->vma_lock);
1883 INIT_LIST_HEAD(&vdev->vma_list);
1884 init_rwsem(&vdev->memory_lock);
1885
1886 ret = vfio_add_group_dev(&pdev->dev, &vfio_pci_ops, vdev);
1887 if (ret)
1888 goto out_free;
1889
1890 ret = vfio_pci_reflck_attach(vdev);
1891 if (ret)
1892 goto out_del_group_dev;
1893
1894 if (pdev->is_physfn) {
1895 vdev->vf_token = kzalloc(sizeof(*vdev->vf_token), GFP_KERNEL);
1896 if (!vdev->vf_token) {
1897 ret = -ENOMEM;
1898 goto out_reflck;
1899 }
1900
1901 mutex_init(&vdev->vf_token->lock);
1902 uuid_gen(&vdev->vf_token->uuid);
1903
1904 vdev->nb.notifier_call = vfio_pci_bus_notifier;
1905 ret = bus_register_notifier(&pci_bus_type, &vdev->nb);
1906 if (ret)
1907 goto out_vf_token;
1908 }
1909
1910 if (vfio_pci_is_vga(pdev)) {
1911 vga_client_register(pdev, vdev, NULL, vfio_pci_set_vga_decode);
1912 vga_set_legacy_decoding(pdev,
1913 vfio_pci_set_vga_decode(vdev, false));
1914 }
1915
1916 vfio_pci_probe_power_state(vdev);
1917
1918 if (!disable_idle_d3) {
1919 /*
1920 * pci-core sets the device power state to an unknown value at
1921 * bootup and after being removed from a driver. The only
1922 * transition it allows from this unknown state is to D0, which
1923 * typically happens when a driver calls pci_enable_device().
1924 * We're not ready to enable the device yet, but we do want to
1925 * be able to get to D3. Therefore first do a D0 transition
1926 * before going to D3.
1927 */
1928 vfio_pci_set_power_state(vdev, PCI_D0);
1929 vfio_pci_set_power_state(vdev, PCI_D3hot);
1930 }
1931
1932 return ret;
1933
1934 out_vf_token:
1935 kfree(vdev->vf_token);
1936 out_reflck:
1937 vfio_pci_reflck_put(vdev->reflck);
1938 out_del_group_dev:
1939 vfio_del_group_dev(&pdev->dev);
1940 out_free:
1941 kfree(vdev);
1942 out_group_put:
1943 vfio_iommu_group_put(group, &pdev->dev);
1944 return ret;
1945 }
1946
1947 static void vfio_pci_remove(struct pci_dev *pdev)
1948 {
1949 struct vfio_pci_device *vdev;
1950
1951 pci_disable_sriov(pdev);
1952
1953 vdev = vfio_del_group_dev(&pdev->dev);
1954 if (!vdev)
1955 return;
1956
1957 if (vdev->vf_token) {
1958 WARN_ON(vdev->vf_token->users);
1959 mutex_destroy(&vdev->vf_token->lock);
1960 kfree(vdev->vf_token);
1961 }
1962
1963 if (vdev->nb.notifier_call)
1964 bus_unregister_notifier(&pci_bus_type, &vdev->nb);
1965
1966 vfio_pci_reflck_put(vdev->reflck);
1967
1968 vfio_iommu_group_put(pdev->dev.iommu_group, &pdev->dev);
1969 kfree(vdev->region);
1970 mutex_destroy(&vdev->ioeventfds_lock);
1971
1972 if (!disable_idle_d3)
1973 vfio_pci_set_power_state(vdev, PCI_D0);
1974
1975 kfree(vdev->pm_save);
1976 kfree(vdev);
1977
1978 if (vfio_pci_is_vga(pdev)) {
1979 vga_client_register(pdev, NULL, NULL, NULL);
1980 vga_set_legacy_decoding(pdev,
1981 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM |
1982 VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM);
1983 }
1984 }
1985
1986 static pci_ers_result_t vfio_pci_aer_err_detected(struct pci_dev *pdev,
1987 pci_channel_state_t state)
1988 {
1989 struct vfio_pci_device *vdev;
1990 struct vfio_device *device;
1991
1992 device = vfio_device_get_from_dev(&pdev->dev);
1993 if (device == NULL)
1994 return PCI_ERS_RESULT_DISCONNECT;
1995
1996 vdev = vfio_device_data(device);
1997 if (vdev == NULL) {
1998 vfio_device_put(device);
1999 return PCI_ERS_RESULT_DISCONNECT;
2000 }
2001
2002 mutex_lock(&vdev->igate);
2003
2004 if (vdev->err_trigger)
2005 eventfd_signal(vdev->err_trigger, 1);
2006
2007 mutex_unlock(&vdev->igate);
2008
2009 vfio_device_put(device);
2010
2011 return PCI_ERS_RESULT_CAN_RECOVER;
2012 }
2013
2014 static int vfio_pci_sriov_configure(struct pci_dev *pdev, int nr_virtfn)
2015 {
2016 struct vfio_pci_device *vdev;
2017 struct vfio_device *device;
2018 int ret = 0;
2019
2020 might_sleep();
2021
2022 if (!enable_sriov)
2023 return -ENOENT;
2024
2025 device = vfio_device_get_from_dev(&pdev->dev);
2026 if (!device)
2027 return -ENODEV;
2028
2029 vdev = vfio_device_data(device);
2030 if (!vdev) {
2031 vfio_device_put(device);
2032 return -ENODEV;
2033 }
2034
2035 if (nr_virtfn == 0)
2036 pci_disable_sriov(pdev);
2037 else
2038 ret = pci_enable_sriov(pdev, nr_virtfn);
2039
2040 vfio_device_put(device);
2041
2042 return ret < 0 ? ret : nr_virtfn;
2043 }
2044
2045 static const struct pci_error_handlers vfio_err_handlers = {
2046 .error_detected = vfio_pci_aer_err_detected,
2047 };
2048
2049 static struct pci_driver vfio_pci_driver = {
2050 .name = "vfio-pci",
2051 .id_table = NULL, /* only dynamic ids */
2052 .probe = vfio_pci_probe,
2053 .remove = vfio_pci_remove,
2054 .sriov_configure = vfio_pci_sriov_configure,
2055 .err_handler = &vfio_err_handlers,
2056 };
2057
2058 static DEFINE_MUTEX(reflck_lock);
2059
2060 static struct vfio_pci_reflck *vfio_pci_reflck_alloc(void)
2061 {
2062 struct vfio_pci_reflck *reflck;
2063
2064 reflck = kzalloc(sizeof(*reflck), GFP_KERNEL);
2065 if (!reflck)
2066 return ERR_PTR(-ENOMEM);
2067
2068 kref_init(&reflck->kref);
2069 mutex_init(&reflck->lock);
2070
2071 return reflck;
2072 }
2073
2074 static void vfio_pci_reflck_get(struct vfio_pci_reflck *reflck)
2075 {
2076 kref_get(&reflck->kref);
2077 }
2078
2079 static int vfio_pci_reflck_find(struct pci_dev *pdev, void *data)
2080 {
2081 struct vfio_pci_reflck **preflck = data;
2082 struct vfio_device *device;
2083 struct vfio_pci_device *vdev;
2084
2085 device = vfio_device_get_from_dev(&pdev->dev);
2086 if (!device)
2087 return 0;
2088
2089 if (pci_dev_driver(pdev) != &vfio_pci_driver) {
2090 vfio_device_put(device);
2091 return 0;
2092 }
2093
2094 vdev = vfio_device_data(device);
2095
2096 if (vdev->reflck) {
2097 vfio_pci_reflck_get(vdev->reflck);
2098 *preflck = vdev->reflck;
2099 vfio_device_put(device);
2100 return 1;
2101 }
2102
2103 vfio_device_put(device);
2104 return 0;
2105 }
2106
2107 static int vfio_pci_reflck_attach(struct vfio_pci_device *vdev)
2108 {
2109 bool slot = !pci_probe_reset_slot(vdev->pdev->slot);
2110
2111 mutex_lock(&reflck_lock);
2112
2113 if (pci_is_root_bus(vdev->pdev->bus) ||
2114 vfio_pci_for_each_slot_or_bus(vdev->pdev, vfio_pci_reflck_find,
2115 &vdev->reflck, slot) <= 0)
2116 vdev->reflck = vfio_pci_reflck_alloc();
2117
2118 mutex_unlock(&reflck_lock);
2119
2120 return PTR_ERR_OR_ZERO(vdev->reflck);
2121 }
2122
2123 static void vfio_pci_reflck_release(struct kref *kref)
2124 {
2125 struct vfio_pci_reflck *reflck = container_of(kref,
2126 struct vfio_pci_reflck,
2127 kref);
2128
2129 kfree(reflck);
2130 mutex_unlock(&reflck_lock);
2131 }
2132
2133 static void vfio_pci_reflck_put(struct vfio_pci_reflck *reflck)
2134 {
2135 kref_put_mutex(&reflck->kref, vfio_pci_reflck_release, &reflck_lock);
2136 }
2137
2138 static int vfio_pci_get_unused_devs(struct pci_dev *pdev, void *data)
2139 {
2140 struct vfio_devices *devs = data;
2141 struct vfio_device *device;
2142 struct vfio_pci_device *vdev;
2143
2144 if (devs->cur_index == devs->max_index)
2145 return -ENOSPC;
2146
2147 device = vfio_device_get_from_dev(&pdev->dev);
2148 if (!device)
2149 return -EINVAL;
2150
2151 if (pci_dev_driver(pdev) != &vfio_pci_driver) {
2152 vfio_device_put(device);
2153 return -EBUSY;
2154 }
2155
2156 vdev = vfio_device_data(device);
2157
2158 /* Fault if the device is not unused */
2159 if (vdev->refcnt) {
2160 vfio_device_put(device);
2161 return -EBUSY;
2162 }
2163
2164 devs->devices[devs->cur_index++] = device;
2165 return 0;
2166 }
2167
2168 static int vfio_pci_try_zap_and_vma_lock_cb(struct pci_dev *pdev, void *data)
2169 {
2170 struct vfio_devices *devs = data;
2171 struct vfio_device *device;
2172 struct vfio_pci_device *vdev;
2173
2174 if (devs->cur_index == devs->max_index)
2175 return -ENOSPC;
2176
2177 device = vfio_device_get_from_dev(&pdev->dev);
2178 if (!device)
2179 return -EINVAL;
2180
2181 if (pci_dev_driver(pdev) != &vfio_pci_driver) {
2182 vfio_device_put(device);
2183 return -EBUSY;
2184 }
2185
2186 vdev = vfio_device_data(device);
2187
2188 /*
2189 * Locking multiple devices is prone to deadlock, runaway and
2190 * unwind if we hit contention.
2191 */
2192 if (!vfio_pci_zap_and_vma_lock(vdev, true)) {
2193 vfio_device_put(device);
2194 return -EBUSY;
2195 }
2196
2197 devs->devices[devs->cur_index++] = device;
2198 return 0;
2199 }
2200
2201 /*
2202 * If a bus or slot reset is available for the provided device and:
2203 * - All of the devices affected by that bus or slot reset are unused
2204 * (!refcnt)
2205 * - At least one of the affected devices is marked dirty via
2206 * needs_reset (such as by lack of FLR support)
2207 * Then attempt to perform that bus or slot reset. Callers are required
2208 * to hold vdev->reflck->lock, protecting the bus/slot reset group from
2209 * concurrent opens. A vfio_device reference is acquired for each device
2210 * to prevent unbinds during the reset operation.
2211 *
2212 * NB: vfio-core considers a group to be viable even if some devices are
2213 * bound to drivers like pci-stub or pcieport. Here we require all devices
2214 * to be bound to vfio_pci since that's the only way we can be sure they
2215 * stay put.
2216 */
2217 static void vfio_pci_try_bus_reset(struct vfio_pci_device *vdev)
2218 {
2219 struct vfio_devices devs = { .cur_index = 0 };
2220 int i = 0, ret = -EINVAL;
2221 bool slot = false;
2222 struct vfio_pci_device *tmp;
2223
2224 if (!pci_probe_reset_slot(vdev->pdev->slot))
2225 slot = true;
2226 else if (pci_probe_reset_bus(vdev->pdev->bus))
2227 return;
2228
2229 if (vfio_pci_for_each_slot_or_bus(vdev->pdev, vfio_pci_count_devs,
2230 &i, slot) || !i)
2231 return;
2232
2233 devs.max_index = i;
2234 devs.devices = kcalloc(i, sizeof(struct vfio_device *), GFP_KERNEL);
2235 if (!devs.devices)
2236 return;
2237
2238 if (vfio_pci_for_each_slot_or_bus(vdev->pdev,
2239 vfio_pci_get_unused_devs,
2240 &devs, slot))
2241 goto put_devs;
2242
2243 /* Does at least one need a reset? */
2244 for (i = 0; i < devs.cur_index; i++) {
2245 tmp = vfio_device_data(devs.devices[i]);
2246 if (tmp->needs_reset) {
2247 ret = pci_reset_bus(vdev->pdev);
2248 break;
2249 }
2250 }
2251
2252 put_devs:
2253 for (i = 0; i < devs.cur_index; i++) {
2254 tmp = vfio_device_data(devs.devices[i]);
2255
2256 /*
2257 * If reset was successful, affected devices no longer need
2258 * a reset and we should return all the collateral devices
2259 * to low power. If not successful, we either didn't reset
2260 * the bus or timed out waiting for it, so let's not touch
2261 * the power state.
2262 */
2263 if (!ret) {
2264 tmp->needs_reset = false;
2265
2266 if (tmp != vdev && !disable_idle_d3)
2267 vfio_pci_set_power_state(tmp, PCI_D3hot);
2268 }
2269
2270 vfio_device_put(devs.devices[i]);
2271 }
2272
2273 kfree(devs.devices);
2274 }
2275
2276 static void __exit vfio_pci_cleanup(void)
2277 {
2278 pci_unregister_driver(&vfio_pci_driver);
2279 vfio_pci_uninit_perm_bits();
2280 }
2281
2282 static void __init vfio_pci_fill_ids(void)
2283 {
2284 char *p, *id;
2285 int rc;
2286
2287 /* no ids passed actually */
2288 if (ids[0] == '\0')
2289 return;
2290
2291 /* add ids specified in the module parameter */
2292 p = ids;
2293 while ((id = strsep(&p, ","))) {
2294 unsigned int vendor, device, subvendor = PCI_ANY_ID,
2295 subdevice = PCI_ANY_ID, class = 0, class_mask = 0;
2296 int fields;
2297
2298 if (!strlen(id))
2299 continue;
2300
2301 fields = sscanf(id, "%x:%x:%x:%x:%x:%x",
2302 &vendor, &device, &subvendor, &subdevice,
2303 &class, &class_mask);
2304
2305 if (fields < 2) {
2306 pr_warn("invalid id string \"%s\"\n", id);
2307 continue;
2308 }
2309
2310 rc = pci_add_dynid(&vfio_pci_driver, vendor, device,
2311 subvendor, subdevice, class, class_mask, 0);
2312 if (rc)
2313 pr_warn("failed to add dynamic id [%04x:%04x[%04x:%04x]] class %#08x/%08x (%d)\n",
2314 vendor, device, subvendor, subdevice,
2315 class, class_mask, rc);
2316 else
2317 pr_info("add [%04x:%04x[%04x:%04x]] class %#08x/%08x\n",
2318 vendor, device, subvendor, subdevice,
2319 class, class_mask);
2320 }
2321 }
2322
2323 static int __init vfio_pci_init(void)
2324 {
2325 int ret;
2326
2327 /* Allocate shared config space permision data used by all devices */
2328 ret = vfio_pci_init_perm_bits();
2329 if (ret)
2330 return ret;
2331
2332 /* Register and scan for devices */
2333 ret = pci_register_driver(&vfio_pci_driver);
2334 if (ret)
2335 goto out_driver;
2336
2337 vfio_pci_fill_ids();
2338
2339 return 0;
2340
2341 out_driver:
2342 vfio_pci_uninit_perm_bits();
2343 return ret;
2344 }
2345
2346 module_init(vfio_pci_init);
2347 module_exit(vfio_pci_cleanup);
2348
2349 MODULE_VERSION(DRIVER_VERSION);
2350 MODULE_LICENSE("GPL v2");
2351 MODULE_AUTHOR(DRIVER_AUTHOR);
2352 MODULE_DESCRIPTION(DRIVER_DESC);