git.ipfire.org Git - thirdparty/kernel/stable.git/blob

3 *

4 * Author: Michael S. Tsirkin <mst@redhat.com>

5 *

6 * Inspiration, some code, and most witty comments come from

7 * Documentation/virtual/lguest/lguest.c, by Rusty Russell

8 *

9 * This work is licensed under the terms of the GNU GPL, version 2.

10 *

11 * Generic code for virtio server in host kernel.

12 */

14 #include <linux/eventfd.h>

15 #include <linux/vhost.h>

16 #include <linux/uio.h>

17 #include <linux/mm.h>

18 #include <linux/mmu_context.h>

19 #include <linux/miscdevice.h>

20 #include <linux/mutex.h>

21 #include <linux/poll.h>

22 #include <linux/file.h>

23 #include <linux/highmem.h>

24 #include <linux/slab.h>

25 #include <linux/vmalloc.h>

26 #include <linux/kthread.h>

27 #include <linux/cgroup.h>

28 #include <linux/module.h>

29 #include <linux/sort.h>

30 #include <linux/sched/mm.h>

31 #include <linux/sched/signal.h>

32 #include <linux/interval_tree_generic.h>

34 #include "vhost.h"

36 static ushort max_mem_regions = 64;

37 module_param(max_mem_regions, ushort, 0444);

38 MODULE_PARM_DESC(max_mem_regions,

39 "Maximum number of memory regions in memory map. (default: 64)");

40 static int max_iotlb_entries = 2048;

41 module_param(max_iotlb_entries, int, 0444);

42 MODULE_PARM_DESC(max_iotlb_entries,

43 "Maximum number of iotlb entries. (default: 2048)");

45 enum {

46 VHOST_MEMORY_F_LOG = 0x1,

47 };

49 #define vhost_used_event(vq) ((__virtio16 __user *)&vq->avail->ring[vq->num])

50 #define vhost_avail_event(vq) ((__virtio16 __user *)&vq->used->ring[vq->num])

52 INTERVAL_TREE_DEFINE(struct vhost_umem_node,

53 rb, __u64, __subtree_last,

54 START, LAST, static inline, vhost_umem_interval_tree);

56 #ifdef CONFIG_VHOST_CROSS_ENDIAN_LEGACY

57 static void vhost_disable_cross_endian(struct vhost_virtqueue *vq)

58 {

59 vq->user_be = !virtio_legacy_is_little_endian();

60 }

62 static void vhost_enable_cross_endian_big(struct vhost_virtqueue *vq)

63 {

64 vq->user_be = true;

65 }

67 static void vhost_enable_cross_endian_little(struct vhost_virtqueue *vq)

68 {

69 vq->user_be = false;

70 }

72 static long vhost_set_vring_endian(struct vhost_virtqueue *vq, int __user *argp)

73 {

74 struct vhost_vring_state s;

76 if (vq->private_data)

77 return -EBUSY;

79 if (copy_from_user(&s, argp, sizeof(s)))

80 return -EFAULT;

82 if (s.num != VHOST_VRING_LITTLE_ENDIAN &&

83 s.num != VHOST_VRING_BIG_ENDIAN)

84 return -EINVAL;

86 if (s.num == VHOST_VRING_BIG_ENDIAN)

87 vhost_enable_cross_endian_big(vq);

88 else

89 vhost_enable_cross_endian_little(vq);

91 return 0;

92 }

94 static long vhost_get_vring_endian(struct vhost_virtqueue *vq, u32 idx,

95 int __user *argp)

96 {

97 struct vhost_vring_state s = {

98 .index = idx,

99 .num = vq->user_be

100 };

101

102 if (copy_to_user(argp, &s, sizeof(s)))

103 return -EFAULT;

104

105 return 0;

106 }

107

108 static void vhost_init_is_le(struct vhost_virtqueue *vq)

109 {

110 /* Note for legacy virtio: user_be is initialized at reset time

111 * according to the host endianness. If userspace does not set an

112 * explicit endianness, the default behavior is native endian, as

113 * expected by legacy virtio.

114 */

115 vq->is_le = vhost_has_feature(vq, VIRTIO_F_VERSION_1) || !vq->user_be;

116 }

117 #else

118 static void vhost_disable_cross_endian(struct vhost_virtqueue *vq)

119 {

120 }

121

122 static long vhost_set_vring_endian(struct vhost_virtqueue *vq, int __user *argp)

123 {

124 return -ENOIOCTLCMD;

125 }

126

127 static long vhost_get_vring_endian(struct vhost_virtqueue *vq, u32 idx,

128 int __user *argp)

129 {

130 return -ENOIOCTLCMD;

131 }

132

133 static void vhost_init_is_le(struct vhost_virtqueue *vq)

134 {

135 vq->is_le = vhost_has_feature(vq, VIRTIO_F_VERSION_1)

136 || virtio_legacy_is_little_endian();

137 }

138 #endif /* CONFIG_VHOST_CROSS_ENDIAN_LEGACY */

139

140 static void vhost_reset_is_le(struct vhost_virtqueue *vq)

141 {

142 vhost_init_is_le(vq);

143 }

144

145 struct vhost_flush_struct {

146 struct vhost_work work;

147 struct completion wait_event;

148 };

149

150 static void vhost_flush_work(struct vhost_work *work)

151 {

152 struct vhost_flush_struct *s;

153

154 s = container_of(work, struct vhost_flush_struct, work);

155 complete(&s->wait_event);

156 }

157

158 static void vhost_poll_func(struct file *file, wait_queue_head_t *wqh,

159 poll_table *pt)

160 {

161 struct vhost_poll *poll;

162

163 poll = container_of(pt, struct vhost_poll, table);

164 poll->wqh = wqh;

165 add_wait_queue(wqh, &poll->wait);

166 }

167

168 static int vhost_poll_wakeup(wait_queue_entry_t *wait, unsigned mode, int sync,

169 void *key)

170 {

171 struct vhost_poll *poll = container_of(wait, struct vhost_poll, wait);

172

173 if (!((unsigned long)key & poll->mask))

174 return 0;

175

176 vhost_poll_queue(poll);

177 return 0;

178 }

179

180 void vhost_work_init(struct vhost_work *work, vhost_work_fn_t fn)

181 {

182 clear_bit(VHOST_WORK_QUEUED, &work->flags);

183 work->fn = fn;

184 init_waitqueue_head(&work->done);

185 }

186 EXPORT_SYMBOL_GPL(vhost_work_init);

187

188 /* Init poll structure */

189 void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn,

190 unsigned long mask, struct vhost_dev *dev)

191 {

192 init_waitqueue_func_entry(&poll->wait, vhost_poll_wakeup);

193 init_poll_funcptr(&poll->table, vhost_poll_func);

194 poll->mask = mask;

195 poll->dev = dev;

196 poll->wqh = NULL;

197

198 vhost_work_init(&poll->work, fn);

199 }

200 EXPORT_SYMBOL_GPL(vhost_poll_init);

201

202 /* Start polling a file. We add ourselves to file's wait queue. The caller must

203 * keep a reference to a file until after vhost_poll_stop is called. */

204 int vhost_poll_start(struct vhost_poll *poll, struct file *file)

205 {

206 unsigned long mask;

207 int ret = 0;

208

209 if (poll->wqh)

210 return 0;

211

212 mask = file->f_op->poll(file, &poll->table);

213 if (mask)

214 vhost_poll_wakeup(&poll->wait, 0, 0, (void *)mask);

215 if (mask & POLLERR) {

216 vhost_poll_stop(poll);

217 ret = -EINVAL;

218 }

219

220 return ret;

221 }

222 EXPORT_SYMBOL_GPL(vhost_poll_start);

223

224 /* Stop polling a file. After this function returns, it becomes safe to drop the

225 * file reference. You must also flush afterwards. */

226 void vhost_poll_stop(struct vhost_poll *poll)

227 {

228 if (poll->wqh) {

229 remove_wait_queue(poll->wqh, &poll->wait);

230 poll->wqh = NULL;

231 }

232 }

233 EXPORT_SYMBOL_GPL(vhost_poll_stop);

234

235 void vhost_work_flush(struct vhost_dev *dev, struct vhost_work *work)

236 {

237 struct vhost_flush_struct flush;

238

239 if (dev->worker) {

240 init_completion(&flush.wait_event);

241 vhost_work_init(&flush.work, vhost_flush_work);

242

243 vhost_work_queue(dev, &flush.work);

244 wait_for_completion(&flush.wait_event);

245 }

246 }

247 EXPORT_SYMBOL_GPL(vhost_work_flush);

248

249 /* Flush any work that has been scheduled. When calling this, don't hold any

250 * locks that are also used by the callback. */

251 void vhost_poll_flush(struct vhost_poll *poll)

252 {

253 vhost_work_flush(poll->dev, &poll->work);

254 }

255 EXPORT_SYMBOL_GPL(vhost_poll_flush);

256

257 void vhost_work_queue(struct vhost_dev *dev, struct vhost_work *work)

258 {

259 if (!dev->worker)

260 return;

261

262 if (!test_and_set_bit(VHOST_WORK_QUEUED, &work->flags)) {

263 /* We can only add the work to the list after we're

264 * sure it was not in the list.

265 * test_and_set_bit() implies a memory barrier.

266 */

267 llist_add(&work->node, &dev->work_list);

268 wake_up_process(dev->worker);

269 }

270 }

271 EXPORT_SYMBOL_GPL(vhost_work_queue);

272

273 /* A lockless hint for busy polling code to exit the loop */

274 bool vhost_has_work(struct vhost_dev *dev)

275 {

276 return !llist_empty(&dev->work_list);

277 }

278 EXPORT_SYMBOL_GPL(vhost_has_work);

279

280 void vhost_poll_queue(struct vhost_poll *poll)

281 {

282 vhost_work_queue(poll->dev, &poll->work);

283 }

284 EXPORT_SYMBOL_GPL(vhost_poll_queue);

285

286 static void __vhost_vq_meta_reset(struct vhost_virtqueue *vq)

287 {

288 int j;

289

290 for (j = 0; j < VHOST_NUM_ADDRS; j++)

291 vq->meta_iotlb[j] = NULL;

292 }

293

294 static void vhost_vq_meta_reset(struct vhost_dev *d)

295 {

296 int i;

297

298 for (i = 0; i < d->nvqs; ++i)

299 __vhost_vq_meta_reset(d->vqs[i]);

300 }

301

302 static void vhost_vq_reset(struct vhost_dev *dev,

303 struct vhost_virtqueue *vq)

304 {

305 vq->num = 1;

306 vq->desc = NULL;

307 vq->avail = NULL;

308 vq->used = NULL;

309 vq->last_avail_idx = 0;

310 vq->avail_idx = 0;

311 vq->last_used_idx = 0;

312 vq->signalled_used = 0;

313 vq->signalled_used_valid = false;

314 vq->used_flags = 0;

315 vq->log_used = false;

316 vq->log_addr = -1ull;

317 vq->private_data = NULL;

318 vq->acked_features = 0;

319 vq->log_base = NULL;

320 vq->error_ctx = NULL;

321 vq->error = NULL;

322 vq->kick = NULL;

323 vq->call_ctx = NULL;

324 vq->call = NULL;

325 vq->log_ctx = NULL;

326 vhost_reset_is_le(vq);

327 vhost_disable_cross_endian(vq);

328 vq->busyloop_timeout = 0;

329 vq->umem = NULL;

330 vq->iotlb = NULL;

331 __vhost_vq_meta_reset(vq);

332 }

333

334 static int vhost_worker(void *data)

335 {

336 struct vhost_dev *dev = data;

337 struct vhost_work *work, *work_next;

338 struct llist_node *node;

339 mm_segment_t oldfs = get_fs();

340

341 set_fs(USER_DS);

342 use_mm(dev->mm);

343

344 for (;;) {

345 /* mb paired w/ kthread_stop */

346 set_current_state(TASK_INTERRUPTIBLE);

347

348 if (kthread_should_stop()) {

349 __set_current_state(TASK_RUNNING);

350 break;

351 }

352

353 node = llist_del_all(&dev->work_list);

354 if (!node)

355 schedule();

356

357 node = llist_reverse_order(node);

358 /* make sure flag is seen after deletion */

359 smp_wmb();

360 llist_for_each_entry_safe(work, work_next, node, node) {

361 clear_bit(VHOST_WORK_QUEUED, &work->flags);

362 __set_current_state(TASK_RUNNING);

363 work->fn(work);

364 if (need_resched())

365 schedule();

366 }

367 }

368 unuse_mm(dev->mm);

369 set_fs(oldfs);

370 return 0;

371 }

372

373 static void vhost_vq_free_iovecs(struct vhost_virtqueue *vq)

374 {

375 kfree(vq->indirect);

376 vq->indirect = NULL;

377 kfree(vq->log);

378 vq->log = NULL;

379 kfree(vq->heads);

380 vq->heads = NULL;

381 }

382

383 /* Helper to allocate iovec buffers for all vqs. */

384 static long vhost_dev_alloc_iovecs(struct vhost_dev *dev)

385 {

386 struct vhost_virtqueue *vq;

387 int i;

388

389 for (i = 0; i < dev->nvqs; ++i) {

390 vq = dev->vqs[i];

391 vq->indirect = kmalloc(sizeof *vq->indirect * UIO_MAXIOV,

392 GFP_KERNEL);

393 vq->log = kmalloc(sizeof *vq->log * UIO_MAXIOV, GFP_KERNEL);

394 vq->heads = kmalloc(sizeof *vq->heads * UIO_MAXIOV, GFP_KERNEL);

395 if (!vq->indirect || !vq->log || !vq->heads)

396 goto err_nomem;

397 }

398 return 0;

399

400 err_nomem:

401 for (; i >= 0; --i)

402 vhost_vq_free_iovecs(dev->vqs[i]);

403 return -ENOMEM;

404 }

405

406 static void vhost_dev_free_iovecs(struct vhost_dev *dev)

407 {

408 int i;

409

410 for (i = 0; i < dev->nvqs; ++i)

411 vhost_vq_free_iovecs(dev->vqs[i]);

412 }

413

414 void vhost_dev_init(struct vhost_dev *dev,

415 struct vhost_virtqueue **vqs, int nvqs)

416 {

417 struct vhost_virtqueue *vq;

418 int i;

419

420 dev->vqs = vqs;

421 dev->nvqs = nvqs;

422 mutex_init(&dev->mutex);

423 dev->log_ctx = NULL;

424 dev->log_file = NULL;

425 dev->umem = NULL;

426 dev->iotlb = NULL;

427 dev->mm = NULL;

428 dev->worker = NULL;

429 init_llist_head(&dev->work_list);

430 init_waitqueue_head(&dev->wait);

431 INIT_LIST_HEAD(&dev->read_list);

432 INIT_LIST_HEAD(&dev->pending_list);

433 spin_lock_init(&dev->iotlb_lock);

434

435

436 for (i = 0; i < dev->nvqs; ++i) {

437 vq = dev->vqs[i];

438 vq->log = NULL;

439 vq->indirect = NULL;

440 vq->heads = NULL;

441 vq->dev = dev;

442 mutex_init(&vq->mutex);

443 vhost_vq_reset(dev, vq);

444 if (vq->handle_kick)

445 vhost_poll_init(&vq->poll, vq->handle_kick,

446 POLLIN, dev);

447 }

448 }

449 EXPORT_SYMBOL_GPL(vhost_dev_init);

450

451 /* Caller should have device mutex */

452 long vhost_dev_check_owner(struct vhost_dev *dev)

453 {

454 /* Are you the owner? If not, I don't think you mean to do that */

455 return dev->mm == current->mm ? 0 : -EPERM;

456 }

457 EXPORT_SYMBOL_GPL(vhost_dev_check_owner);

458

459 struct vhost_attach_cgroups_struct {

460 struct vhost_work work;

461 struct task_struct *owner;

462 int ret;

463 };

464

465 static void vhost_attach_cgroups_work(struct vhost_work *work)

466 {

467 struct vhost_attach_cgroups_struct *s;

468

469 s = container_of(work, struct vhost_attach_cgroups_struct, work);

470 s->ret = cgroup_attach_task_all(s->owner, current);

471 }

472

473 static int vhost_attach_cgroups(struct vhost_dev *dev)

474 {

475 struct vhost_attach_cgroups_struct attach;

476

477 attach.owner = current;

478 vhost_work_init(&attach.work, vhost_attach_cgroups_work);

479 vhost_work_queue(dev, &attach.work);

480 vhost_work_flush(dev, &attach.work);

481 return attach.ret;

482 }

483

484 /* Caller should have device mutex */

485 bool vhost_dev_has_owner(struct vhost_dev *dev)

486 {

487 return dev->mm;

488 }

489 EXPORT_SYMBOL_GPL(vhost_dev_has_owner);

490

491 /* Caller should have device mutex */

492 long vhost_dev_set_owner(struct vhost_dev *dev)

493 {

494 struct task_struct *worker;

495 int err;

496

497 /* Is there an owner already? */

498 if (vhost_dev_has_owner(dev)) {

499 err = -EBUSY;

500 goto err_mm;

501 }

502

503 /* No owner, become one */

504 dev->mm = get_task_mm(current);

505 worker = kthread_create(vhost_worker, dev, "vhost-%d", current->pid);

506 if (IS_ERR(worker)) {

507 err = PTR_ERR(worker);

508 goto err_worker;

509 }

510

511 dev->worker = worker;

512 wake_up_process(worker); /* avoid contributing to loadavg */

513

514 err = vhost_attach_cgroups(dev);

515 if (err)

516 goto err_cgroup;

517

518 err = vhost_dev_alloc_iovecs(dev);

519 if (err)

520 goto err_cgroup;

521

522 return 0;

523 err_cgroup:

524 kthread_stop(worker);

525 dev->worker = NULL;

526 err_worker:

527 if (dev->mm)

528 mmput(dev->mm);

529 dev->mm = NULL;

530 err_mm:

531 return err;

532 }

533 EXPORT_SYMBOL_GPL(vhost_dev_set_owner);

534

535 struct vhost_umem *vhost_dev_reset_owner_prepare(void)

536 {

537 return kvzalloc(sizeof(struct vhost_umem), GFP_KERNEL);

538 }

539 EXPORT_SYMBOL_GPL(vhost_dev_reset_owner_prepare);

540

541 /* Caller should have device mutex */

542 void vhost_dev_reset_owner(struct vhost_dev *dev, struct vhost_umem *umem)

543 {

544 int i;

545

546 vhost_dev_cleanup(dev, true);

547

548 /* Restore memory to default empty mapping. */

549 INIT_LIST_HEAD(&umem->umem_list);

550 dev->umem = umem;

551 /* We don't need VQ locks below since vhost_dev_cleanup makes sure

552 * VQs aren't running.

553 */

554 for (i = 0; i < dev->nvqs; ++i)

555 dev->vqs[i]->umem = umem;

556 }

557 EXPORT_SYMBOL_GPL(vhost_dev_reset_owner);

558

559 void vhost_dev_stop(struct vhost_dev *dev)

560 {

561 int i;

562

563 for (i = 0; i < dev->nvqs; ++i) {

564 if (dev->vqs[i]->kick && dev->vqs[i]->handle_kick) {

565 vhost_poll_stop(&dev->vqs[i]->poll);

566 vhost_poll_flush(&dev->vqs[i]->poll);

567 }

568 }

569 }

570 EXPORT_SYMBOL_GPL(vhost_dev_stop);

571

572 static void vhost_umem_free(struct vhost_umem *umem,

573 struct vhost_umem_node *node)

574 {

575 vhost_umem_interval_tree_remove(node, &umem->umem_tree);

576 list_del(&node->link);

577 kfree(node);

578 umem->numem--;

579 }

580

581 static void vhost_umem_clean(struct vhost_umem *umem)

582 {

583 struct vhost_umem_node *node, *tmp;

584

585 if (!umem)

586 return;

587

588 list_for_each_entry_safe(node, tmp, &umem->umem_list, link)

589 vhost_umem_free(umem, node);

590

591 kvfree(umem);

592 }

593

594 static void vhost_clear_msg(struct vhost_dev *dev)

595 {

596 struct vhost_msg_node *node, *n;

597

598 spin_lock(&dev->iotlb_lock);

599

600 list_for_each_entry_safe(node, n, &dev->read_list, node) {

601 list_del(&node->node);

602 kfree(node);

603 }

604

605 list_for_each_entry_safe(node, n, &dev->pending_list, node) {

606 list_del(&node->node);

607 kfree(node);

608 }

609

610 spin_unlock(&dev->iotlb_lock);

611 }

612

613 /* Caller should have device mutex if and only if locked is set */

614 void vhost_dev_cleanup(struct vhost_dev *dev, bool locked)

615 {

616 int i;

617

618 for (i = 0; i < dev->nvqs; ++i) {

619 if (dev->vqs[i]->error_ctx)

620 eventfd_ctx_put(dev->vqs[i]->error_ctx);

621 if (dev->vqs[i]->error)

622 fput(dev->vqs[i]->error);

623 if (dev->vqs[i]->kick)

624 fput(dev->vqs[i]->kick);

625 if (dev->vqs[i]->call_ctx)

626 eventfd_ctx_put(dev->vqs[i]->call_ctx);

627 if (dev->vqs[i]->call)

628 fput(dev->vqs[i]->call);

629 vhost_vq_reset(dev, dev->vqs[i]);

630 }

631 vhost_dev_free_iovecs(dev);

632 if (dev->log_ctx)

633 eventfd_ctx_put(dev->log_ctx);

634 dev->log_ctx = NULL;

635 if (dev->log_file)

636 fput(dev->log_file);

637 dev->log_file = NULL;

638 /* No one will access memory at this point */

639 vhost_umem_clean(dev->umem);

640 dev->umem = NULL;

641 vhost_umem_clean(dev->iotlb);

642 dev->iotlb = NULL;

643 vhost_clear_msg(dev);

644 wake_up_interruptible_poll(&dev->wait, POLLIN | POLLRDNORM);

645 WARN_ON(!llist_empty(&dev->work_list));

646 if (dev->worker) {

647 kthread_stop(dev->worker);

648 dev->worker = NULL;

649 }

650 if (dev->mm)

651 mmput(dev->mm);

652 dev->mm = NULL;

653 }

654 EXPORT_SYMBOL_GPL(vhost_dev_cleanup);

655

656 static int log_access_ok(void __user *log_base, u64 addr, unsigned long sz)

657 {

658 u64 a = addr / VHOST_PAGE_SIZE / 8;

659

660 /* Make sure 64 bit math will not overflow. */

661 if (a > ULONG_MAX - (unsigned long)log_base ||

662 a + (unsigned long)log_base > ULONG_MAX)

663 return 0;

664

665 return access_ok(VERIFY_WRITE, log_base + a,

666 (sz + VHOST_PAGE_SIZE * 8 - 1) / VHOST_PAGE_SIZE / 8);

667 }

668

669 static bool vhost_overflow(u64 uaddr, u64 size)

670 {

671 /* Make sure 64 bit math will not overflow. */

672 return uaddr > ULONG_MAX || size > ULONG_MAX || uaddr > ULONG_MAX - size;

673 }

674

675 /* Caller should have vq mutex and device mutex. */

676 static int vq_memory_access_ok(void __user *log_base, struct vhost_umem *umem,

677 int log_all)

678 {

679 struct vhost_umem_node *node;

680

681 if (!umem)

682 return 0;

683

684 list_for_each_entry(node, &umem->umem_list, link) {

685 unsigned long a = node->userspace_addr;

686

687 if (vhost_overflow(node->userspace_addr, node->size))

688 return 0;

689

690

691 if (!access_ok(VERIFY_WRITE, (void __user *)a,

692 node->size))

693 return 0;

694 else if (log_all && !log_access_ok(log_base,

695 node->start,

696 node->size))

697 return 0;

698 }

699 return 1;

700 }

701

702 static inline void __user *vhost_vq_meta_fetch(struct vhost_virtqueue *vq,

703 u64 addr, unsigned int size,

704 int type)

705 {

706 const struct vhost_umem_node *node = vq->meta_iotlb[type];

707

708 if (!node)

709 return NULL;

710

711 return (void *)(uintptr_t)(node->userspace_addr + addr - node->start);

712 }

713

714 /* Can we switch to this memory table? */

715 /* Caller should have device mutex but not vq mutex */

716 static int memory_access_ok(struct vhost_dev *d, struct vhost_umem *umem,

717 int log_all)

718 {

719 int i;

720

721 for (i = 0; i < d->nvqs; ++i) {

722 int ok;

723 bool log;

724

725 mutex_lock(&d->vqs[i]->mutex);

726 log = log_all || vhost_has_feature(d->vqs[i], VHOST_F_LOG_ALL);

727 /* If ring is inactive, will check when it's enabled. */

728 if (d->vqs[i]->private_data)

729 ok = vq_memory_access_ok(d->vqs[i]->log_base,

730 umem, log);

731 else

732 ok = 1;

733 mutex_unlock(&d->vqs[i]->mutex);

734 if (!ok)

735 return 0;

736 }

737 return 1;

738 }

739

740 static int translate_desc(struct vhost_virtqueue *vq, u64 addr, u32 len,

741 struct iovec iov[], int iov_size, int access);

742

743 static int vhost_copy_to_user(struct vhost_virtqueue *vq, void __user *to,

744 const void *from, unsigned size)

745 {

746 int ret;

747

748 if (!vq->iotlb)

749 return __copy_to_user(to, from, size);

750 else {

751 /* This function should be called after iotlb

752 * prefetch, which means we're sure that all vq

753 * could be access through iotlb. So -EAGAIN should

754 * not happen in this case.

755 */

756 struct iov_iter t;

757 void __user *uaddr = vhost_vq_meta_fetch(vq,

758 (u64)(uintptr_t)to, size,

759 VHOST_ADDR_DESC);

760

761 if (uaddr)

762 return __copy_to_user(uaddr, from, size);

763

764 ret = translate_desc(vq, (u64)(uintptr_t)to, size, vq->iotlb_iov,

765 ARRAY_SIZE(vq->iotlb_iov),

766 VHOST_ACCESS_WO);

767 if (ret < 0)

768 goto out;

769 iov_iter_init(&t, WRITE, vq->iotlb_iov, ret, size);

770 ret = copy_to_iter(from, size, &t);

771 if (ret == size)

772 ret = 0;

773 }

774 out:

775 return ret;

776 }

777

778 static int vhost_copy_from_user(struct vhost_virtqueue *vq, void *to,

779 void __user *from, unsigned size)

780 {

781 int ret;

782

783 if (!vq->iotlb)

784 return __copy_from_user(to, from, size);

785 else {

786 /* This function should be called after iotlb

787 * prefetch, which means we're sure that vq

788 * could be access through iotlb. So -EAGAIN should

789 * not happen in this case.

790 */

791 void __user *uaddr = vhost_vq_meta_fetch(vq,

792 (u64)(uintptr_t)from, size,

793 VHOST_ADDR_DESC);

794 struct iov_iter f;

795

796 if (uaddr)

797 return __copy_from_user(to, uaddr, size);

798

799 ret = translate_desc(vq, (u64)(uintptr_t)from, size, vq->iotlb_iov,

800 ARRAY_SIZE(vq->iotlb_iov),

801 VHOST_ACCESS_RO);

802 if (ret < 0) {

803 vq_err(vq, "IOTLB translation failure: uaddr "

804 "%p size 0x%llx\n", from,

805 (unsigned long long) size);

806 goto out;

807 }

808 iov_iter_init(&f, READ, vq->iotlb_iov, ret, size);

809 ret = copy_from_iter(to, size, &f);

810 if (ret == size)

811 ret = 0;

812 }

813

814 out:

815 return ret;

816 }

817

818 static void __user *__vhost_get_user_slow(struct vhost_virtqueue *vq,

819 void __user *addr, unsigned int size,

820 int type)

821 {

822 int ret;

823

824 ret = translate_desc(vq, (u64)(uintptr_t)addr, size, vq->iotlb_iov,

825 ARRAY_SIZE(vq->iotlb_iov),

826 VHOST_ACCESS_RO);

827 if (ret < 0) {

828 vq_err(vq, "IOTLB translation failure: uaddr "

829 "%p size 0x%llx\n", addr,

830 (unsigned long long) size);

831 return NULL;

832 }

833

834 if (ret != 1 || vq->iotlb_iov[0].iov_len != size) {

835 vq_err(vq, "Non atomic userspace memory access: uaddr "

836 "%p size 0x%llx\n", addr,

837 (unsigned long long) size);

838 return NULL;

839 }

840

841 return vq->iotlb_iov[0].iov_base;

842 }

843

844 /* This function should be called after iotlb

845 * prefetch, which means we're sure that vq

846 * could be access through iotlb. So -EAGAIN should

847 * not happen in this case.

848 */

849 static inline void __user *__vhost_get_user(struct vhost_virtqueue *vq,

850 void *addr, unsigned int size,

851 int type)

852 {

853 void __user *uaddr = vhost_vq_meta_fetch(vq,

854 (u64)(uintptr_t)addr, size, type);

855 if (uaddr)

856 return uaddr;

857

858 return __vhost_get_user_slow(vq, addr, size, type);

859 }

860

861 #define vhost_put_user(vq, x, ptr) \

862 ({ \

863 int ret = -EFAULT; \

864 if (!vq->iotlb) { \

865 ret = __put_user(x, ptr); \

866 } else { \

867 __typeof__(ptr) to = \

868 (__typeof__(ptr)) __vhost_get_user(vq, ptr, \

869 sizeof(*ptr), VHOST_ADDR_USED); \

870 if (to != NULL) \

871 ret = __put_user(x, to); \

872 else \

873 ret = -EFAULT; \

874 } \

875 ret; \

876 })

877

878 #define vhost_get_user(vq, x, ptr, type) \

879 ({ \

880 int ret; \

881 if (!vq->iotlb) { \

882 ret = __get_user(x, ptr); \

883 } else { \

884 __typeof__(ptr) from = \

885 (__typeof__(ptr)) __vhost_get_user(vq, ptr, \

886 sizeof(*ptr), \

887 type); \

888 if (from != NULL) \

889 ret = __get_user(x, from); \

890 else \

891 ret = -EFAULT; \

892 } \

893 ret; \

894 })

895

896 #define vhost_get_avail(vq, x, ptr) \

897 vhost_get_user(vq, x, ptr, VHOST_ADDR_AVAIL)

898

899 #define vhost_get_used(vq, x, ptr) \

900 vhost_get_user(vq, x, ptr, VHOST_ADDR_USED)

901

902 static void vhost_dev_lock_vqs(struct vhost_dev *d)

903 {

904 int i = 0;

905 for (i = 0; i < d->nvqs; ++i)

906 mutex_lock_nested(&d->vqs[i]->mutex, i);

907 }

908

909 static void vhost_dev_unlock_vqs(struct vhost_dev *d)

910 {

911 int i = 0;

912 for (i = 0; i < d->nvqs; ++i)

913 mutex_unlock(&d->vqs[i]->mutex);

914 }

915

916 static int vhost_new_umem_range(struct vhost_umem *umem,

917 u64 start, u64 size, u64 end,

918 u64 userspace_addr, int perm)

919 {

920 struct vhost_umem_node *tmp, *node = kmalloc(sizeof(*node), GFP_ATOMIC);

921

922 if (!node)

923 return -ENOMEM;

924

925 if (umem->numem == max_iotlb_entries) {

926 tmp = list_first_entry(&umem->umem_list, typeof(*tmp), link);

927 vhost_umem_free(umem, tmp);

928 }

929

930 node->start = start;

931 node->size = size;

932 node->last = end;

933 node->userspace_addr = userspace_addr;

934 node->perm = perm;

935 INIT_LIST_HEAD(&node->link);

936 list_add_tail(&node->link, &umem->umem_list);

937 vhost_umem_interval_tree_insert(node, &umem->umem_tree);

938 umem->numem++;

939

940 return 0;

941 }

942

943 static void vhost_del_umem_range(struct vhost_umem *umem,

944 u64 start, u64 end)

945 {

946 struct vhost_umem_node *node;

947

948 while ((node = vhost_umem_interval_tree_iter_first(&umem->umem_tree,

949 start, end)))

950 vhost_umem_free(umem, node);

951 }

952

953 static void vhost_iotlb_notify_vq(struct vhost_dev *d,

954 struct vhost_iotlb_msg *msg)

955 {

956 struct vhost_msg_node *node, *n;

957

958 spin_lock(&d->iotlb_lock);

959

960 list_for_each_entry_safe(node, n, &d->pending_list, node) {

961 struct vhost_iotlb_msg *vq_msg = &node->msg.iotlb;

962 if (msg->iova <= vq_msg->iova &&

963 msg->iova + msg->size - 1 > vq_msg->iova &&

964 vq_msg->type == VHOST_IOTLB_MISS) {

965 vhost_poll_queue(&node->vq->poll);

966 list_del(&node->node);

967 kfree(node);

968 }

969 }

970

971 spin_unlock(&d->iotlb_lock);

972 }

973

974 static int umem_access_ok(u64 uaddr, u64 size, int access)

975 {

976 unsigned long a = uaddr;

977

978 /* Make sure 64 bit math will not overflow. */

979 if (vhost_overflow(uaddr, size))

980 return -EFAULT;

981

982 if ((access & VHOST_ACCESS_RO) &&

983 !access_ok(VERIFY_READ, (void __user *)a, size))

984 return -EFAULT;

985 if ((access & VHOST_ACCESS_WO) &&

986 !access_ok(VERIFY_WRITE, (void __user *)a, size))

987 return -EFAULT;

988 return 0;

989 }

990

991 static int vhost_process_iotlb_msg(struct vhost_dev *dev,

992 struct vhost_iotlb_msg *msg)

993 {

994 int ret = 0;

995

996 vhost_dev_lock_vqs(dev);

997 switch (msg->type) {

998 case VHOST_IOTLB_UPDATE:

999 if (!dev->iotlb) {

1000 ret = -EFAULT;

1001 break;

1002 }

1003 if (umem_access_ok(msg->uaddr, msg->size, msg->perm)) {

1004 ret = -EFAULT;

1005 break;

1006 }

1007 vhost_vq_meta_reset(dev);

1008 if (vhost_new_umem_range(dev->iotlb, msg->iova, msg->size,

1009 msg->iova + msg->size - 1,

1010 msg->uaddr, msg->perm)) {

1011 ret = -ENOMEM;

1012 break;

1013 }

1014 vhost_iotlb_notify_vq(dev, msg);

1015 break;

1016 case VHOST_IOTLB_INVALIDATE:

1017 vhost_vq_meta_reset(dev);

1018 vhost_del_umem_range(dev->iotlb, msg->iova,

1019 msg->iova + msg->size - 1);

1020 break;

1021 default:

1022 ret = -EINVAL;

1023 break;

1024 }

1025

1026 vhost_dev_unlock_vqs(dev);

1027 return ret;

1028 }

1029 ssize_t vhost_chr_write_iter(struct vhost_dev *dev,

1030 struct iov_iter *from)

1031 {

1032 struct vhost_msg_node node;

1033 unsigned size = sizeof(struct vhost_msg);

1034 size_t ret;

1035 int err;

1036

1037 if (iov_iter_count(from) < size)

1038 return 0;

1039 ret = copy_from_iter(&node.msg, size, from);

1040 if (ret != size)

1041 goto done;

1042

1043 switch (node.msg.type) {

1044 case VHOST_IOTLB_MSG:

1045 err = vhost_process_iotlb_msg(dev, &node.msg.iotlb);

1046 if (err)

1047 ret = err;

1048 break;

1049 default:

1050 ret = -EINVAL;

1051 break;

1052 }

1053

1054 done:

1055 return ret;

1056 }

1057 EXPORT_SYMBOL(vhost_chr_write_iter);

1058

1059 unsigned int vhost_chr_poll(struct file *file, struct vhost_dev *dev,

1060 poll_table *wait)

1061 {

1062 unsigned int mask = 0;

1063

1064 poll_wait(file, &dev->wait, wait);

1065

1066 if (!list_empty(&dev->read_list))

1067 mask |= POLLIN | POLLRDNORM;

1068

1069 return mask;

1070 }

1071 EXPORT_SYMBOL(vhost_chr_poll);

1072

1073 ssize_t vhost_chr_read_iter(struct vhost_dev *dev, struct iov_iter *to,

1074 int noblock)

1075 {

1076 DEFINE_WAIT(wait);

1077 struct vhost_msg_node *node;

1078 ssize_t ret = 0;

1079 unsigned size = sizeof(struct vhost_msg);

1080

1081 if (iov_iter_count(to) < size)

1082 return 0;

1083

1084 while (1) {

1085 if (!noblock)

1086 prepare_to_wait(&dev->wait, &wait,

1087 TASK_INTERRUPTIBLE);

1088

1089 node = vhost_dequeue_msg(dev, &dev->read_list);

1090 if (node)

1091 break;

1092 if (noblock) {

1093 ret = -EAGAIN;

1094 break;

1095 }

1096 if (signal_pending(current)) {

1097 ret = -ERESTARTSYS;

1098 break;

1099 }

1100 if (!dev->iotlb) {

1101 ret = -EBADFD;

1102 break;

1103 }

1104

1105 schedule();

1106 }

1107

1108 if (!noblock)

1109 finish_wait(&dev->wait, &wait);

1110

1111 if (node) {

1112 ret = copy_to_iter(&node->msg, size, to);

1113

1114 if (ret != size || node->msg.type != VHOST_IOTLB_MISS) {

1115 kfree(node);

1116 return ret;

1117 }

1118

1119 vhost_enqueue_msg(dev, &dev->pending_list, node);

1120 }

1121

1122 return ret;

1123 }

1124 EXPORT_SYMBOL_GPL(vhost_chr_read_iter);

1125

1126 static int vhost_iotlb_miss(struct vhost_virtqueue *vq, u64 iova, int access)

1127 {

1128 struct vhost_dev *dev = vq->dev;

1129 struct vhost_msg_node *node;

1130 struct vhost_iotlb_msg *msg;

1131

1132 node = vhost_new_msg(vq, VHOST_IOTLB_MISS);

1133 if (!node)

1134 return -ENOMEM;

1135

1136 msg = &node->msg.iotlb;

1137 msg->type = VHOST_IOTLB_MISS;

1138 msg->iova = iova;

1139 msg->perm = access;

1140

1141 vhost_enqueue_msg(dev, &dev->read_list, node);

1142

1143 return 0;

1144 }

1145

1146 static int vq_access_ok(struct vhost_virtqueue *vq, unsigned int num,

1147 struct vring_desc __user *desc,

1148 struct vring_avail __user *avail,

1149 struct vring_used __user *used)

1150

1151 {

1152 size_t s = vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX) ? 2 : 0;

1153

1154 return access_ok(VERIFY_READ, desc, num * sizeof *desc) &&

1155 access_ok(VERIFY_READ, avail,

1156 sizeof *avail + num * sizeof *avail->ring + s) &&

1157 access_ok(VERIFY_WRITE, used,

1158 sizeof *used + num * sizeof *used->ring + s);

1159 }

1160

1161 static void vhost_vq_meta_update(struct vhost_virtqueue *vq,

1162 const struct vhost_umem_node *node,

1163 int type)

1164 {

1165 int access = (type == VHOST_ADDR_USED) ?

1166 VHOST_ACCESS_WO : VHOST_ACCESS_RO;

1167

1168 if (likely(node->perm & access))

1169 vq->meta_iotlb[type] = node;

1170 }

1171

1172 static int iotlb_access_ok(struct vhost_virtqueue *vq,

1173 int access, u64 addr, u64 len, int type)

1174 {

1175 const struct vhost_umem_node *node;

1176 struct vhost_umem *umem = vq->iotlb;

1177 u64 s = 0, size, orig_addr = addr;

1178

1179 if (vhost_vq_meta_fetch(vq, addr, len, type))

1180 return true;

1181

1182 while (len > s) {

1183 node = vhost_umem_interval_tree_iter_first(&umem->umem_tree,

1184 addr,

1185 addr + len - 1);

1186 if (node == NULL || node->start > addr) {

1187 vhost_iotlb_miss(vq, addr, access);

1188 return false;

1189 } else if (!(node->perm & access)) {

1190 /* Report the possible access violation by

1191 * request another translation from userspace.

1192 */

1193 return false;

1194 }

1195

1196 size = node->size - addr + node->start;

1197

1198 if (orig_addr == addr && size >= len)

1199 vhost_vq_meta_update(vq, node, type);

1200

1201 s += size;

1202 addr += size;

1203 }

1204

1205 return true;

1206 }

1207

1208 int vq_iotlb_prefetch(struct vhost_virtqueue *vq)

1209 {

1210 size_t s = vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX) ? 2 : 0;

1211 unsigned int num = vq->num;

1212

1213 if (!vq->iotlb)

1214 return 1;

1215

1216 return iotlb_access_ok(vq, VHOST_ACCESS_RO, (u64)(uintptr_t)vq->desc,

1217 num * sizeof(*vq->desc), VHOST_ADDR_DESC) &&

1218 iotlb_access_ok(vq, VHOST_ACCESS_RO, (u64)(uintptr_t)vq->avail,

1219 sizeof *vq->avail +

1220 num * sizeof(*vq->avail->ring) + s,

1221 VHOST_ADDR_AVAIL) &&

1222 iotlb_access_ok(vq, VHOST_ACCESS_WO, (u64)(uintptr_t)vq->used,

1223 sizeof *vq->used +

1224 num * sizeof(*vq->used->ring) + s,

1225 VHOST_ADDR_USED);

1226 }

1227 EXPORT_SYMBOL_GPL(vq_iotlb_prefetch);

1228

1229 /* Can we log writes? */

1230 /* Caller should have device mutex but not vq mutex */

1231 int vhost_log_access_ok(struct vhost_dev *dev)

1232 {

1233 return memory_access_ok(dev, dev->umem, 1);

1234 }

1235 EXPORT_SYMBOL_GPL(vhost_log_access_ok);

1236

1237 /* Verify access for write logging. */

1238 /* Caller should have vq mutex and device mutex */

1239 static int vq_log_access_ok(struct vhost_virtqueue *vq,

1240 void __user *log_base)

1241 {

1242 size_t s = vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX) ? 2 : 0;

1243

1244 return vq_memory_access_ok(log_base, vq->umem,

1245 vhost_has_feature(vq, VHOST_F_LOG_ALL)) &&

1246 (!vq->log_used || log_access_ok(log_base, vq->log_addr,

1247 sizeof *vq->used +

1248 vq->num * sizeof *vq->used->ring + s));

1249 }

1250

1251 /* Can we start vq? */

1252 /* Caller should have vq mutex and device mutex */

1253 int vhost_vq_access_ok(struct vhost_virtqueue *vq)

1254 {

1255 if (vq->iotlb) {

1256 /* When device IOTLB was used, the access validation

1257 * will be validated during prefetching.

1258 */

1259 return 1;

1260 }

1261 return vq_access_ok(vq, vq->num, vq->desc, vq->avail, vq->used) &&

1262 vq_log_access_ok(vq, vq->log_base);

1263 }

1264 EXPORT_SYMBOL_GPL(vhost_vq_access_ok);

1265

1266 static struct vhost_umem *vhost_umem_alloc(void)

1267 {

1268 struct vhost_umem *umem = kvzalloc(sizeof(*umem), GFP_KERNEL);

1269

1270 if (!umem)

1271 return NULL;

1272

1273 umem->umem_tree = RB_ROOT_CACHED;

1274 umem->numem = 0;

1275 INIT_LIST_HEAD(&umem->umem_list);

1276

1277 return umem;

1278 }

1279

1280 static long vhost_set_memory(struct vhost_dev *d, struct vhost_memory __user *m)

1281 {

1282 struct vhost_memory mem, *newmem;

1283 struct vhost_memory_region *region;

1284 struct vhost_umem *newumem, *oldumem;

1285 unsigned long size = offsetof(struct vhost_memory, regions);

1286 int i;

1287

1288 if (copy_from_user(&mem, m, size))

1289 return -EFAULT;

1290 if (mem.padding)

1291 return -EOPNOTSUPP;

1292 if (mem.nregions > max_mem_regions)

1293 return -E2BIG;

1294 newmem = kvzalloc(size + mem.nregions * sizeof(*m->regions), GFP_KERNEL);

1295 if (!newmem)

1296 return -ENOMEM;

1297

1298 memcpy(newmem, &mem, size);

1299 if (copy_from_user(newmem->regions, m->regions,

1300 mem.nregions * sizeof *m->regions)) {

1301 kvfree(newmem);

1302 return -EFAULT;

1303 }

1304

1305 newumem = vhost_umem_alloc();

1306 if (!newumem) {

1307 kvfree(newmem);

1308 return -ENOMEM;

1309 }

1310

1311 for (region = newmem->regions;

1312 region < newmem->regions + mem.nregions;

1313 region++) {

1314 if (vhost_new_umem_range(newumem,

1315 region->guest_phys_addr,

1316 region->memory_size,

1317 region->guest_phys_addr +

1318 region->memory_size - 1,

1319 region->userspace_addr,

1320 VHOST_ACCESS_RW))

1321 goto err;

1322 }

1323

1324 if (!memory_access_ok(d, newumem, 0))

1325 goto err;

1326

1327 oldumem = d->umem;

1328 d->umem = newumem;

1329

1330 /* All memory accesses are done under some VQ mutex. */

1331 for (i = 0; i < d->nvqs; ++i) {

1332 mutex_lock(&d->vqs[i]->mutex);

1333 d->vqs[i]->umem = newumem;

1334 mutex_unlock(&d->vqs[i]->mutex);

1335 }

1336

1337 kvfree(newmem);

1338 vhost_umem_clean(oldumem);

1339 return 0;

1340

1341 err:

1342 vhost_umem_clean(newumem);

1343 kvfree(newmem);

1344 return -EFAULT;

1345 }

1346

1347 long vhost_vring_ioctl(struct vhost_dev *d, int ioctl, void __user *argp)

1348 {

1349 struct file *eventfp, *filep = NULL;

1350 bool pollstart = false, pollstop = false;

1351 struct eventfd_ctx *ctx = NULL;

1352 u32 __user *idxp = argp;

1353 struct vhost_virtqueue *vq;

1354 struct vhost_vring_state s;

1355 struct vhost_vring_file f;

1356 struct vhost_vring_addr a;

1357 u32 idx;

1358 long r;

1359

1360 r = get_user(idx, idxp);

1361 if (r < 0)

1362 return r;

1363 if (idx >= d->nvqs)

1364 return -ENOBUFS;

1365

1366 vq = d->vqs[idx];

1367

1368 mutex_lock(&vq->mutex);

1369

1370 switch (ioctl) {

1371 case VHOST_SET_VRING_NUM:

1372 /* Resizing ring with an active backend?

1373 * You don't want to do that. */

1374 if (vq->private_data) {

1375 r = -EBUSY;

1376 break;

1377 }

1378 if (copy_from_user(&s, argp, sizeof s)) {

1379 r = -EFAULT;

1380 break;

1381 }

1382 if (!s.num || s.num > 0xffff || (s.num & (s.num - 1))) {

1383 r = -EINVAL;

1384 break;

1385 }

1386 vq->num = s.num;

1387 break;

1388 case VHOST_SET_VRING_BASE:

1389 /* Moving base with an active backend?

1390 * You don't want to do that. */

1391 if (vq->private_data) {

1392 r = -EBUSY;

1393 break;

1394 }

1395 if (copy_from_user(&s, argp, sizeof s)) {

1396 r = -EFAULT;

1397 break;

1398 }

1399 if (s.num > 0xffff) {

1400 r = -EINVAL;

1401 break;

1402 }

1403 vq->last_avail_idx = s.num;

1404 /* Forget the cached index value. */

1405 vq->avail_idx = vq->last_avail_idx;

1406 break;

1407 case VHOST_GET_VRING_BASE:

1408 s.index = idx;

1409 s.num = vq->last_avail_idx;

1410 if (copy_to_user(argp, &s, sizeof s))

1411 r = -EFAULT;

1412 break;

1413 case VHOST_SET_VRING_ADDR:

1414 if (copy_from_user(&a, argp, sizeof a)) {

1415 r = -EFAULT;

1416 break;

1417 }

1418 if (a.flags & ~(0x1 << VHOST_VRING_F_LOG)) {

1419 r = -EOPNOTSUPP;

1420 break;

1421 }

1422 /* For 32bit, verify that the top 32bits of the user

1423 data are set to zero. */

1424 if ((u64)(unsigned long)a.desc_user_addr != a.desc_user_addr ||

1425 (u64)(unsigned long)a.used_user_addr != a.used_user_addr ||

1426 (u64)(unsigned long)a.avail_user_addr != a.avail_user_addr) {

1427 r = -EFAULT;

1428 break;

1429 }

1430

1431 /* Make sure it's safe to cast pointers to vring types. */

1432 BUILD_BUG_ON(__alignof__ *vq->avail > VRING_AVAIL_ALIGN_SIZE);

1433 BUILD_BUG_ON(__alignof__ *vq->used > VRING_USED_ALIGN_SIZE);

1434 if ((a.avail_user_addr & (VRING_AVAIL_ALIGN_SIZE - 1)) ||

1435 (a.used_user_addr & (VRING_USED_ALIGN_SIZE - 1)) ||

1436 (a.log_guest_addr & (VRING_USED_ALIGN_SIZE - 1))) {

1437 r = -EINVAL;

1438 break;

1439 }

1440

1441 /* We only verify access here if backend is configured.

1442 * If it is not, we don't as size might not have been setup.

1443 * We will verify when backend is configured. */

1444 if (vq->private_data) {

1445 if (!vq_access_ok(vq, vq->num,

1446 (void __user *)(unsigned long)a.desc_user_addr,

1447 (void __user *)(unsigned long)a.avail_user_addr,

1448 (void __user *)(unsigned long)a.used_user_addr)) {

1449 r = -EINVAL;

1450 break;

1451 }

1452

1453 /* Also validate log access for used ring if enabled. */

1454 if ((a.flags & (0x1 << VHOST_VRING_F_LOG)) &&

1455 !log_access_ok(vq->log_base, a.log_guest_addr,

1456 sizeof *vq->used +

1457 vq->num * sizeof *vq->used->ring)) {

1458 r = -EINVAL;

1459 break;

1460 }

1461 }

1462

1463 vq->log_used = !!(a.flags & (0x1 << VHOST_VRING_F_LOG));

1464 vq->desc = (void __user *)(unsigned long)a.desc_user_addr;

1465 vq->avail = (void __user *)(unsigned long)a.avail_user_addr;

1466 vq->log_addr = a.log_guest_addr;

1467 vq->used = (void __user *)(unsigned long)a.used_user_addr;

1468 break;

1469 case VHOST_SET_VRING_KICK:

1470 if (copy_from_user(&f, argp, sizeof f)) {

1471 r = -EFAULT;

1472 break;

1473 }

1474 eventfp = f.fd == -1 ? NULL : eventfd_fget(f.fd);

1475 if (IS_ERR(eventfp)) {

1476 r = PTR_ERR(eventfp);

1477 break;

1478 }

1479 if (eventfp != vq->kick) {

1480 pollstop = (filep = vq->kick) != NULL;

1481 pollstart = (vq->kick = eventfp) != NULL;

1482 } else

1483 filep = eventfp;

1484 break;

1485 case VHOST_SET_VRING_CALL:

1486 if (copy_from_user(&f, argp, sizeof f)) {

1487 r = -EFAULT;

1488 break;

1489 }

1490 eventfp = f.fd == -1 ? NULL : eventfd_fget(f.fd);

1491 if (IS_ERR(eventfp)) {

1492 r = PTR_ERR(eventfp);

1493 break;

1494 }

1495 if (eventfp != vq->call) {

1496 filep = vq->call;

1497 ctx = vq->call_ctx;

1498 vq->call = eventfp;

1499 vq->call_ctx = eventfp ?

1500 eventfd_ctx_fileget(eventfp) : NULL;

1501 } else

1502 filep = eventfp;

1503 break;

1504 case VHOST_SET_VRING_ERR:

1505 if (copy_from_user(&f, argp, sizeof f)) {

1506 r = -EFAULT;

1507 break;

1508 }

1509 eventfp = f.fd == -1 ? NULL : eventfd_fget(f.fd);

1510 if (IS_ERR(eventfp)) {

1511 r = PTR_ERR(eventfp);

1512 break;

1513 }

1514 if (eventfp != vq->error) {

1515 filep = vq->error;

1516 vq->error = eventfp;

1517 ctx = vq->error_ctx;

1518 vq->error_ctx = eventfp ?

1519 eventfd_ctx_fileget(eventfp) : NULL;

1520 } else

1521 filep = eventfp;

1522 break;

1523 case VHOST_SET_VRING_ENDIAN:

1524 r = vhost_set_vring_endian(vq, argp);

1525 break;

1526 case VHOST_GET_VRING_ENDIAN:

1527 r = vhost_get_vring_endian(vq, idx, argp);

1528 break;

1529 case VHOST_SET_VRING_BUSYLOOP_TIMEOUT:

1530 if (copy_from_user(&s, argp, sizeof(s))) {

1531 r = -EFAULT;

1532 break;

1533 }

1534 vq->busyloop_timeout = s.num;

1535 break;

1536 case VHOST_GET_VRING_BUSYLOOP_TIMEOUT:

1537 s.index = idx;

1538 s.num = vq->busyloop_timeout;

1539 if (copy_to_user(argp, &s, sizeof(s)))

1540 r = -EFAULT;

1541 break;

1542 default:

1543 r = -ENOIOCTLCMD;

1544 }

1545

1546 if (pollstop && vq->handle_kick)

1547 vhost_poll_stop(&vq->poll);

1548

1549 if (ctx)

1550 eventfd_ctx_put(ctx);

1551 if (filep)

1552 fput(filep);

1553

1554 if (pollstart && vq->handle_kick)

1555 r = vhost_poll_start(&vq->poll, vq->kick);

1556

1557 mutex_unlock(&vq->mutex);

1558

1559 if (pollstop && vq->handle_kick)

1560 vhost_poll_flush(&vq->poll);

1561 return r;

1562 }

1563 EXPORT_SYMBOL_GPL(vhost_vring_ioctl);

1564

1565 int vhost_init_device_iotlb(struct vhost_dev *d, bool enabled)

1566 {

1567 struct vhost_umem *niotlb, *oiotlb;

1568 int i;

1569

1570 niotlb = vhost_umem_alloc();

1571 if (!niotlb)

1572 return -ENOMEM;

1573

1574 oiotlb = d->iotlb;

1575 d->iotlb = niotlb;

1576

1577 for (i = 0; i < d->nvqs; ++i) {

1578 mutex_lock(&d->vqs[i]->mutex);

1579 d->vqs[i]->iotlb = niotlb;

1580 mutex_unlock(&d->vqs[i]->mutex);

1581 }

1582

1583 vhost_umem_clean(oiotlb);

1584

1585 return 0;

1586 }

1587 EXPORT_SYMBOL_GPL(vhost_init_device_iotlb);

1588

1589 /* Caller must have device mutex */

1590 long vhost_dev_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *argp)

1591 {

1592 struct file *eventfp, *filep = NULL;

1593 struct eventfd_ctx *ctx = NULL;

1594 u64 p;

1595 long r;

1596 int i, fd;

1597

1598 /* If you are not the owner, you can become one */

1599 if (ioctl == VHOST_SET_OWNER) {

1600 r = vhost_dev_set_owner(d);

1601 goto done;

1602 }

1603

1604 /* You must be the owner to do anything else */

1605 r = vhost_dev_check_owner(d);

1606 if (r)

1607 goto done;

1608

1609 switch (ioctl) {

1610 case VHOST_SET_MEM_TABLE:

1611 r = vhost_set_memory(d, argp);

1612 break;

1613 case VHOST_SET_LOG_BASE:

1614 if (copy_from_user(&p, argp, sizeof p)) {

1615 r = -EFAULT;

1616 break;

1617 }

1618 if ((u64)(unsigned long)p != p) {

1619 r = -EFAULT;

1620 break;

1621 }

1622 for (i = 0; i < d->nvqs; ++i) {

1623 struct vhost_virtqueue *vq;

1624 void __user *base = (void __user *)(unsigned long)p;

1625 vq = d->vqs[i];

1626 mutex_lock(&vq->mutex);

1627 /* If ring is inactive, will check when it's enabled. */

1628 if (vq->private_data && !vq_log_access_ok(vq, base))

1629 r = -EFAULT;

1630 else

1631 vq->log_base = base;

1632 mutex_unlock(&vq->mutex);

1633 }

1634 break;

1635 case VHOST_SET_LOG_FD:

1636 r = get_user(fd, (int __user *)argp);

1637 if (r < 0)

1638 break;

1639 eventfp = fd == -1 ? NULL : eventfd_fget(fd);

1640 if (IS_ERR(eventfp)) {

1641 r = PTR_ERR(eventfp);

1642 break;

1643 }

1644 if (eventfp != d->log_file) {

1645 filep = d->log_file;

1646 d->log_file = eventfp;

1647 ctx = d->log_ctx;

1648 d->log_ctx = eventfp ?

1649 eventfd_ctx_fileget(eventfp) : NULL;

1650 } else

1651 filep = eventfp;

1652 for (i = 0; i < d->nvqs; ++i) {

1653 mutex_lock(&d->vqs[i]->mutex);

1654 d->vqs[i]->log_ctx = d->log_ctx;

1655 mutex_unlock(&d->vqs[i]->mutex);

1656 }

1657 if (ctx)

1658 eventfd_ctx_put(ctx);

1659 if (filep)

1660 fput(filep);

1661 break;

1662 default:

1663 r = -ENOIOCTLCMD;

1664 break;

1665 }

1666 done:

1667 return r;

1668 }

1669 EXPORT_SYMBOL_GPL(vhost_dev_ioctl);

1670

1671 /* TODO: This is really inefficient. We need something like get_user()

1672 * (instruction directly accesses the data, with an exception table entry

1673 * returning -EFAULT). See Documentation/x86/exception-tables.txt.

1674 */

1675 static int set_bit_to_user(int nr, void __user *addr)

1676 {

1677 unsigned long log = (unsigned long)addr;

1678 struct page *page;

1679 void *base;

1680 int bit = nr + (log % PAGE_SIZE) * 8;

1681 int r;

1682

1683 r = get_user_pages_fast(log, 1, 1, &page);

1684 if (r < 0)

1685 return r;

1686 BUG_ON(r != 1);

1687 base = kmap_atomic(page);

1688 set_bit(bit, base);

1689 kunmap_atomic(base);

1690 set_page_dirty_lock(page);

1691 put_page(page);

1692 return 0;

1693 }

1694

1695 static int log_write(void __user *log_base,

1696 u64 write_address, u64 write_length)

1697 {

1698 u64 write_page = write_address / VHOST_PAGE_SIZE;

1699 int r;

1700

1701 if (!write_length)

1702 return 0;

1703 write_length += write_address % VHOST_PAGE_SIZE;

1704 for (;;) {

1705 u64 base = (u64)(unsigned long)log_base;

1706 u64 log = base + write_page / 8;

1707 int bit = write_page % 8;

1708 if ((u64)(unsigned long)log != log)

1709 return -EFAULT;

1710 r = set_bit_to_user(bit, (void __user *)(unsigned long)log);

1711 if (r < 0)

1712 return r;

1713 if (write_length <= VHOST_PAGE_SIZE)

1714 break;

1715 write_length -= VHOST_PAGE_SIZE;

1716 write_page += 1;

1717 }

1718 return r;

1719 }

1720

1721 int vhost_log_write(struct vhost_virtqueue *vq, struct vhost_log *log,

1722 unsigned int log_num, u64 len)

1723 {

1724 int i, r;

1725

1726 /* Make sure data written is seen before log. */

1727 smp_wmb();

1728 for (i = 0; i < log_num; ++i) {

1729 u64 l = min(log[i].len, len);

1730 r = log_write(vq->log_base, log[i].addr, l);

1731 if (r < 0)

1732 return r;

1733 len -= l;

1734 if (!len) {

1735 if (vq->log_ctx)

1736 eventfd_signal(vq->log_ctx, 1);

1737 return 0;

1738 }

1739 }

1740 /* Length written exceeds what we have stored. This is a bug. */

1741 BUG();

1742 return 0;

1743 }

1744 EXPORT_SYMBOL_GPL(vhost_log_write);

1745

1746 static int vhost_update_used_flags(struct vhost_virtqueue *vq)

1747 {

1748 void __user *used;

1749 if (vhost_put_user(vq, cpu_to_vhost16(vq, vq->used_flags),

1750 &vq->used->flags) < 0)

1751 return -EFAULT;

1752 if (unlikely(vq->log_used)) {

1753 /* Make sure the flag is seen before log. */

1754 smp_wmb();

1755 /* Log used flag write. */

1756 used = &vq->used->flags;

1757 log_write(vq->log_base, vq->log_addr +

1758 (used - (void __user *)vq->used),

1759 sizeof vq->used->flags);

1760 if (vq->log_ctx)

1761 eventfd_signal(vq->log_ctx, 1);

1762 }

1763 return 0;

1764 }

1765

1766 static int vhost_update_avail_event(struct vhost_virtqueue *vq, u16 avail_event)

1767 {

1768 if (vhost_put_user(vq, cpu_to_vhost16(vq, vq->avail_idx),

1769 vhost_avail_event(vq)))

1770 return -EFAULT;

1771 if (unlikely(vq->log_used)) {

1772 void __user *used;

1773 /* Make sure the event is seen before log. */

1774 smp_wmb();

1775 /* Log avail event write */

1776 used = vhost_avail_event(vq);

1777 log_write(vq->log_base, vq->log_addr +

1778 (used - (void __user *)vq->used),

1779 sizeof *vhost_avail_event(vq));

1780 if (vq->log_ctx)

1781 eventfd_signal(vq->log_ctx, 1);

1782 }

1783 return 0;

1784 }

1785

1786 int vhost_vq_init_access(struct vhost_virtqueue *vq)

1787 {

1788 __virtio16 last_used_idx;

1789 int r;

1790 bool is_le = vq->is_le;

1791

1792 if (!vq->private_data)

1793 return 0;

1794

1795 vhost_init_is_le(vq);

1796

1797 r = vhost_update_used_flags(vq);

1798 if (r)

1799 goto err;

1800 vq->signalled_used_valid = false;

1801 if (!vq->iotlb &&

1802 !access_ok(VERIFY_READ, &vq->used->idx, sizeof vq->used->idx)) {

1803 r = -EFAULT;

1804 goto err;

1805 }

1806 r = vhost_get_used(vq, last_used_idx, &vq->used->idx);

1807 if (r) {

1808 vq_err(vq, "Can't access used idx at %p\n",

1809 &vq->used->idx);

1810 goto err;

1811 }

1812 vq->last_used_idx = vhost16_to_cpu(vq, last_used_idx);

1813 return 0;

1814

1815 err:

1816 vq->is_le = is_le;

1817 return r;

1818 }

1819 EXPORT_SYMBOL_GPL(vhost_vq_init_access);

1820

1821 static int translate_desc(struct vhost_virtqueue *vq, u64 addr, u32 len,

1822 struct iovec iov[], int iov_size, int access)

1823 {

1824 const struct vhost_umem_node *node;

1825 struct vhost_dev *dev = vq->dev;

1826 struct vhost_umem *umem = dev->iotlb ? dev->iotlb : dev->umem;

1827 struct iovec *_iov;

1828 u64 s = 0;

1829 int ret = 0;

1830

1831 while ((u64)len > s) {

1832 u64 size;

1833 if (unlikely(ret >= iov_size)) {

1834 ret = -ENOBUFS;

1835 break;

1836 }

1837

1838 node = vhost_umem_interval_tree_iter_first(&umem->umem_tree,

1839 addr, addr + len - 1);

1840 if (node == NULL || node->start > addr) {

1841 if (umem != dev->iotlb) {

1842 ret = -EFAULT;

1843 break;

1844 }

1845 ret = -EAGAIN;

1846 break;

1847 } else if (!(node->perm & access)) {

1848 ret = -EPERM;

1849 break;

1850 }

1851

1852 _iov = iov + ret;

1853 size = node->size - addr + node->start;

1854 _iov->iov_len = min((u64)len - s, size);

1855 _iov->iov_base = (void __user *)(unsigned long)

1856 (node->userspace_addr + addr - node->start);

1857 s += size;

1858 addr += size;

1859 ++ret;

1860 }

1861

1862 if (ret == -EAGAIN)

1863 vhost_iotlb_miss(vq, addr, access);

1864 return ret;

1865 }

1866

1867 /* Each buffer in the virtqueues is actually a chain of descriptors. This

1868 * function returns the next descriptor in the chain,

1869 * or -1U if we're at the end. */

1870 static unsigned next_desc(struct vhost_virtqueue *vq, struct vring_desc *desc)

1871 {

1872 unsigned int next;

1873

1874 /* If this descriptor says it doesn't chain, we're done. */

1875 if (!(desc->flags & cpu_to_vhost16(vq, VRING_DESC_F_NEXT)))

1876 return -1U;

1877

1878 /* Check they're not leading us off end of descriptors. */

1879 next = vhost16_to_cpu(vq, desc->next);

1880 /* Make sure compiler knows to grab that: we don't want it changing! */

1881 /* We will use the result as an index in an array, so most

1882 * architectures only need a compiler barrier here. */

1883 read_barrier_depends();

1884

1885 return next;

1886 }

1887

1888 static int get_indirect(struct vhost_virtqueue *vq,

1889 struct iovec iov[], unsigned int iov_size,

1890 unsigned int *out_num, unsigned int *in_num,

1891 struct vhost_log *log, unsigned int *log_num,

1892 struct vring_desc *indirect)

1893 {

1894 struct vring_desc desc;

1895 unsigned int i = 0, count, found = 0;

1896 u32 len = vhost32_to_cpu(vq, indirect->len);

1897 struct iov_iter from;

1898 int ret, access;

1899

1900 /* Sanity check */

1901 if (unlikely(len % sizeof desc)) {

1902 vq_err(vq, "Invalid length in indirect descriptor: "

1903 "len 0x%llx not multiple of 0x%zx\n",

1904 (unsigned long long)len,

1905 sizeof desc);

1906 return -EINVAL;

1907 }

1908

1909 ret = translate_desc(vq, vhost64_to_cpu(vq, indirect->addr), len, vq->indirect,

1910 UIO_MAXIOV, VHOST_ACCESS_RO);

1911 if (unlikely(ret < 0)) {

1912 if (ret != -EAGAIN)

1913 vq_err(vq, "Translation failure %d in indirect.\n", ret);

1914 return ret;

1915 }

1916 iov_iter_init(&from, READ, vq->indirect, ret, len);

1917

1918 /* We will use the result as an address to read from, so most

1919 * architectures only need a compiler barrier here. */

1920 read_barrier_depends();

1921

1922 count = len / sizeof desc;

1923 /* Buffers are chained via a 16 bit next field, so

1924 * we can have at most 2^16 of these. */

1925 if (unlikely(count > USHRT_MAX + 1)) {

1926 vq_err(vq, "Indirect buffer length too big: %d\n",

1927 indirect->len);

1928 return -E2BIG;

1929 }

1930

1931 do {

1932 unsigned iov_count = *in_num + *out_num;

1933 if (unlikely(++found > count)) {

1934 vq_err(vq, "Loop detected: last one at %u "

1935 "indirect size %u\n",

1936 i, count);

1937 return -EINVAL;

1938 }

1939 if (unlikely(!copy_from_iter_full(&desc, sizeof(desc), &from))) {

1940 vq_err(vq, "Failed indirect descriptor: idx %d, %zx\n",

1941 i, (size_t)vhost64_to_cpu(vq, indirect->addr) + i * sizeof desc);

1942 return -EINVAL;

1943 }

1944 if (unlikely(desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_INDIRECT))) {

1945 vq_err(vq, "Nested indirect descriptor: idx %d, %zx\n",

1946 i, (size_t)vhost64_to_cpu(vq, indirect->addr) + i * sizeof desc);

1947 return -EINVAL;

1948 }

1949

1950 if (desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_WRITE))

1951 access = VHOST_ACCESS_WO;

1952 else

1953 access = VHOST_ACCESS_RO;

1954

1955 ret = translate_desc(vq, vhost64_to_cpu(vq, desc.addr),

1956 vhost32_to_cpu(vq, desc.len), iov + iov_count,

1957 iov_size - iov_count, access);

1958 if (unlikely(ret < 0)) {

1959 if (ret != -EAGAIN)

1960 vq_err(vq, "Translation failure %d indirect idx %d\n",

1961 ret, i);

1962 return ret;

1963 }

1964 /* If this is an input descriptor, increment that count. */

1965 if (access == VHOST_ACCESS_WO) {

1966 *in_num += ret;

1967 if (unlikely(log)) {

1968 log[*log_num].addr = vhost64_to_cpu(vq, desc.addr);

1969 log[*log_num].len = vhost32_to_cpu(vq, desc.len);

1970 ++*log_num;

1971 }

1972 } else {

1973 /* If it's an output descriptor, they're all supposed

1974 * to come before any input descriptors. */

1975 if (unlikely(*in_num)) {

1976 vq_err(vq, "Indirect descriptor "

1977 "has out after in: idx %d\n", i);

1978 return -EINVAL;

1979 }

1980 *out_num += ret;

1981 }

1982 } while ((i = next_desc(vq, &desc)) != -1);

1983 return 0;

1984 }

1985

1986 /* This looks in the virtqueue and for the first available buffer, and converts

1987 * it to an iovec for convenient access. Since descriptors consist of some

1988 * number of output then some number of input descriptors, it's actually two

1989 * iovecs, but we pack them into one and note how many of each there were.

1990 *

1991 * This function returns the descriptor number found, or vq->num (which is

1992 * never a valid descriptor number) if none was found. A negative code is

1993 * returned on error. */

1994 int vhost_get_vq_desc(struct vhost_virtqueue *vq,

1995 struct iovec iov[], unsigned int iov_size,

1996 unsigned int *out_num, unsigned int *in_num,

1997 struct vhost_log *log, unsigned int *log_num)

1998 {

1999 struct vring_desc desc;

2000 unsigned int i, head, found = 0;

2001 u16 last_avail_idx;

2002 __virtio16 avail_idx;

2003 __virtio16 ring_head;

2004 int ret, access;

2005

2006 /* Check it isn't doing very strange things with descriptor numbers. */

2007 last_avail_idx = vq->last_avail_idx;

2008

2009 if (vq->avail_idx == vq->last_avail_idx) {

2010 if (unlikely(vhost_get_avail(vq, avail_idx, &vq->avail->idx))) {

2011 vq_err(vq, "Failed to access avail idx at %p\n",

2012 &vq->avail->idx);

2013 return -EFAULT;

2014 }

2015 vq->avail_idx = vhost16_to_cpu(vq, avail_idx);

2016

2017 if (unlikely((u16)(vq->avail_idx - last_avail_idx) > vq->num)) {

2018 vq_err(vq, "Guest moved used index from %u to %u",

2019 last_avail_idx, vq->avail_idx);

2020 return -EFAULT;

2021 }

2022

2023 /* If there's nothing new since last we looked, return

2024 * invalid.

2025 */

2026 if (vq->avail_idx == last_avail_idx)

2027 return vq->num;

2028

2029 /* Only get avail ring entries after they have been

2030 * exposed by guest.

2031 */

2032 smp_rmb();

2033 }

2034

2035 /* Grab the next descriptor number they're advertising, and increment

2036 * the index we've seen. */

2037 if (unlikely(vhost_get_avail(vq, ring_head,

2038 &vq->avail->ring[last_avail_idx & (vq->num - 1)]))) {

2039 vq_err(vq, "Failed to read head: idx %d address %p\n",

2040 last_avail_idx,

2041 &vq->avail->ring[last_avail_idx % vq->num]);

2042 return -EFAULT;

2043 }

2044

2045 head = vhost16_to_cpu(vq, ring_head);

2046

2047 /* If their number is silly, that's an error. */

2048 if (unlikely(head >= vq->num)) {

2049 vq_err(vq, "Guest says index %u > %u is available",

2050 head, vq->num);

2051 return -EINVAL;

2052 }

2053

2054 /* When we start there are none of either input nor output. */

2055 *out_num = *in_num = 0;

2056 if (unlikely(log))

2057 *log_num = 0;

2058

2059 i = head;

2060 do {

2061 unsigned iov_count = *in_num + *out_num;

2062 if (unlikely(i >= vq->num)) {

2063 vq_err(vq, "Desc index is %u > %u, head = %u",

2064 i, vq->num, head);

2065 return -EINVAL;

2066 }

2067 if (unlikely(++found > vq->num)) {

2068 vq_err(vq, "Loop detected: last one at %u "

2069 "vq size %u head %u\n",

2070 i, vq->num, head);

2071 return -EINVAL;

2072 }

2073 ret = vhost_copy_from_user(vq, &desc, vq->desc + i,

2074 sizeof desc);

2075 if (unlikely(ret)) {

2076 vq_err(vq, "Failed to get descriptor: idx %d addr %p\n",

2077 i, vq->desc + i);

2078 return -EFAULT;

2079 }

2080 if (desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_INDIRECT)) {

2081 ret = get_indirect(vq, iov, iov_size,

2082 out_num, in_num,

2083 log, log_num, &desc);

2084 if (unlikely(ret < 0)) {

2085 if (ret != -EAGAIN)

2086 vq_err(vq, "Failure detected "

2087 "in indirect descriptor at idx %d\n", i);

2088 return ret;

2089 }

2090 continue;

2091 }

2092

2093 if (desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_WRITE))

2094 access = VHOST_ACCESS_WO;

2095 else

2096 access = VHOST_ACCESS_RO;

2097 ret = translate_desc(vq, vhost64_to_cpu(vq, desc.addr),

2098 vhost32_to_cpu(vq, desc.len), iov + iov_count,

2099 iov_size - iov_count, access);

2100 if (unlikely(ret < 0)) {

2101 if (ret != -EAGAIN)

2102 vq_err(vq, "Translation failure %d descriptor idx %d\n",

2103 ret, i);

2104 return ret;

2105 }

2106 if (access == VHOST_ACCESS_WO) {

2107 /* If this is an input descriptor,

2108 * increment that count. */

2109 *in_num += ret;

2110 if (unlikely(log)) {

2111 log[*log_num].addr = vhost64_to_cpu(vq, desc.addr);

2112 log[*log_num].len = vhost32_to_cpu(vq, desc.len);

2113 ++*log_num;

2114 }

2115 } else {

2116 /* If it's an output descriptor, they're all supposed

2117 * to come before any input descriptors. */

2118 if (unlikely(*in_num)) {

2119 vq_err(vq, "Descriptor has out after in: "

2120 "idx %d\n", i);

2121 return -EINVAL;

2122 }

2123 *out_num += ret;

2124 }

2125 } while ((i = next_desc(vq, &desc)) != -1);

2126

2127 /* On success, increment avail index. */

2128 vq->last_avail_idx++;

2129

2130 /* Assume notifications from guest are disabled at this point,

2131 * if they aren't we would need to update avail_event index. */

2132 BUG_ON(!(vq->used_flags & VRING_USED_F_NO_NOTIFY));

2133 return head;

2134 }

2135 EXPORT_SYMBOL_GPL(vhost_get_vq_desc);

2136

2137 /* Reverse the effect of vhost_get_vq_desc. Useful for error handling. */

2138 void vhost_discard_vq_desc(struct vhost_virtqueue *vq, int n)

2139 {

2140 vq->last_avail_idx -= n;

2141 }

2142 EXPORT_SYMBOL_GPL(vhost_discard_vq_desc);

2143

2144 /* After we've used one of their buffers, we tell them about it. We'll then

2145 * want to notify the guest, using eventfd. */

2146 int vhost_add_used(struct vhost_virtqueue *vq, unsigned int head, int len)

2147 {

2148 struct vring_used_elem heads = {

2149 cpu_to_vhost32(vq, head),

2150 cpu_to_vhost32(vq, len)

2151 };

2152

2153 return vhost_add_used_n(vq, &heads, 1);

2154 }

2155 EXPORT_SYMBOL_GPL(vhost_add_used);

2156

2157 static int __vhost_add_used_n(struct vhost_virtqueue *vq,

2158 struct vring_used_elem *heads,

2159 unsigned count)

2160 {

2161 struct vring_used_elem __user *used;

2162 u16 old, new;

2163 int start;

2164

2165 start = vq->last_used_idx & (vq->num - 1);

2166 used = vq->used->ring + start;

2167 if (count == 1) {

2168 if (vhost_put_user(vq, heads[0].id, &used->id)) {

2169 vq_err(vq, "Failed to write used id");

2170 return -EFAULT;

2171 }

2172 if (vhost_put_user(vq, heads[0].len, &used->len)) {

2173 vq_err(vq, "Failed to write used len");

2174 return -EFAULT;

2175 }

2176 } else if (vhost_copy_to_user(vq, used, heads, count * sizeof *used)) {

2177 vq_err(vq, "Failed to write used");

2178 return -EFAULT;

2179 }

2180 if (unlikely(vq->log_used)) {

2181 /* Make sure data is seen before log. */

2182 smp_wmb();

2183 /* Log used ring entry write. */

2184 log_write(vq->log_base,

2185 vq->log_addr +

2186 ((void __user *)used - (void __user *)vq->used),

2187 count * sizeof *used);

2188 }

2189 old = vq->last_used_idx;

2190 new = (vq->last_used_idx += count);

2191 /* If the driver never bothers to signal in a very long while,

2192 * used index might wrap around. If that happens, invalidate

2193 * signalled_used index we stored. TODO: make sure driver

2194 * signals at least once in 2^16 and remove this. */

2195 if (unlikely((u16)(new - vq->signalled_used) < (u16)(new - old)))

2196 vq->signalled_used_valid = false;

2197 return 0;

2198 }

2199

2200 /* After we've used one of their buffers, we tell them about it. We'll then

2201 * want to notify the guest, using eventfd. */

2202 int vhost_add_used_n(struct vhost_virtqueue *vq, struct vring_used_elem *heads,

2203 unsigned count)

2204 {

2205 int start, n, r;

2206

2207 start = vq->last_used_idx & (vq->num - 1);

2208 n = vq->num - start;

2209 if (n < count) {

2210 r = __vhost_add_used_n(vq, heads, n);

2211 if (r < 0)

2212 return r;

2213 heads += n;

2214 count -= n;

2215 }

2216 r = __vhost_add_used_n(vq, heads, count);

2217

2218 /* Make sure buffer is written before we update index. */

2219 smp_wmb();

2220 if (vhost_put_user(vq, cpu_to_vhost16(vq, vq->last_used_idx),

2221 &vq->used->idx)) {

2222 vq_err(vq, "Failed to increment used idx");

2223 return -EFAULT;

2224 }

2225 if (unlikely(vq->log_used)) {

2226 /* Log used index update. */

2227 log_write(vq->log_base,

2228 vq->log_addr + offsetof(struct vring_used, idx),

2229 sizeof vq->used->idx);

2230 if (vq->log_ctx)

2231 eventfd_signal(vq->log_ctx, 1);

2232 }

2233 return r;

2234 }

2235 EXPORT_SYMBOL_GPL(vhost_add_used_n);

2236

2237 static bool vhost_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)

2238 {

2239 __u16 old, new;

2240 __virtio16 event;

2241 bool v;

2242 /* Flush out used index updates. This is paired

2243 * with the barrier that the Guest executes when enabling

2244 * interrupts. */

2245 smp_mb();

2246

2247 if (vhost_has_feature(vq, VIRTIO_F_NOTIFY_ON_EMPTY) &&

2248 unlikely(vq->avail_idx == vq->last_avail_idx))

2249 return true;

2250

2251 if (!vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX)) {

2252 __virtio16 flags;

2253 if (vhost_get_avail(vq, flags, &vq->avail->flags)) {

2254 vq_err(vq, "Failed to get flags");

2255 return true;

2256 }

2257 return !(flags & cpu_to_vhost16(vq, VRING_AVAIL_F_NO_INTERRUPT));

2258 }

2259 old = vq->signalled_used;

2260 v = vq->signalled_used_valid;

2261 new = vq->signalled_used = vq->last_used_idx;

2262 vq->signalled_used_valid = true;

2263

2264 if (unlikely(!v))

2265 return true;

2266

2267 if (vhost_get_avail(vq, event, vhost_used_event(vq))) {

2268 vq_err(vq, "Failed to get used event idx");

2269 return true;

2270 }

2271 return vring_need_event(vhost16_to_cpu(vq, event), new, old);

2272 }

2273

2274 /* This actually signals the guest, using eventfd. */

2275 void vhost_signal(struct vhost_dev *dev, struct vhost_virtqueue *vq)

2276 {

2277 /* Signal the Guest tell them we used something up. */

2278 if (vq->call_ctx && vhost_notify(dev, vq))

2279 eventfd_signal(vq->call_ctx, 1);

2280 }

2281 EXPORT_SYMBOL_GPL(vhost_signal);

2282

2283 /* And here's the combo meal deal. Supersize me! */

2284 void vhost_add_used_and_signal(struct vhost_dev *dev,

2285 struct vhost_virtqueue *vq,

2286 unsigned int head, int len)

2287 {

2288 vhost_add_used(vq, head, len);

2289 vhost_signal(dev, vq);

2290 }

2291 EXPORT_SYMBOL_GPL(vhost_add_used_and_signal);

2292

2293 /* multi-buffer version of vhost_add_used_and_signal */

2294 void vhost_add_used_and_signal_n(struct vhost_dev *dev,

2295 struct vhost_virtqueue *vq,

2296 struct vring_used_elem *heads, unsigned count)

2297 {

2298 vhost_add_used_n(vq, heads, count);

2299 vhost_signal(dev, vq);

2300 }

2301 EXPORT_SYMBOL_GPL(vhost_add_used_and_signal_n);

2302

2303 /* return true if we're sure that avaiable ring is empty */

2304 bool vhost_vq_avail_empty(struct vhost_dev *dev, struct vhost_virtqueue *vq)

2305 {

2306 __virtio16 avail_idx;

2307 int r;

2308

2309 if (vq->avail_idx != vq->last_avail_idx)

2310 return false;

2311

2312 r = vhost_get_avail(vq, avail_idx, &vq->avail->idx);

2313 if (unlikely(r))

2314 return false;

2315 vq->avail_idx = vhost16_to_cpu(vq, avail_idx);

2316

2317 return vq->avail_idx == vq->last_avail_idx;

2318 }

2319 EXPORT_SYMBOL_GPL(vhost_vq_avail_empty);

2320

2321 /* OK, now we need to know about added descriptors. */

2322 bool vhost_enable_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)

2323 {

2324 __virtio16 avail_idx;

2325 int r;

2326

2327 if (!(vq->used_flags & VRING_USED_F_NO_NOTIFY))

2328 return false;

2329 vq->used_flags &= ~VRING_USED_F_NO_NOTIFY;

2330 if (!vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX)) {

2331 r = vhost_update_used_flags(vq);

2332 if (r) {

2333 vq_err(vq, "Failed to enable notification at %p: %d\n",

2334 &vq->used->flags, r);

2335 return false;

2336 }

2337 } else {

2338 r = vhost_update_avail_event(vq, vq->avail_idx);

2339 if (r) {

2340 vq_err(vq, "Failed to update avail event index at %p: %d\n",

2341 vhost_avail_event(vq), r);

2342 return false;

2343 }

2344 }

2345 /* They could have slipped one in as we were doing that: make

2346 * sure it's written, then check again. */

2347 smp_mb();

2348 r = vhost_get_avail(vq, avail_idx, &vq->avail->idx);

2349 if (r) {

2350 vq_err(vq, "Failed to check avail idx at %p: %d\n",

2351 &vq->avail->idx, r);

2352 return false;

2353 }

2354

2355 return vhost16_to_cpu(vq, avail_idx) != vq->avail_idx;

2356 }

2357 EXPORT_SYMBOL_GPL(vhost_enable_notify);

2358

2359 /* We don't need to be notified again. */

2360 void vhost_disable_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)

2361 {

2362 int r;

2363

2364 if (vq->used_flags & VRING_USED_F_NO_NOTIFY)

2365 return;

2366 vq->used_flags |= VRING_USED_F_NO_NOTIFY;

2367 if (!vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX)) {

2368 r = vhost_update_used_flags(vq);

2369 if (r)

2370 vq_err(vq, "Failed to enable notification at %p: %d\n",

2371 &vq->used->flags, r);

2372 }

2373 }

2374 EXPORT_SYMBOL_GPL(vhost_disable_notify);

2375

2376 /* Create a new message. */

2377 struct vhost_msg_node *vhost_new_msg(struct vhost_virtqueue *vq, int type)

2378 {

2379 struct vhost_msg_node *node = kmalloc(sizeof *node, GFP_KERNEL);

2380 if (!node)

2381 return NULL;

2382 node->vq = vq;

2383 node->msg.type = type;

2384 return node;

2385 }

2386 EXPORT_SYMBOL_GPL(vhost_new_msg);

2387

2388 void vhost_enqueue_msg(struct vhost_dev *dev, struct list_head *head,

2389 struct vhost_msg_node *node)

2390 {

2391 spin_lock(&dev->iotlb_lock);

2392 list_add_tail(&node->node, head);

2393 spin_unlock(&dev->iotlb_lock);

2394

2395 wake_up_interruptible_poll(&dev->wait, POLLIN | POLLRDNORM);

2396 }

2397 EXPORT_SYMBOL_GPL(vhost_enqueue_msg);

2398

2399 struct vhost_msg_node *vhost_dequeue_msg(struct vhost_dev *dev,

2400 struct list_head *head)

2401 {

2402 struct vhost_msg_node *node = NULL;

2403

2404 spin_lock(&dev->iotlb_lock);

2405 if (!list_empty(head)) {

2406 node = list_first_entry(head, struct vhost_msg_node,

2407 node);

2408 list_del(&node->node);

2409 }

2410 spin_unlock(&dev->iotlb_lock);

2411

2412 return node;

2413 }

2414 EXPORT_SYMBOL_GPL(vhost_dequeue_msg);

2415

2416

2417 static int __init vhost_init(void)

2418 {

2419 return 0;

2420 }

2421

2422 static void __exit vhost_exit(void)

2423 {

2424 }

2425

2426 module_init(vhost_init);

2427 module_exit(vhost_exit);

2428

2429 MODULE_VERSION("0.0.1");

2430 MODULE_LICENSE("GPL v2");

2431 MODULE_AUTHOR("Michael S. Tsirkin");

2432 MODULE_DESCRIPTION("Host kernel accelerator for virtio");