git.ipfire.org Git - thirdparty/kernel/stable.git/blob

1 /*

2 * Kernel-based Virtual Machine driver for Linux

3 *

4 * derived from drivers/kvm/kvm_main.c

5 *

7 *

8 * Authors:

9 * Avi Kivity <avi@qumranet.com>

10 * Yaniv Kamay <yaniv@qumranet.com>

11 *

12 * This work is licensed under the terms of the GNU GPL, version 2. See

13 * the COPYING file in the top-level directory.

14 *

15 */

17 #include <linux/kvm_host.h>

18 #include "irq.h"

19 #include "mmu.h"

20 #include "i8254.h"

21 #include "tss.h"

23 #include <linux/clocksource.h>

24 #include <linux/kvm.h>

25 #include <linux/fs.h>

26 #include <linux/vmalloc.h>

27 #include <linux/module.h>

28 #include <linux/mman.h>

29 #include <linux/highmem.h>

31 #include <asm/uaccess.h>

32 #include <asm/msr.h>

33 #include <asm/desc.h>

35 #define MAX_IO_MSRS 256

36 #define CR0_RESERVED_BITS \

37 (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \

38 | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \

39 | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG))

40 #define CR4_RESERVED_BITS \

41 (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\

42 | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \

43 | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR \

44 | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE))

46 #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)

47 /* EFER defaults:

48 * - enable syscall per default because its emulated by KVM

49 * - enable LME and LMA per default on 64 bit KVM

50 */

51 #ifdef CONFIG_X86_64

52 static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffafeULL;

53 #else

54 static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffffeULL;

55 #endif

57 #define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM

58 #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU

60 static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,

61 struct kvm_cpuid_entry2 __user *entries);

63 struct kvm_x86_ops *kvm_x86_ops;

65 struct kvm_stats_debugfs_item debugfs_entries[] = {

66 { "pf_fixed", VCPU_STAT(pf_fixed) },

67 { "pf_guest", VCPU_STAT(pf_guest) },

68 { "tlb_flush", VCPU_STAT(tlb_flush) },

69 { "invlpg", VCPU_STAT(invlpg) },

70 { "exits", VCPU_STAT(exits) },

71 { "io_exits", VCPU_STAT(io_exits) },

72 { "mmio_exits", VCPU_STAT(mmio_exits) },

73 { "signal_exits", VCPU_STAT(signal_exits) },

74 { "irq_window", VCPU_STAT(irq_window_exits) },

75 { "nmi_window", VCPU_STAT(nmi_window_exits) },

76 { "halt_exits", VCPU_STAT(halt_exits) },

77 { "halt_wakeup", VCPU_STAT(halt_wakeup) },

78 { "hypercalls", VCPU_STAT(hypercalls) },

79 { "request_irq", VCPU_STAT(request_irq_exits) },

80 { "irq_exits", VCPU_STAT(irq_exits) },

81 { "host_state_reload", VCPU_STAT(host_state_reload) },

82 { "efer_reload", VCPU_STAT(efer_reload) },

83 { "fpu_reload", VCPU_STAT(fpu_reload) },

84 { "insn_emulation", VCPU_STAT(insn_emulation) },

85 { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) },

86 { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },

87 { "mmu_pte_write", VM_STAT(mmu_pte_write) },

88 { "mmu_pte_updated", VM_STAT(mmu_pte_updated) },

89 { "mmu_pde_zapped", VM_STAT(mmu_pde_zapped) },

90 { "mmu_flooded", VM_STAT(mmu_flooded) },

91 { "mmu_recycled", VM_STAT(mmu_recycled) },

92 { "mmu_cache_miss", VM_STAT(mmu_cache_miss) },

93 { "remote_tlb_flush", VM_STAT(remote_tlb_flush) },

94 { "largepages", VM_STAT(lpages) },

95 { NULL }

96 };

99 unsigned long segment_base(u16 selector)

100 {

101 struct descriptor_table gdt;

102 struct desc_struct *d;

103 unsigned long table_base;

104 unsigned long v;

105

106 if (selector == 0)

107 return 0;

108

109 asm("sgdt %0" : "=m"(gdt));

110 table_base = gdt.base;

111

112 if (selector & 4) { /* from ldt */

113 u16 ldt_selector;

114

115 asm("sldt %0" : "=g"(ldt_selector));

116 table_base = segment_base(ldt_selector);

117 }

118 d = (struct desc_struct *)(table_base + (selector & ~7));

119 v = d->base0 | ((unsigned long)d->base1 << 16) |

120 ((unsigned long)d->base2 << 24);

121 #ifdef CONFIG_X86_64

122 if (d->s == 0 && (d->type == 2 || d->type == 9 || d->type == 11))

123 v |= ((unsigned long)((struct ldttss_desc64 *)d)->base3) << 32;

124 #endif

125 return v;

126 }

127 EXPORT_SYMBOL_GPL(segment_base);

128

129 u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)

130 {

131 if (irqchip_in_kernel(vcpu->kvm))

132 return vcpu->arch.apic_base;

133 else

134 return vcpu->arch.apic_base;

135 }

136 EXPORT_SYMBOL_GPL(kvm_get_apic_base);

137

138 void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data)

139 {

140 /* TODO: reserve bits check */

141 if (irqchip_in_kernel(vcpu->kvm))

142 kvm_lapic_set_base(vcpu, data);

143 else

144 vcpu->arch.apic_base = data;

145 }

146 EXPORT_SYMBOL_GPL(kvm_set_apic_base);

147

148 void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr)

149 {

150 WARN_ON(vcpu->arch.exception.pending);

151 vcpu->arch.exception.pending = true;

152 vcpu->arch.exception.has_error_code = false;

153 vcpu->arch.exception.nr = nr;

154 }

155 EXPORT_SYMBOL_GPL(kvm_queue_exception);

156

157 void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr,

158 u32 error_code)

159 {

160 ++vcpu->stat.pf_guest;

161 if (vcpu->arch.exception.pending) {

162 if (vcpu->arch.exception.nr == PF_VECTOR) {

163 printk(KERN_DEBUG "kvm: inject_page_fault:"

164 " double fault 0x%lx\n", addr);

165 vcpu->arch.exception.nr = DF_VECTOR;

166 vcpu->arch.exception.error_code = 0;

167 } else if (vcpu->arch.exception.nr == DF_VECTOR) {

168 /* triple fault -> shutdown */

169 set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);

170 }

171 return;

172 }

173 vcpu->arch.cr2 = addr;

174 kvm_queue_exception_e(vcpu, PF_VECTOR, error_code);

175 }

176

177 void kvm_inject_nmi(struct kvm_vcpu *vcpu)

178 {

179 vcpu->arch.nmi_pending = 1;

180 }

181 EXPORT_SYMBOL_GPL(kvm_inject_nmi);

182

183 void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)

184 {

185 WARN_ON(vcpu->arch.exception.pending);

186 vcpu->arch.exception.pending = true;

187 vcpu->arch.exception.has_error_code = true;

188 vcpu->arch.exception.nr = nr;

189 vcpu->arch.exception.error_code = error_code;

190 }

191 EXPORT_SYMBOL_GPL(kvm_queue_exception_e);

192

193 static void __queue_exception(struct kvm_vcpu *vcpu)

194 {

195 kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr,

196 vcpu->arch.exception.has_error_code,

197 vcpu->arch.exception.error_code);

198 }

199

200 /*

201 * Load the pae pdptrs. Return true is they are all valid.

202 */

203 int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)

204 {

205 gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;

206 unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2;

207 int i;

208 int ret;

209 u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)];

210

211 ret = kvm_read_guest_page(vcpu->kvm, pdpt_gfn, pdpte,

212 offset * sizeof(u64), sizeof(pdpte));

213 if (ret < 0) {

214 ret = 0;

215 goto out;

216 }

217 for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {

218 if ((pdpte[i] & 1) && (pdpte[i] & 0xfffffff0000001e6ull)) {

219 ret = 0;

220 goto out;

221 }

222 }

223 ret = 1;

224

225 memcpy(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs));

226 out:

227

228 return ret;

229 }

230 EXPORT_SYMBOL_GPL(load_pdptrs);

231

232 static bool pdptrs_changed(struct kvm_vcpu *vcpu)

233 {

234 u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)];

235 bool changed = true;

236 int r;

237

238 if (is_long_mode(vcpu) || !is_pae(vcpu))

239 return false;

240

241 r = kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u, pdpte, sizeof(pdpte));

242 if (r < 0)

243 goto out;

244 changed = memcmp(pdpte, vcpu->arch.pdptrs, sizeof(pdpte)) != 0;

245 out:

246

247 return changed;

248 }

249

250 void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)

251 {

252 if (cr0 & CR0_RESERVED_BITS) {

253 printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n",

254 cr0, vcpu->arch.cr0);

255 kvm_inject_gp(vcpu, 0);

256 return;

257 }

258

259 if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) {

260 printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n");

261 kvm_inject_gp(vcpu, 0);

262 return;

263 }

264

265 if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) {

266 printk(KERN_DEBUG "set_cr0: #GP, set PG flag "

267 "and a clear PE flag\n");

268 kvm_inject_gp(vcpu, 0);

269 return;

270 }

271

272 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {

273 #ifdef CONFIG_X86_64

274 if ((vcpu->arch.shadow_efer & EFER_LME)) {

275 int cs_db, cs_l;

276

277 if (!is_pae(vcpu)) {

278 printk(KERN_DEBUG "set_cr0: #GP, start paging "

279 "in long mode while PAE is disabled\n");

280 kvm_inject_gp(vcpu, 0);

281 return;

282 }

283 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);

284 if (cs_l) {

285 printk(KERN_DEBUG "set_cr0: #GP, start paging "

286 "in long mode while CS.L == 1\n");

287 kvm_inject_gp(vcpu, 0);

288 return;

289

290 }

291 } else

292 #endif

293 if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) {

294 printk(KERN_DEBUG "set_cr0: #GP, pdptrs "

295 "reserved bits\n");

296 kvm_inject_gp(vcpu, 0);

297 return;

298 }

299

300 }

301

302 kvm_x86_ops->set_cr0(vcpu, cr0);

303 vcpu->arch.cr0 = cr0;

304

305 kvm_mmu_reset_context(vcpu);

306 return;

307 }

308 EXPORT_SYMBOL_GPL(kvm_set_cr0);

309

310 void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)

311 {

312 kvm_set_cr0(vcpu, (vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f));

313 KVMTRACE_1D(LMSW, vcpu,

314 (u32)((vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f)),

315 handler);

316 }

317 EXPORT_SYMBOL_GPL(kvm_lmsw);

318

319 void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)

320 {

321 if (cr4 & CR4_RESERVED_BITS) {

322 printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n");

323 kvm_inject_gp(vcpu, 0);

324 return;

325 }

326

327 if (is_long_mode(vcpu)) {

328 if (!(cr4 & X86_CR4_PAE)) {

329 printk(KERN_DEBUG "set_cr4: #GP, clearing PAE while "

330 "in long mode\n");

331 kvm_inject_gp(vcpu, 0);

332 return;

333 }

334 } else if (is_paging(vcpu) && !is_pae(vcpu) && (cr4 & X86_CR4_PAE)

335 && !load_pdptrs(vcpu, vcpu->arch.cr3)) {

336 printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n");

337 kvm_inject_gp(vcpu, 0);

338 return;

339 }

340

341 if (cr4 & X86_CR4_VMXE) {

342 printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n");

343 kvm_inject_gp(vcpu, 0);

344 return;

345 }

346 kvm_x86_ops->set_cr4(vcpu, cr4);

347 vcpu->arch.cr4 = cr4;

348 kvm_mmu_reset_context(vcpu);

349 }

350 EXPORT_SYMBOL_GPL(kvm_set_cr4);

351

352 void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)

353 {

354 if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) {

355 kvm_mmu_flush_tlb(vcpu);

356 return;

357 }

358

359 if (is_long_mode(vcpu)) {

360 if (cr3 & CR3_L_MODE_RESERVED_BITS) {

361 printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n");

362 kvm_inject_gp(vcpu, 0);

363 return;

364 }

365 } else {

366 if (is_pae(vcpu)) {

367 if (cr3 & CR3_PAE_RESERVED_BITS) {

368 printk(KERN_DEBUG

369 "set_cr3: #GP, reserved bits\n");

370 kvm_inject_gp(vcpu, 0);

371 return;

372 }

373 if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) {

374 printk(KERN_DEBUG "set_cr3: #GP, pdptrs "

375 "reserved bits\n");

376 kvm_inject_gp(vcpu, 0);

377 return;

378 }

379 }

380 /*

381 * We don't check reserved bits in nonpae mode, because

382 * this isn't enforced, and VMware depends on this.

383 */

384 }

385

386 /*

387 * Does the new cr3 value map to physical memory? (Note, we

388 * catch an invalid cr3 even in real-mode, because it would

389 * cause trouble later on when we turn on paging anyway.)

390 *

391 * A real CPU would silently accept an invalid cr3 and would

392 * attempt to use it - with largely undefined (and often hard

393 * to debug) behavior on the guest side.

394 */

395 if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT)))

396 kvm_inject_gp(vcpu, 0);

397 else {

398 vcpu->arch.cr3 = cr3;

399 vcpu->arch.mmu.new_cr3(vcpu);

400 }

401 }

402 EXPORT_SYMBOL_GPL(kvm_set_cr3);

403

404 void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)

405 {

406 if (cr8 & CR8_RESERVED_BITS) {

407 printk(KERN_DEBUG "set_cr8: #GP, reserved bits 0x%lx\n", cr8);

408 kvm_inject_gp(vcpu, 0);

409 return;

410 }

411 if (irqchip_in_kernel(vcpu->kvm))

412 kvm_lapic_set_tpr(vcpu, cr8);

413 else

414 vcpu->arch.cr8 = cr8;

415 }

416 EXPORT_SYMBOL_GPL(kvm_set_cr8);

417

418 unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)

419 {

420 if (irqchip_in_kernel(vcpu->kvm))

421 return kvm_lapic_get_cr8(vcpu);

422 else

423 return vcpu->arch.cr8;

424 }

425 EXPORT_SYMBOL_GPL(kvm_get_cr8);

426

427 /*

428 * List of msr numbers which we expose to userspace through KVM_GET_MSRS

429 * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.

430 *

431 * This list is modified at module load time to reflect the

432 * capabilities of the host cpu.

433 */

434 static u32 msrs_to_save[] = {

435 MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,

436 MSR_K6_STAR,

437 #ifdef CONFIG_X86_64

438 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,

439 #endif

440 MSR_IA32_TIME_STAMP_COUNTER, MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,

441 MSR_IA32_PERF_STATUS,

442 };

443

444 static unsigned num_msrs_to_save;

445

446 static u32 emulated_msrs[] = {

447 MSR_IA32_MISC_ENABLE,

448 };

449

450 static void set_efer(struct kvm_vcpu *vcpu, u64 efer)

451 {

452 if (efer & efer_reserved_bits) {

453 printk(KERN_DEBUG "set_efer: 0x%llx #GP, reserved bits\n",

454 efer);

455 kvm_inject_gp(vcpu, 0);

456 return;

457 }

458

459 if (is_paging(vcpu)

460 && (vcpu->arch.shadow_efer & EFER_LME) != (efer & EFER_LME)) {

461 printk(KERN_DEBUG "set_efer: #GP, change LME while paging\n");

462 kvm_inject_gp(vcpu, 0);

463 return;

464 }

465

466 kvm_x86_ops->set_efer(vcpu, efer);

467

468 efer &= ~EFER_LMA;

469 efer |= vcpu->arch.shadow_efer & EFER_LMA;

470

471 vcpu->arch.shadow_efer = efer;

472 }

473

474 void kvm_enable_efer_bits(u64 mask)

475 {

476 efer_reserved_bits &= ~mask;

477 }

478 EXPORT_SYMBOL_GPL(kvm_enable_efer_bits);

479

480

481 /*

482 * Writes msr value into into the appropriate "register".

483 * Returns 0 on success, non-0 otherwise.

484 * Assumes vcpu_load() was already called.

485 */

486 int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)

487 {

488 return kvm_x86_ops->set_msr(vcpu, msr_index, data);

489 }

490

491 /*

492 * Adapt set_msr() to msr_io()'s calling convention

493 */

494 static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)

495 {

496 return kvm_set_msr(vcpu, index, *data);

497 }

498

499 static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)

500 {

501 static int version;

502 struct pvclock_wall_clock wc;

503 struct timespec now, sys, boot;

504

505 if (!wall_clock)

506 return;

507

508 version++;

509

510 kvm_write_guest(kvm, wall_clock, &version, sizeof(version));

511

512 /*

513 * The guest calculates current wall clock time by adding

514 * system time (updated by kvm_write_guest_time below) to the

515 * wall clock specified here. guest system time equals host

516 * system time for us, thus we must fill in host boot time here.

517 */

518 now = current_kernel_time();

519 ktime_get_ts(&sys);

520 boot = ns_to_timespec(timespec_to_ns(&now) - timespec_to_ns(&sys));

521

522 wc.sec = boot.tv_sec;

523 wc.nsec = boot.tv_nsec;

524 wc.version = version;

525

526 kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc));

527

528 version++;

529 kvm_write_guest(kvm, wall_clock, &version, sizeof(version));

530 }

531

532 static uint32_t div_frac(uint32_t dividend, uint32_t divisor)

533 {

534 uint32_t quotient, remainder;

535

536 /* Don't try to replace with do_div(), this one calculates

537 * "(dividend << 32) / divisor" */

538 __asm__ ( "divl %4"

539 : "=a" (quotient), "=d" (remainder)

540 : "0" (0), "1" (dividend), "r" (divisor) );

541 return quotient;

542 }

543

544 static void kvm_set_time_scale(uint32_t tsc_khz, struct pvclock_vcpu_time_info *hv_clock)

545 {

546 uint64_t nsecs = 1000000000LL;

547 int32_t shift = 0;

548 uint64_t tps64;

549 uint32_t tps32;

550

551 tps64 = tsc_khz * 1000LL;

552 while (tps64 > nsecs*2) {

553 tps64 >>= 1;

554 shift--;

555 }

556

557 tps32 = (uint32_t)tps64;

558 while (tps32 <= (uint32_t)nsecs) {

559 tps32 <<= 1;

560 shift++;

561 }

562

563 hv_clock->tsc_shift = shift;

564 hv_clock->tsc_to_system_mul = div_frac(nsecs, tps32);

565

566 pr_debug("%s: tsc_khz %u, tsc_shift %d, tsc_mul %u\n",

567 __FUNCTION__, tsc_khz, hv_clock->tsc_shift,

568 hv_clock->tsc_to_system_mul);

569 }

570

571 static void kvm_write_guest_time(struct kvm_vcpu *v)

572 {

573 struct timespec ts;

574 unsigned long flags;

575 struct kvm_vcpu_arch *vcpu = &v->arch;

576 void *shared_kaddr;

577

578 if ((!vcpu->time_page))

579 return;

580

581 if (unlikely(vcpu->hv_clock_tsc_khz != tsc_khz)) {

582 kvm_set_time_scale(tsc_khz, &vcpu->hv_clock);

583 vcpu->hv_clock_tsc_khz = tsc_khz;

584 }

585

586 /* Keep irq disabled to prevent changes to the clock */

587 local_irq_save(flags);

588 kvm_get_msr(v, MSR_IA32_TIME_STAMP_COUNTER,

589 &vcpu->hv_clock.tsc_timestamp);

590 ktime_get_ts(&ts);

591 local_irq_restore(flags);

592

593 /* With all the info we got, fill in the values */

594

595 vcpu->hv_clock.system_time = ts.tv_nsec +

596 (NSEC_PER_SEC * (u64)ts.tv_sec);

597 /*

598 * The interface expects us to write an even number signaling that the

599 * update is finished. Since the guest won't see the intermediate

600 * state, we just increase by 2 at the end.

601 */

602 vcpu->hv_clock.version += 2;

603

604 shared_kaddr = kmap_atomic(vcpu->time_page, KM_USER0);

605

606 memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock,

607 sizeof(vcpu->hv_clock));

608

609 kunmap_atomic(shared_kaddr, KM_USER0);

610

611 mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT);

612 }

613

614 static bool msr_mtrr_valid(unsigned msr)

615 {

616 switch (msr) {

617 case 0x200 ... 0x200 + 2 * KVM_NR_VAR_MTRR - 1:

618 case MSR_MTRRfix64K_00000:

619 case MSR_MTRRfix16K_80000:

620 case MSR_MTRRfix16K_A0000:

621 case MSR_MTRRfix4K_C0000:

622 case MSR_MTRRfix4K_C8000:

623 case MSR_MTRRfix4K_D0000:

624 case MSR_MTRRfix4K_D8000:

625 case MSR_MTRRfix4K_E0000:

626 case MSR_MTRRfix4K_E8000:

627 case MSR_MTRRfix4K_F0000:

628 case MSR_MTRRfix4K_F8000:

629 case MSR_MTRRdefType:

630 case MSR_IA32_CR_PAT:

631 return true;

632 case 0x2f8:

633 return true;

634 }

635 return false;

636 }

637

638 static int set_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 data)

639 {

640 if (!msr_mtrr_valid(msr))

641 return 1;

642

643 vcpu->arch.mtrr[msr - 0x200] = data;

644 return 0;

645 }

646

647 int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)

648 {

649 switch (msr) {

650 case MSR_EFER:

651 set_efer(vcpu, data);

652 break;

653 case MSR_IA32_MC0_STATUS:

654 pr_unimpl(vcpu, "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n",

655 __func__, data);

656 break;

657 case MSR_IA32_MCG_STATUS:

658 pr_unimpl(vcpu, "%s: MSR_IA32_MCG_STATUS 0x%llx, nop\n",

659 __func__, data);

660 break;

661 case MSR_IA32_MCG_CTL:

662 pr_unimpl(vcpu, "%s: MSR_IA32_MCG_CTL 0x%llx, nop\n",

663 __func__, data);

664 break;

665 case MSR_IA32_UCODE_REV:

666 case MSR_IA32_UCODE_WRITE:

667 break;

668 case 0x200 ... 0x2ff:

669 return set_msr_mtrr(vcpu, msr, data);

670 case MSR_IA32_APICBASE:

671 kvm_set_apic_base(vcpu, data);

672 break;

673 case MSR_IA32_MISC_ENABLE:

674 vcpu->arch.ia32_misc_enable_msr = data;

675 break;

676 case MSR_KVM_WALL_CLOCK:

677 vcpu->kvm->arch.wall_clock = data;

678 kvm_write_wall_clock(vcpu->kvm, data);

679 break;

680 case MSR_KVM_SYSTEM_TIME: {

681 if (vcpu->arch.time_page) {

682 kvm_release_page_dirty(vcpu->arch.time_page);

683 vcpu->arch.time_page = NULL;

684 }

685

686 vcpu->arch.time = data;

687

688 /* we verify if the enable bit is set... */

689 if (!(data & 1))

690 break;

691

692 /* ...but clean it before doing the actual write */

693 vcpu->arch.time_offset = data & ~(PAGE_MASK | 1);

694

695 down_read(&current->mm->mmap_sem);

696 vcpu->arch.time_page =

697 gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT);

698 up_read(&current->mm->mmap_sem);

699

700 if (is_error_page(vcpu->arch.time_page)) {

701 kvm_release_page_clean(vcpu->arch.time_page);

702 vcpu->arch.time_page = NULL;

703 }

704

705 kvm_write_guest_time(vcpu);

706 break;

707 }

708 default:

709 pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", msr, data);

710 return 1;

711 }

712 return 0;

713 }

714 EXPORT_SYMBOL_GPL(kvm_set_msr_common);

715

716

717 /*

718 * Reads an msr value (of 'msr_index') into 'pdata'.

719 * Returns 0 on success, non-0 otherwise.

720 * Assumes vcpu_load() was already called.

721 */

722 int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)

723 {

724 return kvm_x86_ops->get_msr(vcpu, msr_index, pdata);

725 }

726

727 static int get_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)

728 {

729 if (!msr_mtrr_valid(msr))

730 return 1;

731

732 *pdata = vcpu->arch.mtrr[msr - 0x200];

733 return 0;

734 }

735

736 int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)

737 {

738 u64 data;

739

740 switch (msr) {

741 case 0xc0010010: /* SYSCFG */

742 case 0xc0010015: /* HWCR */

743 case MSR_IA32_PLATFORM_ID:

744 case MSR_IA32_P5_MC_ADDR:

745 case MSR_IA32_P5_MC_TYPE:

746 case MSR_IA32_MC0_CTL:

747 case MSR_IA32_MCG_STATUS:

748 case MSR_IA32_MCG_CAP:

749 case MSR_IA32_MCG_CTL:

750 case MSR_IA32_MC0_MISC:

751 case MSR_IA32_MC0_MISC+4:

752 case MSR_IA32_MC0_MISC+8:

753 case MSR_IA32_MC0_MISC+12:

754 case MSR_IA32_MC0_MISC+16:

755 case MSR_IA32_MC0_MISC+20:

756 case MSR_IA32_UCODE_REV:

757 case MSR_IA32_EBL_CR_POWERON:

758 data = 0;

759 break;

760 case MSR_MTRRcap:

761 data = 0x500 | KVM_NR_VAR_MTRR;

762 break;

763 case 0x200 ... 0x2ff:

764 return get_msr_mtrr(vcpu, msr, pdata);

765 case 0xcd: /* fsb frequency */

766 data = 3;

767 break;

768 case MSR_IA32_APICBASE:

769 data = kvm_get_apic_base(vcpu);

770 break;

771 case MSR_IA32_MISC_ENABLE:

772 data = vcpu->arch.ia32_misc_enable_msr;

773 break;

774 case MSR_IA32_PERF_STATUS:

775 /* TSC increment by tick */

776 data = 1000ULL;

777 /* CPU multiplier */

778 data |= (((uint64_t)4ULL) << 40);

779 break;

780 case MSR_EFER:

781 data = vcpu->arch.shadow_efer;

782 break;

783 case MSR_KVM_WALL_CLOCK:

784 data = vcpu->kvm->arch.wall_clock;

785 break;

786 case MSR_KVM_SYSTEM_TIME:

787 data = vcpu->arch.time;

788 break;

789 default:

790 pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);

791 return 1;

792 }

793 *pdata = data;

794 return 0;

795 }

796 EXPORT_SYMBOL_GPL(kvm_get_msr_common);

797

798 /*

799 * Read or write a bunch of msrs. All parameters are kernel addresses.

800 *

801 * @return number of msrs set successfully.

802 */

803 static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,

804 struct kvm_msr_entry *entries,

805 int (*do_msr)(struct kvm_vcpu *vcpu,

806 unsigned index, u64 *data))

807 {

808 int i;

809

810 vcpu_load(vcpu);

811

812 down_read(&vcpu->kvm->slots_lock);

813 for (i = 0; i < msrs->nmsrs; ++i)

814 if (do_msr(vcpu, entries[i].index, &entries[i].data))

815 break;

816 up_read(&vcpu->kvm->slots_lock);

817

818 vcpu_put(vcpu);

819

820 return i;

821 }

822

823 /*

824 * Read or write a bunch of msrs. Parameters are user addresses.

825 *

826 * @return number of msrs set successfully.

827 */

828 static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs,

829 int (*do_msr)(struct kvm_vcpu *vcpu,

830 unsigned index, u64 *data),

831 int writeback)

832 {

833 struct kvm_msrs msrs;

834 struct kvm_msr_entry *entries;

835 int r, n;

836 unsigned size;

837

838 r = -EFAULT;

839 if (copy_from_user(&msrs, user_msrs, sizeof msrs))

840 goto out;

841

842 r = -E2BIG;

843 if (msrs.nmsrs >= MAX_IO_MSRS)

844 goto out;

845

846 r = -ENOMEM;

847 size = sizeof(struct kvm_msr_entry) * msrs.nmsrs;

848 entries = vmalloc(size);

849 if (!entries)

850 goto out;

851

852 r = -EFAULT;

853 if (copy_from_user(entries, user_msrs->entries, size))

854 goto out_free;

855

856 r = n = __msr_io(vcpu, &msrs, entries, do_msr);

857 if (r < 0)

858 goto out_free;

859

860 r = -EFAULT;

861 if (writeback && copy_to_user(user_msrs->entries, entries, size))

862 goto out_free;

863

864 r = n;

865

866 out_free:

867 vfree(entries);

868 out:

869 return r;

870 }

871

872 int kvm_dev_ioctl_check_extension(long ext)

873 {

874 int r;

875

876 switch (ext) {

877 case KVM_CAP_IRQCHIP:

878 case KVM_CAP_HLT:

879 case KVM_CAP_MMU_SHADOW_CACHE_CONTROL:

880 case KVM_CAP_USER_MEMORY:

881 case KVM_CAP_SET_TSS_ADDR:

882 case KVM_CAP_EXT_CPUID:

883 case KVM_CAP_CLOCKSOURCE:

884 case KVM_CAP_PIT:

885 case KVM_CAP_NOP_IO_DELAY:

886 case KVM_CAP_MP_STATE:

887 case KVM_CAP_SYNC_MMU:

888 r = 1;

889 break;

890 case KVM_CAP_COALESCED_MMIO:

891 r = KVM_COALESCED_MMIO_PAGE_OFFSET;

892 break;

893 case KVM_CAP_VAPIC:

894 r = !kvm_x86_ops->cpu_has_accelerated_tpr();

895 break;

896 case KVM_CAP_NR_VCPUS:

897 r = KVM_MAX_VCPUS;

898 break;

899 case KVM_CAP_NR_MEMSLOTS:

900 r = KVM_MEMORY_SLOTS;

901 break;

902 case KVM_CAP_PV_MMU:

903 r = !tdp_enabled;

904 break;

905 default:

906 r = 0;

907 break;

908 }

909 return r;

910

911 }

912

913 long kvm_arch_dev_ioctl(struct file *filp,

914 unsigned int ioctl, unsigned long arg)

915 {

916 void __user *argp = (void __user *)arg;

917 long r;

918

919 switch (ioctl) {

920 case KVM_GET_MSR_INDEX_LIST: {

921 struct kvm_msr_list __user *user_msr_list = argp;

922 struct kvm_msr_list msr_list;

923 unsigned n;

924

925 r = -EFAULT;

926 if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list))

927 goto out;

928 n = msr_list.nmsrs;

929 msr_list.nmsrs = num_msrs_to_save + ARRAY_SIZE(emulated_msrs);

930 if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list))

931 goto out;

932 r = -E2BIG;

933 if (n < num_msrs_to_save)

934 goto out;

935 r = -EFAULT;

936 if (copy_to_user(user_msr_list->indices, &msrs_to_save,

937 num_msrs_to_save * sizeof(u32)))

938 goto out;

939 if (copy_to_user(user_msr_list->indices

940 + num_msrs_to_save * sizeof(u32),

941 &emulated_msrs,

942 ARRAY_SIZE(emulated_msrs) * sizeof(u32)))

943 goto out;

944 r = 0;

945 break;

946 }

947 case KVM_GET_SUPPORTED_CPUID: {

948 struct kvm_cpuid2 __user *cpuid_arg = argp;

949 struct kvm_cpuid2 cpuid;

950

951 r = -EFAULT;

952 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))

953 goto out;

954 r = kvm_dev_ioctl_get_supported_cpuid(&cpuid,

955 cpuid_arg->entries);

956 if (r)

957 goto out;

958

959 r = -EFAULT;

960 if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid))

961 goto out;

962 r = 0;

963 break;

964 }

965 default:

966 r = -EINVAL;

967 }

968 out:

969 return r;

970 }

971

972 void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)

973 {

974 kvm_x86_ops->vcpu_load(vcpu, cpu);

975 kvm_write_guest_time(vcpu);

976 }

977

978 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)

979 {

980 kvm_x86_ops->vcpu_put(vcpu);

981 kvm_put_guest_fpu(vcpu);

982 }

983

984 static int is_efer_nx(void)

985 {

986 u64 efer;

987

988 rdmsrl(MSR_EFER, efer);

989 return efer & EFER_NX;

990 }

991

992 static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu)

993 {

994 int i;

995 struct kvm_cpuid_entry2 *e, *entry;

996

997 entry = NULL;

998 for (i = 0; i < vcpu->arch.cpuid_nent; ++i) {

999 e = &vcpu->arch.cpuid_entries[i];

1000 if (e->function == 0x80000001) {

1001 entry = e;

1002 break;

1003 }

1004 }

1005 if (entry && (entry->edx & (1 << 20)) && !is_efer_nx()) {

1006 entry->edx &= ~(1 << 20);

1007 printk(KERN_INFO "kvm: guest NX capability removed\n");

1008 }

1009 }

1010

1011 /* when an old userspace process fills a new kernel module */

1012 static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,

1013 struct kvm_cpuid *cpuid,

1014 struct kvm_cpuid_entry __user *entries)

1015 {

1016 int r, i;

1017 struct kvm_cpuid_entry *cpuid_entries;

1018

1019 r = -E2BIG;

1020 if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)

1021 goto out;

1022 r = -ENOMEM;

1023 cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry) * cpuid->nent);

1024 if (!cpuid_entries)

1025 goto out;

1026 r = -EFAULT;

1027 if (copy_from_user(cpuid_entries, entries,

1028 cpuid->nent * sizeof(struct kvm_cpuid_entry)))

1029 goto out_free;

1030 for (i = 0; i < cpuid->nent; i++) {

1031 vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function;

1032 vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax;

1033 vcpu->arch.cpuid_entries[i].ebx = cpuid_entries[i].ebx;

1034 vcpu->arch.cpuid_entries[i].ecx = cpuid_entries[i].ecx;

1035 vcpu->arch.cpuid_entries[i].edx = cpuid_entries[i].edx;

1036 vcpu->arch.cpuid_entries[i].index = 0;

1037 vcpu->arch.cpuid_entries[i].flags = 0;

1038 vcpu->arch.cpuid_entries[i].padding[0] = 0;

1039 vcpu->arch.cpuid_entries[i].padding[1] = 0;

1040 vcpu->arch.cpuid_entries[i].padding[2] = 0;

1041 }

1042 vcpu->arch.cpuid_nent = cpuid->nent;

1043 cpuid_fix_nx_cap(vcpu);

1044 r = 0;

1045

1046 out_free:

1047 vfree(cpuid_entries);

1048 out:

1049 return r;

1050 }

1051

1052 static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,

1053 struct kvm_cpuid2 *cpuid,

1054 struct kvm_cpuid_entry2 __user *entries)

1055 {

1056 int r;

1057

1058 r = -E2BIG;

1059 if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)

1060 goto out;

1061 r = -EFAULT;

1062 if (copy_from_user(&vcpu->arch.cpuid_entries, entries,

1063 cpuid->nent * sizeof(struct kvm_cpuid_entry2)))

1064 goto out;

1065 vcpu->arch.cpuid_nent = cpuid->nent;

1066 return 0;

1067

1068 out:

1069 return r;

1070 }

1071

1072 static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,

1073 struct kvm_cpuid2 *cpuid,

1074 struct kvm_cpuid_entry2 __user *entries)

1075 {

1076 int r;

1077

1078 r = -E2BIG;

1079 if (cpuid->nent < vcpu->arch.cpuid_nent)

1080 goto out;

1081 r = -EFAULT;

1082 if (copy_to_user(entries, &vcpu->arch.cpuid_entries,

1083 vcpu->arch.cpuid_nent * sizeof(struct kvm_cpuid_entry2)))

1084 goto out;

1085 return 0;

1086

1087 out:

1088 cpuid->nent = vcpu->arch.cpuid_nent;

1089 return r;

1090 }

1091

1092 static inline u32 bit(int bitno)

1093 {

1094 return 1 << (bitno & 31);

1095 }

1096

1097 static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function,

1098 u32 index)

1099 {

1100 entry->function = function;

1101 entry->index = index;

1102 cpuid_count(entry->function, entry->index,

1103 &entry->eax, &entry->ebx, &entry->ecx, &entry->edx);

1104 entry->flags = 0;

1105 }

1106

1107 static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,

1108 u32 index, int *nent, int maxnent)

1109 {

1110 const u32 kvm_supported_word0_x86_features = bit(X86_FEATURE_FPU) |

1111 bit(X86_FEATURE_VME) | bit(X86_FEATURE_DE) |

1112 bit(X86_FEATURE_PSE) | bit(X86_FEATURE_TSC) |

1113 bit(X86_FEATURE_MSR) | bit(X86_FEATURE_PAE) |

1114 bit(X86_FEATURE_CX8) | bit(X86_FEATURE_APIC) |

1115 bit(X86_FEATURE_SEP) | bit(X86_FEATURE_PGE) |

1116 bit(X86_FEATURE_CMOV) | bit(X86_FEATURE_PSE36) |

1117 bit(X86_FEATURE_CLFLSH) | bit(X86_FEATURE_MMX) |

1118 bit(X86_FEATURE_FXSR) | bit(X86_FEATURE_XMM) |

1119 bit(X86_FEATURE_XMM2) | bit(X86_FEATURE_SELFSNOOP);

1120 const u32 kvm_supported_word1_x86_features = bit(X86_FEATURE_FPU) |

1121 bit(X86_FEATURE_VME) | bit(X86_FEATURE_DE) |

1122 bit(X86_FEATURE_PSE) | bit(X86_FEATURE_TSC) |

1123 bit(X86_FEATURE_MSR) | bit(X86_FEATURE_PAE) |

1124 bit(X86_FEATURE_CX8) | bit(X86_FEATURE_APIC) |

1125 bit(X86_FEATURE_PGE) |

1126 bit(X86_FEATURE_CMOV) | bit(X86_FEATURE_PSE36) |

1127 bit(X86_FEATURE_MMX) | bit(X86_FEATURE_FXSR) |

1128 bit(X86_FEATURE_SYSCALL) |

1129 (bit(X86_FEATURE_NX) && is_efer_nx()) |

1130 #ifdef CONFIG_X86_64

1131 bit(X86_FEATURE_LM) |

1132 #endif

1133 bit(X86_FEATURE_MMXEXT) |

1134 bit(X86_FEATURE_3DNOWEXT) |

1135 bit(X86_FEATURE_3DNOW);

1136 const u32 kvm_supported_word3_x86_features =

1137 bit(X86_FEATURE_XMM3) | bit(X86_FEATURE_CX16);

1138 const u32 kvm_supported_word6_x86_features =

1139 bit(X86_FEATURE_LAHF_LM) | bit(X86_FEATURE_CMP_LEGACY);

1140

1141 /* all func 2 cpuid_count() should be called on the same cpu */

1142 get_cpu();

1143 do_cpuid_1_ent(entry, function, index);

1144 ++*nent;

1145

1146 switch (function) {

1147 case 0:

1148 entry->eax = min(entry->eax, (u32)0xb);

1149 break;

1150 case 1:

1151 entry->edx &= kvm_supported_word0_x86_features;

1152 entry->ecx &= kvm_supported_word3_x86_features;

1153 break;

1154 /* function 2 entries are STATEFUL. That is, repeated cpuid commands

1155 * may return different values. This forces us to get_cpu() before

1156 * issuing the first command, and also to emulate this annoying behavior

1157 * in kvm_emulate_cpuid() using KVM_CPUID_FLAG_STATE_READ_NEXT */

1158 case 2: {

1159 int t, times = entry->eax & 0xff;

1160

1161 entry->flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;

1162 for (t = 1; t < times && *nent < maxnent; ++t) {

1163 do_cpuid_1_ent(&entry[t], function, 0);

1164 entry[t].flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;

1165 ++*nent;

1166 }

1167 break;

1168 }

1169 /* function 4 and 0xb have additional index. */

1170 case 4: {

1171 int i, cache_type;

1172

1173 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;

1174 /* read more entries until cache_type is zero */

1175 for (i = 1; *nent < maxnent; ++i) {

1176 cache_type = entry[i - 1].eax & 0x1f;

1177 if (!cache_type)

1178 break;

1179 do_cpuid_1_ent(&entry[i], function, i);

1180 entry[i].flags |=

1181 KVM_CPUID_FLAG_SIGNIFCANT_INDEX;

1182 ++*nent;

1183 }

1184 break;

1185 }

1186 case 0xb: {

1187 int i, level_type;

1188

1189 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;

1190 /* read more entries until level_type is zero */

1191 for (i = 1; *nent < maxnent; ++i) {

1192 level_type = entry[i - 1].ecx & 0xff;

1193 if (!level_type)

1194 break;

1195 do_cpuid_1_ent(&entry[i], function, i);

1196 entry[i].flags |=

1197 KVM_CPUID_FLAG_SIGNIFCANT_INDEX;

1198 ++*nent;

1199 }

1200 break;

1201 }

1202 case 0x80000000:

1203 entry->eax = min(entry->eax, 0x8000001a);

1204 break;

1205 case 0x80000001:

1206 entry->edx &= kvm_supported_word1_x86_features;

1207 entry->ecx &= kvm_supported_word6_x86_features;

1208 break;

1209 }

1210 put_cpu();

1211 }

1212

1213 static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,

1214 struct kvm_cpuid_entry2 __user *entries)

1215 {

1216 struct kvm_cpuid_entry2 *cpuid_entries;

1217 int limit, nent = 0, r = -E2BIG;

1218 u32 func;

1219

1220 if (cpuid->nent < 1)

1221 goto out;

1222 r = -ENOMEM;

1223 cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry2) * cpuid->nent);

1224 if (!cpuid_entries)

1225 goto out;

1226

1227 do_cpuid_ent(&cpuid_entries[0], 0, 0, &nent, cpuid->nent);

1228 limit = cpuid_entries[0].eax;

1229 for (func = 1; func <= limit && nent < cpuid->nent; ++func)

1230 do_cpuid_ent(&cpuid_entries[nent], func, 0,

1231 &nent, cpuid->nent);

1232 r = -E2BIG;

1233 if (nent >= cpuid->nent)

1234 goto out_free;

1235

1236 do_cpuid_ent(&cpuid_entries[nent], 0x80000000, 0, &nent, cpuid->nent);

1237 limit = cpuid_entries[nent - 1].eax;

1238 for (func = 0x80000001; func <= limit && nent < cpuid->nent; ++func)

1239 do_cpuid_ent(&cpuid_entries[nent], func, 0,

1240 &nent, cpuid->nent);

1241 r = -EFAULT;

1242 if (copy_to_user(entries, cpuid_entries,

1243 nent * sizeof(struct kvm_cpuid_entry2)))

1244 goto out_free;

1245 cpuid->nent = nent;

1246 r = 0;

1247

1248 out_free:

1249 vfree(cpuid_entries);

1250 out:

1251 return r;

1252 }

1253

1254 static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,

1255 struct kvm_lapic_state *s)

1256 {

1257 vcpu_load(vcpu);

1258 memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s);

1259 vcpu_put(vcpu);

1260

1261 return 0;

1262 }

1263

1264 static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,

1265 struct kvm_lapic_state *s)

1266 {

1267 vcpu_load(vcpu);

1268 memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s);

1269 kvm_apic_post_state_restore(vcpu);

1270 vcpu_put(vcpu);

1271

1272 return 0;

1273 }

1274

1275 static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,

1276 struct kvm_interrupt *irq)

1277 {

1278 if (irq->irq < 0 || irq->irq >= 256)

1279 return -EINVAL;

1280 if (irqchip_in_kernel(vcpu->kvm))

1281 return -ENXIO;

1282 vcpu_load(vcpu);

1283

1284 set_bit(irq->irq, vcpu->arch.irq_pending);

1285 set_bit(irq->irq / BITS_PER_LONG, &vcpu->arch.irq_summary);

1286

1287 vcpu_put(vcpu);

1288

1289 return 0;

1290 }

1291

1292 static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu,

1293 struct kvm_tpr_access_ctl *tac)

1294 {

1295 if (tac->flags)

1296 return -EINVAL;

1297 vcpu->arch.tpr_access_reporting = !!tac->enabled;

1298 return 0;

1299 }

1300

1301 long kvm_arch_vcpu_ioctl(struct file *filp,

1302 unsigned int ioctl, unsigned long arg)

1303 {

1304 struct kvm_vcpu *vcpu = filp->private_data;

1305 void __user *argp = (void __user *)arg;

1306 int r;

1307 struct kvm_lapic_state *lapic = NULL;

1308

1309 switch (ioctl) {

1310 case KVM_GET_LAPIC: {

1311 lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);

1312

1313 r = -ENOMEM;

1314 if (!lapic)

1315 goto out;

1316 r = kvm_vcpu_ioctl_get_lapic(vcpu, lapic);

1317 if (r)

1318 goto out;

1319 r = -EFAULT;

1320 if (copy_to_user(argp, lapic, sizeof(struct kvm_lapic_state)))

1321 goto out;

1322 r = 0;

1323 break;

1324 }

1325 case KVM_SET_LAPIC: {

1326 lapic = kmalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);

1327 r = -ENOMEM;

1328 if (!lapic)

1329 goto out;

1330 r = -EFAULT;

1331 if (copy_from_user(lapic, argp, sizeof(struct kvm_lapic_state)))

1332 goto out;

1333 r = kvm_vcpu_ioctl_set_lapic(vcpu, lapic);

1334 if (r)

1335 goto out;

1336 r = 0;

1337 break;

1338 }

1339 case KVM_INTERRUPT: {

1340 struct kvm_interrupt irq;

1341

1342 r = -EFAULT;

1343 if (copy_from_user(&irq, argp, sizeof irq))

1344 goto out;

1345 r = kvm_vcpu_ioctl_interrupt(vcpu, &irq);

1346 if (r)

1347 goto out;

1348 r = 0;

1349 break;

1350 }

1351 case KVM_SET_CPUID: {

1352 struct kvm_cpuid __user *cpuid_arg = argp;

1353 struct kvm_cpuid cpuid;

1354

1355 r = -EFAULT;

1356 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))

1357 goto out;

1358 r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries);

1359 if (r)

1360 goto out;

1361 break;

1362 }

1363 case KVM_SET_CPUID2: {

1364 struct kvm_cpuid2 __user *cpuid_arg = argp;

1365 struct kvm_cpuid2 cpuid;

1366

1367 r = -EFAULT;

1368 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))

1369 goto out;

1370 r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid,

1371 cpuid_arg->entries);

1372 if (r)

1373 goto out;

1374 break;

1375 }

1376 case KVM_GET_CPUID2: {

1377 struct kvm_cpuid2 __user *cpuid_arg = argp;

1378 struct kvm_cpuid2 cpuid;

1379

1380 r = -EFAULT;

1381 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))

1382 goto out;

1383 r = kvm_vcpu_ioctl_get_cpuid2(vcpu, &cpuid,

1384 cpuid_arg->entries);

1385 if (r)

1386 goto out;

1387 r = -EFAULT;

1388 if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid))

1389 goto out;

1390 r = 0;

1391 break;

1392 }

1393 case KVM_GET_MSRS:

1394 r = msr_io(vcpu, argp, kvm_get_msr, 1);

1395 break;

1396 case KVM_SET_MSRS:

1397 r = msr_io(vcpu, argp, do_set_msr, 0);

1398 break;

1399 case KVM_TPR_ACCESS_REPORTING: {

1400 struct kvm_tpr_access_ctl tac;

1401

1402 r = -EFAULT;

1403 if (copy_from_user(&tac, argp, sizeof tac))

1404 goto out;

1405 r = vcpu_ioctl_tpr_access_reporting(vcpu, &tac);

1406 if (r)

1407 goto out;

1408 r = -EFAULT;

1409 if (copy_to_user(argp, &tac, sizeof tac))

1410 goto out;

1411 r = 0;

1412 break;

1413 };

1414 case KVM_SET_VAPIC_ADDR: {

1415 struct kvm_vapic_addr va;

1416

1417 r = -EINVAL;

1418 if (!irqchip_in_kernel(vcpu->kvm))

1419 goto out;

1420 r = -EFAULT;

1421 if (copy_from_user(&va, argp, sizeof va))

1422 goto out;

1423 r = 0;

1424 kvm_lapic_set_vapic_addr(vcpu, va.vapic_addr);

1425 break;

1426 }

1427 default:

1428 r = -EINVAL;

1429 }

1430 out:

1431 if (lapic)

1432 kfree(lapic);

1433 return r;

1434 }

1435

1436 static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr)

1437 {

1438 int ret;

1439

1440 if (addr > (unsigned int)(-3 * PAGE_SIZE))

1441 return -1;

1442 ret = kvm_x86_ops->set_tss_addr(kvm, addr);

1443 return ret;

1444 }

1445

1446 static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,

1447 u32 kvm_nr_mmu_pages)

1448 {

1449 if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES)

1450 return -EINVAL;

1451

1452 down_write(&kvm->slots_lock);

1453

1454 kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages);

1455 kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages;

1456

1457 up_write(&kvm->slots_lock);

1458 return 0;

1459 }

1460

1461 static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm)

1462 {

1463 return kvm->arch.n_alloc_mmu_pages;

1464 }

1465

1466 gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)

1467 {

1468 int i;

1469 struct kvm_mem_alias *alias;

1470

1471 for (i = 0; i < kvm->arch.naliases; ++i) {

1472 alias = &kvm->arch.aliases[i];

1473 if (gfn >= alias->base_gfn

1474 && gfn < alias->base_gfn + alias->npages)

1475 return alias->target_gfn + gfn - alias->base_gfn;

1476 }

1477 return gfn;

1478 }

1479

1480 /*

1481 * Set a new alias region. Aliases map a portion of physical memory into

1482 * another portion. This is useful for memory windows, for example the PC

1483 * VGA region.

1484 */

1485 static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm,

1486 struct kvm_memory_alias *alias)

1487 {

1488 int r, n;

1489 struct kvm_mem_alias *p;

1490

1491 r = -EINVAL;

1492 /* General sanity checks */

1493 if (alias->memory_size & (PAGE_SIZE - 1))

1494 goto out;

1495 if (alias->guest_phys_addr & (PAGE_SIZE - 1))

1496 goto out;

1497 if (alias->slot >= KVM_ALIAS_SLOTS)

1498 goto out;

1499 if (alias->guest_phys_addr + alias->memory_size

1500 < alias->guest_phys_addr)

1501 goto out;

1502 if (alias->target_phys_addr + alias->memory_size

1503 < alias->target_phys_addr)

1504 goto out;

1505

1506 down_write(&kvm->slots_lock);

1507 spin_lock(&kvm->mmu_lock);

1508

1509 p = &kvm->arch.aliases[alias->slot];

1510 p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT;

1511 p->npages = alias->memory_size >> PAGE_SHIFT;

1512 p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT;

1513

1514 for (n = KVM_ALIAS_SLOTS; n > 0; --n)

1515 if (kvm->arch.aliases[n - 1].npages)

1516 break;

1517 kvm->arch.naliases = n;

1518

1519 spin_unlock(&kvm->mmu_lock);

1520 kvm_mmu_zap_all(kvm);

1521

1522 up_write(&kvm->slots_lock);

1523

1524 return 0;

1525

1526 out:

1527 return r;

1528 }

1529

1530 static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)

1531 {

1532 int r;

1533

1534 r = 0;

1535 switch (chip->chip_id) {

1536 case KVM_IRQCHIP_PIC_MASTER:

1537 memcpy(&chip->chip.pic,

1538 &pic_irqchip(kvm)->pics[0],

1539 sizeof(struct kvm_pic_state));

1540 break;

1541 case KVM_IRQCHIP_PIC_SLAVE:

1542 memcpy(&chip->chip.pic,

1543 &pic_irqchip(kvm)->pics[1],

1544 sizeof(struct kvm_pic_state));

1545 break;

1546 case KVM_IRQCHIP_IOAPIC:

1547 memcpy(&chip->chip.ioapic,

1548 ioapic_irqchip(kvm),

1549 sizeof(struct kvm_ioapic_state));

1550 break;

1551 default:

1552 r = -EINVAL;

1553 break;

1554 }

1555 return r;

1556 }

1557

1558 static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)

1559 {

1560 int r;

1561

1562 r = 0;

1563 switch (chip->chip_id) {

1564 case KVM_IRQCHIP_PIC_MASTER:

1565 memcpy(&pic_irqchip(kvm)->pics[0],

1566 &chip->chip.pic,

1567 sizeof(struct kvm_pic_state));

1568 break;

1569 case KVM_IRQCHIP_PIC_SLAVE:

1570 memcpy(&pic_irqchip(kvm)->pics[1],

1571 &chip->chip.pic,

1572 sizeof(struct kvm_pic_state));

1573 break;

1574 case KVM_IRQCHIP_IOAPIC:

1575 memcpy(ioapic_irqchip(kvm),

1576 &chip->chip.ioapic,

1577 sizeof(struct kvm_ioapic_state));

1578 break;

1579 default:

1580 r = -EINVAL;

1581 break;

1582 }

1583 kvm_pic_update_irq(pic_irqchip(kvm));

1584 return r;

1585 }

1586

1587 static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps)

1588 {

1589 int r = 0;

1590

1591 memcpy(ps, &kvm->arch.vpit->pit_state, sizeof(struct kvm_pit_state));

1592 return r;

1593 }

1594

1595 static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps)

1596 {

1597 int r = 0;

1598

1599 memcpy(&kvm->arch.vpit->pit_state, ps, sizeof(struct kvm_pit_state));

1600 kvm_pit_load_count(kvm, 0, ps->channels[0].count);

1601 return r;

1602 }

1603

1604 /*

1605 * Get (and clear) the dirty memory log for a memory slot.

1606 */

1607 int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,

1608 struct kvm_dirty_log *log)

1609 {

1610 int r;

1611 int n;

1612 struct kvm_memory_slot *memslot;

1613 int is_dirty = 0;

1614

1615 down_write(&kvm->slots_lock);

1616

1617 r = kvm_get_dirty_log(kvm, log, &is_dirty);

1618 if (r)

1619 goto out;

1620

1621 /* If nothing is dirty, don't bother messing with page tables. */

1622 if (is_dirty) {

1623 kvm_mmu_slot_remove_write_access(kvm, log->slot);

1624 kvm_flush_remote_tlbs(kvm);

1625 memslot = &kvm->memslots[log->slot];

1626 n = ALIGN(memslot->npages, BITS_PER_LONG) / 8;

1627 memset(memslot->dirty_bitmap, 0, n);

1628 }

1629 r = 0;

1630 out:

1631 up_write(&kvm->slots_lock);

1632 return r;

1633 }

1634

1635 long kvm_arch_vm_ioctl(struct file *filp,

1636 unsigned int ioctl, unsigned long arg)

1637 {

1638 struct kvm *kvm = filp->private_data;

1639 void __user *argp = (void __user *)arg;

1640 int r = -EINVAL;

1641 /*

1642 * This union makes it completely explicit to gcc-3.x

1643 * that these two variables' stack usage should be

1644 * combined, not added together.

1645 */

1646 union {

1647 struct kvm_pit_state ps;

1648 struct kvm_memory_alias alias;

1649 } u;

1650

1651 switch (ioctl) {

1652 case KVM_SET_TSS_ADDR:

1653 r = kvm_vm_ioctl_set_tss_addr(kvm, arg);

1654 if (r < 0)

1655 goto out;

1656 break;

1657 case KVM_SET_MEMORY_REGION: {

1658 struct kvm_memory_region kvm_mem;

1659 struct kvm_userspace_memory_region kvm_userspace_mem;

1660

1661 r = -EFAULT;

1662 if (copy_from_user(&kvm_mem, argp, sizeof kvm_mem))

1663 goto out;

1664 kvm_userspace_mem.slot = kvm_mem.slot;

1665 kvm_userspace_mem.flags = kvm_mem.flags;

1666 kvm_userspace_mem.guest_phys_addr = kvm_mem.guest_phys_addr;

1667 kvm_userspace_mem.memory_size = kvm_mem.memory_size;

1668 r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 0);

1669 if (r)

1670 goto out;

1671 break;

1672 }

1673 case KVM_SET_NR_MMU_PAGES:

1674 r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg);

1675 if (r)

1676 goto out;

1677 break;

1678 case KVM_GET_NR_MMU_PAGES:

1679 r = kvm_vm_ioctl_get_nr_mmu_pages(kvm);

1680 break;

1681 case KVM_SET_MEMORY_ALIAS:

1682 r = -EFAULT;

1683 if (copy_from_user(&u.alias, argp, sizeof(struct kvm_memory_alias)))

1684 goto out;

1685 r = kvm_vm_ioctl_set_memory_alias(kvm, &u.alias);

1686 if (r)

1687 goto out;

1688 break;

1689 case KVM_CREATE_IRQCHIP:

1690 r = -ENOMEM;

1691 kvm->arch.vpic = kvm_create_pic(kvm);

1692 if (kvm->arch.vpic) {

1693 r = kvm_ioapic_init(kvm);

1694 if (r) {

1695 kfree(kvm->arch.vpic);

1696 kvm->arch.vpic = NULL;

1697 goto out;

1698 }

1699 } else

1700 goto out;

1701 break;

1702 case KVM_CREATE_PIT:

1703 r = -ENOMEM;

1704 kvm->arch.vpit = kvm_create_pit(kvm);

1705 if (kvm->arch.vpit)

1706 r = 0;

1707 break;

1708 case KVM_IRQ_LINE: {

1709 struct kvm_irq_level irq_event;

1710

1711 r = -EFAULT;

1712 if (copy_from_user(&irq_event, argp, sizeof irq_event))

1713 goto out;

1714 if (irqchip_in_kernel(kvm)) {

1715 mutex_lock(&kvm->lock);

1716 if (irq_event.irq < 16)

1717 kvm_pic_set_irq(pic_irqchip(kvm),

1718 irq_event.irq,

1719 irq_event.level);

1720 kvm_ioapic_set_irq(kvm->arch.vioapic,

1721 irq_event.irq,

1722 irq_event.level);

1723 mutex_unlock(&kvm->lock);

1724 r = 0;

1725 }

1726 break;

1727 }

1728 case KVM_GET_IRQCHIP: {

1729 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */

1730 struct kvm_irqchip *chip = kmalloc(sizeof(*chip), GFP_KERNEL);

1731

1732 r = -ENOMEM;

1733 if (!chip)

1734 goto out;

1735 r = -EFAULT;

1736 if (copy_from_user(chip, argp, sizeof *chip))

1737 goto get_irqchip_out;

1738 r = -ENXIO;

1739 if (!irqchip_in_kernel(kvm))

1740 goto get_irqchip_out;

1741 r = kvm_vm_ioctl_get_irqchip(kvm, chip);

1742 if (r)

1743 goto get_irqchip_out;

1744 r = -EFAULT;

1745 if (copy_to_user(argp, chip, sizeof *chip))

1746 goto get_irqchip_out;

1747 r = 0;

1748 get_irqchip_out:

1749 kfree(chip);

1750 if (r)

1751 goto out;

1752 break;

1753 }

1754 case KVM_SET_IRQCHIP: {

1755 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */

1756 struct kvm_irqchip *chip = kmalloc(sizeof(*chip), GFP_KERNEL);

1757

1758 r = -ENOMEM;

1759 if (!chip)

1760 goto out;

1761 r = -EFAULT;

1762 if (copy_from_user(chip, argp, sizeof *chip))

1763 goto set_irqchip_out;

1764 r = -ENXIO;

1765 if (!irqchip_in_kernel(kvm))

1766 goto set_irqchip_out;

1767 r = kvm_vm_ioctl_set_irqchip(kvm, chip);

1768 if (r)

1769 goto set_irqchip_out;

1770 r = 0;

1771 set_irqchip_out:

1772 kfree(chip);

1773 if (r)

1774 goto out;

1775 break;

1776 }

1777 case KVM_GET_PIT: {

1778 r = -EFAULT;

1779 if (copy_from_user(&u.ps, argp, sizeof(struct kvm_pit_state)))

1780 goto out;

1781 r = -ENXIO;

1782 if (!kvm->arch.vpit)

1783 goto out;

1784 r = kvm_vm_ioctl_get_pit(kvm, &u.ps);

1785 if (r)

1786 goto out;

1787 r = -EFAULT;

1788 if (copy_to_user(argp, &u.ps, sizeof(struct kvm_pit_state)))

1789 goto out;

1790 r = 0;

1791 break;

1792 }

1793 case KVM_SET_PIT: {

1794 r = -EFAULT;

1795 if (copy_from_user(&u.ps, argp, sizeof u.ps))

1796 goto out;

1797 r = -ENXIO;

1798 if (!kvm->arch.vpit)

1799 goto out;

1800 r = kvm_vm_ioctl_set_pit(kvm, &u.ps);

1801 if (r)

1802 goto out;

1803 r = 0;

1804 break;

1805 }

1806 default:

1807 ;

1808 }

1809 out:

1810 return r;

1811 }

1812

1813 static void kvm_init_msr_list(void)

1814 {

1815 u32 dummy[2];

1816 unsigned i, j;

1817

1818 for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) {

1819 if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0)

1820 continue;

1821 if (j < i)

1822 msrs_to_save[j] = msrs_to_save[i];

1823 j++;

1824 }

1825 num_msrs_to_save = j;

1826 }

1827

1828 /*

1829 * Only apic need an MMIO device hook, so shortcut now..

1830 */

1831 static struct kvm_io_device *vcpu_find_pervcpu_dev(struct kvm_vcpu *vcpu,

1832 gpa_t addr, int len,

1833 int is_write)

1834 {

1835 struct kvm_io_device *dev;

1836

1837 if (vcpu->arch.apic) {

1838 dev = &vcpu->arch.apic->dev;

1839 if (dev->in_range(dev, addr, len, is_write))

1840 return dev;

1841 }

1842 return NULL;

1843 }

1844

1845

1846 static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu,

1847 gpa_t addr, int len,

1848 int is_write)

1849 {

1850 struct kvm_io_device *dev;

1851

1852 dev = vcpu_find_pervcpu_dev(vcpu, addr, len, is_write);

1853 if (dev == NULL)

1854 dev = kvm_io_bus_find_dev(&vcpu->kvm->mmio_bus, addr, len,

1855 is_write);

1856 return dev;

1857 }

1858

1859 int emulator_read_std(unsigned long addr,

1860 void *val,

1861 unsigned int bytes,

1862 struct kvm_vcpu *vcpu)

1863 {

1864 void *data = val;

1865 int r = X86EMUL_CONTINUE;

1866

1867 while (bytes) {

1868 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);

1869 unsigned offset = addr & (PAGE_SIZE-1);

1870 unsigned tocopy = min(bytes, (unsigned)PAGE_SIZE - offset);

1871 int ret;

1872

1873 if (gpa == UNMAPPED_GVA) {

1874 r = X86EMUL_PROPAGATE_FAULT;

1875 goto out;

1876 }

1877 ret = kvm_read_guest(vcpu->kvm, gpa, data, tocopy);

1878 if (ret < 0) {

1879 r = X86EMUL_UNHANDLEABLE;

1880 goto out;

1881 }

1882

1883 bytes -= tocopy;

1884 data += tocopy;

1885 addr += tocopy;

1886 }

1887 out:

1888 return r;

1889 }

1890 EXPORT_SYMBOL_GPL(emulator_read_std);

1891

1892 static int emulator_read_emulated(unsigned long addr,

1893 void *val,

1894 unsigned int bytes,

1895 struct kvm_vcpu *vcpu)

1896 {

1897 struct kvm_io_device *mmio_dev;

1898 gpa_t gpa;

1899

1900 if (vcpu->mmio_read_completed) {

1901 memcpy(val, vcpu->mmio_data, bytes);

1902 vcpu->mmio_read_completed = 0;

1903 return X86EMUL_CONTINUE;

1904 }

1905

1906 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);

1907

1908 /* For APIC access vmexit */

1909 if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)

1910 goto mmio;

1911

1912 if (emulator_read_std(addr, val, bytes, vcpu)

1913 == X86EMUL_CONTINUE)

1914 return X86EMUL_CONTINUE;

1915 if (gpa == UNMAPPED_GVA)

1916 return X86EMUL_PROPAGATE_FAULT;

1917

1918 mmio:

1919 /*

1920 * Is this MMIO handled locally?

1921 */

1922 mutex_lock(&vcpu->kvm->lock);

1923 mmio_dev = vcpu_find_mmio_dev(vcpu, gpa, bytes, 0);

1924 if (mmio_dev) {

1925 kvm_iodevice_read(mmio_dev, gpa, bytes, val);

1926 mutex_unlock(&vcpu->kvm->lock);

1927 return X86EMUL_CONTINUE;

1928 }

1929 mutex_unlock(&vcpu->kvm->lock);

1930

1931 vcpu->mmio_needed = 1;

1932 vcpu->mmio_phys_addr = gpa;

1933 vcpu->mmio_size = bytes;

1934 vcpu->mmio_is_write = 0;

1935

1936 return X86EMUL_UNHANDLEABLE;

1937 }

1938

1939 int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,

1940 const void *val, int bytes)

1941 {

1942 int ret;

1943

1944 ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes);

1945 if (ret < 0)

1946 return 0;

1947 kvm_mmu_pte_write(vcpu, gpa, val, bytes);

1948 return 1;

1949 }

1950

1951 static int emulator_write_emulated_onepage(unsigned long addr,

1952 const void *val,

1953 unsigned int bytes,

1954 struct kvm_vcpu *vcpu)

1955 {

1956 struct kvm_io_device *mmio_dev;

1957 gpa_t gpa;

1958

1959 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);

1960

1961 if (gpa == UNMAPPED_GVA) {

1962 kvm_inject_page_fault(vcpu, addr, 2);

1963 return X86EMUL_PROPAGATE_FAULT;

1964 }

1965

1966 /* For APIC access vmexit */

1967 if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)

1968 goto mmio;

1969

1970 if (emulator_write_phys(vcpu, gpa, val, bytes))

1971 return X86EMUL_CONTINUE;

1972

1973 mmio:

1974 /*

1975 * Is this MMIO handled locally?

1976 */

1977 mutex_lock(&vcpu->kvm->lock);

1978 mmio_dev = vcpu_find_mmio_dev(vcpu, gpa, bytes, 1);

1979 if (mmio_dev) {

1980 kvm_iodevice_write(mmio_dev, gpa, bytes, val);

1981 mutex_unlock(&vcpu->kvm->lock);

1982 return X86EMUL_CONTINUE;

1983 }

1984 mutex_unlock(&vcpu->kvm->lock);

1985

1986 vcpu->mmio_needed = 1;

1987 vcpu->mmio_phys_addr = gpa;

1988 vcpu->mmio_size = bytes;

1989 vcpu->mmio_is_write = 1;

1990 memcpy(vcpu->mmio_data, val, bytes);

1991

1992 return X86EMUL_CONTINUE;

1993 }

1994

1995 int emulator_write_emulated(unsigned long addr,

1996 const void *val,

1997 unsigned int bytes,

1998 struct kvm_vcpu *vcpu)

1999 {

2000 /* Crossing a page boundary? */

2001 if (((addr + bytes - 1) ^ addr) & PAGE_MASK) {

2002 int rc, now;

2003

2004 now = -addr & ~PAGE_MASK;

2005 rc = emulator_write_emulated_onepage(addr, val, now, vcpu);

2006 if (rc != X86EMUL_CONTINUE)

2007 return rc;

2008 addr += now;

2009 val += now;

2010 bytes -= now;

2011 }

2012 return emulator_write_emulated_onepage(addr, val, bytes, vcpu);

2013 }

2014 EXPORT_SYMBOL_GPL(emulator_write_emulated);

2015

2016 static int emulator_cmpxchg_emulated(unsigned long addr,

2017 const void *old,

2018 const void *new,

2019 unsigned int bytes,

2020 struct kvm_vcpu *vcpu)

2021 {

2022 static int reported;

2023

2024 if (!reported) {

2025 reported = 1;

2026 printk(KERN_WARNING "kvm: emulating exchange as write\n");

2027 }

2028 #ifndef CONFIG_X86_64

2029 /* guests cmpxchg8b have to be emulated atomically */

2030 if (bytes == 8) {

2031 gpa_t gpa;

2032 struct page *page;

2033 char *kaddr;

2034 u64 val;

2035

2036 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);

2037

2038 if (gpa == UNMAPPED_GVA ||

2039 (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)

2040 goto emul_write;

2041

2042 if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK))

2043 goto emul_write;

2044

2045 val = *(u64 *)new;

2046

2047 down_read(&current->mm->mmap_sem);

2048 page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);

2049 up_read(&current->mm->mmap_sem);

2050

2051 kaddr = kmap_atomic(page, KM_USER0);

2052 set_64bit((u64 *)(kaddr + offset_in_page(gpa)), val);

2053 kunmap_atomic(kaddr, KM_USER0);

2054 kvm_release_page_dirty(page);

2055 }

2056 emul_write:

2057 #endif

2058

2059 return emulator_write_emulated(addr, new, bytes, vcpu);

2060 }

2061

2062 static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)

2063 {

2064 return kvm_x86_ops->get_segment_base(vcpu, seg);

2065 }

2066

2067 int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address)

2068 {

2069 return X86EMUL_CONTINUE;

2070 }

2071

2072 int emulate_clts(struct kvm_vcpu *vcpu)

2073 {

2074 KVMTRACE_0D(CLTS, vcpu, handler);

2075 kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 & ~X86_CR0_TS);

2076 return X86EMUL_CONTINUE;

2077 }

2078

2079 int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest)

2080 {

2081 struct kvm_vcpu *vcpu = ctxt->vcpu;

2082

2083 switch (dr) {

2084 case 0 ... 3:

2085 *dest = kvm_x86_ops->get_dr(vcpu, dr);

2086 return X86EMUL_CONTINUE;

2087 default:

2088 pr_unimpl(vcpu, "%s: unexpected dr %u\n", __func__, dr);

2089 return X86EMUL_UNHANDLEABLE;

2090 }

2091 }

2092

2093 int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value)

2094 {

2095 unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U;

2096 int exception;

2097

2098 kvm_x86_ops->set_dr(ctxt->vcpu, dr, value & mask, &exception);

2099 if (exception) {

2100 /* FIXME: better handling */

2101 return X86EMUL_UNHANDLEABLE;

2102 }

2103 return X86EMUL_CONTINUE;

2104 }

2105

2106 void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)

2107 {

2108 u8 opcodes[4];

2109 unsigned long rip = vcpu->arch.rip;

2110 unsigned long rip_linear;

2111

2112 if (!printk_ratelimit())

2113 return;

2114

2115 rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS);

2116

2117 emulator_read_std(rip_linear, (void *)opcodes, 4, vcpu);

2118

2119 printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n",

2120 context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]);

2121 }

2122 EXPORT_SYMBOL_GPL(kvm_report_emulation_failure);

2123

2124 static struct x86_emulate_ops emulate_ops = {

2125 .read_std = emulator_read_std,

2126 .read_emulated = emulator_read_emulated,

2127 .write_emulated = emulator_write_emulated,

2128 .cmpxchg_emulated = emulator_cmpxchg_emulated,

2129 };

2130

2131 int emulate_instruction(struct kvm_vcpu *vcpu,

2132 struct kvm_run *run,

2133 unsigned long cr2,

2134 u16 error_code,

2135 int emulation_type)

2136 {

2137 int r;

2138 struct decode_cache *c;

2139

2140 vcpu->arch.mmio_fault_cr2 = cr2;

2141 kvm_x86_ops->cache_regs(vcpu);

2142

2143 vcpu->mmio_is_write = 0;

2144 vcpu->arch.pio.string = 0;

2145

2146 if (!(emulation_type & EMULTYPE_NO_DECODE)) {

2147 int cs_db, cs_l;

2148 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);

2149

2150 vcpu->arch.emulate_ctxt.vcpu = vcpu;

2151 vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu);

2152 vcpu->arch.emulate_ctxt.mode =

2153 (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM)

2154 ? X86EMUL_MODE_REAL : cs_l

2155 ? X86EMUL_MODE_PROT64 : cs_db

2156 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;

2157

2158 r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);

2159

2160 /* Reject the instructions other than VMCALL/VMMCALL when

2161 * try to emulate invalid opcode */

2162 c = &vcpu->arch.emulate_ctxt.decode;

2163 if ((emulation_type & EMULTYPE_TRAP_UD) &&

2164 (!(c->twobyte && c->b == 0x01 &&

2165 (c->modrm_reg == 0 || c->modrm_reg == 3) &&

2166 c->modrm_mod == 3 && c->modrm_rm == 1)))

2167 return EMULATE_FAIL;

2168

2169 ++vcpu->stat.insn_emulation;

2170 if (r) {

2171 ++vcpu->stat.insn_emulation_fail;

2172 if (kvm_mmu_unprotect_page_virt(vcpu, cr2))

2173 return EMULATE_DONE;

2174 return EMULATE_FAIL;

2175 }

2176 }

2177

2178 r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);

2179

2180 if (vcpu->arch.pio.string)

2181 return EMULATE_DO_MMIO;

2182

2183 if ((r || vcpu->mmio_is_write) && run) {

2184 run->exit_reason = KVM_EXIT_MMIO;

2185 run->mmio.phys_addr = vcpu->mmio_phys_addr;

2186 memcpy(run->mmio.data, vcpu->mmio_data, 8);

2187 run->mmio.len = vcpu->mmio_size;

2188 run->mmio.is_write = vcpu->mmio_is_write;

2189 }

2190

2191 if (r) {

2192 if (kvm_mmu_unprotect_page_virt(vcpu, cr2))

2193 return EMULATE_DONE;

2194 if (!vcpu->mmio_needed) {

2195 kvm_report_emulation_failure(vcpu, "mmio");

2196 return EMULATE_FAIL;

2197 }

2198 return EMULATE_DO_MMIO;

2199 }

2200

2201 kvm_x86_ops->decache_regs(vcpu);

2202 kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);

2203

2204 if (vcpu->mmio_is_write) {

2205 vcpu->mmio_needed = 0;

2206 return EMULATE_DO_MMIO;

2207 }

2208

2209 return EMULATE_DONE;

2210 }

2211 EXPORT_SYMBOL_GPL(emulate_instruction);

2212

2213 static void free_pio_guest_pages(struct kvm_vcpu *vcpu)

2214 {

2215 int i;

2216

2217 for (i = 0; i < ARRAY_SIZE(vcpu->arch.pio.guest_pages); ++i)

2218 if (vcpu->arch.pio.guest_pages[i]) {

2219 kvm_release_page_dirty(vcpu->arch.pio.guest_pages[i]);

2220 vcpu->arch.pio.guest_pages[i] = NULL;

2221 }

2222 }

2223

2224 static int pio_copy_data(struct kvm_vcpu *vcpu)

2225 {

2226 void *p = vcpu->arch.pio_data;

2227 void *q;

2228 unsigned bytes;

2229 int nr_pages = vcpu->arch.pio.guest_pages[1] ? 2 : 1;

2230

2231 q = vmap(vcpu->arch.pio.guest_pages, nr_pages, VM_READ|VM_WRITE,

2232 PAGE_KERNEL);

2233 if (!q) {

2234 free_pio_guest_pages(vcpu);

2235 return -ENOMEM;

2236 }

2237 q += vcpu->arch.pio.guest_page_offset;

2238 bytes = vcpu->arch.pio.size * vcpu->arch.pio.cur_count;

2239 if (vcpu->arch.pio.in)

2240 memcpy(q, p, bytes);

2241 else

2242 memcpy(p, q, bytes);

2243 q -= vcpu->arch.pio.guest_page_offset;

2244 vunmap(q);

2245 free_pio_guest_pages(vcpu);

2246 return 0;

2247 }

2248

2249 int complete_pio(struct kvm_vcpu *vcpu)

2250 {

2251 struct kvm_pio_request *io = &vcpu->arch.pio;

2252 long delta;

2253 int r;

2254

2255 kvm_x86_ops->cache_regs(vcpu);

2256

2257 if (!io->string) {

2258 if (io->in)

2259 memcpy(&vcpu->arch.regs[VCPU_REGS_RAX], vcpu->arch.pio_data,

2260 io->size);

2261 } else {

2262 if (io->in) {

2263 r = pio_copy_data(vcpu);

2264 if (r) {

2265 kvm_x86_ops->cache_regs(vcpu);

2266 return r;

2267 }

2268 }

2269

2270 delta = 1;

2271 if (io->rep) {

2272 delta *= io->cur_count;

2273 /*

2274 * The size of the register should really depend on

2275 * current address size.

2276 */

2277 vcpu->arch.regs[VCPU_REGS_RCX] -= delta;

2278 }

2279 if (io->down)

2280 delta = -delta;

2281 delta *= io->size;

2282 if (io->in)

2283 vcpu->arch.regs[VCPU_REGS_RDI] += delta;

2284 else

2285 vcpu->arch.regs[VCPU_REGS_RSI] += delta;

2286 }

2287

2288 kvm_x86_ops->decache_regs(vcpu);

2289

2290 io->count -= io->cur_count;

2291 io->cur_count = 0;

2292

2293 return 0;

2294 }

2295

2296 static void kernel_pio(struct kvm_io_device *pio_dev,

2297 struct kvm_vcpu *vcpu,

2298 void *pd)

2299 {

2300 /* TODO: String I/O for in kernel device */

2301

2302 mutex_lock(&vcpu->kvm->lock);

2303 if (vcpu->arch.pio.in)

2304 kvm_iodevice_read(pio_dev, vcpu->arch.pio.port,

2305 vcpu->arch.pio.size,

2306 pd);

2307 else

2308 kvm_iodevice_write(pio_dev, vcpu->arch.pio.port,

2309 vcpu->arch.pio.size,

2310 pd);

2311 mutex_unlock(&vcpu->kvm->lock);

2312 }

2313

2314 static void pio_string_write(struct kvm_io_device *pio_dev,

2315 struct kvm_vcpu *vcpu)

2316 {

2317 struct kvm_pio_request *io = &vcpu->arch.pio;

2318 void *pd = vcpu->arch.pio_data;

2319 int i;

2320

2321 mutex_lock(&vcpu->kvm->lock);

2322 for (i = 0; i < io->cur_count; i++) {

2323 kvm_iodevice_write(pio_dev, io->port,

2324 io->size,

2325 pd);

2326 pd += io->size;

2327 }

2328 mutex_unlock(&vcpu->kvm->lock);

2329 }

2330

2331 static struct kvm_io_device *vcpu_find_pio_dev(struct kvm_vcpu *vcpu,

2332 gpa_t addr, int len,

2333 int is_write)

2334 {

2335 return kvm_io_bus_find_dev(&vcpu->kvm->pio_bus, addr, len, is_write);

2336 }

2337

2338 int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,

2339 int size, unsigned port)

2340 {

2341 struct kvm_io_device *pio_dev;

2342

2343 vcpu->run->exit_reason = KVM_EXIT_IO;

2344 vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;

2345 vcpu->run->io.size = vcpu->arch.pio.size = size;

2346 vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;

2347 vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = 1;

2348 vcpu->run->io.port = vcpu->arch.pio.port = port;

2349 vcpu->arch.pio.in = in;

2350 vcpu->arch.pio.string = 0;

2351 vcpu->arch.pio.down = 0;

2352 vcpu->arch.pio.guest_page_offset = 0;

2353 vcpu->arch.pio.rep = 0;

2354

2355 if (vcpu->run->io.direction == KVM_EXIT_IO_IN)

2356 KVMTRACE_2D(IO_READ, vcpu, vcpu->run->io.port, (u32)size,

2357 handler);

2358 else

2359 KVMTRACE_2D(IO_WRITE, vcpu, vcpu->run->io.port, (u32)size,

2360 handler);

2361

2362 kvm_x86_ops->cache_regs(vcpu);

2363 memcpy(vcpu->arch.pio_data, &vcpu->arch.regs[VCPU_REGS_RAX], 4);

2364

2365 kvm_x86_ops->skip_emulated_instruction(vcpu);

2366

2367 pio_dev = vcpu_find_pio_dev(vcpu, port, size, !in);

2368 if (pio_dev) {

2369 kernel_pio(pio_dev, vcpu, vcpu->arch.pio_data);

2370 complete_pio(vcpu);

2371 return 1;

2372 }

2373 return 0;

2374 }

2375 EXPORT_SYMBOL_GPL(kvm_emulate_pio);

2376

2377 int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,

2378 int size, unsigned long count, int down,

2379 gva_t address, int rep, unsigned port)

2380 {

2381 unsigned now, in_page;

2382 int i, ret = 0;

2383 int nr_pages = 1;

2384 struct page *page;

2385 struct kvm_io_device *pio_dev;

2386

2387 vcpu->run->exit_reason = KVM_EXIT_IO;

2388 vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;

2389 vcpu->run->io.size = vcpu->arch.pio.size = size;

2390 vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;

2391 vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = count;

2392 vcpu->run->io.port = vcpu->arch.pio.port = port;

2393 vcpu->arch.pio.in = in;

2394 vcpu->arch.pio.string = 1;

2395 vcpu->arch.pio.down = down;

2396 vcpu->arch.pio.guest_page_offset = offset_in_page(address);

2397 vcpu->arch.pio.rep = rep;

2398

2399 if (vcpu->run->io.direction == KVM_EXIT_IO_IN)

2400 KVMTRACE_2D(IO_READ, vcpu, vcpu->run->io.port, (u32)size,

2401 handler);

2402 else

2403 KVMTRACE_2D(IO_WRITE, vcpu, vcpu->run->io.port, (u32)size,

2404 handler);

2405

2406 if (!count) {

2407 kvm_x86_ops->skip_emulated_instruction(vcpu);

2408 return 1;

2409 }

2410

2411 if (!down)

2412 in_page = PAGE_SIZE - offset_in_page(address);

2413 else

2414 in_page = offset_in_page(address) + size;

2415 now = min(count, (unsigned long)in_page / size);

2416 if (!now) {

2417 /*

2418 * String I/O straddles page boundary. Pin two guest pages

2419 * so that we satisfy atomicity constraints. Do just one

2420 * transaction to avoid complexity.

2421 */

2422 nr_pages = 2;

2423 now = 1;

2424 }

2425 if (down) {

2426 /*

2427 * String I/O in reverse. Yuck. Kill the guest, fix later.

2428 */

2429 pr_unimpl(vcpu, "guest string pio down\n");

2430 kvm_inject_gp(vcpu, 0);

2431 return 1;

2432 }

2433 vcpu->run->io.count = now;

2434 vcpu->arch.pio.cur_count = now;

2435

2436 if (vcpu->arch.pio.cur_count == vcpu->arch.pio.count)

2437 kvm_x86_ops->skip_emulated_instruction(vcpu);

2438

2439 for (i = 0; i < nr_pages; ++i) {

2440 page = gva_to_page(vcpu, address + i * PAGE_SIZE);

2441 vcpu->arch.pio.guest_pages[i] = page;

2442 if (!page) {

2443 kvm_inject_gp(vcpu, 0);

2444 free_pio_guest_pages(vcpu);

2445 return 1;

2446 }

2447 }

2448

2449 pio_dev = vcpu_find_pio_dev(vcpu, port,

2450 vcpu->arch.pio.cur_count,

2451 !vcpu->arch.pio.in);

2452 if (!vcpu->arch.pio.in) {

2453 /* string PIO write */

2454 ret = pio_copy_data(vcpu);

2455 if (ret >= 0 && pio_dev) {

2456 pio_string_write(pio_dev, vcpu);

2457 complete_pio(vcpu);

2458 if (vcpu->arch.pio.count == 0)

2459 ret = 1;

2460 }

2461 } else if (pio_dev)

2462 pr_unimpl(vcpu, "no string pio read support yet, "

2463 "port %x size %d count %ld\n",

2464 port, size, count);

2465

2466 return ret;

2467 }

2468 EXPORT_SYMBOL_GPL(kvm_emulate_pio_string);

2469

2470 int kvm_arch_init(void *opaque)

2471 {

2472 int r;

2473 struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque;

2474

2475 if (kvm_x86_ops) {

2476 printk(KERN_ERR "kvm: already loaded the other module\n");

2477 r = -EEXIST;

2478 goto out;

2479 }

2480

2481 if (!ops->cpu_has_kvm_support()) {

2482 printk(KERN_ERR "kvm: no hardware support\n");

2483 r = -EOPNOTSUPP;

2484 goto out;

2485 }

2486 if (ops->disabled_by_bios()) {

2487 printk(KERN_ERR "kvm: disabled by bios\n");

2488 r = -EOPNOTSUPP;

2489 goto out;

2490 }

2491

2492 r = kvm_mmu_module_init();

2493 if (r)

2494 goto out;

2495

2496 kvm_init_msr_list();

2497

2498 kvm_x86_ops = ops;

2499 kvm_mmu_set_nonpresent_ptes(0ull, 0ull);

2500 kvm_mmu_set_base_ptes(PT_PRESENT_MASK);

2501 kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,

2502 PT_DIRTY_MASK, PT64_NX_MASK, 0);

2503 return 0;

2504

2505 out:

2506 return r;

2507 }

2508

2509 void kvm_arch_exit(void)

2510 {

2511 kvm_x86_ops = NULL;

2512 kvm_mmu_module_exit();

2513 }

2514

2515 int kvm_emulate_halt(struct kvm_vcpu *vcpu)

2516 {

2517 ++vcpu->stat.halt_exits;

2518 KVMTRACE_0D(HLT, vcpu, handler);

2519 if (irqchip_in_kernel(vcpu->kvm)) {

2520 vcpu->arch.mp_state = KVM_MP_STATE_HALTED;

2521 up_read(&vcpu->kvm->slots_lock);

2522 kvm_vcpu_block(vcpu);

2523 down_read(&vcpu->kvm->slots_lock);

2524 if (vcpu->arch.mp_state != KVM_MP_STATE_RUNNABLE)

2525 return -EINTR;

2526 return 1;

2527 } else {

2528 vcpu->run->exit_reason = KVM_EXIT_HLT;

2529 return 0;

2530 }

2531 }

2532 EXPORT_SYMBOL_GPL(kvm_emulate_halt);

2533

2534 static inline gpa_t hc_gpa(struct kvm_vcpu *vcpu, unsigned long a0,

2535 unsigned long a1)

2536 {

2537 if (is_long_mode(vcpu))

2538 return a0;

2539 else

2540 return a0 | ((gpa_t)a1 << 32);

2541 }

2542

2543 int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)

2544 {

2545 unsigned long nr, a0, a1, a2, a3, ret;

2546 int r = 1;

2547

2548 kvm_x86_ops->cache_regs(vcpu);

2549

2550 nr = vcpu->arch.regs[VCPU_REGS_RAX];

2551 a0 = vcpu->arch.regs[VCPU_REGS_RBX];

2552 a1 = vcpu->arch.regs[VCPU_REGS_RCX];

2553 a2 = vcpu->arch.regs[VCPU_REGS_RDX];

2554 a3 = vcpu->arch.regs[VCPU_REGS_RSI];

2555

2556 KVMTRACE_1D(VMMCALL, vcpu, (u32)nr, handler);

2557

2558 if (!is_long_mode(vcpu)) {

2559 nr &= 0xFFFFFFFF;

2560 a0 &= 0xFFFFFFFF;

2561 a1 &= 0xFFFFFFFF;

2562 a2 &= 0xFFFFFFFF;

2563 a3 &= 0xFFFFFFFF;

2564 }

2565

2566 switch (nr) {

2567 case KVM_HC_VAPIC_POLL_IRQ:

2568 ret = 0;

2569 break;

2570 case KVM_HC_MMU_OP:

2571 r = kvm_pv_mmu_op(vcpu, a0, hc_gpa(vcpu, a1, a2), &ret);

2572 break;

2573 default:

2574 ret = -KVM_ENOSYS;

2575 break;

2576 }

2577 vcpu->arch.regs[VCPU_REGS_RAX] = ret;

2578 kvm_x86_ops->decache_regs(vcpu);

2579 ++vcpu->stat.hypercalls;

2580 return r;

2581 }

2582 EXPORT_SYMBOL_GPL(kvm_emulate_hypercall);

2583

2584 int kvm_fix_hypercall(struct kvm_vcpu *vcpu)

2585 {

2586 char instruction[3];

2587 int ret = 0;

2588

2589

2590 /*

2591 * Blow out the MMU to ensure that no other VCPU has an active mapping

2592 * to ensure that the updated hypercall appears atomically across all

2593 * VCPUs.

2594 */

2595 kvm_mmu_zap_all(vcpu->kvm);

2596

2597 kvm_x86_ops->cache_regs(vcpu);

2598 kvm_x86_ops->patch_hypercall(vcpu, instruction);

2599 if (emulator_write_emulated(vcpu->arch.rip, instruction, 3, vcpu)

2600 != X86EMUL_CONTINUE)

2601 ret = -EFAULT;

2602

2603 return ret;

2604 }

2605

2606 static u64 mk_cr_64(u64 curr_cr, u32 new_val)

2607 {

2608 return (curr_cr & ~((1ULL << 32) - 1)) | new_val;

2609 }

2610

2611 void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)

2612 {

2613 struct descriptor_table dt = { limit, base };

2614

2615 kvm_x86_ops->set_gdt(vcpu, &dt);

2616 }

2617

2618 void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)

2619 {

2620 struct descriptor_table dt = { limit, base };

2621

2622 kvm_x86_ops->set_idt(vcpu, &dt);

2623 }

2624

2625 void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,

2626 unsigned long *rflags)

2627 {

2628 kvm_lmsw(vcpu, msw);

2629 *rflags = kvm_x86_ops->get_rflags(vcpu);

2630 }

2631

2632 unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)

2633 {

2634 unsigned long value;

2635

2636 kvm_x86_ops->decache_cr4_guest_bits(vcpu);

2637 switch (cr) {

2638 case 0:

2639 value = vcpu->arch.cr0;

2640 break;

2641 case 2:

2642 value = vcpu->arch.cr2;

2643 break;

2644 case 3:

2645 value = vcpu->arch.cr3;

2646 break;

2647 case 4:

2648 value = vcpu->arch.cr4;

2649 break;

2650 case 8:

2651 value = kvm_get_cr8(vcpu);

2652 break;

2653 default:

2654 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);

2655 return 0;

2656 }

2657 KVMTRACE_3D(CR_READ, vcpu, (u32)cr, (u32)value,

2658 (u32)((u64)value >> 32), handler);

2659

2660 return value;

2661 }

2662

2663 void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val,

2664 unsigned long *rflags)

2665 {

2666 KVMTRACE_3D(CR_WRITE, vcpu, (u32)cr, (u32)val,

2667 (u32)((u64)val >> 32), handler);

2668

2669 switch (cr) {

2670 case 0:

2671 kvm_set_cr0(vcpu, mk_cr_64(vcpu->arch.cr0, val));

2672 *rflags = kvm_x86_ops->get_rflags(vcpu);

2673 break;

2674 case 2:

2675 vcpu->arch.cr2 = val;

2676 break;

2677 case 3:

2678 kvm_set_cr3(vcpu, val);

2679 break;

2680 case 4:

2681 kvm_set_cr4(vcpu, mk_cr_64(vcpu->arch.cr4, val));

2682 break;

2683 case 8:

2684 kvm_set_cr8(vcpu, val & 0xfUL);

2685 break;

2686 default:

2687 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);

2688 }

2689 }

2690

2691 static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i)

2692 {

2693 struct kvm_cpuid_entry2 *e = &vcpu->arch.cpuid_entries[i];

2694 int j, nent = vcpu->arch.cpuid_nent;

2695

2696 e->flags &= ~KVM_CPUID_FLAG_STATE_READ_NEXT;

2697 /* when no next entry is found, the current entry[i] is reselected */

2698 for (j = i + 1; j == i; j = (j + 1) % nent) {

2699 struct kvm_cpuid_entry2 *ej = &vcpu->arch.cpuid_entries[j];

2700 if (ej->function == e->function) {

2701 ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;

2702 return j;

2703 }

2704 }

2705 return 0; /* silence gcc, even though control never reaches here */

2706 }

2707

2708 /* find an entry with matching function, matching index (if needed), and that

2709 * should be read next (if it's stateful) */

2710 static int is_matching_cpuid_entry(struct kvm_cpuid_entry2 *e,

2711 u32 function, u32 index)

2712 {

2713 if (e->function != function)

2714 return 0;

2715 if ((e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) && e->index != index)

2716 return 0;

2717 if ((e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) &&

2718 !(e->flags & KVM_CPUID_FLAG_STATE_READ_NEXT))

2719 return 0;

2720 return 1;

2721 }

2722

2723 void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)

2724 {

2725 int i;

2726 u32 function, index;

2727 struct kvm_cpuid_entry2 *e, *best;

2728

2729 kvm_x86_ops->cache_regs(vcpu);

2730 function = vcpu->arch.regs[VCPU_REGS_RAX];

2731 index = vcpu->arch.regs[VCPU_REGS_RCX];

2732 vcpu->arch.regs[VCPU_REGS_RAX] = 0;

2733 vcpu->arch.regs[VCPU_REGS_RBX] = 0;

2734 vcpu->arch.regs[VCPU_REGS_RCX] = 0;

2735 vcpu->arch.regs[VCPU_REGS_RDX] = 0;

2736 best = NULL;

2737 for (i = 0; i < vcpu->arch.cpuid_nent; ++i) {

2738 e = &vcpu->arch.cpuid_entries[i];

2739 if (is_matching_cpuid_entry(e, function, index)) {

2740 if (e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC)

2741 move_to_next_stateful_cpuid_entry(vcpu, i);

2742 best = e;

2743 break;

2744 }

2745 /*

2746 * Both basic or both extended?

2747 */

2748 if (((e->function ^ function) & 0x80000000) == 0)

2749 if (!best || e->function > best->function)

2750 best = e;

2751 }

2752 if (best) {

2753 vcpu->arch.regs[VCPU_REGS_RAX] = best->eax;

2754 vcpu->arch.regs[VCPU_REGS_RBX] = best->ebx;

2755 vcpu->arch.regs[VCPU_REGS_RCX] = best->ecx;

2756 vcpu->arch.regs[VCPU_REGS_RDX] = best->edx;

2757 }

2758 kvm_x86_ops->decache_regs(vcpu);

2759 kvm_x86_ops->skip_emulated_instruction(vcpu);

2760 KVMTRACE_5D(CPUID, vcpu, function,

2761 (u32)vcpu->arch.regs[VCPU_REGS_RAX],

2762 (u32)vcpu->arch.regs[VCPU_REGS_RBX],

2763 (u32)vcpu->arch.regs[VCPU_REGS_RCX],

2764 (u32)vcpu->arch.regs[VCPU_REGS_RDX], handler);

2765 }

2766 EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);

2767

2768 /*

2769 * Check if userspace requested an interrupt window, and that the

2770 * interrupt window is open.

2771 *

2772 * No need to exit to userspace if we already have an interrupt queued.

2773 */

2774 static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu,

2775 struct kvm_run *kvm_run)

2776 {

2777 return (!vcpu->arch.irq_summary &&

2778 kvm_run->request_interrupt_window &&

2779 vcpu->arch.interrupt_window_open &&

2780 (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF));

2781 }

2782

2783 static void post_kvm_run_save(struct kvm_vcpu *vcpu,

2784 struct kvm_run *kvm_run)

2785 {

2786 kvm_run->if_flag = (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF) != 0;

2787 kvm_run->cr8 = kvm_get_cr8(vcpu);

2788 kvm_run->apic_base = kvm_get_apic_base(vcpu);

2789 if (irqchip_in_kernel(vcpu->kvm))

2790 kvm_run->ready_for_interrupt_injection = 1;

2791 else

2792 kvm_run->ready_for_interrupt_injection =

2793 (vcpu->arch.interrupt_window_open &&

2794 vcpu->arch.irq_summary == 0);

2795 }

2796

2797 static void vapic_enter(struct kvm_vcpu *vcpu)

2798 {

2799 struct kvm_lapic *apic = vcpu->arch.apic;

2800 struct page *page;

2801

2802 if (!apic || !apic->vapic_addr)

2803 return;

2804

2805 down_read(&current->mm->mmap_sem);

2806 page = gfn_to_page(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT);

2807 up_read(&current->mm->mmap_sem);

2808

2809 vcpu->arch.apic->vapic_page = page;

2810 }

2811

2812 static void vapic_exit(struct kvm_vcpu *vcpu)

2813 {

2814 struct kvm_lapic *apic = vcpu->arch.apic;

2815

2816 if (!apic || !apic->vapic_addr)

2817 return;

2818

2819 down_read(&vcpu->kvm->slots_lock);

2820 kvm_release_page_dirty(apic->vapic_page);

2821 mark_page_dirty(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT);

2822 up_read(&vcpu->kvm->slots_lock);

2823 }

2824

2825 static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)

2826 {

2827 int r;

2828

2829 if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) {

2830 pr_debug("vcpu %d received sipi with vector # %x\n",

2831 vcpu->vcpu_id, vcpu->arch.sipi_vector);

2832 kvm_lapic_reset(vcpu);

2833 r = kvm_x86_ops->vcpu_reset(vcpu);

2834 if (r)

2835 return r;

2836 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;

2837 }

2838

2839 down_read(&vcpu->kvm->slots_lock);

2840 vapic_enter(vcpu);

2841

2842 again:

2843 if (vcpu->requests)

2844 if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests))

2845 kvm_mmu_unload(vcpu);

2846

2847 r = kvm_mmu_reload(vcpu);

2848 if (unlikely(r))

2849 goto out;

2850

2851 if (vcpu->requests) {

2852 if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests))

2853 __kvm_migrate_timers(vcpu);

2854 if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests))

2855 kvm_x86_ops->tlb_flush(vcpu);

2856 if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS,

2857 &vcpu->requests)) {

2858 kvm_run->exit_reason = KVM_EXIT_TPR_ACCESS;

2859 r = 0;

2860 goto out;

2861 }

2862 if (test_and_clear_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests)) {

2863 kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;

2864 r = 0;

2865 goto out;

2866 }

2867 }

2868

2869 clear_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests);

2870 kvm_inject_pending_timer_irqs(vcpu);

2871

2872 preempt_disable();

2873

2874 kvm_x86_ops->prepare_guest_switch(vcpu);

2875 kvm_load_guest_fpu(vcpu);

2876

2877 local_irq_disable();

2878

2879 if (vcpu->requests || need_resched()) {

2880 local_irq_enable();

2881 preempt_enable();

2882 r = 1;

2883 goto out;

2884 }

2885

2886 if (signal_pending(current)) {

2887 local_irq_enable();

2888 preempt_enable();

2889 r = -EINTR;

2890 kvm_run->exit_reason = KVM_EXIT_INTR;

2891 ++vcpu->stat.signal_exits;

2892 goto out;

2893 }

2894

2895 if (vcpu->guest_debug.enabled)

2896 kvm_x86_ops->guest_debug_pre(vcpu);

2897

2898 vcpu->guest_mode = 1;

2899 /*

2900 * Make sure that guest_mode assignment won't happen after

2901 * testing the pending IRQ vector bitmap.

2902 */

2903 smp_wmb();

2904

2905 if (vcpu->arch.exception.pending)

2906 __queue_exception(vcpu);

2907 else if (irqchip_in_kernel(vcpu->kvm))

2908 kvm_x86_ops->inject_pending_irq(vcpu);

2909 else

2910 kvm_x86_ops->inject_pending_vectors(vcpu, kvm_run);

2911

2912 kvm_lapic_sync_to_vapic(vcpu);

2913

2914 up_read(&vcpu->kvm->slots_lock);

2915

2916 kvm_guest_enter();

2917

2918

2919 KVMTRACE_0D(VMENTRY, vcpu, entryexit);

2920 kvm_x86_ops->run(vcpu, kvm_run);

2921

2922 vcpu->guest_mode = 0;

2923 local_irq_enable();

2924

2925 ++vcpu->stat.exits;

2926

2927 /*

2928 * We must have an instruction between local_irq_enable() and

2929 * kvm_guest_exit(), so the timer interrupt isn't delayed by

2930 * the interrupt shadow. The stat.exits increment will do nicely.

2931 * But we need to prevent reordering, hence this barrier():

2932 */

2933 barrier();

2934

2935 kvm_guest_exit();

2936

2937 preempt_enable();

2938

2939 down_read(&vcpu->kvm->slots_lock);

2940

2941 /*

2942 * Profile KVM exit RIPs:

2943 */

2944 if (unlikely(prof_on == KVM_PROFILING)) {

2945 kvm_x86_ops->cache_regs(vcpu);

2946 profile_hit(KVM_PROFILING, (void *)vcpu->arch.rip);

2947 }

2948

2949 if (vcpu->arch.exception.pending && kvm_x86_ops->exception_injected(vcpu))

2950 vcpu->arch.exception.pending = false;

2951

2952 kvm_lapic_sync_from_vapic(vcpu);

2953

2954 r = kvm_x86_ops->handle_exit(kvm_run, vcpu);

2955

2956 if (r > 0) {

2957 if (dm_request_for_irq_injection(vcpu, kvm_run)) {

2958 r = -EINTR;

2959 kvm_run->exit_reason = KVM_EXIT_INTR;

2960 ++vcpu->stat.request_irq_exits;

2961 goto out;

2962 }

2963 if (!need_resched())

2964 goto again;

2965 }

2966

2967 out:

2968 up_read(&vcpu->kvm->slots_lock);

2969 if (r > 0) {

2970 kvm_resched(vcpu);

2971 down_read(&vcpu->kvm->slots_lock);

2972 goto again;

2973 }

2974

2975 post_kvm_run_save(vcpu, kvm_run);

2976

2977 vapic_exit(vcpu);

2978

2979 return r;

2980 }

2981

2982 int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)

2983 {

2984 int r;

2985 sigset_t sigsaved;

2986

2987 vcpu_load(vcpu);

2988

2989 if (vcpu->sigset_active)

2990 sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);

2991

2992 if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) {

2993 kvm_vcpu_block(vcpu);

2994 r = -EAGAIN;

2995 goto out;

2996 }

2997

2998 /* re-sync apic's tpr */

2999 if (!irqchip_in_kernel(vcpu->kvm))

3000 kvm_set_cr8(vcpu, kvm_run->cr8);

3001

3002 if (vcpu->arch.pio.cur_count) {

3003 r = complete_pio(vcpu);

3004 if (r)

3005 goto out;

3006 }

3007 #if CONFIG_HAS_IOMEM

3008 if (vcpu->mmio_needed) {

3009 memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);

3010 vcpu->mmio_read_completed = 1;

3011 vcpu->mmio_needed = 0;

3012

3013 down_read(&vcpu->kvm->slots_lock);

3014 r = emulate_instruction(vcpu, kvm_run,

3015 vcpu->arch.mmio_fault_cr2, 0,

3016 EMULTYPE_NO_DECODE);

3017 up_read(&vcpu->kvm->slots_lock);

3018 if (r == EMULATE_DO_MMIO) {

3019 /*

3020 * Read-modify-write. Back to userspace.

3021 */

3022 r = 0;

3023 goto out;

3024 }

3025 }

3026 #endif

3027 if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL) {

3028 kvm_x86_ops->cache_regs(vcpu);

3029 vcpu->arch.regs[VCPU_REGS_RAX] = kvm_run->hypercall.ret;

3030 kvm_x86_ops->decache_regs(vcpu);

3031 }

3032

3033 r = __vcpu_run(vcpu, kvm_run);

3034

3035 out:

3036 if (vcpu->sigset_active)

3037 sigprocmask(SIG_SETMASK, &sigsaved, NULL);

3038

3039 vcpu_put(vcpu);

3040 return r;

3041 }

3042

3043 int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)

3044 {

3045 vcpu_load(vcpu);

3046

3047 kvm_x86_ops->cache_regs(vcpu);

3048

3049 regs->rax = vcpu->arch.regs[VCPU_REGS_RAX];

3050 regs->rbx = vcpu->arch.regs[VCPU_REGS_RBX];

3051 regs->rcx = vcpu->arch.regs[VCPU_REGS_RCX];

3052 regs->rdx = vcpu->arch.regs[VCPU_REGS_RDX];

3053 regs->rsi = vcpu->arch.regs[VCPU_REGS_RSI];

3054 regs->rdi = vcpu->arch.regs[VCPU_REGS_RDI];

3055 regs->rsp = vcpu->arch.regs[VCPU_REGS_RSP];

3056 regs->rbp = vcpu->arch.regs[VCPU_REGS_RBP];

3057 #ifdef CONFIG_X86_64

3058 regs->r8 = vcpu->arch.regs[VCPU_REGS_R8];

3059 regs->r9 = vcpu->arch.regs[VCPU_REGS_R9];

3060 regs->r10 = vcpu->arch.regs[VCPU_REGS_R10];

3061 regs->r11 = vcpu->arch.regs[VCPU_REGS_R11];

3062 regs->r12 = vcpu->arch.regs[VCPU_REGS_R12];

3063 regs->r13 = vcpu->arch.regs[VCPU_REGS_R13];

3064 regs->r14 = vcpu->arch.regs[VCPU_REGS_R14];

3065 regs->r15 = vcpu->arch.regs[VCPU_REGS_R15];

3066 #endif

3067

3068 regs->rip = vcpu->arch.rip;

3069 regs->rflags = kvm_x86_ops->get_rflags(vcpu);

3070

3071 /*

3072 * Don't leak debug flags in case they were set for guest debugging

3073 */

3074 if (vcpu->guest_debug.enabled && vcpu->guest_debug.singlestep)

3075 regs->rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);

3076

3077 vcpu_put(vcpu);

3078

3079 return 0;

3080 }

3081

3082 int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)

3083 {

3084 vcpu_load(vcpu);

3085

3086 vcpu->arch.regs[VCPU_REGS_RAX] = regs->rax;

3087 vcpu->arch.regs[VCPU_REGS_RBX] = regs->rbx;

3088 vcpu->arch.regs[VCPU_REGS_RCX] = regs->rcx;

3089 vcpu->arch.regs[VCPU_REGS_RDX] = regs->rdx;

3090 vcpu->arch.regs[VCPU_REGS_RSI] = regs->rsi;

3091 vcpu->arch.regs[VCPU_REGS_RDI] = regs->rdi;

3092 vcpu->arch.regs[VCPU_REGS_RSP] = regs->rsp;

3093 vcpu->arch.regs[VCPU_REGS_RBP] = regs->rbp;

3094 #ifdef CONFIG_X86_64

3095 vcpu->arch.regs[VCPU_REGS_R8] = regs->r8;

3096 vcpu->arch.regs[VCPU_REGS_R9] = regs->r9;

3097 vcpu->arch.regs[VCPU_REGS_R10] = regs->r10;

3098 vcpu->arch.regs[VCPU_REGS_R11] = regs->r11;

3099 vcpu->arch.regs[VCPU_REGS_R12] = regs->r12;

3100 vcpu->arch.regs[VCPU_REGS_R13] = regs->r13;

3101 vcpu->arch.regs[VCPU_REGS_R14] = regs->r14;

3102 vcpu->arch.regs[VCPU_REGS_R15] = regs->r15;

3103 #endif

3104

3105 vcpu->arch.rip = regs->rip;

3106 kvm_x86_ops->set_rflags(vcpu, regs->rflags);

3107

3108 kvm_x86_ops->decache_regs(vcpu);

3109

3110 vcpu->arch.exception.pending = false;

3111

3112 vcpu_put(vcpu);

3113

3114 return 0;

3115 }

3116

3117 void kvm_get_segment(struct kvm_vcpu *vcpu,

3118 struct kvm_segment *var, int seg)

3119 {

3120 kvm_x86_ops->get_segment(vcpu, var, seg);

3121 }

3122

3123 void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)

3124 {

3125 struct kvm_segment cs;

3126

3127 kvm_get_segment(vcpu, &cs, VCPU_SREG_CS);

3128 *db = cs.db;

3129 *l = cs.l;

3130 }

3131 EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits);

3132

3133 int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,

3134 struct kvm_sregs *sregs)

3135 {

3136 struct descriptor_table dt;

3137 int pending_vec;

3138

3139 vcpu_load(vcpu);

3140

3141 kvm_get_segment(vcpu, &sregs->cs, VCPU_SREG_CS);

3142 kvm_get_segment(vcpu, &sregs->ds, VCPU_SREG_DS);

3143 kvm_get_segment(vcpu, &sregs->es, VCPU_SREG_ES);

3144 kvm_get_segment(vcpu, &sregs->fs, VCPU_SREG_FS);

3145 kvm_get_segment(vcpu, &sregs->gs, VCPU_SREG_GS);

3146 kvm_get_segment(vcpu, &sregs->ss, VCPU_SREG_SS);

3147

3148 kvm_get_segment(vcpu, &sregs->tr, VCPU_SREG_TR);

3149 kvm_get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);

3150

3151 kvm_x86_ops->get_idt(vcpu, &dt);

3152 sregs->idt.limit = dt.limit;

3153 sregs->idt.base = dt.base;

3154 kvm_x86_ops->get_gdt(vcpu, &dt);

3155 sregs->gdt.limit = dt.limit;

3156 sregs->gdt.base = dt.base;

3157

3158 kvm_x86_ops->decache_cr4_guest_bits(vcpu);

3159 sregs->cr0 = vcpu->arch.cr0;

3160 sregs->cr2 = vcpu->arch.cr2;

3161 sregs->cr3 = vcpu->arch.cr3;

3162 sregs->cr4 = vcpu->arch.cr4;

3163 sregs->cr8 = kvm_get_cr8(vcpu);

3164 sregs->efer = vcpu->arch.shadow_efer;

3165 sregs->apic_base = kvm_get_apic_base(vcpu);

3166

3167 if (irqchip_in_kernel(vcpu->kvm)) {

3168 memset(sregs->interrupt_bitmap, 0,

3169 sizeof sregs->interrupt_bitmap);

3170 pending_vec = kvm_x86_ops->get_irq(vcpu);

3171 if (pending_vec >= 0)

3172 set_bit(pending_vec,

3173 (unsigned long *)sregs->interrupt_bitmap);

3174 } else

3175 memcpy(sregs->interrupt_bitmap, vcpu->arch.irq_pending,

3176 sizeof sregs->interrupt_bitmap);

3177

3178 vcpu_put(vcpu);

3179

3180 return 0;

3181 }

3182

3183 int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,

3184 struct kvm_mp_state *mp_state)

3185 {

3186 vcpu_load(vcpu);

3187 mp_state->mp_state = vcpu->arch.mp_state;

3188 vcpu_put(vcpu);

3189 return 0;

3190 }

3191

3192 int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,

3193 struct kvm_mp_state *mp_state)

3194 {

3195 vcpu_load(vcpu);

3196 vcpu->arch.mp_state = mp_state->mp_state;

3197 vcpu_put(vcpu);

3198 return 0;

3199 }

3200

3201 static void kvm_set_segment(struct kvm_vcpu *vcpu,

3202 struct kvm_segment *var, int seg)

3203 {

3204 kvm_x86_ops->set_segment(vcpu, var, seg);

3205 }

3206

3207 static void seg_desct_to_kvm_desct(struct desc_struct *seg_desc, u16 selector,

3208 struct kvm_segment *kvm_desct)

3209 {

3210 kvm_desct->base = seg_desc->base0;

3211 kvm_desct->base |= seg_desc->base1 << 16;

3212 kvm_desct->base |= seg_desc->base2 << 24;

3213 kvm_desct->limit = seg_desc->limit0;

3214 kvm_desct->limit |= seg_desc->limit << 16;

3215 if (seg_desc->g) {

3216 kvm_desct->limit <<= 12;

3217 kvm_desct->limit |= 0xfff;

3218 }

3219 kvm_desct->selector = selector;

3220 kvm_desct->type = seg_desc->type;

3221 kvm_desct->present = seg_desc->p;

3222 kvm_desct->dpl = seg_desc->dpl;

3223 kvm_desct->db = seg_desc->d;

3224 kvm_desct->s = seg_desc->s;

3225 kvm_desct->l = seg_desc->l;

3226 kvm_desct->g = seg_desc->g;

3227 kvm_desct->avl = seg_desc->avl;

3228 if (!selector)

3229 kvm_desct->unusable = 1;

3230 else

3231 kvm_desct->unusable = 0;

3232 kvm_desct->padding = 0;

3233 }

3234

3235 static void get_segment_descritptor_dtable(struct kvm_vcpu *vcpu,

3236 u16 selector,

3237 struct descriptor_table *dtable)

3238 {

3239 if (selector & 1 << 2) {

3240 struct kvm_segment kvm_seg;

3241

3242 kvm_get_segment(vcpu, &kvm_seg, VCPU_SREG_LDTR);

3243

3244 if (kvm_seg.unusable)

3245 dtable->limit = 0;

3246 else

3247 dtable->limit = kvm_seg.limit;

3248 dtable->base = kvm_seg.base;

3249 }

3250 else

3251 kvm_x86_ops->get_gdt(vcpu, dtable);

3252 }

3253

3254 /* allowed just for 8 bytes segments */

3255 static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,

3256 struct desc_struct *seg_desc)

3257 {

3258 gpa_t gpa;

3259 struct descriptor_table dtable;

3260 u16 index = selector >> 3;

3261

3262 get_segment_descritptor_dtable(vcpu, selector, &dtable);

3263

3264 if (dtable.limit < index * 8 + 7) {

3265 kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc);

3266 return 1;

3267 }

3268 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, dtable.base);

3269 gpa += index * 8;

3270 return kvm_read_guest(vcpu->kvm, gpa, seg_desc, 8);

3271 }

3272

3273 /* allowed just for 8 bytes segments */

3274 static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,

3275 struct desc_struct *seg_desc)

3276 {

3277 gpa_t gpa;

3278 struct descriptor_table dtable;

3279 u16 index = selector >> 3;

3280

3281 get_segment_descritptor_dtable(vcpu, selector, &dtable);

3282

3283 if (dtable.limit < index * 8 + 7)

3284 return 1;

3285 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, dtable.base);

3286 gpa += index * 8;

3287 return kvm_write_guest(vcpu->kvm, gpa, seg_desc, 8);

3288 }

3289

3290 static u32 get_tss_base_addr(struct kvm_vcpu *vcpu,

3291 struct desc_struct *seg_desc)

3292 {

3293 u32 base_addr;

3294

3295 base_addr = seg_desc->base0;

3296 base_addr |= (seg_desc->base1 << 16);

3297 base_addr |= (seg_desc->base2 << 24);

3298

3299 return vcpu->arch.mmu.gva_to_gpa(vcpu, base_addr);

3300 }

3301

3302 static u16 get_segment_selector(struct kvm_vcpu *vcpu, int seg)

3303 {

3304 struct kvm_segment kvm_seg;

3305

3306 kvm_get_segment(vcpu, &kvm_seg, seg);

3307 return kvm_seg.selector;

3308 }

3309

3310 static int load_segment_descriptor_to_kvm_desct(struct kvm_vcpu *vcpu,

3311 u16 selector,

3312 struct kvm_segment *kvm_seg)

3313 {

3314 struct desc_struct seg_desc;

3315

3316 if (load_guest_segment_descriptor(vcpu, selector, &seg_desc))

3317 return 1;

3318 seg_desct_to_kvm_desct(&seg_desc, selector, kvm_seg);

3319 return 0;

3320 }

3321

3322 int kvm_load_realmode_segment(struct kvm_vcpu *vcpu, u16 selector, int seg)

3323 {

3324 struct kvm_segment segvar = {

3325 .base = selector << 4,

3326 .limit = 0xffff,

3327 .selector = selector,

3328 .type = 3,

3329 .present = 1,

3330 .dpl = 3,

3331 .db = 0,

3332 .s = 1,

3333 .l = 0,

3334 .g = 0,

3335 .avl = 0,

3336 .unusable = 0,

3337 };

3338 kvm_x86_ops->set_segment(vcpu, &segvar, seg);

3339 return 0;

3340 }

3341

3342 int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,

3343 int type_bits, int seg)

3344 {

3345 struct kvm_segment kvm_seg;

3346

3347 if (!(vcpu->arch.cr0 & X86_CR0_PE))

3348 return kvm_load_realmode_segment(vcpu, selector, seg);

3349 if (load_segment_descriptor_to_kvm_desct(vcpu, selector, &kvm_seg))

3350 return 1;

3351 kvm_seg.type |= type_bits;

3352

3353 if (seg != VCPU_SREG_SS && seg != VCPU_SREG_CS &&

3354 seg != VCPU_SREG_LDTR)

3355 if (!kvm_seg.s)

3356 kvm_seg.unusable = 1;

3357

3358 kvm_set_segment(vcpu, &kvm_seg, seg);

3359 return 0;

3360 }

3361

3362 static void save_state_to_tss32(struct kvm_vcpu *vcpu,

3363 struct tss_segment_32 *tss)

3364 {

3365 tss->cr3 = vcpu->arch.cr3;

3366 tss->eip = vcpu->arch.rip;

3367 tss->eflags = kvm_x86_ops->get_rflags(vcpu);

3368 tss->eax = vcpu->arch.regs[VCPU_REGS_RAX];

3369 tss->ecx = vcpu->arch.regs[VCPU_REGS_RCX];

3370 tss->edx = vcpu->arch.regs[VCPU_REGS_RDX];

3371 tss->ebx = vcpu->arch.regs[VCPU_REGS_RBX];

3372 tss->esp = vcpu->arch.regs[VCPU_REGS_RSP];

3373 tss->ebp = vcpu->arch.regs[VCPU_REGS_RBP];

3374 tss->esi = vcpu->arch.regs[VCPU_REGS_RSI];

3375 tss->edi = vcpu->arch.regs[VCPU_REGS_RDI];

3376

3377 tss->es = get_segment_selector(vcpu, VCPU_SREG_ES);

3378 tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS);

3379 tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS);

3380 tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS);

3381 tss->fs = get_segment_selector(vcpu, VCPU_SREG_FS);

3382 tss->gs = get_segment_selector(vcpu, VCPU_SREG_GS);

3383 tss->ldt_selector = get_segment_selector(vcpu, VCPU_SREG_LDTR);

3384 tss->prev_task_link = get_segment_selector(vcpu, VCPU_SREG_TR);

3385 }

3386

3387 static int load_state_from_tss32(struct kvm_vcpu *vcpu,

3388 struct tss_segment_32 *tss)

3389 {

3390 kvm_set_cr3(vcpu, tss->cr3);

3391

3392 vcpu->arch.rip = tss->eip;

3393 kvm_x86_ops->set_rflags(vcpu, tss->eflags | 2);

3394

3395 vcpu->arch.regs[VCPU_REGS_RAX] = tss->eax;

3396 vcpu->arch.regs[VCPU_REGS_RCX] = tss->ecx;

3397 vcpu->arch.regs[VCPU_REGS_RDX] = tss->edx;

3398 vcpu->arch.regs[VCPU_REGS_RBX] = tss->ebx;

3399 vcpu->arch.regs[VCPU_REGS_RSP] = tss->esp;

3400 vcpu->arch.regs[VCPU_REGS_RBP] = tss->ebp;

3401 vcpu->arch.regs[VCPU_REGS_RSI] = tss->esi;

3402 vcpu->arch.regs[VCPU_REGS_RDI] = tss->edi;

3403

3404 if (kvm_load_segment_descriptor(vcpu, tss->ldt_selector, 0, VCPU_SREG_LDTR))

3405 return 1;

3406

3407 if (kvm_load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES))

3408 return 1;

3409

3410 if (kvm_load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS))

3411 return 1;

3412

3413 if (kvm_load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS))

3414 return 1;

3415

3416 if (kvm_load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS))

3417 return 1;

3418

3419 if (kvm_load_segment_descriptor(vcpu, tss->fs, 1, VCPU_SREG_FS))

3420 return 1;

3421

3422 if (kvm_load_segment_descriptor(vcpu, tss->gs, 1, VCPU_SREG_GS))

3423 return 1;

3424 return 0;

3425 }

3426

3427 static void save_state_to_tss16(struct kvm_vcpu *vcpu,

3428 struct tss_segment_16 *tss)

3429 {

3430 tss->ip = vcpu->arch.rip;

3431 tss->flag = kvm_x86_ops->get_rflags(vcpu);

3432 tss->ax = vcpu->arch.regs[VCPU_REGS_RAX];

3433 tss->cx = vcpu->arch.regs[VCPU_REGS_RCX];

3434 tss->dx = vcpu->arch.regs[VCPU_REGS_RDX];

3435 tss->bx = vcpu->arch.regs[VCPU_REGS_RBX];

3436 tss->sp = vcpu->arch.regs[VCPU_REGS_RSP];

3437 tss->bp = vcpu->arch.regs[VCPU_REGS_RBP];

3438 tss->si = vcpu->arch.regs[VCPU_REGS_RSI];

3439 tss->di = vcpu->arch.regs[VCPU_REGS_RDI];

3440

3441 tss->es = get_segment_selector(vcpu, VCPU_SREG_ES);

3442 tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS);

3443 tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS);

3444 tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS);

3445 tss->ldt = get_segment_selector(vcpu, VCPU_SREG_LDTR);

3446 tss->prev_task_link = get_segment_selector(vcpu, VCPU_SREG_TR);

3447 }

3448

3449 static int load_state_from_tss16(struct kvm_vcpu *vcpu,

3450 struct tss_segment_16 *tss)

3451 {

3452 vcpu->arch.rip = tss->ip;

3453 kvm_x86_ops->set_rflags(vcpu, tss->flag | 2);

3454 vcpu->arch.regs[VCPU_REGS_RAX] = tss->ax;

3455 vcpu->arch.regs[VCPU_REGS_RCX] = tss->cx;

3456 vcpu->arch.regs[VCPU_REGS_RDX] = tss->dx;

3457 vcpu->arch.regs[VCPU_REGS_RBX] = tss->bx;

3458 vcpu->arch.regs[VCPU_REGS_RSP] = tss->sp;

3459 vcpu->arch.regs[VCPU_REGS_RBP] = tss->bp;

3460 vcpu->arch.regs[VCPU_REGS_RSI] = tss->si;

3461 vcpu->arch.regs[VCPU_REGS_RDI] = tss->di;

3462

3463 if (kvm_load_segment_descriptor(vcpu, tss->ldt, 0, VCPU_SREG_LDTR))

3464 return 1;

3465

3466 if (kvm_load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES))

3467 return 1;

3468

3469 if (kvm_load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS))

3470 return 1;

3471

3472 if (kvm_load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS))

3473 return 1;

3474

3475 if (kvm_load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS))

3476 return 1;

3477 return 0;

3478 }

3479

3480 static int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector,

3481 u32 old_tss_base,

3482 struct desc_struct *nseg_desc)

3483 {

3484 struct tss_segment_16 tss_segment_16;

3485 int ret = 0;

3486

3487 if (kvm_read_guest(vcpu->kvm, old_tss_base, &tss_segment_16,

3488 sizeof tss_segment_16))

3489 goto out;

3490

3491 save_state_to_tss16(vcpu, &tss_segment_16);

3492

3493 if (kvm_write_guest(vcpu->kvm, old_tss_base, &tss_segment_16,

3494 sizeof tss_segment_16))

3495 goto out;

3496

3497 if (kvm_read_guest(vcpu->kvm, get_tss_base_addr(vcpu, nseg_desc),

3498 &tss_segment_16, sizeof tss_segment_16))

3499 goto out;

3500

3501 if (load_state_from_tss16(vcpu, &tss_segment_16))

3502 goto out;

3503

3504 ret = 1;

3505 out:

3506 return ret;

3507 }

3508

3509 static int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector,

3510 u32 old_tss_base,

3511 struct desc_struct *nseg_desc)

3512 {

3513 struct tss_segment_32 tss_segment_32;

3514 int ret = 0;

3515

3516 if (kvm_read_guest(vcpu->kvm, old_tss_base, &tss_segment_32,

3517 sizeof tss_segment_32))

3518 goto out;

3519

3520 save_state_to_tss32(vcpu, &tss_segment_32);

3521

3522 if (kvm_write_guest(vcpu->kvm, old_tss_base, &tss_segment_32,

3523 sizeof tss_segment_32))

3524 goto out;

3525

3526 if (kvm_read_guest(vcpu->kvm, get_tss_base_addr(vcpu, nseg_desc),

3527 &tss_segment_32, sizeof tss_segment_32))

3528 goto out;

3529

3530 if (load_state_from_tss32(vcpu, &tss_segment_32))

3531 goto out;

3532

3533 ret = 1;

3534 out:

3535 return ret;

3536 }

3537

3538 int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)

3539 {

3540 struct kvm_segment tr_seg;

3541 struct desc_struct cseg_desc;

3542 struct desc_struct nseg_desc;

3543 int ret = 0;

3544 u32 old_tss_base = get_segment_base(vcpu, VCPU_SREG_TR);

3545 u16 old_tss_sel = get_segment_selector(vcpu, VCPU_SREG_TR);

3546

3547 old_tss_base = vcpu->arch.mmu.gva_to_gpa(vcpu, old_tss_base);

3548

3549 /* FIXME: Handle errors. Failure to read either TSS or their

3550 * descriptors should generate a pagefault.

3551 */

3552 if (load_guest_segment_descriptor(vcpu, tss_selector, &nseg_desc))

3553 goto out;

3554

3555 if (load_guest_segment_descriptor(vcpu, old_tss_sel, &cseg_desc))

3556 goto out;

3557

3558 if (reason != TASK_SWITCH_IRET) {

3559 int cpl;

3560

3561 cpl = kvm_x86_ops->get_cpl(vcpu);

3562 if ((tss_selector & 3) > nseg_desc.dpl || cpl > nseg_desc.dpl) {

3563 kvm_queue_exception_e(vcpu, GP_VECTOR, 0);

3564 return 1;

3565 }

3566 }

3567

3568 if (!nseg_desc.p || (nseg_desc.limit0 | nseg_desc.limit << 16) < 0x67) {

3569 kvm_queue_exception_e(vcpu, TS_VECTOR, tss_selector & 0xfffc);

3570 return 1;

3571 }

3572

3573 if (reason == TASK_SWITCH_IRET || reason == TASK_SWITCH_JMP) {

3574 cseg_desc.type &= ~(1 << 1); //clear the B flag

3575 save_guest_segment_descriptor(vcpu, old_tss_sel, &cseg_desc);

3576 }

3577

3578 if (reason == TASK_SWITCH_IRET) {

3579 u32 eflags = kvm_x86_ops->get_rflags(vcpu);

3580 kvm_x86_ops->set_rflags(vcpu, eflags & ~X86_EFLAGS_NT);

3581 }

3582

3583 kvm_x86_ops->skip_emulated_instruction(vcpu);

3584 kvm_x86_ops->cache_regs(vcpu);

3585

3586 if (nseg_desc.type & 8)

3587 ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_base,

3588 &nseg_desc);

3589 else

3590 ret = kvm_task_switch_16(vcpu, tss_selector, old_tss_base,

3591 &nseg_desc);

3592

3593 if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE) {

3594 u32 eflags = kvm_x86_ops->get_rflags(vcpu);

3595 kvm_x86_ops->set_rflags(vcpu, eflags | X86_EFLAGS_NT);

3596 }

3597

3598 if (reason != TASK_SWITCH_IRET) {

3599 nseg_desc.type |= (1 << 1);

3600 save_guest_segment_descriptor(vcpu, tss_selector,

3601 &nseg_desc);

3602 }

3603

3604 kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 | X86_CR0_TS);

3605 seg_desct_to_kvm_desct(&nseg_desc, tss_selector, &tr_seg);

3606 tr_seg.type = 11;

3607 kvm_set_segment(vcpu, &tr_seg, VCPU_SREG_TR);

3608 out:

3609 kvm_x86_ops->decache_regs(vcpu);

3610 return ret;

3611 }

3612 EXPORT_SYMBOL_GPL(kvm_task_switch);

3613

3614 int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,

3615 struct kvm_sregs *sregs)

3616 {

3617 int mmu_reset_needed = 0;

3618 int i, pending_vec, max_bits;

3619 struct descriptor_table dt;

3620

3621 vcpu_load(vcpu);

3622

3623 dt.limit = sregs->idt.limit;

3624 dt.base = sregs->idt.base;

3625 kvm_x86_ops->set_idt(vcpu, &dt);

3626 dt.limit = sregs->gdt.limit;

3627 dt.base = sregs->gdt.base;

3628 kvm_x86_ops->set_gdt(vcpu, &dt);

3629

3630 vcpu->arch.cr2 = sregs->cr2;

3631 mmu_reset_needed |= vcpu->arch.cr3 != sregs->cr3;

3632 vcpu->arch.cr3 = sregs->cr3;

3633

3634 kvm_set_cr8(vcpu, sregs->cr8);

3635

3636 mmu_reset_needed |= vcpu->arch.shadow_efer != sregs->efer;

3637 kvm_x86_ops->set_efer(vcpu, sregs->efer);

3638 kvm_set_apic_base(vcpu, sregs->apic_base);

3639

3640 kvm_x86_ops->decache_cr4_guest_bits(vcpu);

3641

3642 mmu_reset_needed |= vcpu->arch.cr0 != sregs->cr0;

3643 kvm_x86_ops->set_cr0(vcpu, sregs->cr0);

3644 vcpu->arch.cr0 = sregs->cr0;

3645

3646 mmu_reset_needed |= vcpu->arch.cr4 != sregs->cr4;

3647 kvm_x86_ops->set_cr4(vcpu, sregs->cr4);

3648 if (!is_long_mode(vcpu) && is_pae(vcpu))

3649 load_pdptrs(vcpu, vcpu->arch.cr3);

3650

3651 if (mmu_reset_needed)

3652 kvm_mmu_reset_context(vcpu);

3653

3654 if (!irqchip_in_kernel(vcpu->kvm)) {

3655 memcpy(vcpu->arch.irq_pending, sregs->interrupt_bitmap,

3656 sizeof vcpu->arch.irq_pending);

3657 vcpu->arch.irq_summary = 0;

3658 for (i = 0; i < ARRAY_SIZE(vcpu->arch.irq_pending); ++i)

3659 if (vcpu->arch.irq_pending[i])

3660 __set_bit(i, &vcpu->arch.irq_summary);

3661 } else {

3662 max_bits = (sizeof sregs->interrupt_bitmap) << 3;

3663 pending_vec = find_first_bit(

3664 (const unsigned long *)sregs->interrupt_bitmap,

3665 max_bits);

3666 /* Only pending external irq is handled here */

3667 if (pending_vec < max_bits) {

3668 kvm_x86_ops->set_irq(vcpu, pending_vec);

3669 pr_debug("Set back pending irq %d\n",

3670 pending_vec);

3671 }

3672 }

3673

3674 kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);

3675 kvm_set_segment(vcpu, &sregs->ds, VCPU_SREG_DS);

3676 kvm_set_segment(vcpu, &sregs->es, VCPU_SREG_ES);

3677 kvm_set_segment(vcpu, &sregs->fs, VCPU_SREG_FS);

3678 kvm_set_segment(vcpu, &sregs->gs, VCPU_SREG_GS);

3679 kvm_set_segment(vcpu, &sregs->ss, VCPU_SREG_SS);

3680

3681 kvm_set_segment(vcpu, &sregs->tr, VCPU_SREG_TR);

3682 kvm_set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);

3683

3684 vcpu_put(vcpu);

3685

3686 return 0;

3687 }

3688

3689 int kvm_arch_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu,

3690 struct kvm_debug_guest *dbg)

3691 {

3692 int r;

3693

3694 vcpu_load(vcpu);

3695

3696 r = kvm_x86_ops->set_guest_debug(vcpu, dbg);

3697

3698 vcpu_put(vcpu);

3699

3700 return r;

3701 }

3702

3703 /*

3704 * fxsave fpu state. Taken from x86_64/processor.h. To be killed when

3705 * we have asm/x86/processor.h

3706 */

3707 struct fxsave {

3708 u16 cwd;

3709 u16 swd;

3710 u16 twd;

3711 u16 fop;

3712 u64 rip;

3713 u64 rdp;

3714 u32 mxcsr;

3715 u32 mxcsr_mask;

3716 u32 st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */

3717 #ifdef CONFIG_X86_64

3718 u32 xmm_space[64]; /* 16*16 bytes for each XMM-reg = 256 bytes */

3719 #else

3720 u32 xmm_space[32]; /* 8*16 bytes for each XMM-reg = 128 bytes */

3721 #endif

3722 };

3723

3724 /*

3725 * Translate a guest virtual address to a guest physical address.

3726 */

3727 int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,

3728 struct kvm_translation *tr)

3729 {

3730 unsigned long vaddr = tr->linear_address;

3731 gpa_t gpa;

3732

3733 vcpu_load(vcpu);

3734 down_read(&vcpu->kvm->slots_lock);

3735 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, vaddr);

3736 up_read(&vcpu->kvm->slots_lock);

3737 tr->physical_address = gpa;

3738 tr->valid = gpa != UNMAPPED_GVA;

3739 tr->writeable = 1;

3740 tr->usermode = 0;

3741 vcpu_put(vcpu);

3742

3743 return 0;

3744 }

3745

3746 int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)

3747 {

3748 struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image;

3749

3750 vcpu_load(vcpu);

3751

3752 memcpy(fpu->fpr, fxsave->st_space, 128);

3753 fpu->fcw = fxsave->cwd;

3754 fpu->fsw = fxsave->swd;

3755 fpu->ftwx = fxsave->twd;

3756 fpu->last_opcode = fxsave->fop;

3757 fpu->last_ip = fxsave->rip;

3758 fpu->last_dp = fxsave->rdp;

3759 memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space);

3760

3761 vcpu_put(vcpu);

3762

3763 return 0;

3764 }

3765

3766 int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)

3767 {

3768 struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image;

3769

3770 vcpu_load(vcpu);

3771

3772 memcpy(fxsave->st_space, fpu->fpr, 128);

3773 fxsave->cwd = fpu->fcw;

3774 fxsave->swd = fpu->fsw;

3775 fxsave->twd = fpu->ftwx;

3776 fxsave->fop = fpu->last_opcode;

3777 fxsave->rip = fpu->last_ip;

3778 fxsave->rdp = fpu->last_dp;

3779 memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space);

3780

3781 vcpu_put(vcpu);

3782

3783 return 0;

3784 }

3785

3786 void fx_init(struct kvm_vcpu *vcpu)

3787 {

3788 unsigned after_mxcsr_mask;

3789

3790 /*

3791 * Touch the fpu the first time in non atomic context as if

3792 * this is the first fpu instruction the exception handler

3793 * will fire before the instruction returns and it'll have to

3794 * allocate ram with GFP_KERNEL.

3795 */

3796 if (!used_math())

3797 kvm_fx_save(&vcpu->arch.host_fx_image);

3798

3799 /* Initialize guest FPU by resetting ours and saving into guest's */

3800 preempt_disable();

3801 kvm_fx_save(&vcpu->arch.host_fx_image);

3802 kvm_fx_finit();

3803 kvm_fx_save(&vcpu->arch.guest_fx_image);

3804 kvm_fx_restore(&vcpu->arch.host_fx_image);

3805 preempt_enable();

3806

3807 vcpu->arch.cr0 |= X86_CR0_ET;

3808 after_mxcsr_mask = offsetof(struct i387_fxsave_struct, st_space);

3809 vcpu->arch.guest_fx_image.mxcsr = 0x1f80;

3810 memset((void *)&vcpu->arch.guest_fx_image + after_mxcsr_mask,

3811 0, sizeof(struct i387_fxsave_struct) - after_mxcsr_mask);

3812 }

3813 EXPORT_SYMBOL_GPL(fx_init);

3814

3815 void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)

3816 {

3817 if (!vcpu->fpu_active || vcpu->guest_fpu_loaded)

3818 return;

3819

3820 vcpu->guest_fpu_loaded = 1;

3821 kvm_fx_save(&vcpu->arch.host_fx_image);

3822 kvm_fx_restore(&vcpu->arch.guest_fx_image);

3823 }

3824 EXPORT_SYMBOL_GPL(kvm_load_guest_fpu);

3825

3826 void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)

3827 {

3828 if (!vcpu->guest_fpu_loaded)

3829 return;

3830

3831 vcpu->guest_fpu_loaded = 0;

3832 kvm_fx_save(&vcpu->arch.guest_fx_image);

3833 kvm_fx_restore(&vcpu->arch.host_fx_image);

3834 ++vcpu->stat.fpu_reload;

3835 }

3836 EXPORT_SYMBOL_GPL(kvm_put_guest_fpu);

3837

3838 void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)

3839 {

3840 kvm_x86_ops->vcpu_free(vcpu);

3841 }

3842

3843 struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,

3844 unsigned int id)

3845 {

3846 return kvm_x86_ops->vcpu_create(kvm, id);

3847 }

3848

3849 int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)

3850 {

3851 int r;

3852

3853 /* We do fxsave: this must be aligned. */

3854 BUG_ON((unsigned long)&vcpu->arch.host_fx_image & 0xF);

3855

3856 vcpu_load(vcpu);

3857 r = kvm_arch_vcpu_reset(vcpu);

3858 if (r == 0)

3859 r = kvm_mmu_setup(vcpu);

3860 vcpu_put(vcpu);

3861 if (r < 0)

3862 goto free_vcpu;

3863

3864 return 0;

3865 free_vcpu:

3866 kvm_x86_ops->vcpu_free(vcpu);

3867 return r;

3868 }

3869

3870 void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)

3871 {

3872 vcpu_load(vcpu);

3873 kvm_mmu_unload(vcpu);

3874 vcpu_put(vcpu);

3875

3876 kvm_x86_ops->vcpu_free(vcpu);

3877 }

3878

3879 int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)

3880 {

3881 return kvm_x86_ops->vcpu_reset(vcpu);

3882 }

3883

3884 void kvm_arch_hardware_enable(void *garbage)

3885 {

3886 kvm_x86_ops->hardware_enable(garbage);

3887 }

3888

3889 void kvm_arch_hardware_disable(void *garbage)

3890 {

3891 kvm_x86_ops->hardware_disable(garbage);

3892 }

3893

3894 int kvm_arch_hardware_setup(void)

3895 {

3896 return kvm_x86_ops->hardware_setup();

3897 }

3898

3899 void kvm_arch_hardware_unsetup(void)

3900 {

3901 kvm_x86_ops->hardware_unsetup();

3902 }

3903

3904 void kvm_arch_check_processor_compat(void *rtn)

3905 {

3906 kvm_x86_ops->check_processor_compatibility(rtn);

3907 }

3908

3909 int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)

3910 {

3911 struct page *page;

3912 struct kvm *kvm;

3913 int r;

3914

3915 BUG_ON(vcpu->kvm == NULL);

3916 kvm = vcpu->kvm;

3917

3918 vcpu->arch.mmu.root_hpa = INVALID_PAGE;

3919 if (!irqchip_in_kernel(kvm) || vcpu->vcpu_id == 0)

3920 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;

3921 else

3922 vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED;

3923

3924 page = alloc_page(GFP_KERNEL | __GFP_ZERO);

3925 if (!page) {

3926 r = -ENOMEM;

3927 goto fail;

3928 }

3929 vcpu->arch.pio_data = page_address(page);

3930

3931 r = kvm_mmu_create(vcpu);

3932 if (r < 0)

3933 goto fail_free_pio_data;

3934

3935 if (irqchip_in_kernel(kvm)) {

3936 r = kvm_create_lapic(vcpu);

3937 if (r < 0)

3938 goto fail_mmu_destroy;

3939 }

3940

3941 return 0;

3942

3943 fail_mmu_destroy:

3944 kvm_mmu_destroy(vcpu);

3945 fail_free_pio_data:

3946 free_page((unsigned long)vcpu->arch.pio_data);

3947 fail:

3948 return r;

3949 }

3950

3951 void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)

3952 {

3953 kvm_free_lapic(vcpu);

3954 down_read(&vcpu->kvm->slots_lock);

3955 kvm_mmu_destroy(vcpu);

3956 up_read(&vcpu->kvm->slots_lock);

3957 free_page((unsigned long)vcpu->arch.pio_data);

3958 }

3959

3960 struct kvm *kvm_arch_create_vm(void)

3961 {

3962 struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);

3963

3964 if (!kvm)

3965 return ERR_PTR(-ENOMEM);

3966

3967 INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);

3968

3969 return kvm;

3970 }

3971

3972 static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)

3973 {

3974 vcpu_load(vcpu);

3975 kvm_mmu_unload(vcpu);

3976 vcpu_put(vcpu);

3977 }

3978

3979 static void kvm_free_vcpus(struct kvm *kvm)

3980 {

3981 unsigned int i;

3982

3983 /*

3984 * Unpin any mmu pages first.

3985 */

3986 for (i = 0; i < KVM_MAX_VCPUS; ++i)

3987 if (kvm->vcpus[i])

3988 kvm_unload_vcpu_mmu(kvm->vcpus[i]);

3989 for (i = 0; i < KVM_MAX_VCPUS; ++i) {

3990 if (kvm->vcpus[i]) {

3991 kvm_arch_vcpu_free(kvm->vcpus[i]);

3992 kvm->vcpus[i] = NULL;

3993 }

3994 }

3995

3996 }

3997

3998 void kvm_arch_destroy_vm(struct kvm *kvm)

3999 {

4000 kvm_free_pit(kvm);

4001 kfree(kvm->arch.vpic);

4002 kfree(kvm->arch.vioapic);

4003 kvm_free_vcpus(kvm);

4004 kvm_free_physmem(kvm);

4005 if (kvm->arch.apic_access_page)

4006 put_page(kvm->arch.apic_access_page);

4007 if (kvm->arch.ept_identity_pagetable)

4008 put_page(kvm->arch.ept_identity_pagetable);

4009 kfree(kvm);

4010 }

4011

4012 int kvm_arch_set_memory_region(struct kvm *kvm,

4013 struct kvm_userspace_memory_region *mem,

4014 struct kvm_memory_slot old,

4015 int user_alloc)

4016 {

4017 int npages = mem->memory_size >> PAGE_SHIFT;

4018 struct kvm_memory_slot *memslot = &kvm->memslots[mem->slot];

4019

4020 /*To keep backward compatibility with older userspace,

4021 *x86 needs to hanlde !user_alloc case.

4022 */

4023 if (!user_alloc) {

4024 if (npages && !old.rmap) {

4025 unsigned long userspace_addr;

4026

4027 down_write(&current->mm->mmap_sem);

4028 userspace_addr = do_mmap(NULL, 0,

4029 npages * PAGE_SIZE,

4030 PROT_READ | PROT_WRITE,

4031 MAP_PRIVATE | MAP_ANONYMOUS,

4032 0);

4033 up_write(&current->mm->mmap_sem);

4034

4035 if (IS_ERR((void *)userspace_addr))

4036 return PTR_ERR((void *)userspace_addr);

4037

4038 /* set userspace_addr atomically for kvm_hva_to_rmapp */

4039 spin_lock(&kvm->mmu_lock);

4040 memslot->userspace_addr = userspace_addr;

4041 spin_unlock(&kvm->mmu_lock);

4042 } else {

4043 if (!old.user_alloc && old.rmap) {

4044 int ret;

4045

4046 down_write(&current->mm->mmap_sem);

4047 ret = do_munmap(current->mm, old.userspace_addr,

4048 old.npages * PAGE_SIZE);

4049 up_write(&current->mm->mmap_sem);

4050 if (ret < 0)

4051 printk(KERN_WARNING

4052 "kvm_vm_ioctl_set_memory_region: "

4053 "failed to munmap memory\n");

4054 }

4055 }

4056 }

4057

4058 if (!kvm->arch.n_requested_mmu_pages) {

4059 unsigned int nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm);

4060 kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);

4061 }

4062

4063 kvm_mmu_slot_remove_write_access(kvm, mem->slot);

4064 kvm_flush_remote_tlbs(kvm);

4065

4066 return 0;

4067 }

4068

4069 void kvm_arch_flush_shadow(struct kvm *kvm)

4070 {

4071 kvm_mmu_zap_all(kvm);

4072 }

4073

4074 int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)

4075 {

4076 return vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE

4077 || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED;

4078 }

4079

4080 static void vcpu_kick_intr(void *info)

4081 {

4082 #ifdef DEBUG

4083 struct kvm_vcpu *vcpu = (struct kvm_vcpu *)info;

4084 printk(KERN_DEBUG "vcpu_kick_intr %p \n", vcpu);

4085 #endif

4086 }

4087

4088 void kvm_vcpu_kick(struct kvm_vcpu *vcpu)

4089 {

4090 int ipi_pcpu = vcpu->cpu;

4091 int cpu = get_cpu();

4092

4093 if (waitqueue_active(&vcpu->wq)) {

4094 wake_up_interruptible(&vcpu->wq);

4095 ++vcpu->stat.halt_wakeup;

4096 }

4097 /*

4098 * We may be called synchronously with irqs disabled in guest mode,

4099 * So need not to call smp_call_function_single() in that case.

4100 */

4101 if (vcpu->guest_mode && vcpu->cpu != cpu)

4102 smp_call_function_single(ipi_pcpu, vcpu_kick_intr, vcpu, 0);

4103 put_cpu();

4104 }