1 // SPDX-License-Identifier: GPL-2.0-only
3 * Kernel-based Virtual Machine driver for Linux
7 * Copyright (C) 2006 Qumranet, Inc.
8 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
11 * Yaniv Kamay <yaniv@qumranet.com>
12 * Avi Kivity <avi@qumranet.com>
15 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
17 #include <linux/kvm_types.h>
18 #include <linux/hashtable.h>
19 #include <linux/amd-iommu.h>
20 #include <linux/kvm_host.h>
22 #include <asm/irq_remapping.h>
31 * Encode the arbitrary VM ID and the vCPU's default APIC ID, i.e the vCPU ID,
32 * into the GATag so that KVM can retrieve the correct vCPU from a GALog entry
33 * if an interrupt can't be delivered, e.g. because the vCPU isn't running.
35 * For the vCPU ID, use however many bits are currently allowed for the max
36 * guest physical APIC ID (limited by the size of the physical ID table), and
37 * use whatever bits remain to assign arbitrary AVIC IDs to VMs. Note, the
38 * size of the GATag is defined by hardware (32 bits), but is an opaque value
39 * as far as hardware is concerned.
41 #define AVIC_VCPU_ID_MASK AVIC_PHYSICAL_MAX_INDEX_MASK
43 #define AVIC_VM_ID_SHIFT HWEIGHT32(AVIC_PHYSICAL_MAX_INDEX_MASK)
44 #define AVIC_VM_ID_MASK (GENMASK(31, AVIC_VM_ID_SHIFT) >> AVIC_VM_ID_SHIFT)
46 #define AVIC_GATAG_TO_VMID(x) ((x >> AVIC_VM_ID_SHIFT) & AVIC_VM_ID_MASK)
47 #define AVIC_GATAG_TO_VCPUID(x) (x & AVIC_VCPU_ID_MASK)
49 #define __AVIC_GATAG(vm_id, vcpu_id) ((((vm_id) & AVIC_VM_ID_MASK) << AVIC_VM_ID_SHIFT) | \
50 ((vcpu_id) & AVIC_VCPU_ID_MASK))
51 #define AVIC_GATAG(vm_id, vcpu_id) \
53 u32 ga_tag = __AVIC_GATAG(vm_id, vcpu_id); \
55 WARN_ON_ONCE(AVIC_GATAG_TO_VCPUID(ga_tag) != (vcpu_id)); \
56 WARN_ON_ONCE(AVIC_GATAG_TO_VMID(ga_tag) != (vm_id)); \
60 static_assert(__AVIC_GATAG(AVIC_VM_ID_MASK
, AVIC_VCPU_ID_MASK
) == -1u);
62 static bool force_avic
;
63 module_param_unsafe(force_avic
, bool, 0444);
66 * This hash table is used to map VM_ID to a struct kvm_svm,
67 * when handling AMD IOMMU GALOG notification to schedule in
70 #define SVM_VM_DATA_HASH_BITS 8
71 static DEFINE_HASHTABLE(svm_vm_data_hash
, SVM_VM_DATA_HASH_BITS
);
72 static u32 next_vm_id
= 0;
73 static bool next_vm_id_wrapped
= 0;
74 static DEFINE_SPINLOCK(svm_vm_data_hash_lock
);
78 * This is a wrapper of struct amd_iommu_ir_data.
80 struct amd_svm_iommu_ir
{
81 struct list_head node
; /* Used by SVM for per-vcpu ir_list */
82 void *data
; /* Storing pointer to struct amd_ir_data */
85 static void avic_activate_vmcb(struct vcpu_svm
*svm
)
87 struct vmcb
*vmcb
= svm
->vmcb01
.ptr
;
89 vmcb
->control
.int_ctl
&= ~(AVIC_ENABLE_MASK
| X2APIC_MODE_MASK
);
90 vmcb
->control
.avic_physical_id
&= ~AVIC_PHYSICAL_MAX_INDEX_MASK
;
92 vmcb
->control
.int_ctl
|= AVIC_ENABLE_MASK
;
95 * Note: KVM supports hybrid-AVIC mode, where KVM emulates x2APIC MSR
96 * accesses, while interrupt injection to a running vCPU can be
97 * achieved using AVIC doorbell. KVM disables the APIC access page
98 * (deletes the memslot) if any vCPU has x2APIC enabled, thus enabling
99 * AVIC in hybrid mode activates only the doorbell mechanism.
101 if (x2avic_enabled
&& apic_x2apic_mode(svm
->vcpu
.arch
.apic
)) {
102 vmcb
->control
.int_ctl
|= X2APIC_MODE_MASK
;
103 vmcb
->control
.avic_physical_id
|= X2AVIC_MAX_PHYSICAL_ID
;
104 /* Disabling MSR intercept for x2APIC registers */
105 svm_set_x2apic_msr_interception(svm
, false);
108 * Flush the TLB, the guest may have inserted a non-APIC
109 * mapping into the TLB while AVIC was disabled.
111 kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT
, &svm
->vcpu
);
113 /* For xAVIC and hybrid-xAVIC modes */
114 vmcb
->control
.avic_physical_id
|= AVIC_MAX_PHYSICAL_ID
;
115 /* Enabling MSR intercept for x2APIC registers */
116 svm_set_x2apic_msr_interception(svm
, true);
120 static void avic_deactivate_vmcb(struct vcpu_svm
*svm
)
122 struct vmcb
*vmcb
= svm
->vmcb01
.ptr
;
124 vmcb
->control
.int_ctl
&= ~(AVIC_ENABLE_MASK
| X2APIC_MODE_MASK
);
125 vmcb
->control
.avic_physical_id
&= ~AVIC_PHYSICAL_MAX_INDEX_MASK
;
128 * If running nested and the guest uses its own MSR bitmap, there
129 * is no need to update L0's msr bitmap
131 if (is_guest_mode(&svm
->vcpu
) &&
132 vmcb12_is_intercept(&svm
->nested
.ctl
, INTERCEPT_MSR_PROT
))
135 /* Enabling MSR intercept for x2APIC registers */
136 svm_set_x2apic_msr_interception(svm
, true);
140 * This function is called from IOMMU driver to notify
141 * SVM to schedule in a particular vCPU of a particular VM.
143 int avic_ga_log_notifier(u32 ga_tag
)
146 struct kvm_svm
*kvm_svm
;
147 struct kvm_vcpu
*vcpu
= NULL
;
148 u32 vm_id
= AVIC_GATAG_TO_VMID(ga_tag
);
149 u32 vcpu_id
= AVIC_GATAG_TO_VCPUID(ga_tag
);
151 pr_debug("SVM: %s: vm_id=%#x, vcpu_id=%#x\n", __func__
, vm_id
, vcpu_id
);
152 trace_kvm_avic_ga_log(vm_id
, vcpu_id
);
154 spin_lock_irqsave(&svm_vm_data_hash_lock
, flags
);
155 hash_for_each_possible(svm_vm_data_hash
, kvm_svm
, hnode
, vm_id
) {
156 if (kvm_svm
->avic_vm_id
!= vm_id
)
158 vcpu
= kvm_get_vcpu_by_id(&kvm_svm
->kvm
, vcpu_id
);
161 spin_unlock_irqrestore(&svm_vm_data_hash_lock
, flags
);
164 * At this point, the IOMMU should have already set the pending
165 * bit in the vAPIC backing page. So, we just need to schedule
169 kvm_vcpu_wake_up(vcpu
);
174 void avic_vm_destroy(struct kvm
*kvm
)
177 struct kvm_svm
*kvm_svm
= to_kvm_svm(kvm
);
182 if (kvm_svm
->avic_logical_id_table_page
)
183 __free_page(kvm_svm
->avic_logical_id_table_page
);
184 if (kvm_svm
->avic_physical_id_table_page
)
185 __free_page(kvm_svm
->avic_physical_id_table_page
);
187 spin_lock_irqsave(&svm_vm_data_hash_lock
, flags
);
188 hash_del(&kvm_svm
->hnode
);
189 spin_unlock_irqrestore(&svm_vm_data_hash_lock
, flags
);
192 int avic_vm_init(struct kvm
*kvm
)
196 struct kvm_svm
*kvm_svm
= to_kvm_svm(kvm
);
205 /* Allocating physical APIC ID table (4KB) */
206 p_page
= alloc_page(GFP_KERNEL_ACCOUNT
| __GFP_ZERO
);
210 kvm_svm
->avic_physical_id_table_page
= p_page
;
212 /* Allocating logical APIC ID table (4KB) */
213 l_page
= alloc_page(GFP_KERNEL_ACCOUNT
| __GFP_ZERO
);
217 kvm_svm
->avic_logical_id_table_page
= l_page
;
219 spin_lock_irqsave(&svm_vm_data_hash_lock
, flags
);
221 vm_id
= next_vm_id
= (next_vm_id
+ 1) & AVIC_VM_ID_MASK
;
222 if (vm_id
== 0) { /* id is 1-based, zero is not okay */
223 next_vm_id_wrapped
= 1;
226 /* Is it still in use? Only possible if wrapped at least once */
227 if (next_vm_id_wrapped
) {
228 hash_for_each_possible(svm_vm_data_hash
, k2
, hnode
, vm_id
) {
229 if (k2
->avic_vm_id
== vm_id
)
233 kvm_svm
->avic_vm_id
= vm_id
;
234 hash_add(svm_vm_data_hash
, &kvm_svm
->hnode
, kvm_svm
->avic_vm_id
);
235 spin_unlock_irqrestore(&svm_vm_data_hash_lock
, flags
);
240 avic_vm_destroy(kvm
);
244 void avic_init_vmcb(struct vcpu_svm
*svm
, struct vmcb
*vmcb
)
246 struct kvm_svm
*kvm_svm
= to_kvm_svm(svm
->vcpu
.kvm
);
247 phys_addr_t bpa
= __sme_set(page_to_phys(svm
->avic_backing_page
));
248 phys_addr_t lpa
= __sme_set(page_to_phys(kvm_svm
->avic_logical_id_table_page
));
249 phys_addr_t ppa
= __sme_set(page_to_phys(kvm_svm
->avic_physical_id_table_page
));
251 vmcb
->control
.avic_backing_page
= bpa
& AVIC_HPA_MASK
;
252 vmcb
->control
.avic_logical_id
= lpa
& AVIC_HPA_MASK
;
253 vmcb
->control
.avic_physical_id
= ppa
& AVIC_HPA_MASK
;
254 vmcb
->control
.avic_vapic_bar
= APIC_DEFAULT_PHYS_BASE
& VMCB_AVIC_APIC_BAR_MASK
;
256 if (kvm_apicv_activated(svm
->vcpu
.kvm
))
257 avic_activate_vmcb(svm
);
259 avic_deactivate_vmcb(svm
);
262 static u64
*avic_get_physical_id_entry(struct kvm_vcpu
*vcpu
,
265 u64
*avic_physical_id_table
;
266 struct kvm_svm
*kvm_svm
= to_kvm_svm(vcpu
->kvm
);
268 if ((!x2avic_enabled
&& index
> AVIC_MAX_PHYSICAL_ID
) ||
269 (index
> X2AVIC_MAX_PHYSICAL_ID
))
272 avic_physical_id_table
= page_address(kvm_svm
->avic_physical_id_table_page
);
274 return &avic_physical_id_table
[index
];
277 static int avic_init_backing_page(struct kvm_vcpu
*vcpu
)
279 u64
*entry
, new_entry
;
280 int id
= vcpu
->vcpu_id
;
281 struct vcpu_svm
*svm
= to_svm(vcpu
);
283 if ((!x2avic_enabled
&& id
> AVIC_MAX_PHYSICAL_ID
) ||
284 (id
> X2AVIC_MAX_PHYSICAL_ID
))
287 if (!vcpu
->arch
.apic
->regs
)
290 if (kvm_apicv_activated(vcpu
->kvm
)) {
294 * Note, AVIC hardware walks the nested page table to check
295 * permissions, but does not use the SPA address specified in
296 * the leaf SPTE since it uses address in the AVIC_BACKING_PAGE
297 * pointer field of the VMCB.
299 ret
= kvm_alloc_apic_access_page(vcpu
->kvm
);
304 svm
->avic_backing_page
= virt_to_page(vcpu
->arch
.apic
->regs
);
306 /* Setting AVIC backing page address in the phy APIC ID table */
307 entry
= avic_get_physical_id_entry(vcpu
, id
);
311 new_entry
= __sme_set((page_to_phys(svm
->avic_backing_page
) &
312 AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK
) |
313 AVIC_PHYSICAL_ID_ENTRY_VALID_MASK
);
314 WRITE_ONCE(*entry
, new_entry
);
316 svm
->avic_physical_id_cache
= entry
;
321 void avic_ring_doorbell(struct kvm_vcpu
*vcpu
)
324 * Note, the vCPU could get migrated to a different pCPU at any point,
325 * which could result in signalling the wrong/previous pCPU. But if
326 * that happens the vCPU is guaranteed to do a VMRUN (after being
327 * migrated) and thus will process pending interrupts, i.e. a doorbell
328 * is not needed (and the spurious one is harmless).
330 int cpu
= READ_ONCE(vcpu
->cpu
);
332 if (cpu
!= get_cpu()) {
333 wrmsrl(MSR_AMD64_SVM_AVIC_DOORBELL
, kvm_cpu_get_apicid(cpu
));
334 trace_kvm_avic_doorbell(vcpu
->vcpu_id
, kvm_cpu_get_apicid(cpu
));
340 static void avic_kick_vcpu(struct kvm_vcpu
*vcpu
, u32 icrl
)
342 vcpu
->arch
.apic
->irr_pending
= true;
343 svm_complete_interrupt_delivery(vcpu
,
344 icrl
& APIC_MODE_MASK
,
345 icrl
& APIC_INT_LEVELTRIG
,
346 icrl
& APIC_VECTOR_MASK
);
349 static void avic_kick_vcpu_by_physical_id(struct kvm
*kvm
, u32 physical_id
,
353 * KVM inhibits AVIC if any vCPU ID diverges from the vCPUs APIC ID,
354 * i.e. APIC ID == vCPU ID.
356 struct kvm_vcpu
*target_vcpu
= kvm_get_vcpu_by_id(kvm
, physical_id
);
358 /* Once again, nothing to do if the target vCPU doesn't exist. */
359 if (unlikely(!target_vcpu
))
362 avic_kick_vcpu(target_vcpu
, icrl
);
365 static void avic_kick_vcpu_by_logical_id(struct kvm
*kvm
, u32
*avic_logical_id_table
,
366 u32 logid_index
, u32 icrl
)
370 if (avic_logical_id_table
) {
371 u32 logid_entry
= avic_logical_id_table
[logid_index
];
373 /* Nothing to do if the logical destination is invalid. */
374 if (unlikely(!(logid_entry
& AVIC_LOGICAL_ID_ENTRY_VALID_MASK
)))
377 physical_id
= logid_entry
&
378 AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK
;
381 * For x2APIC, the logical APIC ID is a read-only value that is
382 * derived from the x2APIC ID, thus the x2APIC ID can be found
383 * by reversing the calculation (stored in logid_index). Note,
384 * bits 31:20 of the x2APIC ID aren't propagated to the logical
385 * ID, but KVM limits the x2APIC ID limited to KVM_MAX_VCPU_IDS.
387 physical_id
= logid_index
;
390 avic_kick_vcpu_by_physical_id(kvm
, physical_id
, icrl
);
394 * A fast-path version of avic_kick_target_vcpus(), which attempts to match
395 * destination APIC ID to vCPU without looping through all vCPUs.
397 static int avic_kick_target_vcpus_fast(struct kvm
*kvm
, struct kvm_lapic
*source
,
398 u32 icrl
, u32 icrh
, u32 index
)
400 int dest_mode
= icrl
& APIC_DEST_MASK
;
401 int shorthand
= icrl
& APIC_SHORT_MASK
;
402 struct kvm_svm
*kvm_svm
= to_kvm_svm(kvm
);
405 if (shorthand
!= APIC_DEST_NOSHORT
)
408 if (apic_x2apic_mode(source
))
411 dest
= GET_XAPIC_DEST_FIELD(icrh
);
413 if (dest_mode
== APIC_DEST_PHYSICAL
) {
414 /* broadcast destination, use slow path */
415 if (apic_x2apic_mode(source
) && dest
== X2APIC_BROADCAST
)
417 if (!apic_x2apic_mode(source
) && dest
== APIC_BROADCAST
)
420 if (WARN_ON_ONCE(dest
!= index
))
423 avic_kick_vcpu_by_physical_id(kvm
, dest
, icrl
);
425 u32
*avic_logical_id_table
;
426 unsigned long bitmap
, i
;
429 if (apic_x2apic_mode(source
)) {
430 /* 16 bit dest mask, 16 bit cluster id */
431 bitmap
= dest
& 0xFFFF;
432 cluster
= (dest
>> 16) << 4;
433 } else if (kvm_lapic_get_reg(source
, APIC_DFR
) == APIC_DFR_FLAT
) {
438 /* 4 bit desk mask, 4 bit cluster id */
440 cluster
= (dest
>> 4) << 2;
443 /* Nothing to do if there are no destinations in the cluster. */
444 if (unlikely(!bitmap
))
447 if (apic_x2apic_mode(source
))
448 avic_logical_id_table
= NULL
;
450 avic_logical_id_table
= page_address(kvm_svm
->avic_logical_id_table_page
);
453 * AVIC is inhibited if vCPUs aren't mapped 1:1 with logical
454 * IDs, thus each bit in the destination is guaranteed to map
455 * to at most one vCPU.
457 for_each_set_bit(i
, &bitmap
, 16)
458 avic_kick_vcpu_by_logical_id(kvm
, avic_logical_id_table
,
465 static void avic_kick_target_vcpus(struct kvm
*kvm
, struct kvm_lapic
*source
,
466 u32 icrl
, u32 icrh
, u32 index
)
468 u32 dest
= apic_x2apic_mode(source
) ? icrh
: GET_XAPIC_DEST_FIELD(icrh
);
470 struct kvm_vcpu
*vcpu
;
472 if (!avic_kick_target_vcpus_fast(kvm
, source
, icrl
, icrh
, index
))
475 trace_kvm_avic_kick_vcpu_slowpath(icrh
, icrl
, index
);
478 * Wake any target vCPUs that are blocking, i.e. waiting for a wake
479 * event. There's no need to signal doorbells, as hardware has handled
480 * vCPUs that were in guest at the time of the IPI, and vCPUs that have
481 * since entered the guest will have processed pending IRQs at VMRUN.
483 kvm_for_each_vcpu(i
, vcpu
, kvm
) {
484 if (kvm_apic_match_dest(vcpu
, source
, icrl
& APIC_SHORT_MASK
,
485 dest
, icrl
& APIC_DEST_MASK
))
486 avic_kick_vcpu(vcpu
, icrl
);
490 int avic_incomplete_ipi_interception(struct kvm_vcpu
*vcpu
)
492 struct vcpu_svm
*svm
= to_svm(vcpu
);
493 u32 icrh
= svm
->vmcb
->control
.exit_info_1
>> 32;
494 u32 icrl
= svm
->vmcb
->control
.exit_info_1
;
495 u32 id
= svm
->vmcb
->control
.exit_info_2
>> 32;
496 u32 index
= svm
->vmcb
->control
.exit_info_2
& 0x1FF;
497 struct kvm_lapic
*apic
= vcpu
->arch
.apic
;
499 trace_kvm_avic_incomplete_ipi(vcpu
->vcpu_id
, icrh
, icrl
, id
, index
);
502 case AVIC_IPI_FAILURE_INVALID_TARGET
:
503 case AVIC_IPI_FAILURE_INVALID_INT_TYPE
:
505 * Emulate IPIs that are not handled by AVIC hardware, which
506 * only virtualizes Fixed, Edge-Triggered INTRs, and falls over
507 * if _any_ targets are invalid, e.g. if the logical mode mask
508 * is a superset of running vCPUs.
510 * The exit is a trap, e.g. ICR holds the correct value and RIP
511 * has been advanced, KVM is responsible only for emulating the
512 * IPI. Sadly, hardware may sometimes leave the BUSY flag set,
513 * in which case KVM needs to emulate the ICR write as well in
514 * order to clear the BUSY flag.
516 if (icrl
& APIC_ICR_BUSY
)
517 kvm_apic_write_nodecode(vcpu
, APIC_ICR
);
519 kvm_apic_send_ipi(apic
, icrl
, icrh
);
521 case AVIC_IPI_FAILURE_TARGET_NOT_RUNNING
:
523 * At this point, we expect that the AVIC HW has already
524 * set the appropriate IRR bits on the valid target
525 * vcpus. So, we just need to kick the appropriate vcpu.
527 avic_kick_target_vcpus(vcpu
->kvm
, apic
, icrl
, icrh
, index
);
529 case AVIC_IPI_FAILURE_INVALID_BACKING_PAGE
:
530 WARN_ONCE(1, "Invalid backing page\n");
532 case AVIC_IPI_FAILURE_INVALID_IPI_VECTOR
:
533 /* Invalid IPI with vector < 16 */
536 vcpu_unimpl(vcpu
, "Unknown avic incomplete IPI interception\n");
542 unsigned long avic_vcpu_get_apicv_inhibit_reasons(struct kvm_vcpu
*vcpu
)
544 if (is_guest_mode(vcpu
))
545 return APICV_INHIBIT_REASON_NESTED
;
549 static u32
*avic_get_logical_id_entry(struct kvm_vcpu
*vcpu
, u32 ldr
, bool flat
)
551 struct kvm_svm
*kvm_svm
= to_kvm_svm(vcpu
->kvm
);
552 u32
*logical_apic_id_table
;
555 ldr
= GET_APIC_LOGICAL_ID(ldr
);
560 cluster
= (ldr
>> 4);
565 if (!ldr
|| !is_power_of_2(ldr
))
569 if (WARN_ON_ONCE(index
> 7))
571 index
+= (cluster
<< 2);
573 logical_apic_id_table
= (u32
*) page_address(kvm_svm
->avic_logical_id_table_page
);
575 return &logical_apic_id_table
[index
];
578 static void avic_ldr_write(struct kvm_vcpu
*vcpu
, u8 g_physical_id
, u32 ldr
)
581 u32
*entry
, new_entry
;
583 flat
= kvm_lapic_get_reg(vcpu
->arch
.apic
, APIC_DFR
) == APIC_DFR_FLAT
;
584 entry
= avic_get_logical_id_entry(vcpu
, ldr
, flat
);
588 new_entry
= READ_ONCE(*entry
);
589 new_entry
&= ~AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK
;
590 new_entry
|= (g_physical_id
& AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK
);
591 new_entry
|= AVIC_LOGICAL_ID_ENTRY_VALID_MASK
;
592 WRITE_ONCE(*entry
, new_entry
);
595 static void avic_invalidate_logical_id_entry(struct kvm_vcpu
*vcpu
)
597 struct vcpu_svm
*svm
= to_svm(vcpu
);
598 bool flat
= svm
->dfr_reg
== APIC_DFR_FLAT
;
601 /* Note: x2AVIC does not use logical APIC ID table */
602 if (apic_x2apic_mode(vcpu
->arch
.apic
))
605 entry
= avic_get_logical_id_entry(vcpu
, svm
->ldr_reg
, flat
);
607 clear_bit(AVIC_LOGICAL_ID_ENTRY_VALID_BIT
, (unsigned long *)entry
);
610 static void avic_handle_ldr_update(struct kvm_vcpu
*vcpu
)
612 struct vcpu_svm
*svm
= to_svm(vcpu
);
613 u32 ldr
= kvm_lapic_get_reg(vcpu
->arch
.apic
, APIC_LDR
);
614 u32 id
= kvm_xapic_id(vcpu
->arch
.apic
);
616 /* AVIC does not support LDR update for x2APIC */
617 if (apic_x2apic_mode(vcpu
->arch
.apic
))
620 if (ldr
== svm
->ldr_reg
)
623 avic_invalidate_logical_id_entry(vcpu
);
626 avic_ldr_write(vcpu
, id
, ldr
);
629 static void avic_handle_dfr_update(struct kvm_vcpu
*vcpu
)
631 struct vcpu_svm
*svm
= to_svm(vcpu
);
632 u32 dfr
= kvm_lapic_get_reg(vcpu
->arch
.apic
, APIC_DFR
);
634 if (svm
->dfr_reg
== dfr
)
637 avic_invalidate_logical_id_entry(vcpu
);
641 static int avic_unaccel_trap_write(struct kvm_vcpu
*vcpu
)
643 u32 offset
= to_svm(vcpu
)->vmcb
->control
.exit_info_1
&
644 AVIC_UNACCEL_ACCESS_OFFSET_MASK
;
648 avic_handle_ldr_update(vcpu
);
651 avic_handle_dfr_update(vcpu
);
654 /* Ignore writes to Read Remote Data, it's read-only. */
660 kvm_apic_write_nodecode(vcpu
, offset
);
664 static bool is_avic_unaccelerated_access_trap(u32 offset
)
693 int avic_unaccelerated_access_interception(struct kvm_vcpu
*vcpu
)
695 struct vcpu_svm
*svm
= to_svm(vcpu
);
697 u32 offset
= svm
->vmcb
->control
.exit_info_1
&
698 AVIC_UNACCEL_ACCESS_OFFSET_MASK
;
699 u32 vector
= svm
->vmcb
->control
.exit_info_2
&
700 AVIC_UNACCEL_ACCESS_VECTOR_MASK
;
701 bool write
= (svm
->vmcb
->control
.exit_info_1
>> 32) &
702 AVIC_UNACCEL_ACCESS_WRITE_MASK
;
703 bool trap
= is_avic_unaccelerated_access_trap(offset
);
705 trace_kvm_avic_unaccelerated_access(vcpu
->vcpu_id
, offset
,
706 trap
, write
, vector
);
709 WARN_ONCE(!write
, "svm: Handling trap read.\n");
710 ret
= avic_unaccel_trap_write(vcpu
);
713 ret
= kvm_emulate_instruction(vcpu
, 0);
719 int avic_init_vcpu(struct vcpu_svm
*svm
)
722 struct kvm_vcpu
*vcpu
= &svm
->vcpu
;
724 if (!enable_apicv
|| !irqchip_in_kernel(vcpu
->kvm
))
727 ret
= avic_init_backing_page(vcpu
);
731 INIT_LIST_HEAD(&svm
->ir_list
);
732 spin_lock_init(&svm
->ir_list_lock
);
733 svm
->dfr_reg
= APIC_DFR_FLAT
;
738 void avic_apicv_post_state_restore(struct kvm_vcpu
*vcpu
)
740 avic_handle_dfr_update(vcpu
);
741 avic_handle_ldr_update(vcpu
);
744 static int avic_set_pi_irte_mode(struct kvm_vcpu
*vcpu
, bool activate
)
748 struct amd_svm_iommu_ir
*ir
;
749 struct vcpu_svm
*svm
= to_svm(vcpu
);
751 if (!kvm_arch_has_assigned_device(vcpu
->kvm
))
755 * Here, we go through the per-vcpu ir_list to update all existing
756 * interrupt remapping table entry targeting this vcpu.
758 spin_lock_irqsave(&svm
->ir_list_lock
, flags
);
760 if (list_empty(&svm
->ir_list
))
763 list_for_each_entry(ir
, &svm
->ir_list
, node
) {
765 ret
= amd_iommu_activate_guest_mode(ir
->data
);
767 ret
= amd_iommu_deactivate_guest_mode(ir
->data
);
772 spin_unlock_irqrestore(&svm
->ir_list_lock
, flags
);
776 static void svm_ir_list_del(struct vcpu_svm
*svm
, struct amd_iommu_pi_data
*pi
)
779 struct amd_svm_iommu_ir
*cur
;
781 spin_lock_irqsave(&svm
->ir_list_lock
, flags
);
782 list_for_each_entry(cur
, &svm
->ir_list
, node
) {
783 if (cur
->data
!= pi
->ir_data
)
785 list_del(&cur
->node
);
789 spin_unlock_irqrestore(&svm
->ir_list_lock
, flags
);
792 static int svm_ir_list_add(struct vcpu_svm
*svm
, struct amd_iommu_pi_data
*pi
)
796 struct amd_svm_iommu_ir
*ir
;
800 * In some cases, the existing irte is updated and re-set,
801 * so we need to check here if it's already been * added
804 if (pi
->ir_data
&& (pi
->prev_ga_tag
!= 0)) {
805 struct kvm
*kvm
= svm
->vcpu
.kvm
;
806 u32 vcpu_id
= AVIC_GATAG_TO_VCPUID(pi
->prev_ga_tag
);
807 struct kvm_vcpu
*prev_vcpu
= kvm_get_vcpu_by_id(kvm
, vcpu_id
);
808 struct vcpu_svm
*prev_svm
;
815 prev_svm
= to_svm(prev_vcpu
);
816 svm_ir_list_del(prev_svm
, pi
);
820 * Allocating new amd_iommu_pi_data, which will get
821 * add to the per-vcpu ir_list.
823 ir
= kzalloc(sizeof(struct amd_svm_iommu_ir
), GFP_KERNEL_ACCOUNT
);
828 ir
->data
= pi
->ir_data
;
830 spin_lock_irqsave(&svm
->ir_list_lock
, flags
);
833 * Update the target pCPU for IOMMU doorbells if the vCPU is running.
834 * If the vCPU is NOT running, i.e. is blocking or scheduled out, KVM
835 * will update the pCPU info when the vCPU awkened and/or scheduled in.
836 * See also avic_vcpu_load().
838 entry
= READ_ONCE(*(svm
->avic_physical_id_cache
));
839 if (entry
& AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK
)
840 amd_iommu_update_ga(entry
& AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK
,
843 list_add(&ir
->node
, &svm
->ir_list
);
844 spin_unlock_irqrestore(&svm
->ir_list_lock
, flags
);
851 * The HW cannot support posting multicast/broadcast
852 * interrupts to a vCPU. So, we still use legacy interrupt
853 * remapping for these kind of interrupts.
855 * For lowest-priority interrupts, we only support
856 * those with single CPU as the destination, e.g. user
857 * configures the interrupts via /proc/irq or uses
858 * irqbalance to make the interrupts single-CPU.
861 get_pi_vcpu_info(struct kvm
*kvm
, struct kvm_kernel_irq_routing_entry
*e
,
862 struct vcpu_data
*vcpu_info
, struct vcpu_svm
**svm
)
864 struct kvm_lapic_irq irq
;
865 struct kvm_vcpu
*vcpu
= NULL
;
867 kvm_set_msi_irq(kvm
, e
, &irq
);
869 if (!kvm_intr_is_single_vcpu(kvm
, &irq
, &vcpu
) ||
870 !kvm_irq_is_postable(&irq
)) {
871 pr_debug("SVM: %s: use legacy intr remap mode for irq %u\n",
872 __func__
, irq
.vector
);
876 pr_debug("SVM: %s: use GA mode for irq %u\n", __func__
,
879 vcpu_info
->pi_desc_addr
= __sme_set(page_to_phys((*svm
)->avic_backing_page
));
880 vcpu_info
->vector
= irq
.vector
;
886 * avic_pi_update_irte - set IRTE for Posted-Interrupts
889 * @host_irq: host irq of the interrupt
890 * @guest_irq: gsi of the interrupt
891 * @set: set or unset PI
892 * returns 0 on success, < 0 on failure
894 int avic_pi_update_irte(struct kvm
*kvm
, unsigned int host_irq
,
895 uint32_t guest_irq
, bool set
)
897 struct kvm_kernel_irq_routing_entry
*e
;
898 struct kvm_irq_routing_table
*irq_rt
;
901 if (!kvm_arch_has_assigned_device(kvm
) ||
902 !irq_remapping_cap(IRQ_POSTING_CAP
))
905 pr_debug("SVM: %s: host_irq=%#x, guest_irq=%#x, set=%#x\n",
906 __func__
, host_irq
, guest_irq
, set
);
908 idx
= srcu_read_lock(&kvm
->irq_srcu
);
909 irq_rt
= srcu_dereference(kvm
->irq_routing
, &kvm
->irq_srcu
);
911 if (guest_irq
>= irq_rt
->nr_rt_entries
||
912 hlist_empty(&irq_rt
->map
[guest_irq
])) {
913 pr_warn_once("no route for guest_irq %u/%u (broken user space?)\n",
914 guest_irq
, irq_rt
->nr_rt_entries
);
918 hlist_for_each_entry(e
, &irq_rt
->map
[guest_irq
], link
) {
919 struct vcpu_data vcpu_info
;
920 struct vcpu_svm
*svm
= NULL
;
922 if (e
->type
!= KVM_IRQ_ROUTING_MSI
)
926 * Here, we setup with legacy mode in the following cases:
927 * 1. When cannot target interrupt to a specific vcpu.
928 * 2. Unsetting posted interrupt.
929 * 3. APIC virtualization is disabled for the vcpu.
930 * 4. IRQ has incompatible delivery mode (SMI, INIT, etc)
932 if (!get_pi_vcpu_info(kvm
, e
, &vcpu_info
, &svm
) && set
&&
933 kvm_vcpu_apicv_active(&svm
->vcpu
)) {
934 struct amd_iommu_pi_data pi
;
936 /* Try to enable guest_mode in IRTE */
937 pi
.base
= __sme_set(page_to_phys(svm
->avic_backing_page
) &
939 pi
.ga_tag
= AVIC_GATAG(to_kvm_svm(kvm
)->avic_vm_id
,
941 pi
.is_guest_mode
= true;
942 pi
.vcpu_data
= &vcpu_info
;
943 ret
= irq_set_vcpu_affinity(host_irq
, &pi
);
946 * Here, we successfully setting up vcpu affinity in
947 * IOMMU guest mode. Now, we need to store the posted
948 * interrupt information in a per-vcpu ir_list so that
949 * we can reference to them directly when we update vcpu
950 * scheduling information in IOMMU irte.
952 if (!ret
&& pi
.is_guest_mode
)
953 svm_ir_list_add(svm
, &pi
);
955 /* Use legacy mode in IRTE */
956 struct amd_iommu_pi_data pi
;
959 * Here, pi is used to:
960 * - Tell IOMMU to use legacy mode for this interrupt.
961 * - Retrieve ga_tag of prior interrupt remapping data.
964 pi
.is_guest_mode
= false;
965 ret
= irq_set_vcpu_affinity(host_irq
, &pi
);
968 * Check if the posted interrupt was previously
969 * setup with the guest_mode by checking if the ga_tag
970 * was cached. If so, we need to clean up the per-vcpu
973 if (!ret
&& pi
.prev_ga_tag
) {
974 int id
= AVIC_GATAG_TO_VCPUID(pi
.prev_ga_tag
);
975 struct kvm_vcpu
*vcpu
;
977 vcpu
= kvm_get_vcpu_by_id(kvm
, id
);
979 svm_ir_list_del(to_svm(vcpu
), &pi
);
984 trace_kvm_pi_irte_update(host_irq
, svm
->vcpu
.vcpu_id
,
985 e
->gsi
, vcpu_info
.vector
,
986 vcpu_info
.pi_desc_addr
, set
);
990 pr_err("%s: failed to update PI IRTE\n", __func__
);
997 srcu_read_unlock(&kvm
->irq_srcu
, idx
);
1002 avic_update_iommu_vcpu_affinity(struct kvm_vcpu
*vcpu
, int cpu
, bool r
)
1005 struct amd_svm_iommu_ir
*ir
;
1006 struct vcpu_svm
*svm
= to_svm(vcpu
);
1008 lockdep_assert_held(&svm
->ir_list_lock
);
1010 if (!kvm_arch_has_assigned_device(vcpu
->kvm
))
1014 * Here, we go through the per-vcpu ir_list to update all existing
1015 * interrupt remapping table entry targeting this vcpu.
1017 if (list_empty(&svm
->ir_list
))
1020 list_for_each_entry(ir
, &svm
->ir_list
, node
) {
1021 ret
= amd_iommu_update_ga(cpu
, r
, ir
->data
);
1028 void avic_vcpu_load(struct kvm_vcpu
*vcpu
, int cpu
)
1031 int h_physical_id
= kvm_cpu_get_apicid(cpu
);
1032 struct vcpu_svm
*svm
= to_svm(vcpu
);
1033 unsigned long flags
;
1035 lockdep_assert_preemption_disabled();
1037 if (WARN_ON(h_physical_id
& ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK
))
1041 * No need to update anything if the vCPU is blocking, i.e. if the vCPU
1042 * is being scheduled in after being preempted. The CPU entries in the
1043 * Physical APIC table and IRTE are consumed iff IsRun{ning} is '1'.
1044 * If the vCPU was migrated, its new CPU value will be stuffed when the
1047 if (kvm_vcpu_is_blocking(vcpu
))
1051 * Grab the per-vCPU interrupt remapping lock even if the VM doesn't
1052 * _currently_ have assigned devices, as that can change. Holding
1053 * ir_list_lock ensures that either svm_ir_list_add() will consume
1054 * up-to-date entry information, or that this task will wait until
1055 * svm_ir_list_add() completes to set the new target pCPU.
1057 spin_lock_irqsave(&svm
->ir_list_lock
, flags
);
1059 entry
= READ_ONCE(*(svm
->avic_physical_id_cache
));
1060 WARN_ON_ONCE(entry
& AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK
);
1062 entry
&= ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK
;
1063 entry
|= (h_physical_id
& AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK
);
1064 entry
|= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK
;
1066 WRITE_ONCE(*(svm
->avic_physical_id_cache
), entry
);
1067 avic_update_iommu_vcpu_affinity(vcpu
, h_physical_id
, true);
1069 spin_unlock_irqrestore(&svm
->ir_list_lock
, flags
);
1072 void avic_vcpu_put(struct kvm_vcpu
*vcpu
)
1075 struct vcpu_svm
*svm
= to_svm(vcpu
);
1076 unsigned long flags
;
1078 lockdep_assert_preemption_disabled();
1081 * Note, reading the Physical ID entry outside of ir_list_lock is safe
1082 * as only the pCPU that has loaded (or is loading) the vCPU is allowed
1083 * to modify the entry, and preemption is disabled. I.e. the vCPU
1084 * can't be scheduled out and thus avic_vcpu_{put,load}() can't run
1087 entry
= READ_ONCE(*(svm
->avic_physical_id_cache
));
1089 /* Nothing to do if IsRunning == '0' due to vCPU blocking. */
1090 if (!(entry
& AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK
))
1094 * Take and hold the per-vCPU interrupt remapping lock while updating
1095 * the Physical ID entry even though the lock doesn't protect against
1096 * multiple writers (see above). Holding ir_list_lock ensures that
1097 * either svm_ir_list_add() will consume up-to-date entry information,
1098 * or that this task will wait until svm_ir_list_add() completes to
1099 * mark the vCPU as not running.
1101 spin_lock_irqsave(&svm
->ir_list_lock
, flags
);
1103 avic_update_iommu_vcpu_affinity(vcpu
, -1, 0);
1105 entry
&= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK
;
1106 WRITE_ONCE(*(svm
->avic_physical_id_cache
), entry
);
1108 spin_unlock_irqrestore(&svm
->ir_list_lock
, flags
);
1112 void avic_refresh_virtual_apic_mode(struct kvm_vcpu
*vcpu
)
1114 struct vcpu_svm
*svm
= to_svm(vcpu
);
1115 struct vmcb
*vmcb
= svm
->vmcb01
.ptr
;
1117 if (!lapic_in_kernel(vcpu
) || !enable_apicv
)
1120 if (kvm_vcpu_apicv_active(vcpu
)) {
1122 * During AVIC temporary deactivation, guest could update
1123 * APIC ID, DFR and LDR registers, which would not be trapped
1124 * by avic_unaccelerated_access_interception(). In this case,
1125 * we need to check and update the AVIC logical APIC ID table
1126 * accordingly before re-activating.
1128 avic_apicv_post_state_restore(vcpu
);
1129 avic_activate_vmcb(svm
);
1131 avic_deactivate_vmcb(svm
);
1133 vmcb_mark_dirty(vmcb
, VMCB_AVIC
);
1136 void avic_refresh_apicv_exec_ctrl(struct kvm_vcpu
*vcpu
)
1138 bool activated
= kvm_vcpu_apicv_active(vcpu
);
1143 avic_refresh_virtual_apic_mode(vcpu
);
1146 avic_vcpu_load(vcpu
, vcpu
->cpu
);
1148 avic_vcpu_put(vcpu
);
1150 avic_set_pi_irte_mode(vcpu
, activated
);
1153 void avic_vcpu_blocking(struct kvm_vcpu
*vcpu
)
1155 if (!kvm_vcpu_apicv_active(vcpu
))
1159 * Unload the AVIC when the vCPU is about to block, _before_
1160 * the vCPU actually blocks.
1162 * Any IRQs that arrive before IsRunning=0 will not cause an
1163 * incomplete IPI vmexit on the source, therefore vIRR will also
1164 * be checked by kvm_vcpu_check_block() before blocking. The
1165 * memory barrier implicit in set_current_state orders writing
1166 * IsRunning=0 before reading the vIRR. The processor needs a
1167 * matching memory barrier on interrupt delivery between writing
1168 * IRR and reading IsRunning; the lack of this barrier might be
1169 * the cause of errata #1235).
1171 avic_vcpu_put(vcpu
);
1174 void avic_vcpu_unblocking(struct kvm_vcpu
*vcpu
)
1176 if (!kvm_vcpu_apicv_active(vcpu
))
1179 avic_vcpu_load(vcpu
, vcpu
->cpu
);
1184 * - The module param avic enable both xAPIC and x2APIC mode.
1185 * - Hypervisor can support both xAVIC and x2AVIC in the same guest.
1186 * - The mode can be switched at run-time.
1188 bool avic_hardware_setup(void)
1193 /* AVIC is a prerequisite for x2AVIC. */
1194 if (!boot_cpu_has(X86_FEATURE_AVIC
) && !force_avic
) {
1195 if (boot_cpu_has(X86_FEATURE_X2AVIC
)) {
1196 pr_warn(FW_BUG
"Cannot support x2AVIC due to AVIC is disabled");
1197 pr_warn(FW_BUG
"Try enable AVIC using force_avic option");
1202 if (boot_cpu_has(X86_FEATURE_AVIC
)) {
1203 pr_info("AVIC enabled\n");
1204 } else if (force_avic
) {
1206 * Some older systems does not advertise AVIC support.
1207 * See Revision Guide for specific AMD processor for more detail.
1209 pr_warn("AVIC is not supported in CPUID but force enabled");
1210 pr_warn("Your system might crash and burn");
1213 /* AVIC is a prerequisite for x2AVIC. */
1214 x2avic_enabled
= boot_cpu_has(X86_FEATURE_X2AVIC
);
1216 pr_info("x2AVIC enabled\n");
1218 amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier
);