KVM: arm64: Add support for FEAT_TLBIRANGE

This commit is contained in:
谢晓东 2024-10-29 22:16:01 +08:00
parent 7fbd37f2dd
commit 1b2905d358
9 changed files with 224 additions and 70 deletions

View File

@ -58,6 +58,8 @@ extern char __kvm_hyp_init_end[];
extern char __kvm_hyp_vector[];
extern void __kvm_flush_vm_context(void);
extern void __kvm_tlb_flush_vmid_range(struct kvm *kvm,
phys_addr_t start, unsigned long pages);
extern void __kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa);
extern void __kvm_tlb_flush_vmid(struct kvm *kvm);
extern void __kvm_flush_cpu_context(struct kvm_vcpu *vcpu);

View File

@ -671,6 +671,8 @@ void kvm_vcpu_put_sysregs(struct kvm_vcpu *vcpu);
void kvm_set_ipa_limit(void);
#define __KVM_HAVE_ARCH_VM_ALLOC
#define __KVM_HAVE_ARCH_FLUSH_REMOTE_TLBS
#define __KVM_HAVE_ARCH_FLUSH_REMOTE_TLBS_RANGE
struct kvm *kvm_arch_alloc_vm(void);
void kvm_arch_free_vm(struct kvm *kvm);

View File

@ -282,16 +282,77 @@ static inline void flush_tlb_page(struct vm_area_struct *vma,
*/
#define MAX_TLBI_OPS PTRS_PER_PTE
/*
* __flush_tlb_range_op - Perform TLBI operation upon a range
*
* @op: TLBI instruction that operates on a range (has 'r' prefix)
* @start: The start address of the range
* @pages: Range as the number of pages from 'start'
* @stride: Flush granularity
* @asid: The ASID of the task (0 for IPA instructions)
* @tlb_level: Translation Table level hint, if known
* @tlbi_user: If 'true', call an additional __tlbi_user()
* (typically for user ASIDs). 'flase' for IPA instructions
*
* When the CPU does not support TLB range operations, flush the TLB
* entries one by one at the granularity of 'stride'. If the TLB
* range ops are supported, then:
*
* 1. If 'pages' is odd, flush the first page through non-range
* operations;
*
* 2. For remaining pages: the minimum range granularity is decided
* by 'scale', so multiple range TLBI operations may be required.
* Start from scale = 0, flush the corresponding number of pages
* ((num+1)*2^(5*scale+1) starting from 'addr'), then increase it
* until no pages left.
*
* Note that certain ranges can be represented by either num = 31 and
* scale or num = 0 and scale + 1. The loop below favours the latter
* since num is limited to 30 by the __TLBI_RANGE_NUM() macro.
*/
#define __flush_tlb_range_op(op, start, pages, stride, \
asid, tlb_level, tlbi_user) \
do { \
int num = 0; \
int scale = 0; \
unsigned long addr; \
\
while (pages > 0) { \
if (!system_supports_tlb_range() || \
pages % 2 == 1) { \
addr = __TLBI_VADDR(start, asid); \
__tlbi_level(op, addr, tlb_level); \
if (tlbi_user) \
__tlbi_user_level(op, addr, tlb_level); \
start += stride; \
pages -= stride >> PAGE_SHIFT; \
continue; \
} \
\
num = __TLBI_RANGE_NUM(pages, scale); \
if (num >= 0) { \
addr = __TLBI_VADDR_RANGE(start, asid, scale, \
num, tlb_level); \
__tlbi(r##op, addr); \
if (tlbi_user) \
__tlbi_user(r##op, addr); \
start += __TLBI_RANGE_PAGES(num, scale) << PAGE_SHIFT; \
pages -= __TLBI_RANGE_PAGES(num, scale); \
} \
scale++; \
} \
} while (0)
#define __flush_s2_tlb_range_op(op, start, pages, stride, tlb_level) \
__flush_tlb_range_op(op, start, pages, stride, 0, tlb_level, false)
static inline void __flush_tlb_range(struct vm_area_struct *vma,
unsigned long start, unsigned long end,
unsigned long stride, bool last_level,
int tlb_level)
{
int num = 0;
int scale = 0;
unsigned long asid = ASID(vma->vm_mm);
unsigned long addr;
unsigned long pages;
unsigned long asid, pages;
start = round_down(start, stride);
end = round_up(end, stride);
@ -311,57 +372,13 @@ static inline void __flush_tlb_range(struct vm_area_struct *vma,
}
dsb(ishst);
asid = ASID(vma->vm_mm);
if (last_level)
__flush_tlb_range_op(vale1is, start, pages, stride, asid, tlb_level, true);
else
__flush_tlb_range_op(vae1is, start, pages, stride, asid, tlb_level, true);
/*
* When the CPU does not support TLB range operations, flush the TLB
* entries one by one at the granularity of 'stride'. If the the TLB
* range ops are supported, then:
*
* 1. If 'pages' is odd, flush the first page through non-range
* operations;
*
* 2. For remaining pages: the minimum range granularity is decided
* by 'scale', so multiple range TLBI operations may be required.
* Start from scale = 0, flush the corresponding number of pages
* ((num+1)*2^(5*scale+1) starting from 'addr'), then increase it
* until no pages left.
*
* Note that certain ranges can be represented by either num = 31 and
* scale or num = 0 and scale + 1. The loop below favours the latter
* since num is limited to 30 by the __TLBI_RANGE_NUM() macro.
*/
while (pages > 0) {
if (!system_supports_tlb_range() ||
pages % 2 == 1) {
addr = __TLBI_VADDR(start, asid);
if (last_level) {
__tlbi_level(vale1is, addr, tlb_level);
__tlbi_user_level(vale1is, addr, tlb_level);
} else {
__tlbi_level(vae1is, addr, tlb_level);
__tlbi_user_level(vae1is, addr, tlb_level);
}
start += stride;
pages -= stride >> PAGE_SHIFT;
continue;
}
num = __TLBI_RANGE_NUM(pages, scale);
if (num >= 0) {
addr = __TLBI_VADDR_RANGE(start, asid, scale,
num, tlb_level);
if (last_level) {
__tlbi(rvale1is, addr);
__tlbi_user(rvale1is, addr);
} else {
__tlbi(rvae1is, addr);
__tlbi_user(rvae1is, addr);
}
start += __TLBI_RANGE_PAGES(num, scale) << PAGE_SHIFT;
pages -= __TLBI_RANGE_PAGES(num, scale);
}
scale++;
}
dsb(ish);
}

View File

@ -24,7 +24,6 @@ config KVM
select MMU_NOTIFIER
select PREEMPT_NOTIFIERS
select HAVE_KVM_CPU_RELAX_INTERCEPT
select HAVE_KVM_ARCH_TLB_FLUSH_ALL
select KVM_MMIO
select KVM_ARM_HOST
select KVM_GENERIC_DIRTYLOG_READ_PROTECT

View File

@ -165,6 +165,41 @@ void __hyp_text __kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa)
__tlb_switch_to_host(kvm, &cxt);
}
void __hyp_text __kvm_tlb_flush_vmid_range(struct kvm *kvm,
phys_addr_t start, unsigned long pages)
{
struct tlb_inv_context cxt;
unsigned long stride;
/*
* Since the range of addresses may not be mapped at
* the same level, assume the worst case as PAGE_SIZE
*/
stride = PAGE_SIZE;
start = round_down(start, stride);
if (has_vhe())
dsb(ishst);
/* Switch to requested VMID */
kvm = kern_hyp_va(kvm);
__tlb_switch_to_guest(kvm, &cxt);
__flush_s2_tlb_range_op(ipas2e1is, start, pages, stride, 0);
dsb(ish);
__tlbi(vmalle1is);
dsb(ish);
isb();
/* See the comment in __kvm_tlb_flush_vmid_ipa() */
if (!has_vhe() && icache_is_vpipt())
__flush_icache_all();
__tlb_switch_to_host(kvm, &cxt);
}
void __hyp_text __kvm_tlb_flush_vmid(struct kvm *kvm)
{
struct tlb_inv_context cxt;

View File

@ -813,6 +813,9 @@ int kvm_vcpu_yield_to(struct kvm_vcpu *target);
void kvm_vcpu_on_spin(struct kvm_vcpu *vcpu, bool usermode_vcpu_not_eligible);
void kvm_flush_remote_tlbs(struct kvm *kvm);
void kvm_flush_remote_tlbs_range(struct kvm *kvm, gfn_t gfn, u64 nr_pages);
void kvm_flush_remote_tlbs_memslot(struct kvm *kvm,
const struct kvm_memory_slot *memslot);
void kvm_reload_remote_mmus(struct kvm *kvm);
bool kvm_make_vcpus_request_mask(struct kvm *kvm, unsigned int req,
@ -917,11 +920,23 @@ static inline void kvm_arch_free_vm(struct kvm *kvm)
}
#endif
#ifndef __KVM_HAVE_ARCH_FLUSH_REMOTE_TLB
static inline int kvm_arch_flush_remote_tlb(struct kvm *kvm)
#ifndef __KVM_HAVE_ARCH_FLUSH_REMOTE_TLBS
static inline int kvm_arch_flush_remote_tlbs(struct kvm *kvm)
{
return -ENOTSUPP;
}
#else
int kvm_arch_flush_remote_tlbs(struct kvm *kvm);
#endif
#ifndef __KVM_HAVE_ARCH_FLUSH_REMOTE_TLBS_RANGE
static inline int kvm_arch_flush_remote_tlbs_range(struct kvm *kvm,
gfn_t gfn, u64 nr_pages)
{
return -EOPNOTSUPP;
}
#else
int kvm_arch_flush_remote_tlbs_range(struct kvm *kvm, gfn_t gfn, u64 nr_pages);
#endif
#ifdef __KVM_HAVE_ARCH_NONCOHERENT_DMA

View File

@ -36,9 +36,6 @@ config HAVE_KVM_CPU_RELAX_INTERCEPT
config KVM_VFIO
bool
config HAVE_KVM_ARCH_TLB_FLUSH_ALL
bool
config HAVE_KVM_INVALID_WAKEUPS
bool

View File

@ -49,14 +49,15 @@ static bool memslot_is_logging(struct kvm_memory_slot *memslot)
}
/**
* kvm_flush_remote_tlbs() - flush all VM TLB entries for v7/8
* kvm_arch_flush_remote_tlbs() - flush all VM TLB entries for v7/8
* @kvm: pointer to kvm structure.
*
* Interface to HYP function to flush all VM TLB entries
*/
void kvm_flush_remote_tlbs(struct kvm *kvm)
int kvm_arch_flush_remote_tlbs(struct kvm *kvm)
{
kvm_call_hyp(__kvm_tlb_flush_vmid, kvm);
return 0;
}
static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa)
@ -64,6 +65,48 @@ static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa)
kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, kvm, ipa);
}
/**
* kvm_tlb_flush_vmid_range() - Invalidate/flush a range of TLB entries
*
* @kvm: pointer to kvm structure
* @addr: The base Intermediate physical address from which to invalidate
* @size: Size of the range from the base to invalidate
*/
void kvm_tlb_flush_vmid_range(struct kvm *kvm,
phys_addr_t addr, size_t size)
{
unsigned long pages, inval_pages;
if (!system_supports_tlb_range()) {
kvm_call_hyp(__kvm_tlb_flush_vmid, kvm);
return;
}
pages = size >> PAGE_SHIFT;
while (pages > 0) {
inval_pages = min(pages, MAX_TLBI_RANGE_PAGES);
kvm_call_hyp(__kvm_tlb_flush_vmid_range,
kvm, addr, inval_pages);
addr += inval_pages << PAGE_SHIFT;
pages -= inval_pages;
}
}
static bool stage2_unmap_defer_tlb_flush(void)
{
/*
* If FEAT_TLBIRANGE is implemented, defer the individual
* TLB invalidations until the entire walk is finished, and
* then use the range-based TLBI instructions to do the
* invalidations. Condition deferred TLB invalidation on the
* system supporting FWB as the optimization is entirely
* pointless when the unmap walker needs to perform CMOs.
*/
return system_supports_tlb_range() && cpus_have_const_cap(ARM64_HAS_STAGE2_FWB);
}
/*
* D-Cache management functions. They take the page table entries by
* value, as they are flushing the cache using the kernel mapping (or
@ -84,6 +127,15 @@ static void kvm_flush_dcache_pud(pud_t pud)
__kvm_flush_dcache_pud(pud);
}
int kvm_arch_flush_remote_tlbs_range(struct kvm *kvm,
gfn_t gfn, u64 nr_pages)
{
kvm_tlb_flush_vmid_range(kvm,gfn << PAGE_SHIFT,
nr_pages << PAGE_SHIFT);
return 0;
}
static bool kvm_is_device_pfn(unsigned long pfn)
{
return !pfn_valid(pfn);
@ -249,9 +301,10 @@ static void unmap_stage2_ptes(struct kvm *kvm, pmd_t *pmd,
do {
if (!pte_none(*pte)) {
pte_t old_pte = *pte;
kvm_set_pte(pte, __pte(0));
kvm_tlb_flush_vmid_ipa(kvm, addr);
if (!stage2_unmap_defer_tlb_flush())
kvm_tlb_flush_vmid_ipa(kvm, addr);
/* No need to invalidate the cache for device mappings */
if (!kvm_is_device_pfn(pte_pfn(old_pte)))
@ -354,6 +407,11 @@ static void __unmap_stage2_range(struct kvm *kvm, phys_addr_t start, u64 size,
next = stage2_pgd_addr_end(kvm, addr, end);
if (!stage2_pgd_none(kvm, *pgd))
unmap_stage2_puds(kvm, pgd, addr, next);
if (stage2_unmap_defer_tlb_flush())
/* Perform the deferred TLB invalidations */
kvm_tlb_flush_vmid_range(kvm, addr, size);
/*
* If the range is too large, release the kvm->mmu_lock
* to prevent starvation and lockup detector warnings.
@ -1553,7 +1611,7 @@ void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot)
spin_lock(&kvm->mmu_lock);
stage2_wp_range(kvm, start, end);
spin_unlock(&kvm->mmu_lock);
kvm_flush_remote_tlbs(kvm);
kvm_flush_remote_tlbs_memslot(kvm, memslot);
}
/**

View File

@ -295,7 +295,33 @@ bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req)
return called;
}
#ifndef CONFIG_HAVE_KVM_ARCH_TLB_FLUSH_ALL
void kvm_flush_remote_tlbs_range(struct kvm *kvm, gfn_t gfn, u64 nr_pages)
{
if (!kvm_arch_flush_remote_tlbs_range(kvm, gfn, nr_pages))
return;
/*
* Fall back to a flushing entire TLBs if the architecture range-based
* TLB invalidation is unsupported or can't be performed for whatever
* reason.
*/
kvm_flush_remote_tlbs(kvm);
}
void kvm_flush_remote_tlbs_memslot(struct kvm *kvm,
const struct kvm_memory_slot *memslot)
{
/*
* All current use cases for flushing the TLBs for a specific memslot
* are related to dirty logging, and many do the TLB flush out of
* mmu_lock. The interaction between the various operations on memslot
* must be serialized by slots_locks to ensure the TLB flush from one
* operation is observed by any other operation on the same memslot.
*/
lockdep_assert_held(&kvm->slots_lock);
kvm_flush_remote_tlbs_range(kvm, memslot->base_gfn, memslot->npages);
}
void kvm_flush_remote_tlbs(struct kvm *kvm)
{
/*
@ -315,13 +341,12 @@ void kvm_flush_remote_tlbs(struct kvm *kvm)
* kvm_make_all_cpus_request() reads vcpu->mode. We reuse that
* barrier here.
*/
if (!kvm_arch_flush_remote_tlb(kvm)
if (!kvm_arch_flush_remote_tlbs(kvm)
|| kvm_make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))
++kvm->stat.remote_tlb_flush;
cmpxchg(&kvm->tlbs_dirty, dirty_count, 0);
}
EXPORT_SYMBOL_GPL(kvm_flush_remote_tlbs);
#endif
void kvm_reload_remote_mmus(struct kvm *kvm)
{
@ -1317,7 +1342,8 @@ int kvm_get_dirty_log_protect(struct kvm *kvm,
}
spin_unlock(&kvm->mmu_lock);
}
if (flush)
kvm_flush_remote_tlbs_memslot(kvm, memslot);
if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n))
return -EFAULT;
return 0;
@ -1393,6 +1419,9 @@ int kvm_clear_dirty_log_protect(struct kvm *kvm,
}
}
spin_unlock(&kvm->mmu_lock);
if(flush)
kvm_flush_remote_tlbs_memslot(kvm, memslot);
return 0;
}