From 1b2905d3582fa5fb942083233220ead093c53a1d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=B0=A2=E6=99=93=E4=B8=9C?= Date: Tue, 29 Oct 2024 22:16:01 +0800 Subject: [PATCH] KVM: arm64: Add support for FEAT_TLBIRANGE --- arch/arm64/include/asm/kvm_asm.h | 2 + arch/arm64/include/asm/kvm_host.h | 2 + arch/arm64/include/asm/tlbflush.h | 127 +++++++++++++++++------------- arch/arm64/kvm/Kconfig | 1 - arch/arm64/kvm/hyp/tlb.c | 35 ++++++++ include/linux/kvm_host.h | 19 ++++- virt/kvm/Kconfig | 3 - virt/kvm/arm/mmu.c | 68 ++++++++++++++-- virt/kvm/kvm_main.c | 37 ++++++++- 9 files changed, 224 insertions(+), 70 deletions(-) diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h index c54e759896c1..0ea0cdb9a2ea 100644 --- a/arch/arm64/include/asm/kvm_asm.h +++ b/arch/arm64/include/asm/kvm_asm.h @@ -58,6 +58,8 @@ extern char __kvm_hyp_init_end[]; extern char __kvm_hyp_vector[]; extern void __kvm_flush_vm_context(void); +extern void __kvm_tlb_flush_vmid_range(struct kvm *kvm, + phys_addr_t start, unsigned long pages); extern void __kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa); extern void __kvm_tlb_flush_vmid(struct kvm *kvm); extern void __kvm_flush_cpu_context(struct kvm_vcpu *vcpu); diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index e6efdbe88c0a..a639c7fee3a8 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -671,6 +671,8 @@ void kvm_vcpu_put_sysregs(struct kvm_vcpu *vcpu); void kvm_set_ipa_limit(void); #define __KVM_HAVE_ARCH_VM_ALLOC +#define __KVM_HAVE_ARCH_FLUSH_REMOTE_TLBS +#define __KVM_HAVE_ARCH_FLUSH_REMOTE_TLBS_RANGE struct kvm *kvm_arch_alloc_vm(void); void kvm_arch_free_vm(struct kvm *kvm); diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h index 63c67a5a5b4f..11e9d8bd8b75 100644 --- a/arch/arm64/include/asm/tlbflush.h +++ b/arch/arm64/include/asm/tlbflush.h @@ -282,16 +282,77 @@ static inline void flush_tlb_page(struct vm_area_struct *vma, */ #define MAX_TLBI_OPS PTRS_PER_PTE +/* + * __flush_tlb_range_op - Perform TLBI operation upon a range + * + * @op: TLBI instruction that operates on a range (has 'r' prefix) + * @start: The start address of the range + * @pages: Range as the number of pages from 'start' + * @stride: Flush granularity + * @asid: The ASID of the task (0 for IPA instructions) + * @tlb_level: Translation Table level hint, if known + * @tlbi_user: If 'true', call an additional __tlbi_user() + * (typically for user ASIDs). 'flase' for IPA instructions + * + * When the CPU does not support TLB range operations, flush the TLB + * entries one by one at the granularity of 'stride'. If the TLB + * range ops are supported, then: + * + * 1. If 'pages' is odd, flush the first page through non-range + * operations; + * + * 2. For remaining pages: the minimum range granularity is decided + * by 'scale', so multiple range TLBI operations may be required. + * Start from scale = 0, flush the corresponding number of pages + * ((num+1)*2^(5*scale+1) starting from 'addr'), then increase it + * until no pages left. + * + * Note that certain ranges can be represented by either num = 31 and + * scale or num = 0 and scale + 1. The loop below favours the latter + * since num is limited to 30 by the __TLBI_RANGE_NUM() macro. + */ +#define __flush_tlb_range_op(op, start, pages, stride, \ + asid, tlb_level, tlbi_user) \ +do { \ + int num = 0; \ + int scale = 0; \ + unsigned long addr; \ + \ + while (pages > 0) { \ + if (!system_supports_tlb_range() || \ + pages % 2 == 1) { \ + addr = __TLBI_VADDR(start, asid); \ + __tlbi_level(op, addr, tlb_level); \ + if (tlbi_user) \ + __tlbi_user_level(op, addr, tlb_level); \ + start += stride; \ + pages -= stride >> PAGE_SHIFT; \ + continue; \ + } \ + \ + num = __TLBI_RANGE_NUM(pages, scale); \ + if (num >= 0) { \ + addr = __TLBI_VADDR_RANGE(start, asid, scale, \ + num, tlb_level); \ + __tlbi(r##op, addr); \ + if (tlbi_user) \ + __tlbi_user(r##op, addr); \ + start += __TLBI_RANGE_PAGES(num, scale) << PAGE_SHIFT; \ + pages -= __TLBI_RANGE_PAGES(num, scale); \ + } \ + scale++; \ + } \ +} while (0) + +#define __flush_s2_tlb_range_op(op, start, pages, stride, tlb_level) \ + __flush_tlb_range_op(op, start, pages, stride, 0, tlb_level, false) + static inline void __flush_tlb_range(struct vm_area_struct *vma, unsigned long start, unsigned long end, unsigned long stride, bool last_level, int tlb_level) { - int num = 0; - int scale = 0; - unsigned long asid = ASID(vma->vm_mm); - unsigned long addr; - unsigned long pages; + unsigned long asid, pages; start = round_down(start, stride); end = round_up(end, stride); @@ -311,57 +372,13 @@ static inline void __flush_tlb_range(struct vm_area_struct *vma, } dsb(ishst); + asid = ASID(vma->vm_mm); + + if (last_level) + __flush_tlb_range_op(vale1is, start, pages, stride, asid, tlb_level, true); + else + __flush_tlb_range_op(vae1is, start, pages, stride, asid, tlb_level, true); - /* - * When the CPU does not support TLB range operations, flush the TLB - * entries one by one at the granularity of 'stride'. If the the TLB - * range ops are supported, then: - * - * 1. If 'pages' is odd, flush the first page through non-range - * operations; - * - * 2. For remaining pages: the minimum range granularity is decided - * by 'scale', so multiple range TLBI operations may be required. - * Start from scale = 0, flush the corresponding number of pages - * ((num+1)*2^(5*scale+1) starting from 'addr'), then increase it - * until no pages left. - * - * Note that certain ranges can be represented by either num = 31 and - * scale or num = 0 and scale + 1. The loop below favours the latter - * since num is limited to 30 by the __TLBI_RANGE_NUM() macro. - */ - while (pages > 0) { - if (!system_supports_tlb_range() || - pages % 2 == 1) { - addr = __TLBI_VADDR(start, asid); - if (last_level) { - __tlbi_level(vale1is, addr, tlb_level); - __tlbi_user_level(vale1is, addr, tlb_level); - } else { - __tlbi_level(vae1is, addr, tlb_level); - __tlbi_user_level(vae1is, addr, tlb_level); - } - start += stride; - pages -= stride >> PAGE_SHIFT; - continue; - } - - num = __TLBI_RANGE_NUM(pages, scale); - if (num >= 0) { - addr = __TLBI_VADDR_RANGE(start, asid, scale, - num, tlb_level); - if (last_level) { - __tlbi(rvale1is, addr); - __tlbi_user(rvale1is, addr); - } else { - __tlbi(rvae1is, addr); - __tlbi_user(rvae1is, addr); - } - start += __TLBI_RANGE_PAGES(num, scale) << PAGE_SHIFT; - pages -= __TLBI_RANGE_PAGES(num, scale); - } - scale++; - } dsb(ish); } diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig index a67121d419a2..b62af287c26f 100644 --- a/arch/arm64/kvm/Kconfig +++ b/arch/arm64/kvm/Kconfig @@ -24,7 +24,6 @@ config KVM select MMU_NOTIFIER select PREEMPT_NOTIFIERS select HAVE_KVM_CPU_RELAX_INTERCEPT - select HAVE_KVM_ARCH_TLB_FLUSH_ALL select KVM_MMIO select KVM_ARM_HOST select KVM_GENERIC_DIRTYLOG_READ_PROTECT diff --git a/arch/arm64/kvm/hyp/tlb.c b/arch/arm64/kvm/hyp/tlb.c index 7b7213fc17d9..c8aaff28fed0 100644 --- a/arch/arm64/kvm/hyp/tlb.c +++ b/arch/arm64/kvm/hyp/tlb.c @@ -165,6 +165,41 @@ void __hyp_text __kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa) __tlb_switch_to_host(kvm, &cxt); } +void __hyp_text __kvm_tlb_flush_vmid_range(struct kvm *kvm, + phys_addr_t start, unsigned long pages) +{ + struct tlb_inv_context cxt; + unsigned long stride; + + /* + * Since the range of addresses may not be mapped at + * the same level, assume the worst case as PAGE_SIZE + */ + stride = PAGE_SIZE; + start = round_down(start, stride); + + + if (has_vhe()) + dsb(ishst); + + /* Switch to requested VMID */ + kvm = kern_hyp_va(kvm); + __tlb_switch_to_guest(kvm, &cxt); + + __flush_s2_tlb_range_op(ipas2e1is, start, pages, stride, 0); + + dsb(ish); + __tlbi(vmalle1is); + dsb(ish); + isb(); + + /* See the comment in __kvm_tlb_flush_vmid_ipa() */ + if (!has_vhe() && icache_is_vpipt()) + __flush_icache_all(); + + __tlb_switch_to_host(kvm, &cxt); +} + void __hyp_text __kvm_tlb_flush_vmid(struct kvm *kvm) { struct tlb_inv_context cxt; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index ba88a2741266..3e80b0bdfdea 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -813,6 +813,9 @@ int kvm_vcpu_yield_to(struct kvm_vcpu *target); void kvm_vcpu_on_spin(struct kvm_vcpu *vcpu, bool usermode_vcpu_not_eligible); void kvm_flush_remote_tlbs(struct kvm *kvm); +void kvm_flush_remote_tlbs_range(struct kvm *kvm, gfn_t gfn, u64 nr_pages); +void kvm_flush_remote_tlbs_memslot(struct kvm *kvm, + const struct kvm_memory_slot *memslot); void kvm_reload_remote_mmus(struct kvm *kvm); bool kvm_make_vcpus_request_mask(struct kvm *kvm, unsigned int req, @@ -917,11 +920,23 @@ static inline void kvm_arch_free_vm(struct kvm *kvm) } #endif -#ifndef __KVM_HAVE_ARCH_FLUSH_REMOTE_TLB -static inline int kvm_arch_flush_remote_tlb(struct kvm *kvm) +#ifndef __KVM_HAVE_ARCH_FLUSH_REMOTE_TLBS +static inline int kvm_arch_flush_remote_tlbs(struct kvm *kvm) { return -ENOTSUPP; } +#else +int kvm_arch_flush_remote_tlbs(struct kvm *kvm); +#endif + +#ifndef __KVM_HAVE_ARCH_FLUSH_REMOTE_TLBS_RANGE +static inline int kvm_arch_flush_remote_tlbs_range(struct kvm *kvm, + gfn_t gfn, u64 nr_pages) +{ + return -EOPNOTSUPP; +} +#else +int kvm_arch_flush_remote_tlbs_range(struct kvm *kvm, gfn_t gfn, u64 nr_pages); #endif #ifdef __KVM_HAVE_ARCH_NONCOHERENT_DMA diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig index aad9284c043a..6ec39b52214c 100644 --- a/virt/kvm/Kconfig +++ b/virt/kvm/Kconfig @@ -36,9 +36,6 @@ config HAVE_KVM_CPU_RELAX_INTERCEPT config KVM_VFIO bool -config HAVE_KVM_ARCH_TLB_FLUSH_ALL - bool - config HAVE_KVM_INVALID_WAKEUPS bool diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c index c6ba672f07cc..6e20da9bf4f5 100644 --- a/virt/kvm/arm/mmu.c +++ b/virt/kvm/arm/mmu.c @@ -49,14 +49,15 @@ static bool memslot_is_logging(struct kvm_memory_slot *memslot) } /** - * kvm_flush_remote_tlbs() - flush all VM TLB entries for v7/8 + * kvm_arch_flush_remote_tlbs() - flush all VM TLB entries for v7/8 * @kvm: pointer to kvm structure. * * Interface to HYP function to flush all VM TLB entries */ -void kvm_flush_remote_tlbs(struct kvm *kvm) +int kvm_arch_flush_remote_tlbs(struct kvm *kvm) { kvm_call_hyp(__kvm_tlb_flush_vmid, kvm); + return 0; } static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa) @@ -64,6 +65,48 @@ static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa) kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, kvm, ipa); } +/** + * kvm_tlb_flush_vmid_range() - Invalidate/flush a range of TLB entries + * + * @kvm: pointer to kvm structure + * @addr: The base Intermediate physical address from which to invalidate + * @size: Size of the range from the base to invalidate + */ + +void kvm_tlb_flush_vmid_range(struct kvm *kvm, + phys_addr_t addr, size_t size) +{ + unsigned long pages, inval_pages; + + if (!system_supports_tlb_range()) { + kvm_call_hyp(__kvm_tlb_flush_vmid, kvm); + return; + } + + pages = size >> PAGE_SHIFT; + while (pages > 0) { + inval_pages = min(pages, MAX_TLBI_RANGE_PAGES); + kvm_call_hyp(__kvm_tlb_flush_vmid_range, + kvm, addr, inval_pages); + + addr += inval_pages << PAGE_SHIFT; + pages -= inval_pages; + } +} + +static bool stage2_unmap_defer_tlb_flush(void) +{ + /* + * If FEAT_TLBIRANGE is implemented, defer the individual + * TLB invalidations until the entire walk is finished, and + * then use the range-based TLBI instructions to do the + * invalidations. Condition deferred TLB invalidation on the + * system supporting FWB as the optimization is entirely + * pointless when the unmap walker needs to perform CMOs. + */ + return system_supports_tlb_range() && cpus_have_const_cap(ARM64_HAS_STAGE2_FWB); +} + /* * D-Cache management functions. They take the page table entries by * value, as they are flushing the cache using the kernel mapping (or @@ -84,6 +127,15 @@ static void kvm_flush_dcache_pud(pud_t pud) __kvm_flush_dcache_pud(pud); } + +int kvm_arch_flush_remote_tlbs_range(struct kvm *kvm, + gfn_t gfn, u64 nr_pages) +{ + kvm_tlb_flush_vmid_range(kvm,gfn << PAGE_SHIFT, + nr_pages << PAGE_SHIFT); + return 0; +} + static bool kvm_is_device_pfn(unsigned long pfn) { return !pfn_valid(pfn); @@ -249,9 +301,10 @@ static void unmap_stage2_ptes(struct kvm *kvm, pmd_t *pmd, do { if (!pte_none(*pte)) { pte_t old_pte = *pte; - kvm_set_pte(pte, __pte(0)); - kvm_tlb_flush_vmid_ipa(kvm, addr); + + if (!stage2_unmap_defer_tlb_flush()) + kvm_tlb_flush_vmid_ipa(kvm, addr); /* No need to invalidate the cache for device mappings */ if (!kvm_is_device_pfn(pte_pfn(old_pte))) @@ -354,6 +407,11 @@ static void __unmap_stage2_range(struct kvm *kvm, phys_addr_t start, u64 size, next = stage2_pgd_addr_end(kvm, addr, end); if (!stage2_pgd_none(kvm, *pgd)) unmap_stage2_puds(kvm, pgd, addr, next); + + if (stage2_unmap_defer_tlb_flush()) + /* Perform the deferred TLB invalidations */ + kvm_tlb_flush_vmid_range(kvm, addr, size); + /* * If the range is too large, release the kvm->mmu_lock * to prevent starvation and lockup detector warnings. @@ -1553,7 +1611,7 @@ void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot) spin_lock(&kvm->mmu_lock); stage2_wp_range(kvm, start, end); spin_unlock(&kvm->mmu_lock); - kvm_flush_remote_tlbs(kvm); + kvm_flush_remote_tlbs_memslot(kvm, memslot); } /** diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 6379b85d0672..f654f8855d05 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -295,7 +295,33 @@ bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req) return called; } -#ifndef CONFIG_HAVE_KVM_ARCH_TLB_FLUSH_ALL +void kvm_flush_remote_tlbs_range(struct kvm *kvm, gfn_t gfn, u64 nr_pages) +{ + if (!kvm_arch_flush_remote_tlbs_range(kvm, gfn, nr_pages)) + return; + + /* + * Fall back to a flushing entire TLBs if the architecture range-based + * TLB invalidation is unsupported or can't be performed for whatever + * reason. + */ + kvm_flush_remote_tlbs(kvm); +} + +void kvm_flush_remote_tlbs_memslot(struct kvm *kvm, + const struct kvm_memory_slot *memslot) +{ + /* + * All current use cases for flushing the TLBs for a specific memslot + * are related to dirty logging, and many do the TLB flush out of + * mmu_lock. The interaction between the various operations on memslot + * must be serialized by slots_locks to ensure the TLB flush from one + * operation is observed by any other operation on the same memslot. + */ + lockdep_assert_held(&kvm->slots_lock); + kvm_flush_remote_tlbs_range(kvm, memslot->base_gfn, memslot->npages); +} + void kvm_flush_remote_tlbs(struct kvm *kvm) { /* @@ -315,13 +341,12 @@ void kvm_flush_remote_tlbs(struct kvm *kvm) * kvm_make_all_cpus_request() reads vcpu->mode. We reuse that * barrier here. */ - if (!kvm_arch_flush_remote_tlb(kvm) + if (!kvm_arch_flush_remote_tlbs(kvm) || kvm_make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH)) ++kvm->stat.remote_tlb_flush; cmpxchg(&kvm->tlbs_dirty, dirty_count, 0); } EXPORT_SYMBOL_GPL(kvm_flush_remote_tlbs); -#endif void kvm_reload_remote_mmus(struct kvm *kvm) { @@ -1317,7 +1342,8 @@ int kvm_get_dirty_log_protect(struct kvm *kvm, } spin_unlock(&kvm->mmu_lock); } - + if (flush) + kvm_flush_remote_tlbs_memslot(kvm, memslot); if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n)) return -EFAULT; return 0; @@ -1393,6 +1419,9 @@ int kvm_clear_dirty_log_protect(struct kvm *kvm, } } spin_unlock(&kvm->mmu_lock); + + if(flush) + kvm_flush_remote_tlbs_memslot(kvm, memslot); return 0; }