thp: kvm mmu transparent hugepage support
This should work for both hugetlbfs and transparent hugepages. [akpm@linux-foundation.org: bring forward PageTransCompound() addition for bisectability] Signed-off-by: Andrea Arcangeli <aarcange@redhat.com> Cc: Avi Kivity <avi@redhat.com> Cc: Marcelo Tosatti <mtosatti@redhat.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
This commit is contained in:
parent
47ad8475c0
commit
936a5fe6e6
|
@ -554,14 +554,18 @@ static int host_mapping_level(struct kvm *kvm, gfn_t gfn)
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
|
static bool mapping_level_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t large_gfn)
|
||||||
{
|
{
|
||||||
struct kvm_memory_slot *slot;
|
struct kvm_memory_slot *slot;
|
||||||
int host_level, level, max_level;
|
|
||||||
|
|
||||||
slot = gfn_to_memslot(vcpu->kvm, large_gfn);
|
slot = gfn_to_memslot(vcpu->kvm, large_gfn);
|
||||||
if (slot && slot->dirty_bitmap)
|
if (slot && slot->dirty_bitmap)
|
||||||
return PT_PAGE_TABLE_LEVEL;
|
return true;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
|
||||||
|
{
|
||||||
|
int host_level, level, max_level;
|
||||||
|
|
||||||
host_level = host_mapping_level(vcpu->kvm, large_gfn);
|
host_level = host_mapping_level(vcpu->kvm, large_gfn);
|
||||||
|
|
||||||
|
@ -2281,6 +2285,48 @@ static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn)
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
|
||||||
|
gfn_t *gfnp, pfn_t *pfnp, int *levelp)
|
||||||
|
{
|
||||||
|
pfn_t pfn = *pfnp;
|
||||||
|
gfn_t gfn = *gfnp;
|
||||||
|
int level = *levelp;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Check if it's a transparent hugepage. If this would be an
|
||||||
|
* hugetlbfs page, level wouldn't be set to
|
||||||
|
* PT_PAGE_TABLE_LEVEL and there would be no adjustment done
|
||||||
|
* here.
|
||||||
|
*/
|
||||||
|
if (!is_error_pfn(pfn) && !kvm_is_mmio_pfn(pfn) &&
|
||||||
|
level == PT_PAGE_TABLE_LEVEL &&
|
||||||
|
PageTransCompound(pfn_to_page(pfn)) &&
|
||||||
|
!has_wrprotected_page(vcpu->kvm, gfn, PT_DIRECTORY_LEVEL)) {
|
||||||
|
unsigned long mask;
|
||||||
|
/*
|
||||||
|
* mmu_notifier_retry was successful and we hold the
|
||||||
|
* mmu_lock here, so the pmd can't become splitting
|
||||||
|
* from under us, and in turn
|
||||||
|
* __split_huge_page_refcount() can't run from under
|
||||||
|
* us and we can safely transfer the refcount from
|
||||||
|
* PG_tail to PG_head as we switch the pfn to tail to
|
||||||
|
* head.
|
||||||
|
*/
|
||||||
|
*levelp = level = PT_DIRECTORY_LEVEL;
|
||||||
|
mask = KVM_PAGES_PER_HPAGE(level) - 1;
|
||||||
|
VM_BUG_ON((gfn & mask) != (pfn & mask));
|
||||||
|
if (pfn & mask) {
|
||||||
|
gfn &= ~mask;
|
||||||
|
*gfnp = gfn;
|
||||||
|
kvm_release_pfn_clean(pfn);
|
||||||
|
pfn &= ~mask;
|
||||||
|
if (!get_page_unless_zero(pfn_to_page(pfn)))
|
||||||
|
BUG();
|
||||||
|
*pfnp = pfn;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
|
static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
|
||||||
gva_t gva, pfn_t *pfn, bool write, bool *writable);
|
gva_t gva, pfn_t *pfn, bool write, bool *writable);
|
||||||
|
|
||||||
|
@ -2289,20 +2335,25 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn,
|
||||||
{
|
{
|
||||||
int r;
|
int r;
|
||||||
int level;
|
int level;
|
||||||
|
int force_pt_level;
|
||||||
pfn_t pfn;
|
pfn_t pfn;
|
||||||
unsigned long mmu_seq;
|
unsigned long mmu_seq;
|
||||||
bool map_writable;
|
bool map_writable;
|
||||||
|
|
||||||
level = mapping_level(vcpu, gfn);
|
force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn);
|
||||||
|
if (likely(!force_pt_level)) {
|
||||||
|
level = mapping_level(vcpu, gfn);
|
||||||
|
/*
|
||||||
|
* This path builds a PAE pagetable - so we can map
|
||||||
|
* 2mb pages at maximum. Therefore check if the level
|
||||||
|
* is larger than that.
|
||||||
|
*/
|
||||||
|
if (level > PT_DIRECTORY_LEVEL)
|
||||||
|
level = PT_DIRECTORY_LEVEL;
|
||||||
|
|
||||||
/*
|
gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
|
||||||
* This path builds a PAE pagetable - so we can map 2mb pages at
|
} else
|
||||||
* maximum. Therefore check if the level is larger than that.
|
level = PT_PAGE_TABLE_LEVEL;
|
||||||
*/
|
|
||||||
if (level > PT_DIRECTORY_LEVEL)
|
|
||||||
level = PT_DIRECTORY_LEVEL;
|
|
||||||
|
|
||||||
gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
|
|
||||||
|
|
||||||
mmu_seq = vcpu->kvm->mmu_notifier_seq;
|
mmu_seq = vcpu->kvm->mmu_notifier_seq;
|
||||||
smp_rmb();
|
smp_rmb();
|
||||||
|
@ -2318,6 +2369,8 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn,
|
||||||
if (mmu_notifier_retry(vcpu, mmu_seq))
|
if (mmu_notifier_retry(vcpu, mmu_seq))
|
||||||
goto out_unlock;
|
goto out_unlock;
|
||||||
kvm_mmu_free_some_pages(vcpu);
|
kvm_mmu_free_some_pages(vcpu);
|
||||||
|
if (likely(!force_pt_level))
|
||||||
|
transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level);
|
||||||
r = __direct_map(vcpu, v, write, map_writable, level, gfn, pfn,
|
r = __direct_map(vcpu, v, write, map_writable, level, gfn, pfn,
|
||||||
prefault);
|
prefault);
|
||||||
spin_unlock(&vcpu->kvm->mmu_lock);
|
spin_unlock(&vcpu->kvm->mmu_lock);
|
||||||
|
@ -2655,6 +2708,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
|
||||||
pfn_t pfn;
|
pfn_t pfn;
|
||||||
int r;
|
int r;
|
||||||
int level;
|
int level;
|
||||||
|
int force_pt_level;
|
||||||
gfn_t gfn = gpa >> PAGE_SHIFT;
|
gfn_t gfn = gpa >> PAGE_SHIFT;
|
||||||
unsigned long mmu_seq;
|
unsigned long mmu_seq;
|
||||||
int write = error_code & PFERR_WRITE_MASK;
|
int write = error_code & PFERR_WRITE_MASK;
|
||||||
|
@ -2667,9 +2721,12 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
|
||||||
if (r)
|
if (r)
|
||||||
return r;
|
return r;
|
||||||
|
|
||||||
level = mapping_level(vcpu, gfn);
|
force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn);
|
||||||
|
if (likely(!force_pt_level)) {
|
||||||
gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
|
level = mapping_level(vcpu, gfn);
|
||||||
|
gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
|
||||||
|
} else
|
||||||
|
level = PT_PAGE_TABLE_LEVEL;
|
||||||
|
|
||||||
mmu_seq = vcpu->kvm->mmu_notifier_seq;
|
mmu_seq = vcpu->kvm->mmu_notifier_seq;
|
||||||
smp_rmb();
|
smp_rmb();
|
||||||
|
@ -2684,6 +2741,8 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
|
||||||
if (mmu_notifier_retry(vcpu, mmu_seq))
|
if (mmu_notifier_retry(vcpu, mmu_seq))
|
||||||
goto out_unlock;
|
goto out_unlock;
|
||||||
kvm_mmu_free_some_pages(vcpu);
|
kvm_mmu_free_some_pages(vcpu);
|
||||||
|
if (likely(!force_pt_level))
|
||||||
|
transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level);
|
||||||
r = __direct_map(vcpu, gpa, write, map_writable,
|
r = __direct_map(vcpu, gpa, write, map_writable,
|
||||||
level, gfn, pfn, prefault);
|
level, gfn, pfn, prefault);
|
||||||
spin_unlock(&vcpu->kvm->mmu_lock);
|
spin_unlock(&vcpu->kvm->mmu_lock);
|
||||||
|
|
|
@ -550,6 +550,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
|
||||||
int r;
|
int r;
|
||||||
pfn_t pfn;
|
pfn_t pfn;
|
||||||
int level = PT_PAGE_TABLE_LEVEL;
|
int level = PT_PAGE_TABLE_LEVEL;
|
||||||
|
int force_pt_level;
|
||||||
unsigned long mmu_seq;
|
unsigned long mmu_seq;
|
||||||
bool map_writable;
|
bool map_writable;
|
||||||
|
|
||||||
|
@ -577,7 +578,11 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (walker.level >= PT_DIRECTORY_LEVEL) {
|
if (walker.level >= PT_DIRECTORY_LEVEL)
|
||||||
|
force_pt_level = mapping_level_dirty_bitmap(vcpu, walker.gfn);
|
||||||
|
else
|
||||||
|
force_pt_level = 1;
|
||||||
|
if (!force_pt_level) {
|
||||||
level = min(walker.level, mapping_level(vcpu, walker.gfn));
|
level = min(walker.level, mapping_level(vcpu, walker.gfn));
|
||||||
walker.gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1);
|
walker.gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1);
|
||||||
}
|
}
|
||||||
|
@ -599,6 +604,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
|
||||||
|
|
||||||
trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT);
|
trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT);
|
||||||
kvm_mmu_free_some_pages(vcpu);
|
kvm_mmu_free_some_pages(vcpu);
|
||||||
|
if (!force_pt_level)
|
||||||
|
transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level);
|
||||||
sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
|
sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
|
||||||
level, &write_pt, pfn, map_writable, prefault);
|
level, &write_pt, pfn, map_writable, prefault);
|
||||||
(void)sptep;
|
(void)sptep;
|
||||||
|
|
|
@ -409,6 +409,18 @@ static inline void ClearPageCompound(struct page *page)
|
||||||
|
|
||||||
#endif /* !PAGEFLAGS_EXTENDED */
|
#endif /* !PAGEFLAGS_EXTENDED */
|
||||||
|
|
||||||
|
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
|
||||||
|
static inline int PageTransCompound(struct page *page)
|
||||||
|
{
|
||||||
|
return PageCompound(page);
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
static inline int PageTransCompound(struct page *page)
|
||||||
|
{
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifdef CONFIG_MMU
|
#ifdef CONFIG_MMU
|
||||||
#define __PG_MLOCKED (1 << PG_mlocked)
|
#define __PG_MLOCKED (1 << PG_mlocked)
|
||||||
#else
|
#else
|
||||||
|
|
|
@ -104,8 +104,36 @@ static pfn_t fault_pfn;
|
||||||
inline int kvm_is_mmio_pfn(pfn_t pfn)
|
inline int kvm_is_mmio_pfn(pfn_t pfn)
|
||||||
{
|
{
|
||||||
if (pfn_valid(pfn)) {
|
if (pfn_valid(pfn)) {
|
||||||
struct page *page = compound_head(pfn_to_page(pfn));
|
struct page *head;
|
||||||
return PageReserved(page);
|
struct page *tail = pfn_to_page(pfn);
|
||||||
|
head = compound_head(tail);
|
||||||
|
if (head != tail) {
|
||||||
|
smp_rmb();
|
||||||
|
/*
|
||||||
|
* head may be a dangling pointer.
|
||||||
|
* __split_huge_page_refcount clears PageTail
|
||||||
|
* before overwriting first_page, so if
|
||||||
|
* PageTail is still there it means the head
|
||||||
|
* pointer isn't dangling.
|
||||||
|
*/
|
||||||
|
if (PageTail(tail)) {
|
||||||
|
/*
|
||||||
|
* the "head" is not a dangling
|
||||||
|
* pointer but the hugepage may have
|
||||||
|
* been splitted from under us (and we
|
||||||
|
* may not hold a reference count on
|
||||||
|
* the head page so it can be reused
|
||||||
|
* before we run PageReferenced), so
|
||||||
|
* we've to recheck PageTail before
|
||||||
|
* returning what we just read.
|
||||||
|
*/
|
||||||
|
int reserved = PageReserved(head);
|
||||||
|
smp_rmb();
|
||||||
|
if (PageTail(tail))
|
||||||
|
return reserved;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return PageReserved(tail);
|
||||||
}
|
}
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
|
|
Loading…
Reference in New Issue