OpenCloudOS-Kernel/arch/x86/hyperv/mmu.c

244 lines
6.3 KiB
C
Raw Normal View History

2017-08-03 00:09:19 +08:00
#define pr_fmt(fmt) "Hyper-V: " fmt
#include <linux/hyperv.h>
#include <linux/log2.h>
#include <linux/slab.h>
#include <linux/types.h>
#include <asm/fpu/api.h>
#include <asm/mshyperv.h>
#include <asm/msr.h>
#include <asm/tlbflush.h>
#include <asm/tlb.h>
2017-08-03 00:09:19 +08:00
#define CREATE_TRACE_POINTS
#include <asm/trace/hyperv.h>
2017-08-03 00:09:19 +08:00
/* Each gva in gva_list encodes up to 4096 pages to flush */
#define HV_TLB_FLUSH_UNIT (4096 * PAGE_SIZE)
static u64 hyperv_flush_tlb_others_ex(const struct cpumask *cpus,
const struct flush_tlb_info *info);
2017-08-03 00:09:19 +08:00
/*
* Fills in gva_list starting from offset. Returns the number of items added.
*/
static inline int fill_gva_list(u64 gva_list[], int offset,
unsigned long start, unsigned long end)
{
int gva_n = offset;
unsigned long cur = start, diff;
do {
diff = end > cur ? end - cur : 0;
gva_list[gva_n] = cur & PAGE_MASK;
/*
* Lower 12 bits encode the number of additional
* pages to flush (in addition to the 'cur' page).
*/
if (diff >= HV_TLB_FLUSH_UNIT) {
2017-08-03 00:09:19 +08:00
gva_list[gva_n] |= ~PAGE_MASK;
cur += HV_TLB_FLUSH_UNIT;
} else if (diff) {
2017-08-03 00:09:19 +08:00
gva_list[gva_n] |= (diff - 1) >> PAGE_SHIFT;
cur = end;
}
2017-08-03 00:09:19 +08:00
gva_n++;
} while (cur < end);
return gva_n - offset;
}
static void hyperv_flush_tlb_multi(const struct cpumask *cpus,
const struct flush_tlb_info *info)
2017-08-03 00:09:19 +08:00
{
int cpu, vcpu, gva_n, max_gvas;
struct hv_tlb_flush **flush_pcpu;
struct hv_tlb_flush *flush;
u64 status;
2017-08-03 00:09:19 +08:00
unsigned long flags;
trace_hyperv_mmu_flush_tlb_multi(cpus, info);
if (!hv_hypercall_pg)
2017-08-03 00:09:19 +08:00
goto do_native;
local_irq_save(flags);
flush_pcpu = (struct hv_tlb_flush **)
this_cpu_ptr(hyperv_pcpu_input_arg);
flush = *flush_pcpu;
if (unlikely(!flush)) {
local_irq_restore(flags);
goto do_native;
}
2017-08-03 00:09:19 +08:00
if (info->mm) {
x86/hyperv: Stop suppressing X86_FEATURE_PCID When hypercall-based TLB flush was enabled for Hyper-V guests PCID feature was deliberately suppressed as a precaution: back then PCID was never exposed to Hyper-V guests and it wasn't clear what will happen if some day it becomes available. The day came and PCID/INVPCID features are already exposed on certain Hyper-V hosts. From TLFS (as of 5.0b) it is unclear how TLB flush hypercalls combine with PCID. In particular the usage of PCID is per-cpu based: the same mm gets different CR3 values on different CPUs. If the hypercall does exact matching this will fail. However, this is not the case. David Zhang explains: "In practice, the AddressSpace argument is ignored on any VM that supports PCIDs. Architecturally, the AddressSpace argument must match the CR3 with PCID bits stripped out (i.e., the low 12 bits of AddressSpace should be 0 in long mode). The flush hypercalls flush all PCIDs for the specified AddressSpace." With this, PCID can be enabled. Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Cc: David Zhang <dazhan@microsoft.com> Cc: Stephen Hemminger <sthemmin@microsoft.com> Cc: Haiyang Zhang <haiyangz@microsoft.com> Cc: "Michael Kelley (EOSG)" <Michael.H.Kelley@microsoft.com> Cc: Andy Lutomirski <luto@kernel.org> Cc: devel@linuxdriverproject.org Cc: "K. Y. Srinivasan" <kys@microsoft.com> Cc: Aditya Bhandari <adityabh@microsoft.com> Link: https://lkml.kernel.org/r/20180124103629.29980-1-vkuznets@redhat.com
2018-01-24 18:36:29 +08:00
/*
* AddressSpace argument must match the CR3 with PCID bits
* stripped out.
*/
2017-08-03 00:09:19 +08:00
flush->address_space = virt_to_phys(info->mm->pgd);
x86/hyperv: Stop suppressing X86_FEATURE_PCID When hypercall-based TLB flush was enabled for Hyper-V guests PCID feature was deliberately suppressed as a precaution: back then PCID was never exposed to Hyper-V guests and it wasn't clear what will happen if some day it becomes available. The day came and PCID/INVPCID features are already exposed on certain Hyper-V hosts. From TLFS (as of 5.0b) it is unclear how TLB flush hypercalls combine with PCID. In particular the usage of PCID is per-cpu based: the same mm gets different CR3 values on different CPUs. If the hypercall does exact matching this will fail. However, this is not the case. David Zhang explains: "In practice, the AddressSpace argument is ignored on any VM that supports PCIDs. Architecturally, the AddressSpace argument must match the CR3 with PCID bits stripped out (i.e., the low 12 bits of AddressSpace should be 0 in long mode). The flush hypercalls flush all PCIDs for the specified AddressSpace." With this, PCID can be enabled. Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Cc: David Zhang <dazhan@microsoft.com> Cc: Stephen Hemminger <sthemmin@microsoft.com> Cc: Haiyang Zhang <haiyangz@microsoft.com> Cc: "Michael Kelley (EOSG)" <Michael.H.Kelley@microsoft.com> Cc: Andy Lutomirski <luto@kernel.org> Cc: devel@linuxdriverproject.org Cc: "K. Y. Srinivasan" <kys@microsoft.com> Cc: Aditya Bhandari <adityabh@microsoft.com> Link: https://lkml.kernel.org/r/20180124103629.29980-1-vkuznets@redhat.com
2018-01-24 18:36:29 +08:00
flush->address_space &= CR3_ADDR_MASK;
2017-08-03 00:09:19 +08:00
flush->flags = 0;
} else {
flush->address_space = 0;
flush->flags = HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES;
}
flush->processor_mask = 0;
if (cpumask_equal(cpus, cpu_present_mask)) {
flush->flags |= HV_FLUSH_ALL_PROCESSORS;
} else {
/*
* From the supplied CPU set we need to figure out if we can get
* away with cheaper HVCALL_FLUSH_VIRTUAL_ADDRESS_{LIST,SPACE}
* hypercalls. This is possible when the highest VP number in
* the set is < 64. As VP numbers are usually in ascending order
* and match Linux CPU ids, here is an optimization: we check
* the VP number for the highest bit in the supplied set first
* so we can quickly find out if using *_EX hypercalls is a
* must. We will also check all VP numbers when walking the
* supplied CPU set to remain correct in all cases.
*/
x86/hyperv: Properly deal with empty cpumasks in hyperv_flush_tlb_multi() KASAN detected the following issue: BUG: KASAN: slab-out-of-bounds in hyperv_flush_tlb_multi+0xf88/0x1060 Read of size 4 at addr ffff8880011ccbc0 by task kcompactd0/33 CPU: 1 PID: 33 Comm: kcompactd0 Not tainted 5.14.0-39.el9.x86_64+debug #1 Hardware name: Microsoft Corporation Virtual Machine/Virtual Machine, BIOS Hyper-V UEFI Release v4.0 12/17/2019 Call Trace: dump_stack_lvl+0x57/0x7d print_address_description.constprop.0+0x1f/0x140 ? hyperv_flush_tlb_multi+0xf88/0x1060 __kasan_report.cold+0x7f/0x11e ? hyperv_flush_tlb_multi+0xf88/0x1060 kasan_report+0x38/0x50 hyperv_flush_tlb_multi+0xf88/0x1060 flush_tlb_mm_range+0x1b1/0x200 ptep_clear_flush+0x10e/0x150 ... Allocated by task 0: kasan_save_stack+0x1b/0x40 __kasan_kmalloc+0x7c/0x90 hv_common_init+0xae/0x115 hyperv_init+0x97/0x501 apic_intr_mode_init+0xb3/0x1e0 x86_late_time_init+0x92/0xa2 start_kernel+0x338/0x3eb secondary_startup_64_no_verify+0xc2/0xcb The buggy address belongs to the object at ffff8880011cc800 which belongs to the cache kmalloc-1k of size 1024 The buggy address is located 960 bytes inside of 1024-byte region [ffff8880011cc800, ffff8880011ccc00) 'hyperv_flush_tlb_multi+0xf88/0x1060' points to hv_cpu_number_to_vp_number() and '960 bytes' means we're trying to get VP_INDEX for CPU#240. 'nr_cpus' here is exactly 240 so we're trying to access past hv_vp_index's last element. This can (and will) happen when 'cpus' mask is empty and cpumask_last() will return '>=nr_cpus'. Commit ad0a6bad4475 ("x86/hyperv: check cpu mask after interrupt has been disabled") tried to deal with empty cpumask situation but apparently didn't fully fix the issue. 'cpus' cpumask which is passed to hyperv_flush_tlb_multi() is 'mm_cpumask(mm)' (which is '&mm->cpu_bitmap'). This mask changes every time the particular mm is scheduled/unscheduled on some CPU (see switch_mm_irqs_off()), disabling IRQs on the CPU which is performing remote TLB flush has zero influence on whether the particular process can get scheduled/unscheduled on _other_ CPUs so e.g. in the case where the mm was scheduled on one other CPU and got unscheduled during hyperv_flush_tlb_multi()'s execution will lead to cpumask becoming empty. It doesn't seem that there's a good way to protect 'mm_cpumask(mm)' from changing during hyperv_flush_tlb_multi()'s execution. It would be possible to copy it in the very beginning of the function but this is a waste. It seems we can deal with changing cpumask just fine. When 'cpus' cpumask changes during hyperv_flush_tlb_multi()'s execution, there are two possible issues: - 'Under-flushing': we will not flush TLB on a CPU which got added to the mask while hyperv_flush_tlb_multi() was already running. This is not a problem as this is equal to mm getting scheduled on that CPU right after TLB flush. - 'Over-flushing': we may flush TLB on a CPU which is already cleared from the mask. First, extra TLB flush preserves correctness. Second, Hyper-V's TLB flush hypercall takes 'mm->pgd' argument so Hyper-V may avoid the flush if CR3 doesn't match. Fix the immediate issue with cpumask_last()/hv_cpu_number_to_vp_number() and remove the pointless cpumask_empty() check from the beginning of the function as it really doesn't protect anything. Also, avoid the hypercall altogether when 'flush->processor_mask' ends up being empty. Fixes: ad0a6bad4475 ("x86/hyperv: check cpu mask after interrupt has been disabled") Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com> Reviewed-by: Michael Kelley <mikelley@microsoft.com> Link: https://lore.kernel.org/r/20220106094611.1404218-1-vkuznets@redhat.com Signed-off-by: Wei Liu <wei.liu@kernel.org>
2022-01-06 17:46:11 +08:00
cpu = cpumask_last(cpus);
if (cpu < nr_cpumask_bits && hv_cpu_number_to_vp_number(cpu) >= 64)
goto do_ex_hypercall;
2017-08-03 00:09:19 +08:00
for_each_cpu(cpu, cpus) {
vcpu = hv_cpu_number_to_vp_number(cpu);
if (vcpu == VP_INVAL) {
local_irq_restore(flags);
goto do_native;
}
2017-08-03 00:09:19 +08:00
if (vcpu >= 64)
goto do_ex_hypercall;
2017-08-03 00:09:19 +08:00
__set_bit(vcpu, (unsigned long *)
&flush->processor_mask);
}
x86/hyperv: Properly deal with empty cpumasks in hyperv_flush_tlb_multi() KASAN detected the following issue: BUG: KASAN: slab-out-of-bounds in hyperv_flush_tlb_multi+0xf88/0x1060 Read of size 4 at addr ffff8880011ccbc0 by task kcompactd0/33 CPU: 1 PID: 33 Comm: kcompactd0 Not tainted 5.14.0-39.el9.x86_64+debug #1 Hardware name: Microsoft Corporation Virtual Machine/Virtual Machine, BIOS Hyper-V UEFI Release v4.0 12/17/2019 Call Trace: dump_stack_lvl+0x57/0x7d print_address_description.constprop.0+0x1f/0x140 ? hyperv_flush_tlb_multi+0xf88/0x1060 __kasan_report.cold+0x7f/0x11e ? hyperv_flush_tlb_multi+0xf88/0x1060 kasan_report+0x38/0x50 hyperv_flush_tlb_multi+0xf88/0x1060 flush_tlb_mm_range+0x1b1/0x200 ptep_clear_flush+0x10e/0x150 ... Allocated by task 0: kasan_save_stack+0x1b/0x40 __kasan_kmalloc+0x7c/0x90 hv_common_init+0xae/0x115 hyperv_init+0x97/0x501 apic_intr_mode_init+0xb3/0x1e0 x86_late_time_init+0x92/0xa2 start_kernel+0x338/0x3eb secondary_startup_64_no_verify+0xc2/0xcb The buggy address belongs to the object at ffff8880011cc800 which belongs to the cache kmalloc-1k of size 1024 The buggy address is located 960 bytes inside of 1024-byte region [ffff8880011cc800, ffff8880011ccc00) 'hyperv_flush_tlb_multi+0xf88/0x1060' points to hv_cpu_number_to_vp_number() and '960 bytes' means we're trying to get VP_INDEX for CPU#240. 'nr_cpus' here is exactly 240 so we're trying to access past hv_vp_index's last element. This can (and will) happen when 'cpus' mask is empty and cpumask_last() will return '>=nr_cpus'. Commit ad0a6bad4475 ("x86/hyperv: check cpu mask after interrupt has been disabled") tried to deal with empty cpumask situation but apparently didn't fully fix the issue. 'cpus' cpumask which is passed to hyperv_flush_tlb_multi() is 'mm_cpumask(mm)' (which is '&mm->cpu_bitmap'). This mask changes every time the particular mm is scheduled/unscheduled on some CPU (see switch_mm_irqs_off()), disabling IRQs on the CPU which is performing remote TLB flush has zero influence on whether the particular process can get scheduled/unscheduled on _other_ CPUs so e.g. in the case where the mm was scheduled on one other CPU and got unscheduled during hyperv_flush_tlb_multi()'s execution will lead to cpumask becoming empty. It doesn't seem that there's a good way to protect 'mm_cpumask(mm)' from changing during hyperv_flush_tlb_multi()'s execution. It would be possible to copy it in the very beginning of the function but this is a waste. It seems we can deal with changing cpumask just fine. When 'cpus' cpumask changes during hyperv_flush_tlb_multi()'s execution, there are two possible issues: - 'Under-flushing': we will not flush TLB on a CPU which got added to the mask while hyperv_flush_tlb_multi() was already running. This is not a problem as this is equal to mm getting scheduled on that CPU right after TLB flush. - 'Over-flushing': we may flush TLB on a CPU which is already cleared from the mask. First, extra TLB flush preserves correctness. Second, Hyper-V's TLB flush hypercall takes 'mm->pgd' argument so Hyper-V may avoid the flush if CR3 doesn't match. Fix the immediate issue with cpumask_last()/hv_cpu_number_to_vp_number() and remove the pointless cpumask_empty() check from the beginning of the function as it really doesn't protect anything. Also, avoid the hypercall altogether when 'flush->processor_mask' ends up being empty. Fixes: ad0a6bad4475 ("x86/hyperv: check cpu mask after interrupt has been disabled") Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com> Reviewed-by: Michael Kelley <mikelley@microsoft.com> Link: https://lore.kernel.org/r/20220106094611.1404218-1-vkuznets@redhat.com Signed-off-by: Wei Liu <wei.liu@kernel.org>
2022-01-06 17:46:11 +08:00
/* nothing to flush if 'processor_mask' ends up being empty */
if (!flush->processor_mask) {
local_irq_restore(flags);
return;
}
2017-08-03 00:09:19 +08:00
}
/*
* We can flush not more than max_gvas with one hypercall. Flush the
* whole address space if we were asked to do more.
*/
max_gvas = (PAGE_SIZE - sizeof(*flush)) / sizeof(flush->gva_list[0]);
if (info->end == TLB_FLUSH_ALL) {
flush->flags |= HV_FLUSH_NON_GLOBAL_MAPPINGS_ONLY;
status = hv_do_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE,
flush, NULL);
} else if (info->end &&
((info->end - info->start)/HV_TLB_FLUSH_UNIT) > max_gvas) {
status = hv_do_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE,
flush, NULL);
} else {
gva_n = fill_gva_list(flush->gva_list, 0,
info->start, info->end);
status = hv_do_rep_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST,
gva_n, 0, flush, NULL);
}
goto check_status;
do_ex_hypercall:
status = hyperv_flush_tlb_others_ex(cpus, info);
2017-08-03 00:09:19 +08:00
check_status:
2017-08-03 00:09:19 +08:00
local_irq_restore(flags);
if (hv_result_success(status))
2017-08-03 00:09:19 +08:00
return;
do_native:
native_flush_tlb_multi(cpus, info);
2017-08-03 00:09:19 +08:00
}
static u64 hyperv_flush_tlb_others_ex(const struct cpumask *cpus,
const struct flush_tlb_info *info)
{
int nr_bank = 0, max_gvas, gva_n;
struct hv_tlb_flush_ex **flush_pcpu;
struct hv_tlb_flush_ex *flush;
u64 status;
if (!(ms_hyperv.hints & HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED))
return HV_STATUS_INVALID_PARAMETER;
flush_pcpu = (struct hv_tlb_flush_ex **)
this_cpu_ptr(hyperv_pcpu_input_arg);
flush = *flush_pcpu;
if (info->mm) {
x86/hyperv: Stop suppressing X86_FEATURE_PCID When hypercall-based TLB flush was enabled for Hyper-V guests PCID feature was deliberately suppressed as a precaution: back then PCID was never exposed to Hyper-V guests and it wasn't clear what will happen if some day it becomes available. The day came and PCID/INVPCID features are already exposed on certain Hyper-V hosts. From TLFS (as of 5.0b) it is unclear how TLB flush hypercalls combine with PCID. In particular the usage of PCID is per-cpu based: the same mm gets different CR3 values on different CPUs. If the hypercall does exact matching this will fail. However, this is not the case. David Zhang explains: "In practice, the AddressSpace argument is ignored on any VM that supports PCIDs. Architecturally, the AddressSpace argument must match the CR3 with PCID bits stripped out (i.e., the low 12 bits of AddressSpace should be 0 in long mode). The flush hypercalls flush all PCIDs for the specified AddressSpace." With this, PCID can be enabled. Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Cc: David Zhang <dazhan@microsoft.com> Cc: Stephen Hemminger <sthemmin@microsoft.com> Cc: Haiyang Zhang <haiyangz@microsoft.com> Cc: "Michael Kelley (EOSG)" <Michael.H.Kelley@microsoft.com> Cc: Andy Lutomirski <luto@kernel.org> Cc: devel@linuxdriverproject.org Cc: "K. Y. Srinivasan" <kys@microsoft.com> Cc: Aditya Bhandari <adityabh@microsoft.com> Link: https://lkml.kernel.org/r/20180124103629.29980-1-vkuznets@redhat.com
2018-01-24 18:36:29 +08:00
/*
* AddressSpace argument must match the CR3 with PCID bits
* stripped out.
*/
flush->address_space = virt_to_phys(info->mm->pgd);
x86/hyperv: Stop suppressing X86_FEATURE_PCID When hypercall-based TLB flush was enabled for Hyper-V guests PCID feature was deliberately suppressed as a precaution: back then PCID was never exposed to Hyper-V guests and it wasn't clear what will happen if some day it becomes available. The day came and PCID/INVPCID features are already exposed on certain Hyper-V hosts. From TLFS (as of 5.0b) it is unclear how TLB flush hypercalls combine with PCID. In particular the usage of PCID is per-cpu based: the same mm gets different CR3 values on different CPUs. If the hypercall does exact matching this will fail. However, this is not the case. David Zhang explains: "In practice, the AddressSpace argument is ignored on any VM that supports PCIDs. Architecturally, the AddressSpace argument must match the CR3 with PCID bits stripped out (i.e., the low 12 bits of AddressSpace should be 0 in long mode). The flush hypercalls flush all PCIDs for the specified AddressSpace." With this, PCID can be enabled. Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Cc: David Zhang <dazhan@microsoft.com> Cc: Stephen Hemminger <sthemmin@microsoft.com> Cc: Haiyang Zhang <haiyangz@microsoft.com> Cc: "Michael Kelley (EOSG)" <Michael.H.Kelley@microsoft.com> Cc: Andy Lutomirski <luto@kernel.org> Cc: devel@linuxdriverproject.org Cc: "K. Y. Srinivasan" <kys@microsoft.com> Cc: Aditya Bhandari <adityabh@microsoft.com> Link: https://lkml.kernel.org/r/20180124103629.29980-1-vkuznets@redhat.com
2018-01-24 18:36:29 +08:00
flush->address_space &= CR3_ADDR_MASK;
flush->flags = 0;
} else {
flush->address_space = 0;
flush->flags = HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES;
}
flush->hv_vp_set.valid_bank_mask = 0;
flush->hv_vp_set.format = HV_GENERIC_SET_SPARSE_4K;
nr_bank = cpumask_to_vpset(&(flush->hv_vp_set), cpus);
if (nr_bank < 0)
return HV_STATUS_INVALID_PARAMETER;
/*
* We can flush not more than max_gvas with one hypercall. Flush the
* whole address space if we were asked to do more.
*/
max_gvas =
(PAGE_SIZE - sizeof(*flush) - nr_bank *
sizeof(flush->hv_vp_set.bank_contents[0])) /
sizeof(flush->gva_list[0]);
if (info->end == TLB_FLUSH_ALL) {
flush->flags |= HV_FLUSH_NON_GLOBAL_MAPPINGS_ONLY;
status = hv_do_rep_hypercall(
HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX,
0, nr_bank, flush, NULL);
} else if (info->end &&
((info->end - info->start)/HV_TLB_FLUSH_UNIT) > max_gvas) {
status = hv_do_rep_hypercall(
HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX,
0, nr_bank, flush, NULL);
} else {
gva_n = fill_gva_list(flush->gva_list, nr_bank,
info->start, info->end);
status = hv_do_rep_hypercall(
HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX,
gva_n, nr_bank, flush, NULL);
}
return status;
}
2017-08-03 00:09:19 +08:00
void hyperv_setup_mmu_ops(void)
{
if (!(ms_hyperv.hints & HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED))
return;
pr_info("Using hypercall for remote TLB flush\n");
pv_ops.mmu.flush_tlb_multi = hyperv_flush_tlb_multi;
pv_ops.mmu.tlb_remove_table = tlb_remove_table;
2017-08-03 00:09:19 +08:00
}