lguest: don't share Switcher PTE pages between guests.

We currently use the whole top PGD entry for the switcher, so we
simply share a fixed page of PTEs between all guests (actually, it's
one per Host CPU, to ensure isolation between guests).

Changes to a scheme where every guest has its own mappings.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
This commit is contained in:
Rusty Russell 2013-04-22 14:10:40 +09:30
parent f1f394b1c3
commit 3412b6ae29
3 changed files with 106 additions and 165 deletions

View File

@ -333,15 +333,10 @@ static int __init init(void)
if (err) if (err)
goto out; goto out;
/* Now we set up the pagetable implementation for the Guests. */
err = init_pagetables(lg_switcher_pages);
if (err)
goto unmap;
/* We might need to reserve an interrupt vector. */ /* We might need to reserve an interrupt vector. */
err = init_interrupts(); err = init_interrupts();
if (err) if (err)
goto free_pgtables; goto unmap;
/* /dev/lguest needs to be registered. */ /* /dev/lguest needs to be registered. */
err = lguest_device_init(); err = lguest_device_init();
@ -356,8 +351,6 @@ static int __init init(void)
free_interrupts: free_interrupts:
free_interrupts(); free_interrupts();
free_pgtables:
free_pagetables();
unmap: unmap:
unmap_switcher(); unmap_switcher();
out: out:
@ -369,7 +362,6 @@ static void __exit fini(void)
{ {
lguest_device_remove(); lguest_device_remove();
free_interrupts(); free_interrupts();
free_pagetables();
unmap_switcher(); unmap_switcher();
lguest_arch_host_fini(); lguest_arch_host_fini();

View File

@ -14,9 +14,6 @@
#include <asm/lguest.h> #include <asm/lguest.h>
void free_pagetables(void);
int init_pagetables(struct page **switcher_pages);
struct pgdir { struct pgdir {
unsigned long gpgdir; unsigned long gpgdir;
pgd_t *pgdir; pgd_t *pgdir;

View File

@ -62,20 +62,11 @@
* will need the last pmd entry of the last pmd page. * will need the last pmd entry of the last pmd page.
*/ */
#ifdef CONFIG_X86_PAE #ifdef CONFIG_X86_PAE
#define SWITCHER_PMD_INDEX (PTRS_PER_PMD - 1)
#define CHECK_GPGD_MASK _PAGE_PRESENT #define CHECK_GPGD_MASK _PAGE_PRESENT
#else #else
#define CHECK_GPGD_MASK _PAGE_TABLE #define CHECK_GPGD_MASK _PAGE_TABLE
#endif #endif
/*
* We actually need a separate PTE page for each CPU. Remember that after the
* Switcher code itself comes two pages for each CPU, and we don't want this
* CPU's guest to see the pages of any other CPU.
*/
static DEFINE_PER_CPU(pte_t *, switcher_pte_pages);
#define switcher_pte_page(cpu) per_cpu(switcher_pte_pages, cpu)
/*H:320 /*H:320
* The page table code is curly enough to need helper functions to keep it * The page table code is curly enough to need helper functions to keep it
* clear and clean. The kernel itself provides many of them; one advantage * clear and clean. The kernel itself provides many of them; one advantage
@ -714,9 +705,6 @@ static unsigned int new_pgdir(struct lg_cpu *cpu,
int *blank_pgdir) int *blank_pgdir)
{ {
unsigned int next; unsigned int next;
#ifdef CONFIG_X86_PAE
pmd_t *pmd_table;
#endif
/* /*
* We pick one entry at random to throw out. Choosing the Least * We pick one entry at random to throw out. Choosing the Least
@ -731,29 +719,11 @@ static unsigned int new_pgdir(struct lg_cpu *cpu,
if (!cpu->lg->pgdirs[next].pgdir) if (!cpu->lg->pgdirs[next].pgdir)
next = cpu->cpu_pgd; next = cpu->cpu_pgd;
else { else {
#ifdef CONFIG_X86_PAE
/* /*
* In PAE mode, allocate a pmd page and populate the * This is a blank page, so there are no kernel
* last pgd entry. * mappings: caller must map the stack!
*/ */
pmd_table = (pmd_t *)get_zeroed_page(GFP_KERNEL);
if (!pmd_table) {
free_page((long)cpu->lg->pgdirs[next].pgdir);
set_pgd(cpu->lg->pgdirs[next].pgdir, __pgd(0));
next = cpu->cpu_pgd;
} else {
set_pgd(cpu->lg->pgdirs[next].pgdir +
SWITCHER_PGD_INDEX,
__pgd(__pa(pmd_table) | _PAGE_PRESENT));
/*
* This is a blank page, so there are no kernel
* mappings: caller must map the stack!
*/
*blank_pgdir = 1;
}
#else
*blank_pgdir = 1; *blank_pgdir = 1;
#endif
} }
} }
/* Record which Guest toplevel this shadows. */ /* Record which Guest toplevel this shadows. */
@ -764,6 +734,23 @@ static unsigned int new_pgdir(struct lg_cpu *cpu,
return next; return next;
} }
/*H:501
* We do need the Switcher code mapped at all times, so we allocate that
* part of the Guest page table here, and populate it when we're about to run
* the guest.
*/
static bool allocate_switcher_mapping(struct lg_cpu *cpu)
{
int i;
for (i = 0; i < TOTAL_SWITCHER_PAGES; i++) {
if (!find_spte(cpu, switcher_addr + i * PAGE_SIZE, true,
CHECK_GPGD_MASK, _PAGE_TABLE))
return false;
}
return true;
}
/*H:470 /*H:470
* Finally, a routine which throws away everything: all PGD entries in all * Finally, a routine which throws away everything: all PGD entries in all
* the shadow page tables, including the Guest's kernel mappings. This is used * the shadow page tables, including the Guest's kernel mappings. This is used
@ -774,28 +761,14 @@ static void release_all_pagetables(struct lguest *lg)
unsigned int i, j; unsigned int i, j;
/* Every shadow pagetable this Guest has */ /* Every shadow pagetable this Guest has */
for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++) for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++) {
if (lg->pgdirs[i].pgdir) { if (!lg->pgdirs[i].pgdir)
#ifdef CONFIG_X86_PAE continue;
pgd_t *spgd;
pmd_t *pmdpage;
unsigned int k;
/* Get the last pmd page. */ /* Every PGD entry. */
spgd = lg->pgdirs[i].pgdir + SWITCHER_PGD_INDEX; for (j = 0; j < PTRS_PER_PGD; j++)
pmdpage = __va(pgd_pfn(*spgd) << PAGE_SHIFT); release_pgd(lg->pgdirs[i].pgdir + j);
}
/*
* And release the pmd entries of that pmd page,
* except for the switcher pmd.
*/
for (k = 0; k < SWITCHER_PMD_INDEX; k++)
release_pmd(&pmdpage[k]);
#endif
/* Every PGD entry except the Switcher at the top */
for (j = 0; j < SWITCHER_PGD_INDEX; j++)
release_pgd(lg->pgdirs[i].pgdir + j);
}
} }
/* /*
@ -809,6 +782,9 @@ void guest_pagetable_clear_all(struct lg_cpu *cpu)
release_all_pagetables(cpu->lg); release_all_pagetables(cpu->lg);
/* We need the Guest kernel stack mapped again. */ /* We need the Guest kernel stack mapped again. */
pin_stack_pages(cpu); pin_stack_pages(cpu);
/* And we need Switcher allocated. */
if (!allocate_switcher_mapping(cpu))
kill_guest(cpu, "Cannot populate switcher mapping");
} }
/*H:430 /*H:430
@ -844,9 +820,15 @@ void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable)
newpgdir = new_pgdir(cpu, pgtable, &repin); newpgdir = new_pgdir(cpu, pgtable, &repin);
/* Change the current pgd index to the new one. */ /* Change the current pgd index to the new one. */
cpu->cpu_pgd = newpgdir; cpu->cpu_pgd = newpgdir;
/* If it was completely blank, we map in the Guest kernel stack */ /*
* If it was completely blank, we map in the Guest kernel stack and
* the Switcher.
*/
if (repin) if (repin)
pin_stack_pages(cpu); pin_stack_pages(cpu);
if (!allocate_switcher_mapping(cpu))
kill_guest(cpu, "Cannot populate switcher mapping");
} }
/*:*/ /*:*/
@ -976,14 +958,23 @@ void guest_set_pgd(struct lguest *lg, unsigned long gpgdir, u32 idx)
{ {
int pgdir; int pgdir;
if (idx >= SWITCHER_PGD_INDEX) if (idx > PTRS_PER_PGD) {
kill_guest(&lg->cpus[0], "Attempt to set pgd %u/%u",
idx, PTRS_PER_PGD);
return; return;
}
/* If they're talking about a page table we have a shadow for... */ /* If they're talking about a page table we have a shadow for... */
pgdir = find_pgdir(lg, gpgdir); pgdir = find_pgdir(lg, gpgdir);
if (pgdir < ARRAY_SIZE(lg->pgdirs)) if (pgdir < ARRAY_SIZE(lg->pgdirs)) {
/* ... throw it away. */ /* ... throw it away. */
release_pgd(lg->pgdirs[pgdir].pgdir + idx); release_pgd(lg->pgdirs[pgdir].pgdir + idx);
/* That might have been the Switcher mapping, remap it. */
if (!allocate_switcher_mapping(&lg->cpus[0])) {
kill_guest(&lg->cpus[0],
"Cannot populate switcher mapping");
}
}
} }
#ifdef CONFIG_X86_PAE #ifdef CONFIG_X86_PAE
@ -1001,6 +992,9 @@ void guest_set_pmd(struct lguest *lg, unsigned long pmdp, u32 idx)
* we will populate on future faults. The Guest doesn't have any actual * we will populate on future faults. The Guest doesn't have any actual
* pagetables yet, so we set linear_pages to tell demand_page() to fake it * pagetables yet, so we set linear_pages to tell demand_page() to fake it
* for the moment. * for the moment.
*
* We do need the Switcher to be mapped at all times, so we allocate that
* part of the Guest page table here.
*/ */
int init_guest_pagetable(struct lguest *lg) int init_guest_pagetable(struct lguest *lg)
{ {
@ -1014,6 +1008,13 @@ int init_guest_pagetable(struct lguest *lg)
/* We start with a linear mapping until the initialize. */ /* We start with a linear mapping until the initialize. */
cpu->linear_pages = true; cpu->linear_pages = true;
/* Allocate the page tables for the Switcher. */
if (!allocate_switcher_mapping(cpu)) {
release_all_pagetables(lg);
return -ENOMEM;
}
return 0; return 0;
} }
@ -1065,92 +1066,69 @@ void free_guest_pagetable(struct lguest *lg)
* (vi) Mapping the Switcher when the Guest is about to run. * (vi) Mapping the Switcher when the Guest is about to run.
* *
* The Switcher and the two pages for this CPU need to be visible in the * The Switcher and the two pages for this CPU need to be visible in the
* Guest (and not the pages for other CPUs). We have the appropriate PTE pages * Guest (and not the pages for other CPUs).
* for each CPU already set up, we just need to hook them in now we know which *
* Guest is about to run on this CPU. * The pages have all been allocate
*/ */
void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages) void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages)
{ {
pte_t *switcher_pte_page = __this_cpu_read(switcher_pte_pages); unsigned long base, i;
pte_t regs_pte; struct page *percpu_switcher_page, *regs_page;
pte_t *pte;
#ifdef CONFIG_X86_PAE /* Code page should always be mapped, and executable. */
pmd_t switcher_pmd; pte = find_spte(cpu, switcher_addr, false, 0, 0);
pmd_t *pmd_table; get_page(lg_switcher_pages[0]);
set_pte(pte, mk_pte(lg_switcher_pages[0], PAGE_KERNEL_RX));
switcher_pmd = pfn_pmd(__pa(switcher_pte_page) >> PAGE_SHIFT, /* Clear all the Switcher mappings for any other CPUs. */
PAGE_KERNEL_EXEC); /* FIXME: This is dumb: update only when Host CPU changes. */
for_each_possible_cpu(i) {
/* Get location of lguest_pages (indexed by Host CPU) */
base = switcher_addr + PAGE_SIZE
+ i * sizeof(struct lguest_pages);
/* Figure out where the pmd page is, by reading the PGD, and converting /* Get shadow PTE for first page (where we put guest regs). */
* it to a virtual address. */ pte = find_spte(cpu, base, false, 0, 0);
pmd_table = __va(pgd_pfn(cpu->lg-> set_pte(pte, __pte(0));
pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX])
<< PAGE_SHIFT); /* This is where we put R/O state. */
/* Now write it into the shadow page table. */ pte = find_spte(cpu, base + PAGE_SIZE, false, 0, 0);
set_pmd(&pmd_table[SWITCHER_PMD_INDEX], switcher_pmd); set_pte(pte, __pte(0));
#else }
pgd_t switcher_pgd;
/* /*
* Make the last PGD entry for this Guest point to the Switcher's PTE * When we're running the Guest, we want the Guest's "regs" page to
* page for this CPU (with appropriate flags). * appear where the first Switcher page for this CPU is. This is an
* optimization: when the Switcher saves the Guest registers, it saves
* them into the first page of this CPU's "struct lguest_pages": if we
* make sure the Guest's register page is already mapped there, we
* don't have to copy them out again.
*/ */
switcher_pgd = __pgd(__pa(switcher_pte_page) | __PAGE_KERNEL_EXEC); /* Find the shadow PTE for this regs page. */
base = switcher_addr + PAGE_SIZE
+ raw_smp_processor_id() * sizeof(struct lguest_pages);
pte = find_spte(cpu, base, false, 0, 0);
regs_page = pfn_to_page(__pa(cpu->regs_page) >> PAGE_SHIFT);
get_page(regs_page);
set_pte(pte, mk_pte(regs_page, __pgprot(__PAGE_KERNEL & ~_PAGE_GLOBAL)));
cpu->lg->pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX] = switcher_pgd;
#endif
/* /*
* We also change the Switcher PTE page. When we're running the Guest, * We map the second page of the struct lguest_pages read-only in
* we want the Guest's "regs" page to appear where the first Switcher * the Guest: the IDT, GDT and other things it's not supposed to
* page for this CPU is. This is an optimization: when the Switcher * change.
* saves the Guest registers, it saves them into the first page of this
* CPU's "struct lguest_pages": if we make sure the Guest's register
* page is already mapped there, we don't have to copy them out
* again.
*/ */
regs_pte = pfn_pte(__pa(cpu->regs_page) >> PAGE_SHIFT, PAGE_KERNEL); base += PAGE_SIZE;
set_pte(&switcher_pte_page[pte_index((unsigned long)pages)], regs_pte); pte = find_spte(cpu, base, false, 0, 0);
percpu_switcher_page
= lg_switcher_pages[1 + raw_smp_processor_id()*2 + 1];
get_page(percpu_switcher_page);
set_pte(pte, mk_pte(percpu_switcher_page,
__pgprot(__PAGE_KERNEL_RO & ~_PAGE_GLOBAL)));
} }
/*:*/ /*:*/
static void free_switcher_pte_pages(void)
{
unsigned int i;
for_each_possible_cpu(i)
free_page((long)switcher_pte_page(i));
}
/*H:520
* Setting up the Switcher PTE page for given CPU is fairly easy, given
* the CPU number and the "struct page"s for the Switcher and per-cpu pages.
*/
static __init void populate_switcher_pte_page(unsigned int cpu,
struct page *switcher_pages[])
{
pte_t *pte = switcher_pte_page(cpu);
int i;
/* The first entries maps the Switcher code. */
set_pte(&pte[0], mk_pte(switcher_pages[0],
__pgprot(_PAGE_PRESENT|_PAGE_ACCESSED)));
/* The only other thing we map is this CPU's pair of pages. */
i = 1 + cpu*2;
/* First page (Guest registers) is writable from the Guest */
set_pte(&pte[i], pfn_pte(page_to_pfn(switcher_pages[i]),
__pgprot(_PAGE_PRESENT|_PAGE_ACCESSED|_PAGE_RW)));
/*
* The second page contains the "struct lguest_ro_state", and is
* read-only.
*/
set_pte(&pte[i+1], pfn_pte(page_to_pfn(switcher_pages[i+1]),
__pgprot(_PAGE_PRESENT|_PAGE_ACCESSED)));
}
/* /*
* We've made it through the page table code. Perhaps our tired brains are * We've made it through the page table code. Perhaps our tired brains are
* still processing the details, or perhaps we're simply glad it's over. * still processing the details, or perhaps we're simply glad it's over.
@ -1163,29 +1141,3 @@ static __init void populate_switcher_pte_page(unsigned int cpu,
* *
* There is just one file remaining in the Host. * There is just one file remaining in the Host.
*/ */
/*H:510
* At boot or module load time, init_pagetables() allocates and populates
* the Switcher PTE page for each CPU.
*/
__init int init_pagetables(struct page **switcher_pages)
{
unsigned int i;
for_each_possible_cpu(i) {
switcher_pte_page(i) = (pte_t *)get_zeroed_page(GFP_KERNEL);
if (!switcher_pte_page(i)) {
free_switcher_pte_pages();
return -ENOMEM;
}
populate_switcher_pte_page(i, switcher_pages);
}
return 0;
}
/*:*/
/* Cleaning up simply involves freeing the PTE page for each CPU. */
void free_pagetables(void)
{
free_switcher_pte_pages();
}