lguest: don't share Switcher PTE pages between guests.

We currently use the whole top PGD entry for the switcher, so we simply share a fixed page of PTEs between all guests (actually, it's one per Host CPU, to ensure isolation between guests). Changes to a scheme where every guest has its own mappings. Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
2013-04-22 14:10:40 +09:30 · 2013-04-22 14:10:40 +09:30 · 3412b6ae29
parent f1f394b1c3
commit 3412b6ae29
3 changed files with 106 additions and 165 deletions
--- a/drivers/lguest/core.c
+++ b/drivers/lguest/core.c
@ -333,15 +333,10 @@ static int __init init(void)
 	if (err)
 		goto out;
 	/* Now we set up the pagetable implementation for the Guests. */
 	err = init_pagetables(lg_switcher_pages);
 	if (err)
 		goto unmap;
 	/* We might need to reserve an interrupt vector. */
 	err = init_interrupts();
 	if (err)
-		goto free_pgtables;
+		goto unmap;
 	/* /dev/lguest needs to be registered. */
 	err = lguest_device_init();
@ -356,8 +351,6 @@ static int __init init(void)
 free_interrupts:
 	free_interrupts();
 free_pgtables:
 	free_pagetables();
 unmap:
 	unmap_switcher();
 out:
@ -369,7 +362,6 @@ static void __exit fini(void)
 {
 	lguest_device_remove();
 	free_interrupts();
 	free_pagetables();
 	unmap_switcher();
 	lguest_arch_host_fini();
--- a/drivers/lguest/lg.h
+++ b/drivers/lguest/lg.h
@ -14,9 +14,6 @@
 #include <asm/lguest.h>
 void free_pagetables(void);
 int init_pagetables(struct page **switcher_pages);
 struct pgdir {
 	unsigned long gpgdir;
 	pgd_t *pgdir;
--- a/drivers/lguest/page_tables.c
+++ b/drivers/lguest/page_tables.c
@ -62,20 +62,11 @@
 * will need the last pmd entry of the last pmd page.
 */
 #ifdef CONFIG_X86_PAE
 #define SWITCHER_PMD_INDEX 	(PTRS_PER_PMD - 1)
 #define CHECK_GPGD_MASK		_PAGE_PRESENT
 #else
 #define CHECK_GPGD_MASK		_PAGE_TABLE
 #endif
 /*
 * We actually need a separate PTE page for each CPU.  Remember that after the
 * Switcher code itself comes two pages for each CPU, and we don't want this
 * CPU's guest to see the pages of any other CPU.
 */
 static DEFINE_PER_CPU(pte_t *, switcher_pte_pages);
 #define switcher_pte_page(cpu) per_cpu(switcher_pte_pages, cpu)
 /*H:320
 * The page table code is curly enough to need helper functions to keep it
 * clear and clean.  The kernel itself provides many of them; one advantage
@ -714,9 +705,6 @@ static unsigned int new_pgdir(struct lg_cpu *cpu,
 			      int *blank_pgdir)
 {
 	unsigned int next;
 #ifdef CONFIG_X86_PAE
 	pmd_t *pmd_table;
 #endif
 	/*
 	 * We pick one entry at random to throw out.  Choosing the Least
@ -731,29 +719,11 @@ static unsigned int new_pgdir(struct lg_cpu *cpu,
 		if (!cpu->lg->pgdirs[next].pgdir)
 			next = cpu->cpu_pgd;
 		else {
 #ifdef CONFIG_X86_PAE
 			/*
-			 * In PAE mode, allocate a pmd page and populate the
+			 * This is a blank page, so there are no kernel
-			 * last pgd entry.
+			 * mappings: caller must map the stack!
 			 */
 			pmd_table = (pmd_t *)get_zeroed_page(GFP_KERNEL);
 			if (!pmd_table) {
 				free_page((long)cpu->lg->pgdirs[next].pgdir);
 				set_pgd(cpu->lg->pgdirs[next].pgdir, __pgd(0));
 				next = cpu->cpu_pgd;
 			} else {
 				set_pgd(cpu->lg->pgdirs[next].pgdir +
 					SWITCHER_PGD_INDEX,
 					__pgd(__pa(pmd_table) | _PAGE_PRESENT));
 				/*
 				 * This is a blank page, so there are no kernel
 				 * mappings: caller must map the stack!
 				 */
 				*blank_pgdir = 1;
 			}
 #else
 			*blank_pgdir = 1;
 #endif
 		}
 	}
 	/* Record which Guest toplevel this shadows. */
@ -764,6 +734,23 @@ static unsigned int new_pgdir(struct lg_cpu *cpu,
 	return next;
 }
 /*H:501
 * We do need the Switcher code mapped at all times, so we allocate that
 * part of the Guest page table here, and populate it when we're about to run
 * the guest.
 */
 static bool allocate_switcher_mapping(struct lg_cpu *cpu)
 {
 	int i;
 	for (i = 0; i < TOTAL_SWITCHER_PAGES; i++) {
 		if (!find_spte(cpu, switcher_addr + i * PAGE_SIZE, true,
 			       CHECK_GPGD_MASK, _PAGE_TABLE))
 			return false;
 	}
 	return true;
 }
 /*H:470
 * Finally, a routine which throws away everything: all PGD entries in all
 * the shadow page tables, including the Guest's kernel mappings.  This is used
@ -774,28 +761,14 @@ static void release_all_pagetables(struct lguest *lg)
 	unsigned int i, j;
 	/* Every shadow pagetable this Guest has */
-	for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++)
+	for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++) {
-		if (lg->pgdirs[i].pgdir) {
+		if (!lg->pgdirs[i].pgdir)
-#ifdef CONFIG_X86_PAE
+			continue;
 			pgd_t *spgd;
 			pmd_t *pmdpage;
 			unsigned int k;
-			/* Get the last pmd page. */
+		/* Every PGD entry. */
-			spgd = lg->pgdirs[i].pgdir + SWITCHER_PGD_INDEX;
+		for (j = 0; j < PTRS_PER_PGD; j++)
-			pmdpage = __va(pgd_pfn(*spgd) << PAGE_SHIFT);
+			release_pgd(lg->pgdirs[i].pgdir + j);
-
+	}
 			/*
 			 * And release the pmd entries of that pmd page,
 			 * except for the switcher pmd.
 			 */
 			for (k = 0; k < SWITCHER_PMD_INDEX; k++)
 				release_pmd(&pmdpage[k]);
 #endif
 			/* Every PGD entry except the Switcher at the top */
 			for (j = 0; j < SWITCHER_PGD_INDEX; j++)
 				release_pgd(lg->pgdirs[i].pgdir + j);
 		}
 }
 /*
@ -809,6 +782,9 @@ void guest_pagetable_clear_all(struct lg_cpu *cpu)
 	release_all_pagetables(cpu->lg);
 	/* We need the Guest kernel stack mapped again. */
 	pin_stack_pages(cpu);
 	/* And we need Switcher allocated. */
 	if (!allocate_switcher_mapping(cpu))
 		kill_guest(cpu, "Cannot populate switcher mapping");
 }
 /*H:430
@ -844,9 +820,15 @@ void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable)
 		newpgdir = new_pgdir(cpu, pgtable, &repin);
 	/* Change the current pgd index to the new one. */
 	cpu->cpu_pgd = newpgdir;
-	/* If it was completely blank, we map in the Guest kernel stack */
+	/*
 	 * If it was completely blank, we map in the Guest kernel stack and
 	 * the Switcher.
 	 */
 	if (repin)
 		pin_stack_pages(cpu);
 	if (!allocate_switcher_mapping(cpu))
 		kill_guest(cpu, "Cannot populate switcher mapping");
 }
 /*:*/
@ -976,14 +958,23 @@ void guest_set_pgd(struct lguest *lg, unsigned long gpgdir, u32 idx)
 {
 	int pgdir;
-	if (idx >= SWITCHER_PGD_INDEX)
+	if (idx > PTRS_PER_PGD) {
 		kill_guest(&lg->cpus[0], "Attempt to set pgd %u/%u",
 			   idx, PTRS_PER_PGD);
 		return;
 	}
 	/* If they're talking about a page table we have a shadow for... */
 	pgdir = find_pgdir(lg, gpgdir);
-	if (pgdir < ARRAY_SIZE(lg->pgdirs))
+	if (pgdir < ARRAY_SIZE(lg->pgdirs)) {
 		/* ... throw it away. */
 		release_pgd(lg->pgdirs[pgdir].pgdir + idx);
 		/* That might have been the Switcher mapping, remap it. */
 		if (!allocate_switcher_mapping(&lg->cpus[0])) {
 			kill_guest(&lg->cpus[0],
 				   "Cannot populate switcher mapping");
 		}
 	}
 }
 #ifdef CONFIG_X86_PAE
@ -1001,6 +992,9 @@ void guest_set_pmd(struct lguest *lg, unsigned long pmdp, u32 idx)
 * we will populate on future faults.  The Guest doesn't have any actual
 * pagetables yet, so we set linear_pages to tell demand_page() to fake it
 * for the moment.
 *
 * We do need the Switcher to be mapped at all times, so we allocate that
 * part of the Guest page table here.
 */
 int init_guest_pagetable(struct lguest *lg)
 {
@ -1014,6 +1008,13 @@ int init_guest_pagetable(struct lguest *lg)
 	/* We start with a linear mapping until the initialize. */
 	cpu->linear_pages = true;
 	/* Allocate the page tables for the Switcher. */
 	if (!allocate_switcher_mapping(cpu)) {
 		release_all_pagetables(lg);
 		return -ENOMEM;
 	}
 	return 0;
 }
@ -1065,92 +1066,69 @@ void free_guest_pagetable(struct lguest *lg)
 * (vi) Mapping the Switcher when the Guest is about to run.
 *
 * The Switcher and the two pages for this CPU need to be visible in the
- * Guest (and not the pages for other CPUs).  We have the appropriate PTE pages
+ * Guest (and not the pages for other CPUs).
- * for each CPU already set up, we just need to hook them in now we know which
+ *
- * Guest is about to run on this CPU.
+ * The pages have all been allocate
 */
 void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages)
 {
-	pte_t *switcher_pte_page = __this_cpu_read(switcher_pte_pages);
+	unsigned long base, i;
-	pte_t regs_pte;
+	struct page *percpu_switcher_page, *regs_page;
 	pte_t *pte;
-#ifdef CONFIG_X86_PAE
+	/* Code page should always be mapped, and executable. */
-	pmd_t switcher_pmd;
+	pte = find_spte(cpu, switcher_addr, false, 0, 0);
-	pmd_t *pmd_table;
+	get_page(lg_switcher_pages[0]);
 	set_pte(pte, mk_pte(lg_switcher_pages[0], PAGE_KERNEL_RX));
-	switcher_pmd = pfn_pmd(__pa(switcher_pte_page) >> PAGE_SHIFT,
+	/* Clear all the Switcher mappings for any other CPUs. */
-			       PAGE_KERNEL_EXEC);
+	/* FIXME: This is dumb: update only when Host CPU changes. */
 	for_each_possible_cpu(i) {
 		/* Get location of lguest_pages (indexed by Host CPU) */
 		base = switcher_addr + PAGE_SIZE
 			+ i * sizeof(struct lguest_pages);
-	/* Figure out where the pmd page is, by reading the PGD, and converting
+		/* Get shadow PTE for first page (where we put guest regs). */
-	 * it to a virtual address. */
+		pte = find_spte(cpu, base, false, 0, 0);
-	pmd_table = __va(pgd_pfn(cpu->lg->
+		set_pte(pte, __pte(0));
-			pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX])
+
-								<< PAGE_SHIFT);
+		/* This is where we put R/O state. */
-	/* Now write it into the shadow page table. */
+		pte = find_spte(cpu, base + PAGE_SIZE, false, 0, 0);
-	set_pmd(&pmd_table[SWITCHER_PMD_INDEX], switcher_pmd);
+		set_pte(pte, __pte(0));
-#else
+	}
 	pgd_t switcher_pgd;
 	/*
-	 * Make the last PGD entry for this Guest point to the Switcher's PTE
+	 * When we're running the Guest, we want the Guest's "regs" page to
-	 * page for this CPU (with appropriate flags).
+	 * appear where the first Switcher page for this CPU is.  This is an
 	 * optimization: when the Switcher saves the Guest registers, it saves
 	 * them into the first page of this CPU's "struct lguest_pages": if we
 	 * make sure the Guest's register page is already mapped there, we
 	 * don't have to copy them out again.
 	 */
-	switcher_pgd = __pgd(__pa(switcher_pte_page) | __PAGE_KERNEL_EXEC);
+	/* Find the shadow PTE for this regs page. */
 	base = switcher_addr + PAGE_SIZE
 		+ raw_smp_processor_id() * sizeof(struct lguest_pages);
 	pte = find_spte(cpu, base, false, 0, 0);
 	regs_page = pfn_to_page(__pa(cpu->regs_page) >> PAGE_SHIFT);
 	get_page(regs_page);
 	set_pte(pte, mk_pte(regs_page, __pgprot(__PAGE_KERNEL & ~_PAGE_GLOBAL)));
 	cpu->lg->pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX] = switcher_pgd;
 #endif
 	/*
-	 * We also change the Switcher PTE page.  When we're running the Guest,
+	 * We map the second page of the struct lguest_pages read-only in
-	 * we want the Guest's "regs" page to appear where the first Switcher
+	 * the Guest: the IDT, GDT and other things it's not supposed to
-	 * page for this CPU is.  This is an optimization: when the Switcher
+	 * change.
 	 * saves the Guest registers, it saves them into the first page of this
 	 * CPU's "struct lguest_pages": if we make sure the Guest's register
 	 * page is already mapped there, we don't have to copy them out
 	 * again.
 	 */
-	regs_pte = pfn_pte(__pa(cpu->regs_page) >> PAGE_SHIFT, PAGE_KERNEL);
+	base += PAGE_SIZE;
-	set_pte(&switcher_pte_page[pte_index((unsigned long)pages)], regs_pte);
+	pte = find_spte(cpu, base, false, 0, 0);
 	percpu_switcher_page
 		= lg_switcher_pages[1 + raw_smp_processor_id()*2 + 1];
 	get_page(percpu_switcher_page);
 	set_pte(pte, mk_pte(percpu_switcher_page,
 			    __pgprot(__PAGE_KERNEL_RO & ~_PAGE_GLOBAL)));
 }
 /*:*/
 static void free_switcher_pte_pages(void)
 {
 	unsigned int i;
 	for_each_possible_cpu(i)
 		free_page((long)switcher_pte_page(i));
 }
 /*H:520
 * Setting up the Switcher PTE page for given CPU is fairly easy, given
 * the CPU number and the "struct page"s for the Switcher and per-cpu pages.
 */
 static __init void populate_switcher_pte_page(unsigned int cpu,
 					      struct page *switcher_pages[])
 {
 	pte_t *pte = switcher_pte_page(cpu);
 	int i;
 	/* The first entries maps the Switcher code. */
 	set_pte(&pte[0], mk_pte(switcher_pages[0],
 				__pgprot(_PAGE_PRESENT|_PAGE_ACCESSED)));
 	/* The only other thing we map is this CPU's pair of pages. */
 	i = 1 + cpu*2;
 	/* First page (Guest registers) is writable from the Guest */
 	set_pte(&pte[i], pfn_pte(page_to_pfn(switcher_pages[i]),
 			 __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED|_PAGE_RW)));
 	/*
 	 * The second page contains the "struct lguest_ro_state", and is
 	 * read-only.
 	 */
 	set_pte(&pte[i+1], pfn_pte(page_to_pfn(switcher_pages[i+1]),
 			   __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED)));
 }
 /*
 * We've made it through the page table code.  Perhaps our tired brains are
 * still processing the details, or perhaps we're simply glad it's over.
@ -1163,29 +1141,3 @@ static __init void populate_switcher_pte_page(unsigned int cpu,
 *
 * There is just one file remaining in the Host.
 */
 /*H:510
 * At boot or module load time, init_pagetables() allocates and populates
 * the Switcher PTE page for each CPU.
 */
 __init int init_pagetables(struct page **switcher_pages)
 {
 	unsigned int i;
 	for_each_possible_cpu(i) {
 		switcher_pte_page(i) = (pte_t *)get_zeroed_page(GFP_KERNEL);
 		if (!switcher_pte_page(i)) {
 			free_switcher_pte_pages();
 			return -ENOMEM;
 		}
 		populate_switcher_pte_page(i, switcher_pages);
 	}
 	return 0;
 }
 /*:*/
 /* Cleaning up simply involves freeing the PTE page for each CPU. */
 void free_pagetables(void)
 {
 	free_switcher_pte_pages();
 }