From e0377e25206328998d036cafddcd00a7c3252e3e Mon Sep 17 00:00:00 2001 From: Sakari Ailus Date: Sun, 26 Jun 2011 19:36:46 +0300 Subject: [PATCH 1/9] lguest: Do not exit on non-fatal errors Do not exit on some non-fatal errors: - writev() fails in net_output(). The result is a lost packet or packets. - writev() fails in console_output(). The result is partially lost console output. - readv() fails in net_input(). The result is a lost packet or packets. Rather than bringing the guest down, this patch ignores e.g. an allocation failure on the host side. Example: lguest: page allocation failure. order:4, mode:0x4d0 Pid: 4045, comm: lguest Tainted: G W 2.6.36 #1 Call Trace: [] ? printk+0x18/0x1c [] __alloc_pages_nodemask+0x4d2/0x570 [] cache_alloc_refill+0x2a4/0x4d0 [] ? __netif_receive_skb+0x189/0x270 [] __kmalloc+0xda/0xf0 [] __alloc_skb+0x55/0x100 [] ? net_rx_action+0x79/0x100 [] sock_alloc_send_pskb+0x18d/0x280 [] ? _copy_from_user+0x35/0x130 [] ? memcpy_fromiovecend+0x56/0x80 [] tun_chr_aio_write+0x1cc/0x500 [] do_sync_readv_writev+0x95/0xd0 [] ? _copy_from_user+0x35/0x130 [] ? rw_copy_check_uvector+0x58/0x100 [] do_readv_writev+0x9c/0x1d0 [] ? tun_chr_aio_write+0x0/0x500 [] vfs_writev+0x4a/0x60 [] sys_writev+0x41/0x80 [] syscall_call+0x7/0xb Mem-Info: DMA per-cpu: CPU 0: hi: 0, btch: 1 usd: 0 Normal per-cpu: CPU 0: hi: 186, btch: 31 usd: 0 HighMem per-cpu: CPU 0: hi: 186, btch: 31 usd: 0 active_anon:134651 inactive_anon:50543 isolated_anon:0 active_file:96881 inactive_file:132007 isolated_file:0 unevictable:0 dirty:3 writeback:0 unstable:0 free:91374 slab_reclaimable:6300 slab_unreclaimable:2802 mapped:2281 shmem:9 pagetables:330 bounce:0 DMA free:3524kB min:64kB low:80kB high:96kB active_anon:0kB inactive_anon:8kB active_file:8760kB inactive_file:2760kB unevictable:0kB isolated(anon):0kB isolated(file):0kB present:15868kB mlocked:0kB dirty:0kB writeback:0kB mapped:16kB shmem:0kB slab_reclaimable:88kB slab_unreclaimable:148kB kernel_stack:40kB pagetables:0kB unstable:0kB bounce:0kB writeback_tmp:0kB pages_scanned:0 all_unreclaimable? no lowmem_reserve[]: 0 865 2016 2016 Normal free:150100kB min:3728kB low:4660kB high:5592kB active_anon:6224kB inactive_anon:15772kB active_file:324084kB inactive_file:325944kB unevictable:0kB isolated(anon):0kB isolated(file):0kB present:885944kB mlocked:0kB dirty:12kB writeback:0kB mapped:1520kB shmem:0kB slab_reclaimable:25112kB slab_unreclaimable:11060kB kernel_stack:1888kB pagetables:1320kB unstable:0kB bounce:0kB writeback_tmp:0kB pages_scanned:0 all_unreclaimable? no lowmem_reserve[]: 0 0 9207 9207 HighMem free:211872kB min:512kB low:1752kB high:2992kB active_anon:532380kB inactive_anon:186392kB active_file:54680kB inactive_file:199324kB unevictable:0kB isolated(anon):0kB isolated(file):0kB present:1178504kB mlocked:0kB dirty:0kB writeback:0kB mapped:7588kB shmem:36kB slab_reclaimable:0kB slab_unreclaimable:0kB kernel_stack:0kB pagetables:0kB unstable:0kB bounce:0kB writeback_tmp:0kB pages_scanned:0 all_unreclaimable? no lowmem_reserve[]: 0 0 0 0 DMA: 3*4kB 65*8kB 35*16kB 18*32kB 11*64kB 9*128kB 0*256kB 0*512kB 0*1024kB 0*2048kB 0*4096kB = 3524kB Normal: 35981*4kB 344*8kB 158*16kB 28*32kB 0*64kB 0*128kB 0*256kB 0*512kB 0*1024kB 0*2048kB 0*4096kB = 150100kB HighMem: 5732*4kB 5462*8kB 2826*16kB 1598*32kB 84*64kB 10*128kB 7*256kB 1*512kB 1*1024kB 1*2048kB 9*4096kB = 211872kB 231237 total pagecache pages 2340 pages in swap cache Swap cache stats: add 160060, delete 157720, find 189017/194106 Free swap = 4179840kB Total swap = 4194300kB 524271 pages RAM 296946 pages HighMem 5668 pages reserved 867664 pages shared 82155 pages non-shared Signed-off-by: Sakari Ailus Signed-off-by: Rusty Russell --- Documentation/virtual/lguest/lguest.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/Documentation/virtual/lguest/lguest.c b/Documentation/virtual/lguest/lguest.c index cd9d6af61d07..e3b9bb7a644a 100644 --- a/Documentation/virtual/lguest/lguest.c +++ b/Documentation/virtual/lguest/lguest.c @@ -861,8 +861,10 @@ static void console_output(struct virtqueue *vq) /* writev can return a partial write, so we loop here. */ while (!iov_empty(iov, out)) { int len = writev(STDOUT_FILENO, iov, out); - if (len <= 0) - err(1, "Write to stdout gave %i", len); + if (len <= 0) { + warn("Write to stdout gave %i (%d)", len, errno); + break; + } iov_consume(iov, out, len); } @@ -898,7 +900,7 @@ static void net_output(struct virtqueue *vq) * same format: what a coincidence! */ if (writev(net_info->tunfd, iov, out) < 0) - errx(1, "Write to tun failed?"); + warnx("Write to tun failed (%d)?", errno); /* * Done with that one; wait_for_vq_desc() will send the interrupt if @@ -955,7 +957,7 @@ static void net_input(struct virtqueue *vq) */ len = readv(net_info->tunfd, iov, in); if (len <= 0) - err(1, "Failed to read from tun."); + warn("Failed to read from tun (%d).", errno); /* * Mark that packet buffer as used, but don't interrupt here. We want From 5dea1c88ed11a1221581c4b202f053c4fc138704 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 22 Jul 2011 14:39:48 +0930 Subject: [PATCH 2/9] lguest: use a special 1:1 linear pagetable mode until first switch. The Host used to create some page tables for the Guest to use at the top of Guest memory; it would then tell the Guest where this was. In particular, it created linear mappings for 0 and 0xC0000000 addresses because lguest used to switch to its real page tables quite late in boot. However, since d50d8fe19 Linux initialized boot page tables in head_32.S even before the "are we lguest?" boot jump. So, now we can simplify things: the Host pagetable code assumes 1:1 linear mapping until it first calls the LHCALL_NEW_PGTABLE hypercall, which we now do before we reach C code. This also means that the Host doesn't need to know anything about the Guest's PAGE_OFFSET. (Non-Linux guests might not even have such a thing). Signed-off-by: Rusty Russell --- arch/x86/kernel/asm-offsets_32.c | 1 - arch/x86/lguest/boot.c | 11 +- arch/x86/lguest/i386_head.S | 9 +- drivers/lguest/lg.h | 2 + drivers/lguest/page_tables.c | 278 ++++++++++--------------------- include/linux/lguest.h | 2 - 6 files changed, 98 insertions(+), 205 deletions(-) diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index c29d631af6fc..395a10e68067 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c @@ -63,7 +63,6 @@ void foo(void) BLANK(); OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled); OFFSET(LGUEST_DATA_irq_pending, lguest_data, irq_pending); - OFFSET(LGUEST_DATA_pgdir, lguest_data, pgdir); BLANK(); OFFSET(LGUEST_PAGES_host_gdt_desc, lguest_pages, state.host_gdt_desc); diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index db832fd65ecb..719a32c60516 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -520,17 +520,16 @@ static unsigned long lguest_read_cr2(void) /* See lguest_set_pte() below. */ static bool cr3_changed = false; +static unsigned long current_cr3; /* * cr3 is the current toplevel pagetable page: the principle is the same as - * cr0. Keep a local copy, and tell the Host when it changes. The only - * difference is that our local copy is in lguest_data because the Host needs - * to set it upon our initial hypercall. + * cr0. Keep a local copy, and tell the Host when it changes. */ static void lguest_write_cr3(unsigned long cr3) { - lguest_data.pgdir = cr3; lazy_hcall1(LHCALL_NEW_PGTABLE, cr3); + current_cr3 = cr3; /* These two page tables are simple, linear, and used during boot */ if (cr3 != __pa(swapper_pg_dir) && cr3 != __pa(initial_page_table)) @@ -539,7 +538,7 @@ static void lguest_write_cr3(unsigned long cr3) static unsigned long lguest_read_cr3(void) { - return lguest_data.pgdir; + return current_cr3; } /* cr4 is used to enable and disable PGE, but we don't care. */ @@ -758,7 +757,7 @@ static void lguest_pmd_clear(pmd_t *pmdp) static void lguest_flush_tlb_single(unsigned long addr) { /* Simply set it to zero: if it was not, it will fault back in. */ - lazy_hcall3(LHCALL_SET_PTE, lguest_data.pgdir, addr, 0); + lazy_hcall3(LHCALL_SET_PTE, current_cr3, addr, 0); } /* diff --git a/arch/x86/lguest/i386_head.S b/arch/x86/lguest/i386_head.S index 4f420c2f2d55..863b03a9fbff 100644 --- a/arch/x86/lguest/i386_head.S +++ b/arch/x86/lguest/i386_head.S @@ -27,13 +27,18 @@ .section .init.text, "ax", @progbits ENTRY(lguest_entry) /* - * We make the "initialization" hypercall now to tell the Host about - * us, and also find out where it put our page tables. + * We make the "initialization" hypercall now to tell the Host where + * our lguest_data struct is. */ movl $LHCALL_LGUEST_INIT, %eax movl $lguest_data - __PAGE_OFFSET, %ebx int $LGUEST_TRAP_ENTRY + /* Now turn our pagetables on; setup by arch/x86/kernel/head_32.S. */ + movl $LHCALL_NEW_PGTABLE, %eax + movl $(initial_page_table - __PAGE_OFFSET), %ebx + int $LGUEST_TRAP_ENTRY + /* Set up the initial stack so we can run C code. */ movl $(init_thread_union+THREAD_SIZE),%esp diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h index 9136411fadd5..295df06e6590 100644 --- a/drivers/lguest/lg.h +++ b/drivers/lguest/lg.h @@ -59,6 +59,8 @@ struct lg_cpu { struct lguest_pages *last_pages; + /* Initialization mode: linear map everything. */ + bool linear_pages; int cpu_pgd; /* Which pgd this cpu is currently using */ /* If a hypercall was asked for, this points to the arguments. */ diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c index d21578ee95de..00026222bde8 100644 --- a/drivers/lguest/page_tables.c +++ b/drivers/lguest/page_tables.c @@ -17,7 +17,6 @@ #include #include #include -#include #include "lg.h" /*M:008 @@ -325,10 +324,15 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) #endif /* First step: get the top-level Guest page table entry. */ - gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t); - /* Toplevel not present? We can't map it in. */ - if (!(pgd_flags(gpgd) & _PAGE_PRESENT)) - return false; + if (unlikely(cpu->linear_pages)) { + /* Faking up a linear mapping. */ + gpgd = __pgd(CHECK_GPGD_MASK); + } else { + gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t); + /* Toplevel not present? We can't map it in. */ + if (!(pgd_flags(gpgd) & _PAGE_PRESENT)) + return false; + } /* Now look at the matching shadow entry. */ spgd = spgd_addr(cpu, cpu->cpu_pgd, vaddr); @@ -353,10 +357,15 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) } #ifdef CONFIG_X86_PAE - gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t); - /* Middle level not present? We can't map it in. */ - if (!(pmd_flags(gpmd) & _PAGE_PRESENT)) - return false; + if (unlikely(cpu->linear_pages)) { + /* Faking up a linear mapping. */ + gpmd = __pmd(_PAGE_TABLE); + } else { + gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t); + /* Middle level not present? We can't map it in. */ + if (!(pmd_flags(gpmd) & _PAGE_PRESENT)) + return false; + } /* Now look at the matching shadow entry. */ spmd = spmd_addr(cpu, *spgd, vaddr); @@ -397,8 +406,13 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) gpte_ptr = gpte_addr(cpu, gpgd, vaddr); #endif - /* Read the actual PTE value. */ - gpte = lgread(cpu, gpte_ptr, pte_t); + if (unlikely(cpu->linear_pages)) { + /* Linear? Make up a PTE which points to same page. */ + gpte = __pte((vaddr & PAGE_MASK) | _PAGE_RW | _PAGE_PRESENT); + } else { + /* Read the actual PTE value. */ + gpte = lgread(cpu, gpte_ptr, pte_t); + } /* If this page isn't in the Guest page tables, we can't page it in. */ if (!(pte_flags(gpte) & _PAGE_PRESENT)) @@ -454,7 +468,8 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) * Finally, we write the Guest PTE entry back: we've set the * _PAGE_ACCESSED and maybe the _PAGE_DIRTY flags. */ - lgwrite(cpu, gpte_ptr, pte_t, gpte); + if (likely(!cpu->linear_pages)) + lgwrite(cpu, gpte_ptr, pte_t, gpte); /* * The fault is fixed, the page table is populated, the mapping @@ -612,6 +627,11 @@ unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr) #ifdef CONFIG_X86_PAE pmd_t gpmd; #endif + + /* Still not set up? Just map 1:1. */ + if (unlikely(cpu->linear_pages)) + return vaddr; + /* First step: get the top-level Guest page table entry. */ gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t); /* Toplevel not present? We can't map it in. */ @@ -708,32 +728,6 @@ static unsigned int new_pgdir(struct lg_cpu *cpu, return next; } -/*H:430 - * (iv) Switching page tables - * - * Now we've seen all the page table setting and manipulation, let's see - * what happens when the Guest changes page tables (ie. changes the top-level - * pgdir). This occurs on almost every context switch. - */ -void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable) -{ - int newpgdir, repin = 0; - - /* Look to see if we have this one already. */ - newpgdir = find_pgdir(cpu->lg, pgtable); - /* - * If not, we allocate or mug an existing one: if it's a fresh one, - * repin gets set to 1. - */ - if (newpgdir == ARRAY_SIZE(cpu->lg->pgdirs)) - newpgdir = new_pgdir(cpu, pgtable, &repin); - /* Change the current pgd index to the new one. */ - cpu->cpu_pgd = newpgdir; - /* If it was completely blank, we map in the Guest kernel stack */ - if (repin) - pin_stack_pages(cpu); -} - /*H:470 * Finally, a routine which throws away everything: all PGD entries in all * the shadow page tables, including the Guest's kernel mappings. This is used @@ -780,6 +774,44 @@ void guest_pagetable_clear_all(struct lg_cpu *cpu) /* We need the Guest kernel stack mapped again. */ pin_stack_pages(cpu); } + +/*H:430 + * (iv) Switching page tables + * + * Now we've seen all the page table setting and manipulation, let's see + * what happens when the Guest changes page tables (ie. changes the top-level + * pgdir). This occurs on almost every context switch. + */ +void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable) +{ + int newpgdir, repin = 0; + + /* + * The very first time they call this, we're actually running without + * any page tables; we've been making it up. Throw them away now. + */ + if (unlikely(cpu->linear_pages)) { + release_all_pagetables(cpu->lg); + cpu->linear_pages = false; + /* Force allocation of a new pgdir. */ + newpgdir = ARRAY_SIZE(cpu->lg->pgdirs); + } else { + /* Look to see if we have this one already. */ + newpgdir = find_pgdir(cpu->lg, pgtable); + } + + /* + * If not, we allocate or mug an existing one: if it's a fresh one, + * repin gets set to 1. + */ + if (newpgdir == ARRAY_SIZE(cpu->lg->pgdirs)) + newpgdir = new_pgdir(cpu, pgtable, &repin); + /* Change the current pgd index to the new one. */ + cpu->cpu_pgd = newpgdir; + /* If it was completely blank, we map in the Guest kernel stack */ + if (repin) + pin_stack_pages(cpu); +} /*:*/ /*M:009 @@ -919,168 +951,26 @@ void guest_set_pmd(struct lguest *lg, unsigned long pmdp, u32 idx) } #endif -/*H:505 - * To get through boot, we construct simple identity page mappings (which - * set virtual == physical) and linear mappings which will get the Guest far - * enough into the boot to create its own. The linear mapping means we - * simplify the Guest boot, but it makes assumptions about their PAGE_OFFSET, - * as you'll see. - * - * We lay them out of the way, just below the initrd (which is why we need to - * know its size here). - */ -static unsigned long setup_pagetables(struct lguest *lg, - unsigned long mem, - unsigned long initrd_size) -{ - pgd_t __user *pgdir; - pte_t __user *linear; - unsigned long mem_base = (unsigned long)lg->mem_base; - unsigned int mapped_pages, i, linear_pages; -#ifdef CONFIG_X86_PAE - pmd_t __user *pmds; - unsigned int j; - pgd_t pgd; - pmd_t pmd; -#else - unsigned int phys_linear; -#endif - - /* - * We have mapped_pages frames to map, so we need linear_pages page - * tables to map them. - */ - mapped_pages = mem / PAGE_SIZE; - linear_pages = (mapped_pages + PTRS_PER_PTE - 1) / PTRS_PER_PTE; - - /* We put the toplevel page directory page at the top of memory. */ - pgdir = (pgd_t *)(mem + mem_base - initrd_size - PAGE_SIZE); - - /* Now we use the next linear_pages pages as pte pages */ - linear = (void *)pgdir - linear_pages * PAGE_SIZE; - -#ifdef CONFIG_X86_PAE - /* - * And the single mid page goes below that. We only use one, but - * that's enough to map 1G, which definitely gets us through boot. - */ - pmds = (void *)linear - PAGE_SIZE; -#endif - /* - * Linear mapping is easy: put every page's address into the - * mapping in order. - */ - for (i = 0; i < mapped_pages; i++) { - pte_t pte; - pte = pfn_pte(i, __pgprot(_PAGE_PRESENT|_PAGE_RW|_PAGE_USER)); - if (copy_to_user(&linear[i], &pte, sizeof(pte)) != 0) - return -EFAULT; - } - -#ifdef CONFIG_X86_PAE - /* - * Make the Guest PMD entries point to the corresponding place in the - * linear mapping (up to one page worth of PMD). - */ - for (i = j = 0; i < mapped_pages && j < PTRS_PER_PMD; - i += PTRS_PER_PTE, j++) { - pmd = pfn_pmd(((unsigned long)&linear[i] - mem_base)/PAGE_SIZE, - __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER)); - - if (copy_to_user(&pmds[j], &pmd, sizeof(pmd)) != 0) - return -EFAULT; - } - - /* One PGD entry, pointing to that PMD page. */ - pgd = __pgd(((unsigned long)pmds - mem_base) | _PAGE_PRESENT); - /* Copy it in as the first PGD entry (ie. addresses 0-1G). */ - if (copy_to_user(&pgdir[0], &pgd, sizeof(pgd)) != 0) - return -EFAULT; - /* - * And the other PGD entry to make the linear mapping at PAGE_OFFSET - */ - if (copy_to_user(&pgdir[KERNEL_PGD_BOUNDARY], &pgd, sizeof(pgd))) - return -EFAULT; -#else - /* - * The top level points to the linear page table pages above. - * We setup the identity and linear mappings here. - */ - phys_linear = (unsigned long)linear - mem_base; - for (i = 0; i < mapped_pages; i += PTRS_PER_PTE) { - pgd_t pgd; - /* - * Create a PGD entry which points to the right part of the - * linear PTE pages. - */ - pgd = __pgd((phys_linear + i * sizeof(pte_t)) | - (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER)); - - /* - * Copy it into the PGD page at 0 and PAGE_OFFSET. - */ - if (copy_to_user(&pgdir[i / PTRS_PER_PTE], &pgd, sizeof(pgd)) - || copy_to_user(&pgdir[pgd_index(PAGE_OFFSET) - + i / PTRS_PER_PTE], - &pgd, sizeof(pgd))) - return -EFAULT; - } -#endif - - /* - * We return the top level (guest-physical) address: we remember where - * this is to write it into lguest_data when the Guest initializes. - */ - return (unsigned long)pgdir - mem_base; -} - /*H:500 * (vii) Setting up the page tables initially. * - * When a Guest is first created, the Launcher tells us where the toplevel of - * its first page table is. We set some things up here: + * When a Guest is first created, set initialize a shadow page table which + * we will populate on future faults. The Guest doesn't have any actual + * pagetables yet, so we set linear_pages to tell demand_page() to fake it + * for the moment. */ int init_guest_pagetable(struct lguest *lg) { - u64 mem; - u32 initrd_size; - struct boot_params __user *boot = (struct boot_params *)lg->mem_base; -#ifdef CONFIG_X86_PAE - pgd_t *pgd; - pmd_t *pmd_table; -#endif - /* - * Get the Guest memory size and the ramdisk size from the boot header - * located at lg->mem_base (Guest address 0). - */ - if (copy_from_user(&mem, &boot->e820_map[0].size, sizeof(mem)) - || get_user(initrd_size, &boot->hdr.ramdisk_size)) - return -EFAULT; + struct lg_cpu *cpu = &lg->cpus[0]; + int allocated = 0; - /* - * We start on the first shadow page table, and give it a blank PGD - * page. - */ - lg->pgdirs[0].gpgdir = setup_pagetables(lg, mem, initrd_size); - if (IS_ERR_VALUE(lg->pgdirs[0].gpgdir)) - return lg->pgdirs[0].gpgdir; - lg->pgdirs[0].pgdir = (pgd_t *)get_zeroed_page(GFP_KERNEL); - if (!lg->pgdirs[0].pgdir) + /* lg (and lg->cpus[]) starts zeroed: this allocates a new pgdir */ + cpu->cpu_pgd = new_pgdir(cpu, 0, &allocated); + if (!allocated) return -ENOMEM; -#ifdef CONFIG_X86_PAE - /* For PAE, we also create the initial mid-level. */ - pgd = lg->pgdirs[0].pgdir; - pmd_table = (pmd_t *) get_zeroed_page(GFP_KERNEL); - if (!pmd_table) - return -ENOMEM; - - set_pgd(pgd + SWITCHER_PGD_INDEX, - __pgd(__pa(pmd_table) | _PAGE_PRESENT)); -#endif - - /* This is the current page table. */ - lg->cpus[0].cpu_pgd = 0; + /* We start with a linear mapping until the initialize. */ + cpu->linear_pages = true; return 0; } @@ -1095,10 +985,10 @@ void page_table_guest_data_init(struct lg_cpu *cpu) * of virtual addresses used by the Switcher. */ || put_user(RESERVE_MEM * 1024 * 1024, - &cpu->lg->lguest_data->reserve_mem) - || put_user(cpu->lg->pgdirs[0].gpgdir, - &cpu->lg->lguest_data->pgdir)) + &cpu->lg->lguest_data->reserve_mem)) { kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data); + return; + } /* * In flush_user_mappings() we loop from 0 to diff --git a/include/linux/lguest.h b/include/linux/lguest.h index 2fb1dcbcb5aa..9962c6bb1311 100644 --- a/include/linux/lguest.h +++ b/include/linux/lguest.h @@ -59,8 +59,6 @@ struct lguest_data { unsigned long reserve_mem; /* KHz for the TSC clock. */ u32 tsc_khz; - /* Page where the top-level pagetable is */ - unsigned long pgdir; /* Fields initialized by the Guest at boot: */ /* Instruction range to suppress interrupts even if enabled */ From 7e1941444f808d8001aa3b63588150c516321a3c Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 22 Jul 2011 14:39:49 +0930 Subject: [PATCH 3/9] lguest: remove remaining vmcall We switch back from using vmcall in 091ebf07a2408f9a56634caa0f86d9360e9af23b because it was unreliable under kvm, but I missed one (rarely-used) place. Signed-off-by: Rusty Russell --- arch/x86/lguest/i386_head.S | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/arch/x86/lguest/i386_head.S b/arch/x86/lguest/i386_head.S index 863b03a9fbff..c8c95e575c1e 100644 --- a/arch/x86/lguest/i386_head.S +++ b/arch/x86/lguest/i386_head.S @@ -101,12 +101,8 @@ send_interrupts: */ pushl %eax movl $LHCALL_SEND_INTERRUPTS, %eax - /* - * This is a vmcall instruction (same thing that KVM uses). Older - * assembler versions might not know the "vmcall" instruction, so we - * create one manually here. - */ - .byte 0x0f,0x01,0xc1 /* KVM_HYPERCALL */ + /* This is the actual hypercall trap. */ + int $LGUEST_TRAP_ENTRY /* Put eax back the way we found it. */ popl %eax ret From 6d7a5d1ea34495ecb1d608f0e40afba7776ee408 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 22 Jul 2011 14:39:49 +0930 Subject: [PATCH 4/9] lguest: don't rewrite vmcall instructions Now we no longer use vmcall, we don't need to rewrite it in the Guest. Signed-off-by: Rusty Russell --- drivers/lguest/interrupts_and_traps.c | 6 +-- drivers/lguest/x86/core.c | 77 --------------------------- 2 files changed, 2 insertions(+), 81 deletions(-) diff --git a/drivers/lguest/interrupts_and_traps.c b/drivers/lguest/interrupts_and_traps.c index daaf86631647..f0c171506371 100644 --- a/drivers/lguest/interrupts_and_traps.c +++ b/drivers/lguest/interrupts_and_traps.c @@ -375,11 +375,9 @@ static bool direct_trap(unsigned int num) /* * The Host needs to see page faults (for shadow paging and to save the * fault address), general protection faults (in/out emulation) and - * device not available (TS handling), invalid opcode fault (kvm hcall), - * and of course, the hypercall trap. + * device not available (TS handling) and of course, the hypercall trap. */ - return num != 14 && num != 13 && num != 7 && - num != 6 && num != LGUEST_TRAP_ENTRY; + return num != 14 && num != 13 && num != 7 && num != LGUEST_TRAP_ENTRY; } /*:*/ diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c index 9f1659c3d1f3..ec0cdfc04e78 100644 --- a/drivers/lguest/x86/core.c +++ b/drivers/lguest/x86/core.c @@ -352,69 +352,6 @@ static int emulate_insn(struct lg_cpu *cpu) return 1; } -/* - * Our hypercalls mechanism used to be based on direct software interrupts. - * After Anthony's "Refactor hypercall infrastructure" kvm patch, we decided to - * change over to using kvm hypercalls. - * - * KVM_HYPERCALL is actually a "vmcall" instruction, which generates an invalid - * opcode fault (fault 6) on non-VT cpus, so the easiest solution seemed to be - * an *emulation approach*: if the fault was really produced by an hypercall - * (is_hypercall() does exactly this check), we can just call the corresponding - * hypercall host implementation function. - * - * But these invalid opcode faults are notably slower than software interrupts. - * So we implemented the *patching (or rewriting) approach*: every time we hit - * the KVM_HYPERCALL opcode in Guest code, we patch it to the old "int 0x1f" - * opcode, so next time the Guest calls this hypercall it will use the - * faster trap mechanism. - * - * Matias even benchmarked it to convince you: this shows the average cycle - * cost of a hypercall. For each alternative solution mentioned above we've - * made 5 runs of the benchmark: - * - * 1) direct software interrupt: 2915, 2789, 2764, 2721, 2898 - * 2) emulation technique: 3410, 3681, 3466, 3392, 3780 - * 3) patching (rewrite) technique: 2977, 2975, 2891, 2637, 2884 - * - * One two-line function is worth a 20% hypercall speed boost! - */ -static void rewrite_hypercall(struct lg_cpu *cpu) -{ - /* - * This are the opcodes we use to patch the Guest. The opcode for "int - * $0x1f" is "0xcd 0x1f" but vmcall instruction is 3 bytes long, so we - * complete the sequence with a NOP (0x90). - */ - u8 insn[3] = {0xcd, 0x1f, 0x90}; - - __lgwrite(cpu, guest_pa(cpu, cpu->regs->eip), insn, sizeof(insn)); - /* - * The above write might have caused a copy of that page to be made - * (if it was read-only). We need to make sure the Guest has - * up-to-date pagetables. As this doesn't happen often, we can just - * drop them all. - */ - guest_pagetable_clear_all(cpu); -} - -static bool is_hypercall(struct lg_cpu *cpu) -{ - u8 insn[3]; - - /* - * This must be the Guest kernel trying to do something. - * The bottom two bits of the CS segment register are the privilege - * level. - */ - if ((cpu->regs->cs & 3) != GUEST_PL) - return false; - - /* Is it a vmcall? */ - __lgread(cpu, insn, guest_pa(cpu, cpu->regs->eip), sizeof(insn)); - return insn[0] == 0x0f && insn[1] == 0x01 && insn[2] == 0xc1; -} - /*H:050 Once we've re-enabled interrupts, we look at why the Guest exited. */ void lguest_arch_handle_trap(struct lg_cpu *cpu) { @@ -429,20 +366,6 @@ void lguest_arch_handle_trap(struct lg_cpu *cpu) if (emulate_insn(cpu)) return; } - /* - * If KVM is active, the vmcall instruction triggers a General - * Protection Fault. Normally it triggers an invalid opcode - * fault (6): - */ - case 6: - /* - * We need to check if ring == GUEST_PL and faulting - * instruction == vmcall. - */ - if (is_hypercall(cpu)) { - rewrite_hypercall(cpu); - return; - } break; case 14: /* We've intercepted a Page Fault. */ /* From 3c3ed482dc077a67903a58c9e1aedba1bb18c18a Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 22 Jul 2011 14:39:49 +0930 Subject: [PATCH 5/9] lguest: Simplify device initialization. We used to notify the Host every time we updated a device's status. However, it only really needs to know when we're resetting the device, or failed to initialize it, or when we've finished our feature negotiation. In particular, we used to wait for VIRTIO_CONFIG_S_DRIVER_OK in the status byte before starting the device service threads. But this corresponds to the successful finish of device initialization, which might (like virtio_blk's partition scanning) use the device. So we had a hack, if they used the device before we expected we started the threads anyway. Now we hook into the finalize_features hook in the Guest: at that point we tell the Launcher that it can rely on the features we have acked. On the Launcher side, we look at the status at that point, and start servicing the device. Signed-off-by: Rusty Russell --- Documentation/virtual/lguest/lguest.c | 25 +++++------------- drivers/lguest/lguest_device.c | 37 ++++++++++++++++----------- 2 files changed, 28 insertions(+), 34 deletions(-) diff --git a/Documentation/virtual/lguest/lguest.c b/Documentation/virtual/lguest/lguest.c index e3b9bb7a644a..80261d34da3d 100644 --- a/Documentation/virtual/lguest/lguest.c +++ b/Documentation/virtual/lguest/lguest.c @@ -1095,9 +1095,10 @@ static void update_device_status(struct device *dev) warnx("Device %s configuration FAILED", dev->name); if (dev->running) reset_device(dev); - } else if (dev->desc->status & VIRTIO_CONFIG_S_DRIVER_OK) { - if (!dev->running) - start_device(dev); + } else { + if (dev->running) + err(1, "Device %s features finalized twice", dev->name); + start_device(dev); } } @@ -1122,25 +1123,11 @@ static void handle_output(unsigned long addr) return; } - /* - * Devices *can* be used before status is set to DRIVER_OK. - * The original plan was that they would never do this: they - * would always finish setting up their status bits before - * actually touching the virtqueues. In practice, we allowed - * them to, and they do (eg. the disk probes for partition - * tables as part of initialization). - * - * If we see this, we start the device: once it's running, we - * expect the device to catch all the notifications. - */ + /* Devices should not be used before features are finalized. */ for (vq = i->vq; vq; vq = vq->next) { if (addr != vq->config.pfn*getpagesize()) continue; - if (i->running) - errx(1, "Notification on running %s", i->name); - /* This just calls create_thread() for each virtqueue */ - start_device(i); - return; + errx(1, "Notification on %s before setup!", i->name); } } diff --git a/drivers/lguest/lguest_device.c b/drivers/lguest/lguest_device.c index 69c84a1d88ea..5289ffa2e500 100644 --- a/drivers/lguest/lguest_device.c +++ b/drivers/lguest/lguest_device.c @@ -108,6 +108,17 @@ static u32 lg_get_features(struct virtio_device *vdev) return features; } +/* + * To notify on reset or feature finalization, we (ab)use the NOTIFY + * hypercall, with the descriptor address of the device. + */ +static void status_notify(struct virtio_device *vdev) +{ + unsigned long offset = (void *)to_lgdev(vdev)->desc - lguest_devices; + + hcall(LHCALL_NOTIFY, (max_pfn << PAGE_SHIFT) + offset, 0, 0, 0); +} + /* * The virtio core takes the features the Host offers, and copies the ones * supported by the driver into the vdev->features array. Once that's all @@ -135,6 +146,9 @@ static void lg_finalize_features(struct virtio_device *vdev) if (test_bit(i, vdev->features)) out_features[i / 8] |= (1 << (i % 8)); } + + /* Tell Host we've finished with this device's feature negotiation */ + status_notify(vdev); } /* Once they've found a field, getting a copy of it is easy. */ @@ -168,28 +182,21 @@ static u8 lg_get_status(struct virtio_device *vdev) return to_lgdev(vdev)->desc->status; } -/* - * To notify on status updates, we (ab)use the NOTIFY hypercall, with the - * descriptor address of the device. A zero status means "reset". - */ -static void set_status(struct virtio_device *vdev, u8 status) -{ - unsigned long offset = (void *)to_lgdev(vdev)->desc - lguest_devices; - - /* We set the status. */ - to_lgdev(vdev)->desc->status = status; - hcall(LHCALL_NOTIFY, (max_pfn << PAGE_SHIFT) + offset, 0, 0, 0); -} - static void lg_set_status(struct virtio_device *vdev, u8 status) { BUG_ON(!status); - set_status(vdev, status); + to_lgdev(vdev)->desc->status = status; + + /* Tell Host immediately if we failed. */ + if (status & VIRTIO_CONFIG_S_FAILED) + status_notify(vdev); } static void lg_reset(struct virtio_device *vdev) { - set_status(vdev, 0); + /* 0 status means "reset" */ + to_lgdev(vdev)->desc->status = 0; + status_notify(vdev); } /* From 9f54288def3f92b7805eb6d4b1ddcd73ecf6e889 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 22 Jul 2011 14:39:50 +0930 Subject: [PATCH 6/9] lguest: update comments Also removes a long-unused #define and an extraneous semicolon. Signed-off-by: Rusty Russell --- Documentation/virtual/lguest/lguest.c | 12 ++++-------- arch/x86/include/asm/lguest_hcall.h | 1 + arch/x86/lguest/boot.c | 21 +++++++++++++++------ arch/x86/lguest/i386_head.S | 18 +++++++++++------- drivers/lguest/core.c | 2 +- drivers/lguest/interrupts_and_traps.c | 4 ++-- drivers/lguest/lguest_user.c | 17 ++++++++++------- drivers/lguest/page_tables.c | 4 ++-- drivers/lguest/x86/core.c | 10 ++++------ 9 files changed, 50 insertions(+), 39 deletions(-) diff --git a/Documentation/virtual/lguest/lguest.c b/Documentation/virtual/lguest/lguest.c index 80261d34da3d..043bd7df3139 100644 --- a/Documentation/virtual/lguest/lguest.c +++ b/Documentation/virtual/lguest/lguest.c @@ -51,7 +51,7 @@ #include #include "../../../include/linux/lguest_launcher.h" /*L:110 - * We can ignore the 42 include files we need for this program, but I do want + * We can ignore the 43 include files we need for this program, but I do want * to draw attention to the use of kernel-style types. * * As Linus said, "C is a Spartan language, and so should your naming be." I @@ -65,7 +65,6 @@ typedef uint16_t u16; typedef uint8_t u8; /*:*/ -#define PAGE_PRESENT 0x7 /* Present, RW, Execute */ #define BRIDGE_PFX "bridge:" #ifndef SIOCBRADDIF #define SIOCBRADDIF 0x89a2 /* add interface to bridge */ @@ -1359,7 +1358,7 @@ static void setup_console(void) * --sharenet= option which opens or creates a named pipe. This can be * used to send packets to another guest in a 1:1 manner. * - * More sopisticated is to use one of the tools developed for project like UML + * More sophisticated is to use one of the tools developed for project like UML * to do networking. * * Faster is to do virtio bonding in kernel. Doing this 1:1 would be @@ -1369,7 +1368,7 @@ static void setup_console(void) * multiple inter-guest channels behind one interface, although it would * require some manner of hotplugging new virtio channels. * - * Finally, we could implement a virtio network switch in the kernel. + * Finally, we could use a virtio network switch in the kernel, ie. vhost. :*/ static u32 str2ip(const char *ipaddr) @@ -2006,10 +2005,7 @@ int main(int argc, char *argv[]) /* Tell the entry path not to try to reload segment registers. */ boot->hdr.loadflags |= KEEP_SEGMENTS; - /* - * We tell the kernel to initialize the Guest: this returns the open - * /dev/lguest file descriptor. - */ + /* We tell the kernel to initialize the Guest. */ tell_kernel(start); /* Ensure that we terminate if a device-servicing child dies. */ diff --git a/arch/x86/include/asm/lguest_hcall.h b/arch/x86/include/asm/lguest_hcall.h index b60f2924c413..879fd7d33877 100644 --- a/arch/x86/include/asm/lguest_hcall.h +++ b/arch/x86/include/asm/lguest_hcall.h @@ -61,6 +61,7 @@ hcall(unsigned long call, : "memory"); return call; } +/*:*/ /* Can't use our min() macro here: needs to be a constant */ #define LGUEST_IRQS (NR_IRQS < 32 ? NR_IRQS: 32) diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index 719a32c60516..74279907bc1a 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -71,7 +71,8 @@ #include #include /* for struct machine_ops */ -/*G:010 Welcome to the Guest! +/*G:010 + * Welcome to the Guest! * * The Guest in our tale is a simple creature: identical to the Host but * behaving in simplified but equivalent ways. In particular, the Guest is the @@ -190,15 +191,23 @@ static void lazy_hcall4(unsigned long call, #endif /*G:036 - * When lazy mode is turned off reset the per-cpu lazy mode variable and then - * issue the do-nothing hypercall to flush any stored calls. -:*/ + * When lazy mode is turned off, we issue the do-nothing hypercall to + * flush any stored calls, and call the generic helper to reset the + * per-cpu lazy mode variable. + */ static void lguest_leave_lazy_mmu_mode(void) { hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0, 0); paravirt_leave_lazy_mmu(); } +/* + * We also catch the end of context switch; we enter lazy mode for much of + * that too, so again we need to flush here. + * + * (Technically, this is lazy CPU mode, and normally we're in lazy MMU + * mode, but unlike Xen, lguest doesn't care about the difference). + */ static void lguest_end_context_switch(struct task_struct *next) { hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0, 0); @@ -640,7 +649,7 @@ static void lguest_write_cr4(unsigned long val) /* * The Guest calls this after it has set a second-level entry (pte), ie. to map - * a page into a process' address space. Wetell the Host the toplevel and + * a page into a process' address space. We tell the Host the toplevel and * address this corresponds to. The Guest uses one pagetable per process, so * we need to tell the Host which one we're changing (mm->pgd). */ @@ -1139,7 +1148,7 @@ static struct notifier_block paniced = { static __init char *lguest_memory_setup(void) { /* - *The Linux bootloader header contains an "e820" memory map: the + * The Linux bootloader header contains an "e820" memory map: the * Launcher populated the first entry with our memory limit. */ e820_add_region(boot_params.e820_map[0].addr, diff --git a/arch/x86/lguest/i386_head.S b/arch/x86/lguest/i386_head.S index c8c95e575c1e..cfa23e37ec5c 100644 --- a/arch/x86/lguest/i386_head.S +++ b/arch/x86/lguest/i386_head.S @@ -6,18 +6,22 @@ #include /*G:020 - * Our story starts with the kernel booting into startup_32 in - * arch/x86/kernel/head_32.S. It expects a boot header, which is created by - * the bootloader (the Launcher in our case). + + * Our story starts with the bzImage: booting starts at startup_32 in + * arch/x86/boot/compressed/head_32.S. This merely uncompresses the real + * kernel in place and then jumps into it: startup_32 in + * arch/x86/kernel/head_32.S. Both routines expects a boot header in the %esi + * register, which is created by the bootloader (the Launcher in our case). * * The startup_32 function does very little: it clears the uninitialized global * C variables which we expect to be zero (ie. BSS) and then copies the boot - * header and kernel command line somewhere safe. Finally it checks the - * 'hardware_subarch' field. This was introduced in 2.6.24 for lguest and Xen: - * if it's set to '1' (lguest's assigned number), then it calls us here. + * header and kernel command line somewhere safe, and populates some initial + * page tables. Finally it checks the 'hardware_subarch' field. This was + * introduced in 2.6.24 for lguest and Xen: if it's set to '1' (lguest's + * assigned number), then it calls us here. * * WARNING: be very careful here! We're running at addresses equal to physical - * addesses (around 0), not above PAGE_OFFSET as most code expectes + * addesses (around 0), not above PAGE_OFFSET as most code expects * (eg. 0xC0000000). Jumps are relative, so they're OK, but we can't touch any * data without remembering to subtract __PAGE_OFFSET! * diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c index efa202499e37..2535933c49f8 100644 --- a/drivers/lguest/core.c +++ b/drivers/lguest/core.c @@ -117,7 +117,7 @@ static __init int map_switcher(void) /* * Now the Switcher is mapped at the right address, we can't fail! - * Copy in the compiled-in Switcher code (from _switcher.S). + * Copy in the compiled-in Switcher code (from x86/switcher_32.S). */ memcpy(switcher_vma->addr, start_switcher_text, end_switcher_text - start_switcher_text); diff --git a/drivers/lguest/interrupts_and_traps.c b/drivers/lguest/interrupts_and_traps.c index f0c171506371..28433a155d67 100644 --- a/drivers/lguest/interrupts_and_traps.c +++ b/drivers/lguest/interrupts_and_traps.c @@ -427,8 +427,8 @@ void pin_stack_pages(struct lg_cpu *cpu) /* * Direct traps also mean that we need to know whenever the Guest wants to use - * a different kernel stack, so we can change the IDT entries to use that - * stack. The IDT entries expect a virtual address, so unlike most addresses + * a different kernel stack, so we can change the guest TSS to use that + * stack. The TSS entries expect a virtual address, so unlike most addresses * the Guest gives us, the "esp" (stack pointer) value here is virtual, not * physical. * diff --git a/drivers/lguest/lguest_user.c b/drivers/lguest/lguest_user.c index 948c547b8e9e..f97e625241ad 100644 --- a/drivers/lguest/lguest_user.c +++ b/drivers/lguest/lguest_user.c @@ -1,8 +1,10 @@ -/*P:200 This contains all the /dev/lguest code, whereby the userspace launcher - * controls and communicates with the Guest. For example, the first write will - * tell us the Guest's memory layout and entry point. A read will run the - * Guest until something happens, such as a signal or the Guest doing a NOTIFY - * out to the Launcher. +/*P:200 This contains all the /dev/lguest code, whereby the userspace + * launcher controls and communicates with the Guest. For example, + * the first write will tell us the Guest's memory layout and entry + * point. A read will run the Guest until something happens, such as + * a signal or the Guest doing a NOTIFY out to the Launcher. There is + * also a way for the Launcher to attach eventfds to particular NOTIFY + * values instead of returning from the read() call. :*/ #include #include @@ -357,8 +359,8 @@ static int initialize(struct file *file, const unsigned long __user *input) goto free_eventfds; /* - * Initialize the Guest's shadow page tables, using the toplevel - * address the Launcher gave us. This allocates memory, so can fail. + * Initialize the Guest's shadow page tables. This allocates + * memory, so can fail. */ err = init_guest_pagetable(lg); if (err) @@ -516,6 +518,7 @@ static const struct file_operations lguest_fops = { .read = read, .llseek = default_llseek, }; +/*:*/ /* * This is a textbook example of a "misc" character device. Populate a "struct diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c index 00026222bde8..3b62be160a6e 100644 --- a/drivers/lguest/page_tables.c +++ b/drivers/lguest/page_tables.c @@ -155,7 +155,7 @@ static pte_t *spte_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr) } /* - * These functions are just like the above two, except they access the Guest + * These functions are just like the above, except they access the Guest * page tables. Hence they return a Guest address. */ static unsigned long gpgd_addr(struct lg_cpu *cpu, unsigned long vaddr) @@ -195,7 +195,7 @@ static unsigned long gpte_addr(struct lg_cpu *cpu, #endif /*:*/ -/*M:014 +/*M:007 * get_pfn is slow: we could probably try to grab batches of pages here as * an optimization (ie. pre-faulting). :*/ diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c index ec0cdfc04e78..3b9b810cbf28 100644 --- a/drivers/lguest/x86/core.c +++ b/drivers/lguest/x86/core.c @@ -272,7 +272,7 @@ static int emulate_insn(struct lg_cpu *cpu) unsigned int insnlen = 0, in = 0, shift = 0; /* * The eip contains the *virtual* address of the Guest's instruction: - * guest_pa just subtracts the Guest's page_offset. + * walk the Guest's page tables to find the "physical" address. */ unsigned long physaddr = guest_pa(cpu, cpu->regs->eip); @@ -409,7 +409,7 @@ void lguest_arch_handle_trap(struct lg_cpu *cpu) * These values mean a real interrupt occurred, in which case * the Host handler has already been run. We just do a * friendly check if another process should now be run, then - * return to run the Guest again + * return to run the Guest again. */ cond_resched(); return; @@ -459,7 +459,7 @@ void __init lguest_arch_host_init(void) int i; /* - * Most of the i386/switcher.S doesn't care that it's been moved; on + * Most of the x86/switcher_32.S doesn't care that it's been moved; on * Intel, jumps are relative, and it doesn't access any references to * external code or data. * @@ -587,7 +587,7 @@ void __init lguest_arch_host_init(void) clear_cpu_cap(&boot_cpu_data, X86_FEATURE_PGE); } put_online_cpus(); -}; +} /*:*/ void __exit lguest_arch_host_fini(void) @@ -670,8 +670,6 @@ int lguest_arch_init_hypercalls(struct lg_cpu *cpu) /*:*/ /*L:030 - * lguest_arch_setup_regs() - * * Most of the Guest's registers are left alone: we used get_zeroed_page() to * allocate the structure, so they will be 0. */ From 64be11583bac5ca65d017a5c4288f10b2bcf1928 Mon Sep 17 00:00:00 2001 From: Adrian Knoth Date: Mon, 11 Jul 2011 18:07:14 +0200 Subject: [PATCH 7/9] lguest: Fix three simple typos in comments This patch fixes three typos I've accidentally spotted. Signed-off-by: Adrian Knoth Signed-off-by: Rusty Russell (one was already fixed) --- arch/x86/lguest/boot.c | 2 +- arch/x86/lguest/i386_head.S | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index 74279907bc1a..5c4f240289b4 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -467,7 +467,7 @@ static void lguest_cpuid(unsigned int *ax, unsigned int *bx, /* * PAE systems can mark pages as non-executable. Linux calls this the * NX bit. Intel calls it XD (eXecute Disable), AMD EVP (Enhanced - * Virus Protection). We just switch turn if off here, since we don't + * Virus Protection). We just switch it off here, since we don't * support it. */ case 0x80000001: diff --git a/arch/x86/lguest/i386_head.S b/arch/x86/lguest/i386_head.S index cfa23e37ec5c..6ddfe4fc23c3 100644 --- a/arch/x86/lguest/i386_head.S +++ b/arch/x86/lguest/i386_head.S @@ -21,7 +21,7 @@ * assigned number), then it calls us here. * * WARNING: be very careful here! We're running at addresses equal to physical - * addesses (around 0), not above PAGE_OFFSET as most code expects + * addresses (around 0), not above PAGE_OFFSET as most code expects * (eg. 0xC0000000). Jumps are relative, so they're OK, but we can't touch any * data without remembering to subtract __PAGE_OFFSET! * From 8d431f41603acff8a20cf5df99bc8958c91879c1 Mon Sep 17 00:00:00 2001 From: Adrian Knoth Date: Mon, 11 Jul 2011 18:08:47 +0200 Subject: [PATCH 8/9] lguest: Fix translation count about wikipedia's cpuid page The comment is outdated, wikipedia now has six translations of the cpuid page. Signed-off-by: Adrian Knoth Signed-off-by: Rusty Russell --- arch/x86/lguest/boot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index 5c4f240289b4..13ee258442ae 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -400,7 +400,7 @@ static void lguest_load_tr_desc(void) * giant ball of hair. Its entry in the current Intel manual runs to 28 pages. * * This instruction even it has its own Wikipedia entry. The Wikipedia entry - * has been translated into 5 languages. I am not making this up! + * has been translated into 6 languages. I am not making this up! * * We could get funky here and identify ourselves as "GenuineLguest", but * instead we just use the real "cpuid" instruction. Then I pretty much turned From 996ba96a97f7406052486682846d68935a60e986 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 22 Jul 2011 14:39:51 +0930 Subject: [PATCH 9/9] lguest: Fix in/out emulation We were blatting too much of the register. Linux didn't care, but in theory it might. Reported-by: Jonas Maebe Signed-off-by: Rusty Russell --- drivers/lguest/x86/core.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c index 3b9b810cbf28..65af42f2d593 100644 --- a/drivers/lguest/x86/core.c +++ b/drivers/lguest/x86/core.c @@ -269,7 +269,7 @@ void lguest_arch_run_guest(struct lg_cpu *cpu) static int emulate_insn(struct lg_cpu *cpu) { u8 insn; - unsigned int insnlen = 0, in = 0, shift = 0; + unsigned int insnlen = 0, in = 0, small_operand = 0; /* * The eip contains the *virtual* address of the Guest's instruction: * walk the Guest's page tables to find the "physical" address. @@ -300,11 +300,10 @@ static int emulate_insn(struct lg_cpu *cpu) } /* - * 0x66 is an "operand prefix". It means it's using the upper 16 bits - * of the eax register. + * 0x66 is an "operand prefix". It means a 16, not 32 bit in/out. */ if (insn == 0x66) { - shift = 16; + small_operand = 1; /* The instruction is 1 byte so far, read the next byte. */ insnlen = 1; insn = lgread(cpu, physaddr + insnlen, u8); @@ -340,11 +339,14 @@ static int emulate_insn(struct lg_cpu *cpu) * traditionally means "there's nothing there". */ if (in) { - /* Lower bit tells is whether it's a 16 or 32 bit access */ - if (insn & 0x1) - cpu->regs->eax = 0xFFFFFFFF; - else - cpu->regs->eax |= (0xFFFF << shift); + /* Lower bit tells means it's a 32/16 bit access */ + if (insn & 0x1) { + if (small_operand) + cpu->regs->eax |= 0xFFFF; + else + cpu->regs->eax = 0xFFFFFFFF; + } else + cpu->regs->eax |= 0xFF; } /* Finally, we've "done" the instruction, so move past it. */ cpu->regs->eip += insnlen;