diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index a494fa34713a..7bb15747fea2 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1921,54 +1921,38 @@ config RELOCATABLE (CONFIG_PHYSICAL_START) is used as the minimum location. config RANDOMIZE_BASE - bool "Randomize the address of the kernel image" + bool "Randomize the address of the kernel image (KASLR)" depends on RELOCATABLE default n ---help--- - Randomizes the physical and virtual address at which the - kernel image is decompressed, as a security feature that - deters exploit attempts relying on knowledge of the location - of kernel internals. + In support of Kernel Address Space Layout Randomization (KASLR), + this randomizes the physical address at which the kernel image + is decompressed and the virtual address where the kernel + image is mapped, as a security feature that deters exploit + attempts relying on knowledge of the location of kernel + code internals. - Entropy is generated using the RDRAND instruction if it is - supported. If RDTSC is supported, it is used as well. If - neither RDRAND nor RDTSC are supported, then randomness is - read from the i8254 timer. + The kernel physical and virtual address can be randomized + from 16MB up to 1GB on 64-bit and 512MB on 32-bit. (Note that + using RANDOMIZE_BASE reduces the memory space available to + kernel modules from 1.5GB to 1GB.) - The kernel will be offset by up to RANDOMIZE_BASE_MAX_OFFSET, - and aligned according to PHYSICAL_ALIGN. Since the kernel is - built using 2GiB addressing, and PHYSICAL_ALGIN must be at a - minimum of 2MiB, only 10 bits of entropy is theoretically - possible. At best, due to page table layouts, 64-bit can use - 9 bits of entropy and 32-bit uses 8 bits. + Entropy is generated using the RDRAND instruction if it is + supported. If RDTSC is supported, its value is mixed into + the entropy pool as well. If neither RDRAND nor RDTSC are + supported, then entropy is read from the i8254 timer. - If unsure, say N. + Since the kernel is built using 2GB addressing, and + PHYSICAL_ALIGN must be at a minimum of 2MB, only 10 bits of + entropy is theoretically possible. Currently, with the + default value for PHYSICAL_ALIGN and due to page table + layouts, 64-bit uses 9 bits of entropy and 32-bit uses 8 bits. -config RANDOMIZE_BASE_MAX_OFFSET - hex "Maximum kASLR offset allowed" if EXPERT - depends on RANDOMIZE_BASE - range 0x0 0x20000000 if X86_32 - default "0x20000000" if X86_32 - range 0x0 0x40000000 if X86_64 - default "0x40000000" if X86_64 - ---help--- - The lesser of RANDOMIZE_BASE_MAX_OFFSET and available physical - memory is used to determine the maximal offset in bytes that will - be applied to the kernel when kernel Address Space Layout - Randomization (kASLR) is active. This must be a multiple of - PHYSICAL_ALIGN. + If CONFIG_HIBERNATE is also enabled, KASLR is disabled at boot + time. To enable it, boot with "kaslr" on the kernel command + line (which will also disable hibernation). - On 32-bit this is limited to 512MiB by page table layouts. The - default is 512MiB. - - On 64-bit this is limited by how the kernel fixmap page table is - positioned, so this cannot be larger than 1GiB currently. Without - RANDOMIZE_BASE, there is a 512MiB to 1.5GiB split between kernel - and modules. When RANDOMIZE_BASE_MAX_OFFSET is above 512MiB, the - modules area will shrink to compensate, up to the current maximum - 1GiB to 1GiB split. The default is 1GiB. - - If unsure, leave at the default value. + If unsure, say N. # Relocation on x86 needs some additional build support config X86_NEED_RELOCS diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 4086abca0b32..6fce7f096b88 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -208,7 +208,8 @@ endif head-y := arch/x86/kernel/head_$(BITS).o head-y += arch/x86/kernel/head$(BITS).o -head-y += arch/x86/kernel/head.o +head-y += arch/x86/kernel/ebda.o +head-y += arch/x86/kernel/platform-quirks.o libs-y += arch/x86/lib/ diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile index b1ef9e489084..700a9c6e6159 100644 --- a/arch/x86/boot/Makefile +++ b/arch/x86/boot/Makefile @@ -86,16 +86,7 @@ $(obj)/vmlinux.bin: $(obj)/compressed/vmlinux FORCE SETUP_OBJS = $(addprefix $(obj)/,$(setup-y)) -sed-voffset := -e 's/^\([0-9a-fA-F]*\) [ABCDGRSTVW] \(_text\|_end\)$$/\#define VO_\2 0x\1/p' - -quiet_cmd_voffset = VOFFSET $@ - cmd_voffset = $(NM) $< | sed -n $(sed-voffset) > $@ - -targets += voffset.h -$(obj)/voffset.h: vmlinux FORCE - $(call if_changed,voffset) - -sed-zoffset := -e 's/^\([0-9a-fA-F]*\) [ABCDGRSTVW] \(startup_32\|startup_64\|efi32_stub_entry\|efi64_stub_entry\|efi_pe_entry\|input_data\|_end\|z_.*\)$$/\#define ZO_\2 0x\1/p' +sed-zoffset := -e 's/^\([0-9a-fA-F]*\) [ABCDGRSTVW] \(startup_32\|startup_64\|efi32_stub_entry\|efi64_stub_entry\|efi_pe_entry\|input_data\|_end\|_ehead\|_text\|z_.*\)$$/\#define ZO_\2 0x\1/p' quiet_cmd_zoffset = ZOFFSET $@ cmd_zoffset = $(NM) $< | sed -n $(sed-zoffset) > $@ @@ -106,7 +97,7 @@ $(obj)/zoffset.h: $(obj)/compressed/vmlinux FORCE AFLAGS_header.o += -I$(obj) -$(obj)/header.o: $(obj)/voffset.h $(obj)/zoffset.h +$(obj)/header.o: $(obj)/zoffset.h LDFLAGS_setup.elf := -T $(obj)/setup.elf: $(src)/setup.ld $(SETUP_OBJS) FORCE diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 8774cb23064f..cfdd8c3f8af2 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -57,12 +57,27 @@ LDFLAGS_vmlinux := -T hostprogs-y := mkpiggy HOST_EXTRACFLAGS += -I$(srctree)/tools/include +sed-voffset := -e 's/^\([0-9a-fA-F]*\) [ABCDGRSTVW] \(_text\|__bss_start\|_end\)$$/\#define VO_\2 _AC(0x\1,UL)/p' + +quiet_cmd_voffset = VOFFSET $@ + cmd_voffset = $(NM) $< | sed -n $(sed-voffset) > $@ + +targets += ../voffset.h + +$(obj)/../voffset.h: vmlinux FORCE + $(call if_changed,voffset) + +$(obj)/misc.o: $(obj)/../voffset.h + vmlinux-objs-y := $(obj)/vmlinux.lds $(obj)/head_$(BITS).o $(obj)/misc.o \ - $(obj)/string.o $(obj)/cmdline.o \ + $(obj)/string.o $(obj)/cmdline.o $(obj)/error.o \ $(obj)/piggy.o $(obj)/cpuflags.o vmlinux-objs-$(CONFIG_EARLY_PRINTK) += $(obj)/early_serial_console.o -vmlinux-objs-$(CONFIG_RANDOMIZE_BASE) += $(obj)/aslr.o +vmlinux-objs-$(CONFIG_RANDOMIZE_BASE) += $(obj)/kaslr.o +ifdef CONFIG_X86_64 + vmlinux-objs-$(CONFIG_RANDOMIZE_BASE) += $(obj)/pagetable.o +endif $(obj)/eboot.o: KBUILD_CFLAGS += -fshort-wchar -mno-red-zone @@ -109,10 +124,8 @@ suffix-$(CONFIG_KERNEL_XZ) := xz suffix-$(CONFIG_KERNEL_LZO) := lzo suffix-$(CONFIG_KERNEL_LZ4) := lz4 -RUN_SIZE = $(shell $(OBJDUMP) -h vmlinux | \ - $(CONFIG_SHELL) $(srctree)/arch/x86/tools/calc_run_size.sh) quiet_cmd_mkpiggy = MKPIGGY $@ - cmd_mkpiggy = $(obj)/mkpiggy $< $(RUN_SIZE) > $@ || ( rm -f $@ ; false ) + cmd_mkpiggy = $(obj)/mkpiggy $< > $@ || ( rm -f $@ ; false ) targets += piggy.S $(obj)/piggy.S: $(obj)/vmlinux.bin.$(suffix-y) $(obj)/mkpiggy FORCE diff --git a/arch/x86/boot/compressed/aslr.c b/arch/x86/boot/compressed/aslr.c deleted file mode 100644 index 6a9b96b4624d..000000000000 --- a/arch/x86/boot/compressed/aslr.c +++ /dev/null @@ -1,339 +0,0 @@ -#include "misc.h" - -#include -#include -#include - -#include -#include -#include -#include -#include - -/* Simplified build-specific string for starting entropy. */ -static const char build_str[] = UTS_RELEASE " (" LINUX_COMPILE_BY "@" - LINUX_COMPILE_HOST ") (" LINUX_COMPILER ") " UTS_VERSION; - -#define I8254_PORT_CONTROL 0x43 -#define I8254_PORT_COUNTER0 0x40 -#define I8254_CMD_READBACK 0xC0 -#define I8254_SELECT_COUNTER0 0x02 -#define I8254_STATUS_NOTREADY 0x40 -static inline u16 i8254(void) -{ - u16 status, timer; - - do { - outb(I8254_PORT_CONTROL, - I8254_CMD_READBACK | I8254_SELECT_COUNTER0); - status = inb(I8254_PORT_COUNTER0); - timer = inb(I8254_PORT_COUNTER0); - timer |= inb(I8254_PORT_COUNTER0) << 8; - } while (status & I8254_STATUS_NOTREADY); - - return timer; -} - -static unsigned long rotate_xor(unsigned long hash, const void *area, - size_t size) -{ - size_t i; - unsigned long *ptr = (unsigned long *)area; - - for (i = 0; i < size / sizeof(hash); i++) { - /* Rotate by odd number of bits and XOR. */ - hash = (hash << ((sizeof(hash) * 8) - 7)) | (hash >> 7); - hash ^= ptr[i]; - } - - return hash; -} - -/* Attempt to create a simple but unpredictable starting entropy. */ -static unsigned long get_random_boot(void) -{ - unsigned long hash = 0; - - hash = rotate_xor(hash, build_str, sizeof(build_str)); - hash = rotate_xor(hash, real_mode, sizeof(*real_mode)); - - return hash; -} - -static unsigned long get_random_long(void) -{ -#ifdef CONFIG_X86_64 - const unsigned long mix_const = 0x5d6008cbf3848dd3UL; -#else - const unsigned long mix_const = 0x3f39e593UL; -#endif - unsigned long raw, random = get_random_boot(); - bool use_i8254 = true; - - debug_putstr("KASLR using"); - - if (has_cpuflag(X86_FEATURE_RDRAND)) { - debug_putstr(" RDRAND"); - if (rdrand_long(&raw)) { - random ^= raw; - use_i8254 = false; - } - } - - if (has_cpuflag(X86_FEATURE_TSC)) { - debug_putstr(" RDTSC"); - raw = rdtsc(); - - random ^= raw; - use_i8254 = false; - } - - if (use_i8254) { - debug_putstr(" i8254"); - random ^= i8254(); - } - - /* Circular multiply for better bit diffusion */ - asm("mul %3" - : "=a" (random), "=d" (raw) - : "a" (random), "rm" (mix_const)); - random += raw; - - debug_putstr("...\n"); - - return random; -} - -struct mem_vector { - unsigned long start; - unsigned long size; -}; - -#define MEM_AVOID_MAX 5 -static struct mem_vector mem_avoid[MEM_AVOID_MAX]; - -static bool mem_contains(struct mem_vector *region, struct mem_vector *item) -{ - /* Item at least partially before region. */ - if (item->start < region->start) - return false; - /* Item at least partially after region. */ - if (item->start + item->size > region->start + region->size) - return false; - return true; -} - -static bool mem_overlaps(struct mem_vector *one, struct mem_vector *two) -{ - /* Item one is entirely before item two. */ - if (one->start + one->size <= two->start) - return false; - /* Item one is entirely after item two. */ - if (one->start >= two->start + two->size) - return false; - return true; -} - -static void mem_avoid_init(unsigned long input, unsigned long input_size, - unsigned long output, unsigned long output_size) -{ - u64 initrd_start, initrd_size; - u64 cmd_line, cmd_line_size; - unsigned long unsafe, unsafe_len; - char *ptr; - - /* - * Avoid the region that is unsafe to overlap during - * decompression (see calculations at top of misc.c). - */ - unsafe_len = (output_size >> 12) + 32768 + 18; - unsafe = (unsigned long)input + input_size - unsafe_len; - mem_avoid[0].start = unsafe; - mem_avoid[0].size = unsafe_len; - - /* Avoid initrd. */ - initrd_start = (u64)real_mode->ext_ramdisk_image << 32; - initrd_start |= real_mode->hdr.ramdisk_image; - initrd_size = (u64)real_mode->ext_ramdisk_size << 32; - initrd_size |= real_mode->hdr.ramdisk_size; - mem_avoid[1].start = initrd_start; - mem_avoid[1].size = initrd_size; - - /* Avoid kernel command line. */ - cmd_line = (u64)real_mode->ext_cmd_line_ptr << 32; - cmd_line |= real_mode->hdr.cmd_line_ptr; - /* Calculate size of cmd_line. */ - ptr = (char *)(unsigned long)cmd_line; - for (cmd_line_size = 0; ptr[cmd_line_size++]; ) - ; - mem_avoid[2].start = cmd_line; - mem_avoid[2].size = cmd_line_size; - - /* Avoid heap memory. */ - mem_avoid[3].start = (unsigned long)free_mem_ptr; - mem_avoid[3].size = BOOT_HEAP_SIZE; - - /* Avoid stack memory. */ - mem_avoid[4].start = (unsigned long)free_mem_end_ptr; - mem_avoid[4].size = BOOT_STACK_SIZE; -} - -/* Does this memory vector overlap a known avoided area? */ -static bool mem_avoid_overlap(struct mem_vector *img) -{ - int i; - struct setup_data *ptr; - - for (i = 0; i < MEM_AVOID_MAX; i++) { - if (mem_overlaps(img, &mem_avoid[i])) - return true; - } - - /* Avoid all entries in the setup_data linked list. */ - ptr = (struct setup_data *)(unsigned long)real_mode->hdr.setup_data; - while (ptr) { - struct mem_vector avoid; - - avoid.start = (unsigned long)ptr; - avoid.size = sizeof(*ptr) + ptr->len; - - if (mem_overlaps(img, &avoid)) - return true; - - ptr = (struct setup_data *)(unsigned long)ptr->next; - } - - return false; -} - -static unsigned long slots[CONFIG_RANDOMIZE_BASE_MAX_OFFSET / - CONFIG_PHYSICAL_ALIGN]; -static unsigned long slot_max; - -static void slots_append(unsigned long addr) -{ - /* Overflowing the slots list should be impossible. */ - if (slot_max >= CONFIG_RANDOMIZE_BASE_MAX_OFFSET / - CONFIG_PHYSICAL_ALIGN) - return; - - slots[slot_max++] = addr; -} - -static unsigned long slots_fetch_random(void) -{ - /* Handle case of no slots stored. */ - if (slot_max == 0) - return 0; - - return slots[get_random_long() % slot_max]; -} - -static void process_e820_entry(struct e820entry *entry, - unsigned long minimum, - unsigned long image_size) -{ - struct mem_vector region, img; - - /* Skip non-RAM entries. */ - if (entry->type != E820_RAM) - return; - - /* Ignore entries entirely above our maximum. */ - if (entry->addr >= CONFIG_RANDOMIZE_BASE_MAX_OFFSET) - return; - - /* Ignore entries entirely below our minimum. */ - if (entry->addr + entry->size < minimum) - return; - - region.start = entry->addr; - region.size = entry->size; - - /* Potentially raise address to minimum location. */ - if (region.start < minimum) - region.start = minimum; - - /* Potentially raise address to meet alignment requirements. */ - region.start = ALIGN(region.start, CONFIG_PHYSICAL_ALIGN); - - /* Did we raise the address above the bounds of this e820 region? */ - if (region.start > entry->addr + entry->size) - return; - - /* Reduce size by any delta from the original address. */ - region.size -= region.start - entry->addr; - - /* Reduce maximum size to fit end of image within maximum limit. */ - if (region.start + region.size > CONFIG_RANDOMIZE_BASE_MAX_OFFSET) - region.size = CONFIG_RANDOMIZE_BASE_MAX_OFFSET - region.start; - - /* Walk each aligned slot and check for avoided areas. */ - for (img.start = region.start, img.size = image_size ; - mem_contains(®ion, &img) ; - img.start += CONFIG_PHYSICAL_ALIGN) { - if (mem_avoid_overlap(&img)) - continue; - slots_append(img.start); - } -} - -static unsigned long find_random_addr(unsigned long minimum, - unsigned long size) -{ - int i; - unsigned long addr; - - /* Make sure minimum is aligned. */ - minimum = ALIGN(minimum, CONFIG_PHYSICAL_ALIGN); - - /* Verify potential e820 positions, appending to slots list. */ - for (i = 0; i < real_mode->e820_entries; i++) { - process_e820_entry(&real_mode->e820_map[i], minimum, size); - } - - return slots_fetch_random(); -} - -unsigned char *choose_kernel_location(struct boot_params *boot_params, - unsigned char *input, - unsigned long input_size, - unsigned char *output, - unsigned long output_size) -{ - unsigned long choice = (unsigned long)output; - unsigned long random; - -#ifdef CONFIG_HIBERNATION - if (!cmdline_find_option_bool("kaslr")) { - debug_putstr("KASLR disabled by default...\n"); - goto out; - } -#else - if (cmdline_find_option_bool("nokaslr")) { - debug_putstr("KASLR disabled by cmdline...\n"); - goto out; - } -#endif - - boot_params->hdr.loadflags |= KASLR_FLAG; - - /* Record the various known unsafe memory ranges. */ - mem_avoid_init((unsigned long)input, input_size, - (unsigned long)output, output_size); - - /* Walk e820 and find a random address. */ - random = find_random_addr(choice, output_size); - if (!random) { - debug_putstr("KASLR could not find suitable E820 region...\n"); - goto out; - } - - /* Always enforce the minimum. */ - if (random < choice) - goto out; - - choice = random; -out: - return (unsigned char *)choice; -} diff --git a/arch/x86/boot/compressed/cmdline.c b/arch/x86/boot/compressed/cmdline.c index b68e3033e6b9..73ccf63b0f48 100644 --- a/arch/x86/boot/compressed/cmdline.c +++ b/arch/x86/boot/compressed/cmdline.c @@ -15,9 +15,9 @@ static inline char rdfs8(addr_t addr) #include "../cmdline.c" static unsigned long get_cmd_line_ptr(void) { - unsigned long cmd_line_ptr = real_mode->hdr.cmd_line_ptr; + unsigned long cmd_line_ptr = boot_params->hdr.cmd_line_ptr; - cmd_line_ptr |= (u64)real_mode->ext_cmd_line_ptr << 32; + cmd_line_ptr |= (u64)boot_params->ext_cmd_line_ptr << 32; return cmd_line_ptr; } diff --git a/arch/x86/boot/compressed/error.c b/arch/x86/boot/compressed/error.c new file mode 100644 index 000000000000..6248740b68b5 --- /dev/null +++ b/arch/x86/boot/compressed/error.c @@ -0,0 +1,22 @@ +/* + * Callers outside of misc.c need access to the error reporting routines, + * but the *_putstr() functions need to stay in misc.c because of how + * memcpy() and memmove() are defined for the compressed boot environment. + */ +#include "misc.h" + +void warn(char *m) +{ + error_putstr("\n\n"); + error_putstr(m); + error_putstr("\n\n"); +} + +void error(char *m) +{ + warn(m); + error_putstr(" -- System halted"); + + while (1) + asm("hlt"); +} diff --git a/arch/x86/boot/compressed/error.h b/arch/x86/boot/compressed/error.h new file mode 100644 index 000000000000..2e59dac07f9e --- /dev/null +++ b/arch/x86/boot/compressed/error.h @@ -0,0 +1,7 @@ +#ifndef BOOT_COMPRESSED_ERROR_H +#define BOOT_COMPRESSED_ERROR_H + +void warn(char *m); +void error(char *m); + +#endif /* BOOT_COMPRESSED_ERROR_H */ diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S index 0256064da8da..1038524270e7 100644 --- a/arch/x86/boot/compressed/head_32.S +++ b/arch/x86/boot/compressed/head_32.S @@ -176,7 +176,9 @@ preferred_addr: 1: /* Target address to relocate to for decompression */ - addl $z_extract_offset, %ebx + movl BP_init_size(%esi), %eax + subl $_end, %eax + addl %eax, %ebx /* Set up the stack */ leal boot_stack_end(%ebx), %esp @@ -233,24 +235,28 @@ relocated: 2: /* - * Do the decompression, and jump to the new kernel.. + * Do the extraction, and jump to the new kernel.. */ - /* push arguments for decompress_kernel: */ - pushl $z_run_size /* size of kernel with .bss and .brk */ + /* push arguments for extract_kernel: */ pushl $z_output_len /* decompressed length, end of relocs */ - leal z_extract_offset_negative(%ebx), %ebp + + movl BP_init_size(%esi), %eax + subl $_end, %eax + movl %ebx, %ebp + subl %eax, %ebp pushl %ebp /* output address */ + pushl $z_input_len /* input_len */ leal input_data(%ebx), %eax pushl %eax /* input_data */ leal boot_heap(%ebx), %eax pushl %eax /* heap area */ pushl %esi /* real mode pointer */ - call decompress_kernel /* returns kernel location in %eax */ - addl $28, %esp + call extract_kernel /* returns kernel location in %eax */ + addl $24, %esp /* - * Jump to the decompressed kernel. + * Jump to the extracted kernel. */ xorl %ebx, %ebx jmp *%eax diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index 86558a199139..0d80a7ad65cd 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -110,7 +110,9 @@ ENTRY(startup_32) 1: /* Target address to relocate to for decompression */ - addl $z_extract_offset, %ebx + movl BP_init_size(%esi), %eax + subl $_end, %eax + addl %eax, %ebx /* * Prepare for entering 64 bit mode @@ -132,7 +134,7 @@ ENTRY(startup_32) /* Initialize Page tables to 0 */ leal pgtable(%ebx), %edi xorl %eax, %eax - movl $((4096*6)/4), %ecx + movl $(BOOT_INIT_PGT_SIZE/4), %ecx rep stosl /* Build Level 4 */ @@ -338,7 +340,9 @@ preferred_addr: 1: /* Target address to relocate to for decompression */ - leaq z_extract_offset(%rbp), %rbx + movl BP_init_size(%rsi), %ebx + subl $_end, %ebx + addq %rbp, %rbx /* Set up the stack */ leaq boot_stack_end(%rbx), %rsp @@ -408,19 +412,16 @@ relocated: 2: /* - * Do the decompression, and jump to the new kernel.. + * Do the extraction, and jump to the new kernel.. */ pushq %rsi /* Save the real mode argument */ - movq $z_run_size, %r9 /* size of kernel with .bss and .brk */ - pushq %r9 movq %rsi, %rdi /* real mode address */ leaq boot_heap(%rip), %rsi /* malloc area for uncompression */ leaq input_data(%rip), %rdx /* input_data */ movl $z_input_len, %ecx /* input_len */ movq %rbp, %r8 /* output target address */ movq $z_output_len, %r9 /* decompressed length, end of relocs */ - call decompress_kernel /* returns kernel location in %rax */ - popq %r9 + call extract_kernel /* returns kernel location in %rax */ popq %rsi /* @@ -485,4 +486,4 @@ boot_stack_end: .section ".pgtable","a",@nobits .balign 4096 pgtable: - .fill 6*4096, 1, 0 + .fill BOOT_PGT_SIZE, 1, 0 diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c new file mode 100644 index 000000000000..cfeb0259ed81 --- /dev/null +++ b/arch/x86/boot/compressed/kaslr.c @@ -0,0 +1,510 @@ +/* + * kaslr.c + * + * This contains the routines needed to generate a reasonable level of + * entropy to choose a randomized kernel base address offset in support + * of Kernel Address Space Layout Randomization (KASLR). Additionally + * handles walking the physical memory maps (and tracking memory regions + * to avoid) in order to select a physical memory location that can + * contain the entire properly aligned running kernel image. + * + */ +#include "misc.h" +#include "error.h" + +#include +#include +#include + +#include +#include +#include +#include +#include + +/* Simplified build-specific string for starting entropy. */ +static const char build_str[] = UTS_RELEASE " (" LINUX_COMPILE_BY "@" + LINUX_COMPILE_HOST ") (" LINUX_COMPILER ") " UTS_VERSION; + +#define I8254_PORT_CONTROL 0x43 +#define I8254_PORT_COUNTER0 0x40 +#define I8254_CMD_READBACK 0xC0 +#define I8254_SELECT_COUNTER0 0x02 +#define I8254_STATUS_NOTREADY 0x40 +static inline u16 i8254(void) +{ + u16 status, timer; + + do { + outb(I8254_PORT_CONTROL, + I8254_CMD_READBACK | I8254_SELECT_COUNTER0); + status = inb(I8254_PORT_COUNTER0); + timer = inb(I8254_PORT_COUNTER0); + timer |= inb(I8254_PORT_COUNTER0) << 8; + } while (status & I8254_STATUS_NOTREADY); + + return timer; +} + +static unsigned long rotate_xor(unsigned long hash, const void *area, + size_t size) +{ + size_t i; + unsigned long *ptr = (unsigned long *)area; + + for (i = 0; i < size / sizeof(hash); i++) { + /* Rotate by odd number of bits and XOR. */ + hash = (hash << ((sizeof(hash) * 8) - 7)) | (hash >> 7); + hash ^= ptr[i]; + } + + return hash; +} + +/* Attempt to create a simple but unpredictable starting entropy. */ +static unsigned long get_random_boot(void) +{ + unsigned long hash = 0; + + hash = rotate_xor(hash, build_str, sizeof(build_str)); + hash = rotate_xor(hash, boot_params, sizeof(*boot_params)); + + return hash; +} + +static unsigned long get_random_long(const char *purpose) +{ +#ifdef CONFIG_X86_64 + const unsigned long mix_const = 0x5d6008cbf3848dd3UL; +#else + const unsigned long mix_const = 0x3f39e593UL; +#endif + unsigned long raw, random = get_random_boot(); + bool use_i8254 = true; + + debug_putstr(purpose); + debug_putstr(" KASLR using"); + + if (has_cpuflag(X86_FEATURE_RDRAND)) { + debug_putstr(" RDRAND"); + if (rdrand_long(&raw)) { + random ^= raw; + use_i8254 = false; + } + } + + if (has_cpuflag(X86_FEATURE_TSC)) { + debug_putstr(" RDTSC"); + raw = rdtsc(); + + random ^= raw; + use_i8254 = false; + } + + if (use_i8254) { + debug_putstr(" i8254"); + random ^= i8254(); + } + + /* Circular multiply for better bit diffusion */ + asm("mul %3" + : "=a" (random), "=d" (raw) + : "a" (random), "rm" (mix_const)); + random += raw; + + debug_putstr("...\n"); + + return random; +} + +struct mem_vector { + unsigned long start; + unsigned long size; +}; + +enum mem_avoid_index { + MEM_AVOID_ZO_RANGE = 0, + MEM_AVOID_INITRD, + MEM_AVOID_CMDLINE, + MEM_AVOID_BOOTPARAMS, + MEM_AVOID_MAX, +}; + +static struct mem_vector mem_avoid[MEM_AVOID_MAX]; + +static bool mem_contains(struct mem_vector *region, struct mem_vector *item) +{ + /* Item at least partially before region. */ + if (item->start < region->start) + return false; + /* Item at least partially after region. */ + if (item->start + item->size > region->start + region->size) + return false; + return true; +} + +static bool mem_overlaps(struct mem_vector *one, struct mem_vector *two) +{ + /* Item one is entirely before item two. */ + if (one->start + one->size <= two->start) + return false; + /* Item one is entirely after item two. */ + if (one->start >= two->start + two->size) + return false; + return true; +} + +/* + * In theory, KASLR can put the kernel anywhere in the range of [16M, 64T). + * The mem_avoid array is used to store the ranges that need to be avoided + * when KASLR searches for an appropriate random address. We must avoid any + * regions that are unsafe to overlap with during decompression, and other + * things like the initrd, cmdline and boot_params. This comment seeks to + * explain mem_avoid as clearly as possible since incorrect mem_avoid + * memory ranges lead to really hard to debug boot failures. + * + * The initrd, cmdline, and boot_params are trivial to identify for + * avoiding. They are MEM_AVOID_INITRD, MEM_AVOID_CMDLINE, and + * MEM_AVOID_BOOTPARAMS respectively below. + * + * What is not obvious how to avoid is the range of memory that is used + * during decompression (MEM_AVOID_ZO_RANGE below). This range must cover + * the compressed kernel (ZO) and its run space, which is used to extract + * the uncompressed kernel (VO) and relocs. + * + * ZO's full run size sits against the end of the decompression buffer, so + * we can calculate where text, data, bss, etc of ZO are positioned more + * easily. + * + * For additional background, the decompression calculations can be found + * in header.S, and the memory diagram is based on the one found in misc.c. + * + * The following conditions are already enforced by the image layouts and + * associated code: + * - input + input_size >= output + output_size + * - kernel_total_size <= init_size + * - kernel_total_size <= output_size (see Note below) + * - output + init_size >= output + output_size + * + * (Note that kernel_total_size and output_size have no fundamental + * relationship, but output_size is passed to choose_random_location + * as a maximum of the two. The diagram is showing a case where + * kernel_total_size is larger than output_size, but this case is + * handled by bumping output_size.) + * + * The above conditions can be illustrated by a diagram: + * + * 0 output input input+input_size output+init_size + * | | | | | + * | | | | | + * |-----|--------|--------|--------------|-----------|--|-------------| + * | | | + * | | | + * output+init_size-ZO_INIT_SIZE output+output_size output+kernel_total_size + * + * [output, output+init_size) is the entire memory range used for + * extracting the compressed image. + * + * [output, output+kernel_total_size) is the range needed for the + * uncompressed kernel (VO) and its run size (bss, brk, etc). + * + * [output, output+output_size) is VO plus relocs (i.e. the entire + * uncompressed payload contained by ZO). This is the area of the buffer + * written to during decompression. + * + * [output+init_size-ZO_INIT_SIZE, output+init_size) is the worst-case + * range of the copied ZO and decompression code. (i.e. the range + * covered backwards of size ZO_INIT_SIZE, starting from output+init_size.) + * + * [input, input+input_size) is the original copied compressed image (ZO) + * (i.e. it does not include its run size). This range must be avoided + * because it contains the data used for decompression. + * + * [input+input_size, output+init_size) is [_text, _end) for ZO. This + * range includes ZO's heap and stack, and must be avoided since it + * performs the decompression. + * + * Since the above two ranges need to be avoided and they are adjacent, + * they can be merged, resulting in: [input, output+init_size) which + * becomes the MEM_AVOID_ZO_RANGE below. + */ +static void mem_avoid_init(unsigned long input, unsigned long input_size, + unsigned long output) +{ + unsigned long init_size = boot_params->hdr.init_size; + u64 initrd_start, initrd_size; + u64 cmd_line, cmd_line_size; + char *ptr; + + /* + * Avoid the region that is unsafe to overlap during + * decompression. + */ + mem_avoid[MEM_AVOID_ZO_RANGE].start = input; + mem_avoid[MEM_AVOID_ZO_RANGE].size = (output + init_size) - input; + add_identity_map(mem_avoid[MEM_AVOID_ZO_RANGE].start, + mem_avoid[MEM_AVOID_ZO_RANGE].size); + + /* Avoid initrd. */ + initrd_start = (u64)boot_params->ext_ramdisk_image << 32; + initrd_start |= boot_params->hdr.ramdisk_image; + initrd_size = (u64)boot_params->ext_ramdisk_size << 32; + initrd_size |= boot_params->hdr.ramdisk_size; + mem_avoid[MEM_AVOID_INITRD].start = initrd_start; + mem_avoid[MEM_AVOID_INITRD].size = initrd_size; + /* No need to set mapping for initrd, it will be handled in VO. */ + + /* Avoid kernel command line. */ + cmd_line = (u64)boot_params->ext_cmd_line_ptr << 32; + cmd_line |= boot_params->hdr.cmd_line_ptr; + /* Calculate size of cmd_line. */ + ptr = (char *)(unsigned long)cmd_line; + for (cmd_line_size = 0; ptr[cmd_line_size++]; ) + ; + mem_avoid[MEM_AVOID_CMDLINE].start = cmd_line; + mem_avoid[MEM_AVOID_CMDLINE].size = cmd_line_size; + add_identity_map(mem_avoid[MEM_AVOID_CMDLINE].start, + mem_avoid[MEM_AVOID_CMDLINE].size); + + /* Avoid boot parameters. */ + mem_avoid[MEM_AVOID_BOOTPARAMS].start = (unsigned long)boot_params; + mem_avoid[MEM_AVOID_BOOTPARAMS].size = sizeof(*boot_params); + add_identity_map(mem_avoid[MEM_AVOID_BOOTPARAMS].start, + mem_avoid[MEM_AVOID_BOOTPARAMS].size); + + /* We don't need to set a mapping for setup_data. */ + +#ifdef CONFIG_X86_VERBOSE_BOOTUP + /* Make sure video RAM can be used. */ + add_identity_map(0, PMD_SIZE); +#endif +} + +/* + * Does this memory vector overlap a known avoided area? If so, record the + * overlap region with the lowest address. + */ +static bool mem_avoid_overlap(struct mem_vector *img, + struct mem_vector *overlap) +{ + int i; + struct setup_data *ptr; + unsigned long earliest = img->start + img->size; + bool is_overlapping = false; + + for (i = 0; i < MEM_AVOID_MAX; i++) { + if (mem_overlaps(img, &mem_avoid[i]) && + mem_avoid[i].start < earliest) { + *overlap = mem_avoid[i]; + is_overlapping = true; + } + } + + /* Avoid all entries in the setup_data linked list. */ + ptr = (struct setup_data *)(unsigned long)boot_params->hdr.setup_data; + while (ptr) { + struct mem_vector avoid; + + avoid.start = (unsigned long)ptr; + avoid.size = sizeof(*ptr) + ptr->len; + + if (mem_overlaps(img, &avoid) && (avoid.start < earliest)) { + *overlap = avoid; + is_overlapping = true; + } + + ptr = (struct setup_data *)(unsigned long)ptr->next; + } + + return is_overlapping; +} + +static unsigned long slots[KERNEL_IMAGE_SIZE / CONFIG_PHYSICAL_ALIGN]; + +struct slot_area { + unsigned long addr; + int num; +}; + +#define MAX_SLOT_AREA 100 + +static struct slot_area slot_areas[MAX_SLOT_AREA]; + +static unsigned long slot_max; + +static unsigned long slot_area_index; + +static void store_slot_info(struct mem_vector *region, unsigned long image_size) +{ + struct slot_area slot_area; + + if (slot_area_index == MAX_SLOT_AREA) + return; + + slot_area.addr = region->start; + slot_area.num = (region->size - image_size) / + CONFIG_PHYSICAL_ALIGN + 1; + + if (slot_area.num > 0) { + slot_areas[slot_area_index++] = slot_area; + slot_max += slot_area.num; + } +} + +static void slots_append(unsigned long addr) +{ + /* Overflowing the slots list should be impossible. */ + if (slot_max >= KERNEL_IMAGE_SIZE / CONFIG_PHYSICAL_ALIGN) + return; + + slots[slot_max++] = addr; +} + +static unsigned long slots_fetch_random(void) +{ + /* Handle case of no slots stored. */ + if (slot_max == 0) + return 0; + + return slots[get_random_long("Physical") % slot_max]; +} + +static void process_e820_entry(struct e820entry *entry, + unsigned long minimum, + unsigned long image_size) +{ + struct mem_vector region, img, overlap; + + /* Skip non-RAM entries. */ + if (entry->type != E820_RAM) + return; + + /* Ignore entries entirely above our maximum. */ + if (entry->addr >= KERNEL_IMAGE_SIZE) + return; + + /* Ignore entries entirely below our minimum. */ + if (entry->addr + entry->size < minimum) + return; + + region.start = entry->addr; + region.size = entry->size; + + /* Potentially raise address to minimum location. */ + if (region.start < minimum) + region.start = minimum; + + /* Potentially raise address to meet alignment requirements. */ + region.start = ALIGN(region.start, CONFIG_PHYSICAL_ALIGN); + + /* Did we raise the address above the bounds of this e820 region? */ + if (region.start > entry->addr + entry->size) + return; + + /* Reduce size by any delta from the original address. */ + region.size -= region.start - entry->addr; + + /* Reduce maximum size to fit end of image within maximum limit. */ + if (region.start + region.size > KERNEL_IMAGE_SIZE) + region.size = KERNEL_IMAGE_SIZE - region.start; + + /* Walk each aligned slot and check for avoided areas. */ + for (img.start = region.start, img.size = image_size ; + mem_contains(®ion, &img) ; + img.start += CONFIG_PHYSICAL_ALIGN) { + if (mem_avoid_overlap(&img, &overlap)) + continue; + slots_append(img.start); + } +} + +static unsigned long find_random_phys_addr(unsigned long minimum, + unsigned long image_size) +{ + int i; + unsigned long addr; + + /* Make sure minimum is aligned. */ + minimum = ALIGN(minimum, CONFIG_PHYSICAL_ALIGN); + + /* Verify potential e820 positions, appending to slots list. */ + for (i = 0; i < boot_params->e820_entries; i++) { + process_e820_entry(&boot_params->e820_map[i], minimum, + image_size); + } + + return slots_fetch_random(); +} + +static unsigned long find_random_virt_addr(unsigned long minimum, + unsigned long image_size) +{ + unsigned long slots, random_addr; + + /* Make sure minimum is aligned. */ + minimum = ALIGN(minimum, CONFIG_PHYSICAL_ALIGN); + /* Align image_size for easy slot calculations. */ + image_size = ALIGN(image_size, CONFIG_PHYSICAL_ALIGN); + + /* + * There are how many CONFIG_PHYSICAL_ALIGN-sized slots + * that can hold image_size within the range of minimum to + * KERNEL_IMAGE_SIZE? + */ + slots = (KERNEL_IMAGE_SIZE - minimum - image_size) / + CONFIG_PHYSICAL_ALIGN + 1; + + random_addr = get_random_long("Virtual") % slots; + + return random_addr * CONFIG_PHYSICAL_ALIGN + minimum; +} + +/* + * Since this function examines addresses much more numerically, + * it takes the input and output pointers as 'unsigned long'. + */ +unsigned char *choose_random_location(unsigned long input, + unsigned long input_size, + unsigned long output, + unsigned long output_size) +{ + unsigned long choice = output; + unsigned long random_addr; + +#ifdef CONFIG_HIBERNATION + if (!cmdline_find_option_bool("kaslr")) { + warn("KASLR disabled: 'kaslr' not on cmdline (hibernation selected)."); + goto out; + } +#else + if (cmdline_find_option_bool("nokaslr")) { + warn("KASLR disabled: 'nokaslr' on cmdline."); + goto out; + } +#endif + + boot_params->hdr.loadflags |= KASLR_FLAG; + + /* Record the various known unsafe memory ranges. */ + mem_avoid_init(input, input_size, output); + + /* Walk e820 and find a random address. */ + random_addr = find_random_phys_addr(output, output_size); + if (!random_addr) { + warn("KASLR disabled: could not find suitable E820 region!"); + goto out; + } + + /* Always enforce the minimum. */ + if (random_addr < choice) + goto out; + + choice = random_addr; + + add_identity_map(choice, output_size); + + /* This actually loads the identity pagetable on x86_64. */ + finalize_identity_maps(); +out: + return (unsigned char *)choice; +} diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index 79dac1758e7c..f14db4e21654 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -1,8 +1,10 @@ /* * misc.c * - * This is a collection of several routines from gzip-1.0.3 - * adapted for Linux. + * This is a collection of several routines used to extract the kernel + * which includes KASLR relocation, decompression, ELF parsing, and + * relocation processing. Additionally included are the screen and serial + * output functions and related debugging support functions. * * malloc by Hannu Savolainen 1993 and Matthias Urlichs 1994 * puts by Nick Holloway 1993, better puts by Martin Mares 1995 @@ -10,111 +12,37 @@ */ #include "misc.h" +#include "error.h" #include "../string.h" - -/* WARNING!! - * This code is compiled with -fPIC and it is relocated dynamically - * at run time, but no relocation processing is performed. - * This means that it is not safe to place pointers in static structures. - */ +#include "../voffset.h" /* - * Getting to provable safe in place decompression is hard. - * Worst case behaviours need to be analyzed. - * Background information: - * - * The file layout is: - * magic[2] - * method[1] - * flags[1] - * timestamp[4] - * extraflags[1] - * os[1] - * compressed data blocks[N] - * crc[4] orig_len[4] - * - * resulting in 18 bytes of non compressed data overhead. - * - * Files divided into blocks - * 1 bit (last block flag) - * 2 bits (block type) - * - * 1 block occurs every 32K -1 bytes or when there 50% compression - * has been achieved. The smallest block type encoding is always used. - * - * stored: - * 32 bits length in bytes. - * - * fixed: - * magic fixed tree. - * symbols. - * - * dynamic: - * dynamic tree encoding. - * symbols. - * - * - * The buffer for decompression in place is the length of the - * uncompressed data, plus a small amount extra to keep the algorithm safe. - * The compressed data is placed at the end of the buffer. The output - * pointer is placed at the start of the buffer and the input pointer - * is placed where the compressed data starts. Problems will occur - * when the output pointer overruns the input pointer. - * - * The output pointer can only overrun the input pointer if the input - * pointer is moving faster than the output pointer. A condition only - * triggered by data whose compressed form is larger than the uncompressed - * form. - * - * The worst case at the block level is a growth of the compressed data - * of 5 bytes per 32767 bytes. - * - * The worst case internal to a compressed block is very hard to figure. - * The worst case can at least be boundined by having one bit that represents - * 32764 bytes and then all of the rest of the bytes representing the very - * very last byte. - * - * All of which is enough to compute an amount of extra data that is required - * to be safe. To avoid problems at the block level allocating 5 extra bytes - * per 32767 bytes of data is sufficient. To avoind problems internal to a - * block adding an extra 32767 bytes (the worst case uncompressed block size) - * is sufficient, to ensure that in the worst case the decompressed data for - * block will stop the byte before the compressed data for a block begins. - * To avoid problems with the compressed data's meta information an extra 18 - * bytes are needed. Leading to the formula: - * - * extra_bytes = (uncompressed_size >> 12) + 32768 + 18 + decompressor_size. - * - * Adding 8 bytes per 32K is a bit excessive but much easier to calculate. - * Adding 32768 instead of 32767 just makes for round numbers. - * Adding the decompressor_size is necessary as it musht live after all - * of the data as well. Last I measured the decompressor is about 14K. - * 10K of actual data and 4K of bss. - * + * WARNING!! + * This code is compiled with -fPIC and it is relocated dynamically at + * run time, but no relocation processing is performed. This means that + * it is not safe to place pointers in static structures. */ -/* - * gzip declarations - */ +/* Macros used by the included decompressor code below. */ #define STATIC static -#undef memcpy - /* - * Use a normal definition of memset() from string.c. There are already + * Use normal definitions of mem*() from string.c. There are already * included header files which expect a definition of memset() and by * the time we define memset macro, it is too late. */ +#undef memcpy #undef memset #define memzero(s, n) memset((s), 0, (n)) +#define memmove memmove - -static void error(char *m); +/* Functions used by the included decompressor code below. */ +void *memmove(void *dest, const void *src, size_t n); /* * This is set up by the setup-routine at boot-time */ -struct boot_params *real_mode; /* Pointer to real-mode data */ +struct boot_params *boot_params; memptr free_mem_ptr; memptr free_mem_end_ptr; @@ -146,12 +74,16 @@ static int lines, cols; #ifdef CONFIG_KERNEL_LZ4 #include "../../../../lib/decompress_unlz4.c" #endif +/* + * NOTE: When adding a new decompressor, please update the analysis in + * ../header.S. + */ static void scroll(void) { int i; - memcpy(vidmem, vidmem + cols * 2, (lines - 1) * cols * 2); + memmove(vidmem, vidmem + cols * 2, (lines - 1) * cols * 2); for (i = (lines - 1) * cols * 2; i < lines * cols * 2; i += 2) vidmem[i] = ' '; } @@ -184,12 +116,12 @@ void __putstr(const char *s) } } - if (real_mode->screen_info.orig_video_mode == 0 && + if (boot_params->screen_info.orig_video_mode == 0 && lines == 0 && cols == 0) return; - x = real_mode->screen_info.orig_x; - y = real_mode->screen_info.orig_y; + x = boot_params->screen_info.orig_x; + y = boot_params->screen_info.orig_y; while ((c = *s++) != '\0') { if (c == '\n') { @@ -210,8 +142,8 @@ void __putstr(const char *s) } } - real_mode->screen_info.orig_x = x; - real_mode->screen_info.orig_y = y; + boot_params->screen_info.orig_x = x; + boot_params->screen_info.orig_y = y; pos = (x + cols * y) * 2; /* Update cursor position */ outb(14, vidport); @@ -237,23 +169,13 @@ void __puthex(unsigned long value) } } -static void error(char *x) -{ - error_putstr("\n\n"); - error_putstr(x); - error_putstr("\n\n -- System halted"); - - while (1) - asm("hlt"); -} - #if CONFIG_X86_NEED_RELOCS static void handle_relocations(void *output, unsigned long output_len) { int *reloc; unsigned long delta, map, ptr; unsigned long min_addr = (unsigned long)output; - unsigned long max_addr = min_addr + output_len; + unsigned long max_addr = min_addr + (VO___bss_start - VO__text); /* * Calculate the delta between where vmlinux was linked to load @@ -295,7 +217,7 @@ static void handle_relocations(void *output, unsigned long output_len) * So we work backwards from the end of the decompressed image. */ for (reloc = output + output_len - sizeof(*reloc); *reloc; reloc--) { - int extended = *reloc; + long extended = *reloc; extended += map; ptr = (unsigned long)extended; @@ -372,9 +294,7 @@ static void parse_elf(void *output) #else dest = (void *)(phdr->p_paddr); #endif - memcpy(dest, - output + phdr->p_offset, - phdr->p_filesz); + memmove(dest, output + phdr->p_offset, phdr->p_filesz); break; default: /* Ignore other PT_* */ break; } @@ -383,23 +303,41 @@ static void parse_elf(void *output) free(phdrs); } -asmlinkage __visible void *decompress_kernel(void *rmode, memptr heap, +/* + * The compressed kernel image (ZO), has been moved so that its position + * is against the end of the buffer used to hold the uncompressed kernel + * image (VO) and the execution environment (.bss, .brk), which makes sure + * there is room to do the in-place decompression. (See header.S for the + * calculations.) + * + * |-----compressed kernel image------| + * V V + * 0 extract_offset +INIT_SIZE + * |-----------|---------------|-------------------------|--------| + * | | | | + * VO__text startup_32 of ZO VO__end ZO__end + * ^ ^ + * |-------uncompressed kernel image---------| + * + */ +asmlinkage __visible void *extract_kernel(void *rmode, memptr heap, unsigned char *input_data, unsigned long input_len, unsigned char *output, - unsigned long output_len, - unsigned long run_size) + unsigned long output_len) { + const unsigned long kernel_total_size = VO__end - VO__text; unsigned char *output_orig = output; - real_mode = rmode; + /* Retain x86 boot parameters pointer passed from startup_32/64. */ + boot_params = rmode; - /* Clear it for solely in-kernel use */ - real_mode->hdr.loadflags &= ~KASLR_FLAG; + /* Clear flags intended for solely in-kernel use. */ + boot_params->hdr.loadflags &= ~KASLR_FLAG; - sanitize_boot_params(real_mode); + sanitize_boot_params(boot_params); - if (real_mode->screen_info.orig_video_mode == 7) { + if (boot_params->screen_info.orig_video_mode == 7) { vidmem = (char *) 0xb0000; vidport = 0x3b4; } else { @@ -407,11 +345,11 @@ asmlinkage __visible void *decompress_kernel(void *rmode, memptr heap, vidport = 0x3d4; } - lines = real_mode->screen_info.orig_video_lines; - cols = real_mode->screen_info.orig_video_cols; + lines = boot_params->screen_info.orig_video_lines; + cols = boot_params->screen_info.orig_video_cols; console_init(); - debug_putstr("early console in decompress_kernel\n"); + debug_putstr("early console in extract_kernel\n"); free_mem_ptr = heap; /* Heap */ free_mem_end_ptr = heap + BOOT_HEAP_SIZE; @@ -421,16 +359,16 @@ asmlinkage __visible void *decompress_kernel(void *rmode, memptr heap, debug_putaddr(input_len); debug_putaddr(output); debug_putaddr(output_len); - debug_putaddr(run_size); + debug_putaddr(kernel_total_size); /* * The memory hole needed for the kernel is the larger of either * the entire decompressed kernel plus relocation table, or the * entire decompressed kernel plus .bss and .brk sections. */ - output = choose_kernel_location(real_mode, input_data, input_len, output, - output_len > run_size ? output_len - : run_size); + output = choose_random_location((unsigned long)input_data, input_len, + (unsigned long)output, + max(output_len, kernel_total_size)); /* Validate memory location choices. */ if ((unsigned long)output & (MIN_KERNEL_ALIGN - 1)) diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index 3783dc3e10b3..b6fec1ff10e4 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -32,7 +32,7 @@ /* misc.c */ extern memptr free_mem_ptr; extern memptr free_mem_end_ptr; -extern struct boot_params *real_mode; /* Pointer to real-mode data */ +extern struct boot_params *boot_params; void __putstr(const char *s); void __puthex(unsigned long value); #define error_putstr(__x) __putstr(__x) @@ -66,26 +66,35 @@ int cmdline_find_option_bool(const char *option); #if CONFIG_RANDOMIZE_BASE -/* aslr.c */ -unsigned char *choose_kernel_location(struct boot_params *boot_params, - unsigned char *input, +/* kaslr.c */ +unsigned char *choose_random_location(unsigned long input_ptr, unsigned long input_size, - unsigned char *output, + unsigned long output_ptr, unsigned long output_size); /* cpuflags.c */ bool has_cpuflag(int flag); #else static inline -unsigned char *choose_kernel_location(struct boot_params *boot_params, - unsigned char *input, +unsigned char *choose_random_location(unsigned long input_ptr, unsigned long input_size, - unsigned char *output, + unsigned long output_ptr, unsigned long output_size) { - return output; + return (unsigned char *)output_ptr; } #endif +#ifdef CONFIG_X86_64 +void add_identity_map(unsigned long start, unsigned long size); +void finalize_identity_maps(void); +extern unsigned char _pgtable[]; +#else +static inline void add_identity_map(unsigned long start, unsigned long size) +{ } +static inline void finalize_identity_maps(void) +{ } +#endif + #ifdef CONFIG_EARLY_PRINTK /* early_serial_console.c */ extern int early_serial_base; diff --git a/arch/x86/boot/compressed/mkpiggy.c b/arch/x86/boot/compressed/mkpiggy.c index d8222f213182..72bad2c8debe 100644 --- a/arch/x86/boot/compressed/mkpiggy.c +++ b/arch/x86/boot/compressed/mkpiggy.c @@ -18,11 +18,10 @@ * * H. Peter Anvin * - * ----------------------------------------------------------------------- */ - -/* - * Compute the desired load offset from a compressed program; outputs - * a small assembly wrapper with the appropriate symbols defined. + * ----------------------------------------------------------------------- + * + * Outputs a small assembly wrapper with the appropriate symbols defined. + * */ #include @@ -35,14 +34,11 @@ int main(int argc, char *argv[]) { uint32_t olen; long ilen; - unsigned long offs; - unsigned long run_size; FILE *f = NULL; int retval = 1; - if (argc < 3) { - fprintf(stderr, "Usage: %s compressed_file run_size\n", - argv[0]); + if (argc < 2) { + fprintf(stderr, "Usage: %s compressed_file\n", argv[0]); goto bail; } @@ -67,29 +63,11 @@ int main(int argc, char *argv[]) ilen = ftell(f); olen = get_unaligned_le32(&olen); - /* - * Now we have the input (compressed) and output (uncompressed) - * sizes, compute the necessary decompression offset... - */ - - offs = (olen > ilen) ? olen - ilen : 0; - offs += olen >> 12; /* Add 8 bytes for each 32K block */ - offs += 64*1024 + 128; /* Add 64K + 128 bytes slack */ - offs = (offs+4095) & ~4095; /* Round to a 4K boundary */ - run_size = atoi(argv[2]); - printf(".section \".rodata..compressed\",\"a\",@progbits\n"); printf(".globl z_input_len\n"); printf("z_input_len = %lu\n", ilen); printf(".globl z_output_len\n"); printf("z_output_len = %lu\n", (unsigned long)olen); - printf(".globl z_extract_offset\n"); - printf("z_extract_offset = 0x%lx\n", offs); - /* z_extract_offset_negative allows simplification of head_32.S */ - printf(".globl z_extract_offset_negative\n"); - printf("z_extract_offset_negative = -0x%lx\n", offs); - printf(".globl z_run_size\n"); - printf("z_run_size = %lu\n", run_size); printf(".globl input_data, input_data_end\n"); printf("input_data:\n"); diff --git a/arch/x86/boot/compressed/pagetable.c b/arch/x86/boot/compressed/pagetable.c new file mode 100644 index 000000000000..34b95df14e69 --- /dev/null +++ b/arch/x86/boot/compressed/pagetable.c @@ -0,0 +1,129 @@ +/* + * This code is used on x86_64 to create page table identity mappings on + * demand by building up a new set of page tables (or appending to the + * existing ones), and then switching over to them when ready. + */ + +/* + * Since we're dealing with identity mappings, physical and virtual + * addresses are the same, so override these defines which are ultimately + * used by the headers in misc.h. + */ +#define __pa(x) ((unsigned long)(x)) +#define __va(x) ((void *)((unsigned long)(x))) + +#include "misc.h" + +/* These actually do the work of building the kernel identity maps. */ +#include +#include +#include "../../mm/ident_map.c" + +/* Used by pgtable.h asm code to force instruction serialization. */ +unsigned long __force_order; + +/* Used to track our page table allocation area. */ +struct alloc_pgt_data { + unsigned char *pgt_buf; + unsigned long pgt_buf_size; + unsigned long pgt_buf_offset; +}; + +/* + * Allocates space for a page table entry, using struct alloc_pgt_data + * above. Besides the local callers, this is used as the allocation + * callback in mapping_info below. + */ +static void *alloc_pgt_page(void *context) +{ + struct alloc_pgt_data *pages = (struct alloc_pgt_data *)context; + unsigned char *entry; + + /* Validate there is space available for a new page. */ + if (pages->pgt_buf_offset >= pages->pgt_buf_size) { + debug_putstr("out of pgt_buf in " __FILE__ "!?\n"); + debug_putaddr(pages->pgt_buf_offset); + debug_putaddr(pages->pgt_buf_size); + return NULL; + } + + entry = pages->pgt_buf + pages->pgt_buf_offset; + pages->pgt_buf_offset += PAGE_SIZE; + + return entry; +} + +/* Used to track our allocated page tables. */ +static struct alloc_pgt_data pgt_data; + +/* The top level page table entry pointer. */ +static unsigned long level4p; + +/* Locates and clears a region for a new top level page table. */ +static void prepare_level4(void) +{ + /* + * It should be impossible for this not to already be true, + * but since calling this a second time would rewind the other + * counters, let's just make sure this is reset too. + */ + pgt_data.pgt_buf_offset = 0; + + /* + * If we came here via startup_32(), cr3 will be _pgtable already + * and we must append to the existing area instead of entirely + * overwriting it. + */ + level4p = read_cr3(); + if (level4p == (unsigned long)_pgtable) { + debug_putstr("booted via startup_32()\n"); + pgt_data.pgt_buf = _pgtable + BOOT_INIT_PGT_SIZE; + pgt_data.pgt_buf_size = BOOT_PGT_SIZE - BOOT_INIT_PGT_SIZE; + memset(pgt_data.pgt_buf, 0, pgt_data.pgt_buf_size); + } else { + debug_putstr("booted via startup_64()\n"); + pgt_data.pgt_buf = _pgtable; + pgt_data.pgt_buf_size = BOOT_PGT_SIZE; + memset(pgt_data.pgt_buf, 0, pgt_data.pgt_buf_size); + level4p = (unsigned long)alloc_pgt_page(&pgt_data); + } +} + +/* + * Adds the specified range to what will become the new identity mappings. + * Once all ranges have been added, the new mapping is activated by calling + * finalize_identity_maps() below. + */ +void add_identity_map(unsigned long start, unsigned long size) +{ + struct x86_mapping_info mapping_info = { + .alloc_pgt_page = alloc_pgt_page, + .context = &pgt_data, + .pmd_flag = __PAGE_KERNEL_LARGE_EXEC, + }; + unsigned long end = start + size; + + /* Make sure we have a top level page table ready to use. */ + if (!level4p) + prepare_level4(); + + /* Align boundary to 2M. */ + start = round_down(start, PMD_SIZE); + end = round_up(end, PMD_SIZE); + if (start >= end) + return; + + /* Build the mapping. */ + kernel_ident_mapping_init(&mapping_info, (pgd_t *)level4p, + start, end); +} + +/* + * This switches the page tables to the new level4 that has been built + * via calls to add_identity_map() above. If booted via startup_32(), + * this is effectively a no-op. + */ +void finalize_identity_maps(void) +{ + write_cr3(level4p); +} diff --git a/arch/x86/boot/compressed/string.c b/arch/x86/boot/compressed/string.c index 00e788be1db9..cea140ce6b42 100644 --- a/arch/x86/boot/compressed/string.c +++ b/arch/x86/boot/compressed/string.c @@ -1,7 +1,16 @@ +/* + * This provides an optimized implementation of memcpy, and a simplified + * implementation of memset and memmove. These are used here because the + * standard kernel runtime versions are not yet available and we don't + * trust the gcc built-in implementations as they may do unexpected things + * (e.g. FPU ops) in the minimal decompression stub execution environment. + */ +#include "error.h" + #include "../string.c" #ifdef CONFIG_X86_32 -void *memcpy(void *dest, const void *src, size_t n) +static void *__memcpy(void *dest, const void *src, size_t n) { int d0, d1, d2; asm volatile( @@ -15,7 +24,7 @@ void *memcpy(void *dest, const void *src, size_t n) return dest; } #else -void *memcpy(void *dest, const void *src, size_t n) +static void *__memcpy(void *dest, const void *src, size_t n) { long d0, d1, d2; asm volatile( @@ -39,3 +48,27 @@ void *memset(void *s, int c, size_t n) ss[i] = c; return s; } + +void *memmove(void *dest, const void *src, size_t n) +{ + unsigned char *d = dest; + const unsigned char *s = src; + + if (d <= s || d - s >= n) + return __memcpy(dest, src, n); + + while (n-- > 0) + d[n] = s[n]; + + return dest; +} + +/* Detect and warn about potential overlaps, but handle them with memmove. */ +void *memcpy(void *dest, const void *src, size_t n) +{ + if (dest > src && dest - src < n) { + warn("Avoiding potentially unsafe overlapping memcpy()!"); + return memmove(dest, src, n); + } + return __memcpy(dest, src, n); +} diff --git a/arch/x86/boot/compressed/vmlinux.lds.S b/arch/x86/boot/compressed/vmlinux.lds.S index 34d047c98284..e24e0a0c90c9 100644 --- a/arch/x86/boot/compressed/vmlinux.lds.S +++ b/arch/x86/boot/compressed/vmlinux.lds.S @@ -70,5 +70,6 @@ SECTIONS _epgtable = . ; } #endif + . = ALIGN(PAGE_SIZE); /* keep ZO size page aligned */ _end = .; } diff --git a/arch/x86/boot/early_serial_console.c b/arch/x86/boot/early_serial_console.c index 45a07684bbab..f0b8d6d93164 100644 --- a/arch/x86/boot/early_serial_console.c +++ b/arch/x86/boot/early_serial_console.c @@ -1,3 +1,7 @@ +/* + * Serial port routines for use during early boot reporting. This code is + * included from both the compressed kernel and the regular kernel. + */ #include "boot.h" #define DEFAULT_SERIAL_PORT 0x3f8 /* ttyS0 */ diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S index 6236b9ec4b76..3dd5be33aaa7 100644 --- a/arch/x86/boot/header.S +++ b/arch/x86/boot/header.S @@ -440,13 +440,116 @@ setup_data: .quad 0 # 64-bit physical pointer to pref_address: .quad LOAD_PHYSICAL_ADDR # preferred load addr -#define ZO_INIT_SIZE (ZO__end - ZO_startup_32 + ZO_z_extract_offset) +# +# Getting to provably safe in-place decompression is hard. Worst case +# behaviours need to be analyzed. Here let's take the decompression of +# a gzip-compressed kernel as example, to illustrate it: +# +# The file layout of gzip compressed kernel is: +# +# magic[2] +# method[1] +# flags[1] +# timestamp[4] +# extraflags[1] +# os[1] +# compressed data blocks[N] +# crc[4] orig_len[4] +# +# ... resulting in +18 bytes overhead of uncompressed data. +# +# (For more information, please refer to RFC 1951 and RFC 1952.) +# +# Files divided into blocks +# 1 bit (last block flag) +# 2 bits (block type) +# +# 1 block occurs every 32K -1 bytes or when there 50% compression +# has been achieved. The smallest block type encoding is always used. +# +# stored: +# 32 bits length in bytes. +# +# fixed: +# magic fixed tree. +# symbols. +# +# dynamic: +# dynamic tree encoding. +# symbols. +# +# +# The buffer for decompression in place is the length of the uncompressed +# data, plus a small amount extra to keep the algorithm safe. The +# compressed data is placed at the end of the buffer. The output pointer +# is placed at the start of the buffer and the input pointer is placed +# where the compressed data starts. Problems will occur when the output +# pointer overruns the input pointer. +# +# The output pointer can only overrun the input pointer if the input +# pointer is moving faster than the output pointer. A condition only +# triggered by data whose compressed form is larger than the uncompressed +# form. +# +# The worst case at the block level is a growth of the compressed data +# of 5 bytes per 32767 bytes. +# +# The worst case internal to a compressed block is very hard to figure. +# The worst case can at least be bounded by having one bit that represents +# 32764 bytes and then all of the rest of the bytes representing the very +# very last byte. +# +# All of which is enough to compute an amount of extra data that is required +# to be safe. To avoid problems at the block level allocating 5 extra bytes +# per 32767 bytes of data is sufficient. To avoid problems internal to a +# block adding an extra 32767 bytes (the worst case uncompressed block size) +# is sufficient, to ensure that in the worst case the decompressed data for +# block will stop the byte before the compressed data for a block begins. +# To avoid problems with the compressed data's meta information an extra 18 +# bytes are needed. Leading to the formula: +# +# extra_bytes = (uncompressed_size >> 12) + 32768 + 18 +# +# Adding 8 bytes per 32K is a bit excessive but much easier to calculate. +# Adding 32768 instead of 32767 just makes for round numbers. +# +# Above analysis is for decompressing gzip compressed kernel only. Up to +# now 6 different decompressor are supported all together. And among them +# xz stores data in chunks and has maximum chunk of 64K. Hence safety +# margin should be updated to cover all decompressors so that we don't +# need to deal with each of them separately. Please check +# the description in lib/decompressor_xxx.c for specific information. +# +# extra_bytes = (uncompressed_size >> 12) + 65536 + 128 + +#define ZO_z_extra_bytes ((ZO_z_output_len >> 12) + 65536 + 128) +#if ZO_z_output_len > ZO_z_input_len +# define ZO_z_extract_offset (ZO_z_output_len + ZO_z_extra_bytes - \ + ZO_z_input_len) +#else +# define ZO_z_extract_offset ZO_z_extra_bytes +#endif + +/* + * The extract_offset has to be bigger than ZO head section. Otherwise when + * the head code is running to move ZO to the end of the buffer, it will + * overwrite the head code itself. + */ +#if (ZO__ehead - ZO_startup_32) > ZO_z_extract_offset +# define ZO_z_min_extract_offset ((ZO__ehead - ZO_startup_32 + 4095) & ~4095) +#else +# define ZO_z_min_extract_offset ((ZO_z_extract_offset + 4095) & ~4095) +#endif + +#define ZO_INIT_SIZE (ZO__end - ZO_startup_32 + ZO_z_min_extract_offset) + #define VO_INIT_SIZE (VO__end - VO__text) #if ZO_INIT_SIZE > VO_INIT_SIZE -#define INIT_SIZE ZO_INIT_SIZE +# define INIT_SIZE ZO_INIT_SIZE #else -#define INIT_SIZE VO_INIT_SIZE +# define INIT_SIZE VO_INIT_SIZE #endif + init_size: .long INIT_SIZE # kernel initialization size handover_offset: .long 0 # Filled in by build.c diff --git a/arch/x86/include/asm/boot.h b/arch/x86/include/asm/boot.h index 6b8d6e8cd449..abd06b19ddd2 100644 --- a/arch/x86/include/asm/boot.h +++ b/arch/x86/include/asm/boot.h @@ -12,29 +12,46 @@ /* Minimum kernel alignment, as a power of two */ #ifdef CONFIG_X86_64 -#define MIN_KERNEL_ALIGN_LG2 PMD_SHIFT +# define MIN_KERNEL_ALIGN_LG2 PMD_SHIFT #else -#define MIN_KERNEL_ALIGN_LG2 (PAGE_SHIFT + THREAD_SIZE_ORDER) +# define MIN_KERNEL_ALIGN_LG2 (PAGE_SHIFT + THREAD_SIZE_ORDER) #endif #define MIN_KERNEL_ALIGN (_AC(1, UL) << MIN_KERNEL_ALIGN_LG2) #if (CONFIG_PHYSICAL_ALIGN & (CONFIG_PHYSICAL_ALIGN-1)) || \ (CONFIG_PHYSICAL_ALIGN < MIN_KERNEL_ALIGN) -#error "Invalid value for CONFIG_PHYSICAL_ALIGN" +# error "Invalid value for CONFIG_PHYSICAL_ALIGN" #endif #ifdef CONFIG_KERNEL_BZIP2 -#define BOOT_HEAP_SIZE 0x400000 +# define BOOT_HEAP_SIZE 0x400000 #else /* !CONFIG_KERNEL_BZIP2 */ - -#define BOOT_HEAP_SIZE 0x10000 - -#endif /* !CONFIG_KERNEL_BZIP2 */ +# define BOOT_HEAP_SIZE 0x10000 +#endif #ifdef CONFIG_X86_64 -#define BOOT_STACK_SIZE 0x4000 -#else -#define BOOT_STACK_SIZE 0x1000 +# define BOOT_STACK_SIZE 0x4000 + +# define BOOT_INIT_PGT_SIZE (6*4096) +# ifdef CONFIG_RANDOMIZE_BASE +/* + * Assuming all cross the 512GB boundary: + * 1 page for level4 + * (2+2)*4 pages for kernel, param, cmd_line, and randomized kernel + * 2 pages for first 2M (video RAM: CONFIG_X86_VERBOSE_BOOTUP). + * Total is 19 pages. + */ +# ifdef CONFIG_X86_VERBOSE_BOOTUP +# define BOOT_PGT_SIZE (19*4096) +# else /* !CONFIG_X86_VERBOSE_BOOTUP */ +# define BOOT_PGT_SIZE (17*4096) +# endif +# else /* !CONFIG_RANDOMIZE_BASE */ +# define BOOT_PGT_SIZE BOOT_INIT_PGT_SIZE +# endif + +#else /* !CONFIG_X86_64 */ +# define BOOT_STACK_SIZE 0x1000 #endif #endif /* _ASM_X86_BOOT_H */ diff --git a/arch/x86/include/asm/page.h b/arch/x86/include/asm/page.h index 802dde30c928..cf8f619b305f 100644 --- a/arch/x86/include/asm/page.h +++ b/arch/x86/include/asm/page.h @@ -37,7 +37,10 @@ static inline void copy_user_page(void *to, void *from, unsigned long vaddr, alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO | movableflags, vma, vaddr) #define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE +#ifndef __pa #define __pa(x) __phys_addr((unsigned long)(x)) +#endif + #define __pa_nodebug(x) __phys_addr_nodebug((unsigned long)(x)) /* __pa_symbol should be used for C visible symbols. This seems to be the official gcc blessed way to do such arithmetic. */ @@ -51,7 +54,9 @@ static inline void copy_user_page(void *to, void *from, unsigned long vaddr, #define __pa_symbol(x) \ __phys_addr_symbol(__phys_reloc_hide((unsigned long)(x))) +#ifndef __va #define __va(x) ((void *)((unsigned long)(x)+PAGE_OFFSET)) +#endif #define __boot_va(x) __va(x) #define __boot_pa(x) __pa(x) diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h index 4928cf0d5af0..d5c2f8b40faa 100644 --- a/arch/x86/include/asm/page_64_types.h +++ b/arch/x86/include/asm/page_64_types.h @@ -47,12 +47,10 @@ * are fully set up. If kernel ASLR is configured, it can extend the * kernel page table mapping, reducing the size of the modules area. */ -#define KERNEL_IMAGE_SIZE_DEFAULT (512 * 1024 * 1024) -#if defined(CONFIG_RANDOMIZE_BASE) && \ - CONFIG_RANDOMIZE_BASE_MAX_OFFSET > KERNEL_IMAGE_SIZE_DEFAULT -#define KERNEL_IMAGE_SIZE CONFIG_RANDOMIZE_BASE_MAX_OFFSET +#if defined(CONFIG_RANDOMIZE_BASE) +#define KERNEL_IMAGE_SIZE (1024 * 1024 * 1024) #else -#define KERNEL_IMAGE_SIZE KERNEL_IMAGE_SIZE_DEFAULT +#define KERNEL_IMAGE_SIZE (512 * 1024 * 1024) #endif #endif /* _ASM_X86_PAGE_64_DEFS_H */ diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 3c731413f1de..2970d22d7766 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -15,17 +15,6 @@ #include #include -static inline int paravirt_enabled(void) -{ - return pv_info.paravirt_enabled; -} - -static inline int paravirt_has_feature(unsigned int feature) -{ - WARN_ON_ONCE(!pv_info.paravirt_enabled); - return (pv_info.features & feature); -} - static inline void load_sp0(struct tss_struct *tss, struct thread_struct *thread) { diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index b4a23eafa1b9..7fa9e7740ba3 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -69,15 +69,9 @@ struct pv_info { u16 extra_user_64bit_cs; /* __USER_CS if none */ #endif - int paravirt_enabled; - unsigned int features; /* valid only if paravirt_enabled is set */ const char *name; }; -#define paravirt_has(x) paravirt_has_feature(PV_SUPPORTED_##x) -/* Supported features */ -#define PV_SUPPORTED_RTC (1<<0) - struct pv_init_ops { /* * Patch may replace one of the defined code sequences with diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 9251aa962721..62c6cc3cc5d3 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -480,8 +480,6 @@ static inline unsigned long current_top_of_stack(void) #include #else #define __cpuid native_cpuid -#define paravirt_enabled() 0 -#define paravirt_has(x) 0 static inline void load_sp0(struct tss_struct *tss, struct thread_struct *thread) diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index 1ae89a2721d6..4dcdf74dfed8 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -141,6 +141,44 @@ struct x86_cpuinit_ops { struct timespec; +/** + * struct x86_legacy_devices - legacy x86 devices + * + * @pnpbios: this platform can have a PNPBIOS. If this is disabled the platform + * is known to never have a PNPBIOS. + * + * These are devices known to require LPC or ISA bus. The definition of legacy + * devices adheres to the ACPI 5.2.9.3 IA-PC Boot Architecture flag + * ACPI_FADT_LEGACY_DEVICES. These devices consist of user visible devices on + * the LPC or ISA bus. User visible devices are devices that have end-user + * accessible connectors (for example, LPT parallel port). Legacy devices on + * the LPC bus consist for example of serial and parallel ports, PS/2 keyboard + * / mouse, and the floppy disk controller. A system that lacks all known + * legacy devices can assume all devices can be detected exclusively via + * standard device enumeration mechanisms including the ACPI namespace. + * + * A system which has does not have ACPI_FADT_LEGACY_DEVICES enabled must not + * have any of the legacy devices enumerated below present. + */ +struct x86_legacy_devices { + int pnpbios; +}; + +/** + * struct x86_legacy_features - legacy x86 features + * + * @rtc: this device has a CMOS real-time clock present + * @ebda_search: it's safe to search for the EBDA signature in the hardware's + * low RAM + * @devices: legacy x86 devices, refer to struct x86_legacy_devices + * documentation for further details. + */ +struct x86_legacy_features { + int rtc; + int ebda_search; + struct x86_legacy_devices devices; +}; + /** * struct x86_platform_ops - platform specific runtime functions * @calibrate_tsc: calibrate TSC @@ -152,6 +190,14 @@ struct timespec; * @save_sched_clock_state: save state for sched_clock() on suspend * @restore_sched_clock_state: restore state for sched_clock() on resume * @apic_post_init: adjust apic if neeeded + * @legacy: legacy features + * @set_legacy_features: override legacy features. Use of this callback + * is highly discouraged. You should only need + * this if your hardware platform requires further + * custom fine tuning far beyong what may be + * possible in x86_early_init_platform_quirks() by + * only using the current x86_hardware_subarch + * semantics. */ struct x86_platform_ops { unsigned long (*calibrate_tsc)(void); @@ -165,6 +211,8 @@ struct x86_platform_ops { void (*save_sched_clock_state)(void); void (*restore_sched_clock_state)(void); void (*apic_post_init)(void); + struct x86_legacy_features legacy; + void (*set_legacy_features)(void); }; struct pci_dev; @@ -186,6 +234,8 @@ extern struct x86_cpuinit_ops x86_cpuinit; extern struct x86_platform_ops x86_platform; extern struct x86_msi_ops x86_msi; extern struct x86_io_apic_ops x86_io_apic_ops; + +extern void x86_early_init_platform_quirks(void); extern void x86_init_noop(void); extern void x86_init_uint_noop(unsigned int unused); diff --git a/arch/x86/include/uapi/asm/bootparam.h b/arch/x86/include/uapi/asm/bootparam.h index 329254373479..c18ce67495fa 100644 --- a/arch/x86/include/uapi/asm/bootparam.h +++ b/arch/x86/include/uapi/asm/bootparam.h @@ -157,7 +157,46 @@ struct boot_params { __u8 _pad9[276]; /* 0xeec */ } __attribute__((packed)); -enum { +/** + * enum x86_hardware_subarch - x86 hardware subarchitecture + * + * The x86 hardware_subarch and hardware_subarch_data were added as of the x86 + * boot protocol 2.07 to help distinguish and support custom x86 boot + * sequences. This enum represents accepted values for the x86 + * hardware_subarch. Custom x86 boot sequences (not X86_SUBARCH_PC) do not + * have or simply *cannot* make use of natural stubs like BIOS or EFI, the + * hardware_subarch can be used on the Linux entry path to revector to a + * subarchitecture stub when needed. This subarchitecture stub can be used to + * set up Linux boot parameters or for special care to account for nonstandard + * handling of page tables. + * + * These enums should only ever be used by x86 code, and the code that uses + * it should be well contained and compartamentalized. + * + * KVM and Xen HVM do not have a subarch as these are expected to follow + * standard x86 boot entries. If there is a genuine need for "hypervisor" type + * that should be considered separately in the future. Future guest types + * should seriously consider working with standard x86 boot stubs such as + * the BIOS or EFI boot stubs. + * + * WARNING: this enum is only used for legacy hacks, for platform features that + * are not easily enumerated or discoverable. You should not ever use + * this for new features. + * + * @X86_SUBARCH_PC: Should be used if the hardware is enumerable using standard + * PC mechanisms (PCI, ACPI) and doesn't need a special boot flow. + * @X86_SUBARCH_LGUEST: Used for x86 hypervisor demo, lguest + * @X86_SUBARCH_XEN: Used for Xen guest types which follow the PV boot path, + * which start at asm startup_xen() entry point and later jump to the C + * xen_start_kernel() entry point. Both domU and dom0 type of guests are + * currently supportd through this PV boot path. + * @X86_SUBARCH_INTEL_MID: Used for Intel MID (Mobile Internet Device) platform + * systems which do not have the PCI legacy interfaces. + * @X86_SUBARCH_CE4100: Used for Intel CE media processor (CE4100) SoC for + * for settop boxes and media devices, the use of a subarch for CE4100 + * is more of a hack... + */ +enum x86_hardware_subarch { X86_SUBARCH_PC = 0, X86_SUBARCH_LGUEST, X86_SUBARCH_XEN, diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 616ebd22ef9a..9abf8551c7e4 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -2,7 +2,11 @@ # Makefile for the linux kernel. # -extra-y := head_$(BITS).o head$(BITS).o head.o vmlinux.lds +extra-y := head_$(BITS).o +extra-y += head$(BITS).o +extra-y += ebda.o +extra-y += platform-quirks.o +extra-y += vmlinux.lds CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE) diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 2522e564269e..f115a58f7c84 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -913,6 +913,15 @@ late_initcall(hpet_insert_resource); static int __init acpi_parse_fadt(struct acpi_table_header *table) { + if (!(acpi_gbl_FADT.boot_flags & ACPI_FADT_LEGACY_DEVICES)) { + pr_debug("ACPI: no legacy devices present\n"); + x86_platform.legacy.devices.pnpbios = 0; + } + + if (acpi_gbl_FADT.boot_flags & ACPI_FADT_NO_CMOS_RTC) { + pr_debug("ACPI: not registering RTC platform device\n"); + x86_platform.legacy.rtc = 0; + } #ifdef CONFIG_X86_PM_TIMER /* detect the location of the ACPI PM Timer */ diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index 9307f182fe30..c7364bd633e1 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -2267,7 +2267,7 @@ static int __init apm_init(void) dmi_check_system(apm_dmi_table); - if (apm_info.bios.version == 0 || paravirt_enabled() || machine_is_olpc()) { + if (apm_info.bios.version == 0 || machine_is_olpc()) { printk(KERN_INFO "apm: BIOS not found.\n"); return -ENODEV; } diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index 5c042466f274..674134e9f5e5 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -80,6 +80,7 @@ void common(void) { OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch); OFFSET(BP_version, boot_params, hdr.version); OFFSET(BP_kernel_alignment, boot_params, hdr.kernel_alignment); + OFFSET(BP_init_size, boot_params, hdr.init_size); OFFSET(BP_pref_address, boot_params, hdr.pref_address); OFFSET(BP_code32_start, boot_params, hdr.code32_start); diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index b18f4706e607..8dae51fd3db1 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -233,7 +233,7 @@ static void intel_workarounds(struct cpuinfo_x86 *c) * The Quark is also family 5, but does not have the same bug. */ clear_cpu_bug(c, X86_BUG_F00F); - if (!paravirt_enabled() && c->x86 == 5 && c->x86_model < 9) { + if (c->x86 == 5 && c->x86_model < 9) { static int f00f_workaround_enabled; set_cpu_bug(c, X86_BUG_F00F); diff --git a/arch/x86/kernel/head.c b/arch/x86/kernel/ebda.c similarity index 98% rename from arch/x86/kernel/head.c rename to arch/x86/kernel/ebda.c index 992f442ca155..afe65dffee80 100644 --- a/arch/x86/kernel/head.c +++ b/arch/x86/kernel/ebda.c @@ -38,7 +38,7 @@ void __init reserve_ebda_region(void) * that the paravirt case can handle memory setup * correctly, without our help. */ - if (paravirt_enabled()) + if (!x86_platform.legacy.ebda_search) return; /* end of low (conventional) memory */ diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index 2911ef3a9f1c..d784bb547a9d 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c @@ -34,6 +34,8 @@ asmlinkage __visible void __init i386_start_kernel(void) cr4_init_shadow(); sanitize_boot_params(&boot_params); + x86_early_init_platform_quirks(); + /* Call the subarch specific early setup function */ switch (boot_params.hdr.hardware_subarch) { case X86_SUBARCH_INTEL_MID: diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 1f4422d5c8d0..b72fb0b71dd1 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -182,6 +182,7 @@ void __init x86_64_start_reservations(char *real_mode_data) if (!boot_params.hdr.version) copy_bootdata(__va(real_mode_data)); + x86_early_init_platform_quirks(); reserve_ebda_region(); switch (boot_params.hdr.hardware_subarch) { diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index dc1207e2f193..eea2a6f72b31 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -285,14 +285,6 @@ static void __init paravirt_ops_setup(void) { pv_info.name = "KVM"; - /* - * KVM isn't paravirt in the sense of paravirt_enabled. A KVM - * guest kernel works like a bare metal kernel with additional - * features, and paravirt_enabled is about features that are - * missing. - */ - pv_info.paravirt_enabled = 0; - if (kvm_para_has_feature(KVM_FEATURE_NOP_IO_DELAY)) pv_cpu_ops.io_delay = kvm_io_delay; diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index f9583917c7c4..7b3b3f24c3ea 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -294,7 +294,6 @@ enum paravirt_lazy_mode paravirt_get_lazy_mode(void) struct pv_info pv_info = { .name = "bare hardware", - .paravirt_enabled = 0, .kernel_rpl = 0, .shared_kernel_pmd = 1, /* Only used when CONFIG_X86_PAE is set */ diff --git a/arch/x86/kernel/platform-quirks.c b/arch/x86/kernel/platform-quirks.c new file mode 100644 index 000000000000..b2f8a33b36ff --- /dev/null +++ b/arch/x86/kernel/platform-quirks.c @@ -0,0 +1,35 @@ +#include +#include + +#include +#include + +void __init x86_early_init_platform_quirks(void) +{ + x86_platform.legacy.rtc = 1; + x86_platform.legacy.ebda_search = 0; + x86_platform.legacy.devices.pnpbios = 1; + + switch (boot_params.hdr.hardware_subarch) { + case X86_SUBARCH_PC: + x86_platform.legacy.ebda_search = 1; + break; + case X86_SUBARCH_XEN: + case X86_SUBARCH_LGUEST: + case X86_SUBARCH_INTEL_MID: + case X86_SUBARCH_CE4100: + x86_platform.legacy.devices.pnpbios = 0; + x86_platform.legacy.rtc = 0; + break; + } + + if (x86_platform.set_legacy_features) + x86_platform.set_legacy_features(); +} + +#if defined(CONFIG_PNPBIOS) +bool __init arch_pnpbios_disabled(void) +{ + return x86_platform.legacy.devices.pnpbios == 0; +} +#endif diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c index 4af8d063fb36..eceaa082ec3f 100644 --- a/arch/x86/kernel/rtc.c +++ b/arch/x86/kernel/rtc.c @@ -14,6 +14,7 @@ #include #include #include +#include #ifdef CONFIG_X86_32 /* @@ -185,22 +186,7 @@ static __init int add_rtc_cmos(void) } } #endif - if (of_have_populated_dt()) - return 0; - - /* Intel MID platforms don't have ioport rtc */ - if (intel_mid_identify_cpu()) - return -ENODEV; - -#ifdef CONFIG_ACPI - if (acpi_gbl_FADT.boot_flags & ACPI_FADT_NO_CMOS_RTC) { - /* This warning can likely go away again in a year or two. */ - pr_info("ACPI: not registering RTC platform device\n"); - return -ENODEV; - } -#endif - - if (paravirt_enabled() && !paravirt_has(RTC)) + if (!x86_platform.legacy.rtc) return -ENODEV; platform_device_register(&rtc_device); diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c index e72a07f20b05..9b0185fbe3eb 100644 --- a/arch/x86/kernel/tboot.c +++ b/arch/x86/kernel/tboot.c @@ -74,12 +74,6 @@ void __init tboot_probe(void) return; } - /* only a natively booted kernel should be using TXT */ - if (paravirt_enabled()) { - pr_warning("non-0 tboot_addr but pv_ops is enabled\n"); - return; - } - /* Map and check for tboot UUID. */ set_fixmap(FIX_TBOOT_BASE, boot_params.tboot_addr); tboot = (struct tboot *)fix_to_virt(FIX_TBOOT_BASE); diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 4c941f88d405..9297a002d8e5 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -334,7 +334,7 @@ SECTIONS __brk_limit = .; } - . = ALIGN(PAGE_SIZE); + . = ALIGN(PAGE_SIZE); /* keep VO_INIT_SIZE page aligned */ _end = .; STABS_DEBUG diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index fd57d3ae7e16..3847e736702e 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -1408,13 +1408,10 @@ __init void lguest_init(void) { /* We're under lguest. */ pv_info.name = "lguest"; - /* Paravirt is enabled. */ - pv_info.paravirt_enabled = 1; /* We're running at privilege level 1, not 0 as normal. */ pv_info.kernel_rpl = 1; /* Everyone except Xen runs with this set. */ pv_info.shared_kernel_pmd = 1; - pv_info.features = 0; /* * We set up all the lguest overrides for sensitive operations. These diff --git a/arch/x86/mm/ident_map.c b/arch/x86/mm/ident_map.c new file mode 100644 index 000000000000..ec21796ac5fd --- /dev/null +++ b/arch/x86/mm/ident_map.c @@ -0,0 +1,79 @@ +/* + * Helper routines for building identity mapping page tables. This is + * included by both the compressed kernel and the regular kernel. + */ + +static void ident_pmd_init(unsigned long pmd_flag, pmd_t *pmd_page, + unsigned long addr, unsigned long end) +{ + addr &= PMD_MASK; + for (; addr < end; addr += PMD_SIZE) { + pmd_t *pmd = pmd_page + pmd_index(addr); + + if (!pmd_present(*pmd)) + set_pmd(pmd, __pmd(addr | pmd_flag)); + } +} + +static int ident_pud_init(struct x86_mapping_info *info, pud_t *pud_page, + unsigned long addr, unsigned long end) +{ + unsigned long next; + + for (; addr < end; addr = next) { + pud_t *pud = pud_page + pud_index(addr); + pmd_t *pmd; + + next = (addr & PUD_MASK) + PUD_SIZE; + if (next > end) + next = end; + + if (pud_present(*pud)) { + pmd = pmd_offset(pud, 0); + ident_pmd_init(info->pmd_flag, pmd, addr, next); + continue; + } + pmd = (pmd_t *)info->alloc_pgt_page(info->context); + if (!pmd) + return -ENOMEM; + ident_pmd_init(info->pmd_flag, pmd, addr, next); + set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE)); + } + + return 0; +} + +int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page, + unsigned long addr, unsigned long end) +{ + unsigned long next; + int result; + int off = info->kernel_mapping ? pgd_index(__PAGE_OFFSET) : 0; + + for (; addr < end; addr = next) { + pgd_t *pgd = pgd_page + pgd_index(addr) + off; + pud_t *pud; + + next = (addr & PGDIR_MASK) + PGDIR_SIZE; + if (next > end) + next = end; + + if (pgd_present(*pgd)) { + pud = pud_offset(pgd, 0); + result = ident_pud_init(info, pud, addr, next); + if (result) + return result; + continue; + } + + pud = (pud_t *)info->alloc_pgt_page(info->context); + if (!pud) + return -ENOMEM; + result = ident_pud_init(info, pud, addr, next); + if (result) + return result; + set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE)); + } + + return 0; +} diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 85af914e3d27..84df150ee77e 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -804,9 +804,6 @@ void __init mem_init(void) BUILD_BUG_ON(VMALLOC_START >= VMALLOC_END); #undef high_memory #undef __FIXADDR_TOP -#ifdef CONFIG_RANDOMIZE_BASE - BUILD_BUG_ON(CONFIG_RANDOMIZE_BASE_MAX_OFFSET > KERNEL_IMAGE_SIZE); -#endif #ifdef CONFIG_HIGHMEM BUG_ON(PKMAP_BASE + LAST_PKMAP*PAGE_SIZE > FIXADDR_START); diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 89d97477c1d9..bce2e5d9edd4 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -58,79 +58,7 @@ #include "mm_internal.h" -static void ident_pmd_init(unsigned long pmd_flag, pmd_t *pmd_page, - unsigned long addr, unsigned long end) -{ - addr &= PMD_MASK; - for (; addr < end; addr += PMD_SIZE) { - pmd_t *pmd = pmd_page + pmd_index(addr); - - if (!pmd_present(*pmd)) - set_pmd(pmd, __pmd(addr | pmd_flag)); - } -} -static int ident_pud_init(struct x86_mapping_info *info, pud_t *pud_page, - unsigned long addr, unsigned long end) -{ - unsigned long next; - - for (; addr < end; addr = next) { - pud_t *pud = pud_page + pud_index(addr); - pmd_t *pmd; - - next = (addr & PUD_MASK) + PUD_SIZE; - if (next > end) - next = end; - - if (pud_present(*pud)) { - pmd = pmd_offset(pud, 0); - ident_pmd_init(info->pmd_flag, pmd, addr, next); - continue; - } - pmd = (pmd_t *)info->alloc_pgt_page(info->context); - if (!pmd) - return -ENOMEM; - ident_pmd_init(info->pmd_flag, pmd, addr, next); - set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE)); - } - - return 0; -} - -int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page, - unsigned long addr, unsigned long end) -{ - unsigned long next; - int result; - int off = info->kernel_mapping ? pgd_index(__PAGE_OFFSET) : 0; - - for (; addr < end; addr = next) { - pgd_t *pgd = pgd_page + pgd_index(addr) + off; - pud_t *pud; - - next = (addr & PGDIR_MASK) + PGDIR_SIZE; - if (next > end) - next = end; - - if (pgd_present(*pgd)) { - pud = pud_offset(pgd, 0); - result = ident_pud_init(info, pud, addr, next); - if (result) - return result; - continue; - } - - pud = (pud_t *)info->alloc_pgt_page(info->context); - if (!pud) - return -ENOMEM; - result = ident_pud_init(info, pud, addr, next); - if (result) - return result; - set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE)); - } - - return 0; -} +#include "ident_map.c" /* * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the diff --git a/arch/x86/tools/calc_run_size.sh b/arch/x86/tools/calc_run_size.sh deleted file mode 100644 index 1a4c17bb3910..000000000000 --- a/arch/x86/tools/calc_run_size.sh +++ /dev/null @@ -1,42 +0,0 @@ -#!/bin/sh -# -# Calculate the amount of space needed to run the kernel, including room for -# the .bss and .brk sections. -# -# Usage: -# objdump -h a.out | sh calc_run_size.sh - -NUM='\([0-9a-fA-F]*[ \t]*\)' -OUT=$(sed -n 's/^[ \t0-9]*.b[sr][sk][ \t]*'"$NUM$NUM$NUM$NUM"'.*/\1\4/p') -if [ -z "$OUT" ] ; then - echo "Never found .bss or .brk file offset" >&2 - exit 1 -fi - -OUT=$(echo ${OUT# }) -sizeA=$(printf "%d" 0x${OUT%% *}) -OUT=${OUT#* } -offsetA=$(printf "%d" 0x${OUT%% *}) -OUT=${OUT#* } -sizeB=$(printf "%d" 0x${OUT%% *}) -OUT=${OUT#* } -offsetB=$(printf "%d" 0x${OUT%% *}) - -run_size=$(( $offsetA + $sizeA + $sizeB )) - -# BFD linker shows the same file offset in ELF. -if [ "$offsetA" -ne "$offsetB" ] ; then - # Gold linker shows them as consecutive. - endB=$(( $offsetB + $sizeB )) - if [ "$endB" != "$run_size" ] ; then - printf "sizeA: 0x%x\n" $sizeA >&2 - printf "offsetA: 0x%x\n" $offsetA >&2 - printf "sizeB: 0x%x\n" $sizeB >&2 - printf "offsetB: 0x%x\n" $offsetB >&2 - echo ".bss and .brk are non-contiguous" >&2 - exit 1 - fi -fi - -printf "%d\n" $run_size -exit 0 diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 6ab672233ac9..760789ae8562 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1206,13 +1206,11 @@ static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf, } static const struct pv_info xen_info __initconst = { - .paravirt_enabled = 1, .shared_kernel_pmd = 0, #ifdef CONFIG_X86_64 .extra_user_64bit_cs = FLAT_USER_CS64, #endif - .features = 0, .name = "Xen", }; @@ -1528,6 +1526,11 @@ static void __init xen_pvh_early_guest_init(void) } #endif /* CONFIG_XEN_PVH */ +static void __init xen_dom0_set_legacy_features(void) +{ + x86_platform.legacy.rtc = 1; +} + /* First C function to be called on Xen boot */ asmlinkage __visible void __init xen_start_kernel(void) { @@ -1548,8 +1551,6 @@ asmlinkage __visible void __init xen_start_kernel(void) /* Install Xen paravirt ops */ pv_info = xen_info; - if (xen_initial_domain()) - pv_info.features |= PV_SUPPORTED_RTC; pv_init_ops = xen_init_ops; if (!xen_pvh_domain()) { pv_cpu_ops = xen_cpu_ops; @@ -1684,6 +1685,7 @@ asmlinkage __visible void __init xen_start_kernel(void) boot_params.hdr.ramdisk_image = initrd_start; boot_params.hdr.ramdisk_size = xen_start_info->mod_len; boot_params.hdr.cmd_line_ptr = __pa(xen_start_info->cmd_line); + boot_params.hdr.hardware_subarch = X86_SUBARCH_XEN; if (!xen_initial_domain()) { add_preferred_console("xenboot", 0, NULL); @@ -1701,6 +1703,8 @@ asmlinkage __visible void __init xen_start_kernel(void) .u.firmware_info.type = XEN_FW_KBD_SHIFT_FLAGS, }; + x86_platform.set_legacy_features = + xen_dom0_set_legacy_features; xen_init_vga(info, xen_start_info->console.dom0.info_size); xen_start_info->console.domU.mfn = 0; xen_start_info->console.domU.evtchn = 0; diff --git a/drivers/pnp/pnpbios/core.c b/drivers/pnp/pnpbios/core.c index facd43b8516c..81603d99082b 100644 --- a/drivers/pnp/pnpbios/core.c +++ b/drivers/pnp/pnpbios/core.c @@ -521,10 +521,11 @@ static int __init pnpbios_init(void) int ret; if (pnpbios_disabled || dmi_check_system(pnpbios_dmi_table) || - paravirt_enabled()) { + arch_pnpbios_disabled()) { printk(KERN_INFO "PnPBIOS: Disabled\n"); return -ENODEV; } + #ifdef CONFIG_PNPACPI if (!acpi_disabled && !pnpacpi_disabled) { pnpbios_disabled = 1; diff --git a/include/linux/pnp.h b/include/linux/pnp.h index 5df733b8f704..2588ca6a9028 100644 --- a/include/linux/pnp.h +++ b/include/linux/pnp.h @@ -337,9 +337,11 @@ extern struct mutex pnp_res_mutex; #ifdef CONFIG_PNPBIOS extern struct pnp_protocol pnpbios_protocol; +extern bool arch_pnpbios_disabled(void); #define pnp_device_is_pnpbios(dev) ((dev)->protocol == (&pnpbios_protocol)) #else #define pnp_device_is_pnpbios(dev) 0 +#define arch_pnpbios_disabled() false #endif #ifdef CONFIG_PNPACPI diff --git a/tools/lguest/lguest.c b/tools/lguest/lguest.c index 80159e6811c2..d9836c5eb694 100644 --- a/tools/lguest/lguest.c +++ b/tools/lguest/lguest.c @@ -3351,12 +3351,18 @@ int main(int argc, char *argv[]) /* Boot protocol version: 2.07 supports the fields for lguest. */ boot->hdr.version = 0x207; - /* The hardware_subarch value of "1" tells the Guest it's an lguest. */ - boot->hdr.hardware_subarch = 1; + /* X86_SUBARCH_LGUEST tells the Guest it's an lguest. */ + boot->hdr.hardware_subarch = X86_SUBARCH_LGUEST; /* Tell the entry path not to try to reload segment registers. */ boot->hdr.loadflags |= KEEP_SEGMENTS; + /* We don't support tboot: */ + boot->tboot_addr = 0; + + /* Ensure this is 0 to prevent APM from loading: */ + boot->apm_bios_info.version = 0; + /* We tell the kernel to initialize the Guest. */ tell_kernel(start);