Power management fixes for v4.7-rc7
- Fix a recent performance regression on Power systems (powernv and pseries) introduced by a core cpuidle commit that decreased the precision of the last_residency conversion from nano- to microseconds, which should not matter in theory, but turned out to play not-so-well with the special "snooze" idle state on Power (Shreyas B Prabhu). - Fix a crash during resume from hibernation on x86-64 caused by possible corruption of the kernel text part of page tables in the last phase of image restoration exposed by a security-related change during the 4.3 development cycle (Rafael Wysocki). -----BEGIN PGP SIGNATURE----- Version: GnuPG v2.0.22 (GNU/Linux) iQIcBAABCAAGBQJXfuVmAAoJEILEb/54YlRx/TUQALIrIgVMtNpPhC0Mu0DQp+1y S2t/vWfLnMqfl1lzH/scDi4OC3cqEbEsYB5CUJPRysc1KB0fjJbW+BEuQuJyEhgf ac95mv1IU7DKiJ1Rp4qWvvBoovY3fjb1rhHMzvo9uh6HmKrSlWLL4PmnbGw/BaFL PgqEazsy/a9FgGT7VnvamVL6bGXgVMEf8VqoBeZFd0krH4qvBZHrS97OLqU685vT VW0IDZr0sI3u0pG9ropJ5b8ETtnyZ4lmTO5D4TzD9P8kZyjl2TBnSZ/ls/1YLh4N WdBZXbsfmFFt4jyDnOjilWOBA8R3IDteCDmG1l8XhK4Smbl7TRh2XVuRAHHEjG6b oMPHTeqIil3ExBS2gdzVEFNzYfHsyg3N9cXSochqnp3vboyZOiKaxIPCiVeBOZ+t bDPwtbovIly1aEnsJKUbHvBxJ4A9uPUqw99YbBtMK1VpQXZFg+s/0mBVNynkuu/k EUOuU6KcY6Br4JJPsqfb26NmARgLrrI+gTJbC3pFDHARtALnCbkkbmNZRRVRS3Zs 7CzGaYqgZ0ejMv/46jFPsHYKXU52AJ5HDhfXfELd8X3AvdhtmIszfqTkBgKx4gto fyTVzi2c6+RJJr5Jc07msYawaz86nEUjeXFsM59g6jC5dguSWnRxUuWV2zUM/ENb GDkzNdzif0t2DheJ4bKc =XL6s -----END PGP SIGNATURE----- Merge tag 'pm-4.7-rc7' of git://git.kernel.org/pub/scm/linux/kernel/git/rafael/linux-pm Pull power management fixes from Rafael Wysocki: "One fix for a recent cpuidle core change that, against all odds, introduced a functional regression on Power systems and the fix for the crash during resume from hibernation on x86-64 that has been in the works for the last few weeks (it actually was ready last week, but I wanted to allow the reporters to test if for some more time). Specifics: - Fix a recent performance regression on Power systems (powernv and pseries) introduced by a core cpuidle commit that decreased the precision of the last_residency conversion from nano- to microseconds, which should not matter in theory, but turned out to play not-so-well with the special "snooze" idle state on Power (Shreyas B Prabhu). - Fix a crash during resume from hibernation on x86-64 caused by possible corruption of the kernel text part of page tables in the last phase of image restoration exposed by a security-related change during the 4.3 development cycle (Rafael Wysocki)" * tag 'pm-4.7-rc7' of git://git.kernel.org/pub/scm/linux/kernel/git/rafael/linux-pm: cpuidle: Fix last_residency division x86/power/64: Fix kernel text mapping corruption during image restoration
This commit is contained in:
commit
c09230f308
|
@ -19,6 +19,7 @@
|
|||
#include <asm/mtrr.h>
|
||||
#include <asm/sections.h>
|
||||
#include <asm/suspend.h>
|
||||
#include <asm/tlbflush.h>
|
||||
|
||||
/* Defined in hibernate_asm_64.S */
|
||||
extern asmlinkage __visible int restore_image(void);
|
||||
|
@ -28,6 +29,7 @@ extern asmlinkage __visible int restore_image(void);
|
|||
* kernel's text (this value is passed in the image header).
|
||||
*/
|
||||
unsigned long restore_jump_address __visible;
|
||||
unsigned long jump_address_phys;
|
||||
|
||||
/*
|
||||
* Value of the cr3 register from before the hibernation (this value is passed
|
||||
|
@ -37,7 +39,43 @@ unsigned long restore_cr3 __visible;
|
|||
|
||||
pgd_t *temp_level4_pgt __visible;
|
||||
|
||||
void *relocated_restore_code __visible;
|
||||
unsigned long relocated_restore_code __visible;
|
||||
|
||||
static int set_up_temporary_text_mapping(void)
|
||||
{
|
||||
pmd_t *pmd;
|
||||
pud_t *pud;
|
||||
|
||||
/*
|
||||
* The new mapping only has to cover the page containing the image
|
||||
* kernel's entry point (jump_address_phys), because the switch over to
|
||||
* it is carried out by relocated code running from a page allocated
|
||||
* specifically for this purpose and covered by the identity mapping, so
|
||||
* the temporary kernel text mapping is only needed for the final jump.
|
||||
* Moreover, in that mapping the virtual address of the image kernel's
|
||||
* entry point must be the same as its virtual address in the image
|
||||
* kernel (restore_jump_address), so the image kernel's
|
||||
* restore_registers() code doesn't find itself in a different area of
|
||||
* the virtual address space after switching over to the original page
|
||||
* tables used by the image kernel.
|
||||
*/
|
||||
pud = (pud_t *)get_safe_page(GFP_ATOMIC);
|
||||
if (!pud)
|
||||
return -ENOMEM;
|
||||
|
||||
pmd = (pmd_t *)get_safe_page(GFP_ATOMIC);
|
||||
if (!pmd)
|
||||
return -ENOMEM;
|
||||
|
||||
set_pmd(pmd + pmd_index(restore_jump_address),
|
||||
__pmd((jump_address_phys & PMD_MASK) | __PAGE_KERNEL_LARGE_EXEC));
|
||||
set_pud(pud + pud_index(restore_jump_address),
|
||||
__pud(__pa(pmd) | _KERNPG_TABLE));
|
||||
set_pgd(temp_level4_pgt + pgd_index(restore_jump_address),
|
||||
__pgd(__pa(pud) | _KERNPG_TABLE));
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void *alloc_pgt_page(void *context)
|
||||
{
|
||||
|
@ -59,9 +97,10 @@ static int set_up_temporary_mappings(void)
|
|||
if (!temp_level4_pgt)
|
||||
return -ENOMEM;
|
||||
|
||||
/* It is safe to reuse the original kernel mapping */
|
||||
set_pgd(temp_level4_pgt + pgd_index(__START_KERNEL_map),
|
||||
init_level4_pgt[pgd_index(__START_KERNEL_map)]);
|
||||
/* Prepare a temporary mapping for the kernel text */
|
||||
result = set_up_temporary_text_mapping();
|
||||
if (result)
|
||||
return result;
|
||||
|
||||
/* Set up the direct mapping from scratch */
|
||||
for (i = 0; i < nr_pfn_mapped; i++) {
|
||||
|
@ -78,19 +117,50 @@ static int set_up_temporary_mappings(void)
|
|||
return 0;
|
||||
}
|
||||
|
||||
static int relocate_restore_code(void)
|
||||
{
|
||||
pgd_t *pgd;
|
||||
pud_t *pud;
|
||||
|
||||
relocated_restore_code = get_safe_page(GFP_ATOMIC);
|
||||
if (!relocated_restore_code)
|
||||
return -ENOMEM;
|
||||
|
||||
memcpy((void *)relocated_restore_code, &core_restore_code, PAGE_SIZE);
|
||||
|
||||
/* Make the page containing the relocated code executable */
|
||||
pgd = (pgd_t *)__va(read_cr3()) + pgd_index(relocated_restore_code);
|
||||
pud = pud_offset(pgd, relocated_restore_code);
|
||||
if (pud_large(*pud)) {
|
||||
set_pud(pud, __pud(pud_val(*pud) & ~_PAGE_NX));
|
||||
} else {
|
||||
pmd_t *pmd = pmd_offset(pud, relocated_restore_code);
|
||||
|
||||
if (pmd_large(*pmd)) {
|
||||
set_pmd(pmd, __pmd(pmd_val(*pmd) & ~_PAGE_NX));
|
||||
} else {
|
||||
pte_t *pte = pte_offset_kernel(pmd, relocated_restore_code);
|
||||
|
||||
set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_NX));
|
||||
}
|
||||
}
|
||||
__flush_tlb_all();
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int swsusp_arch_resume(void)
|
||||
{
|
||||
int error;
|
||||
|
||||
/* We have got enough memory and from now on we cannot recover */
|
||||
if ((error = set_up_temporary_mappings()))
|
||||
error = set_up_temporary_mappings();
|
||||
if (error)
|
||||
return error;
|
||||
|
||||
relocated_restore_code = (void *)get_safe_page(GFP_ATOMIC);
|
||||
if (!relocated_restore_code)
|
||||
return -ENOMEM;
|
||||
memcpy(relocated_restore_code, &core_restore_code,
|
||||
&restore_registers - &core_restore_code);
|
||||
error = relocate_restore_code();
|
||||
if (error)
|
||||
return error;
|
||||
|
||||
restore_image();
|
||||
return 0;
|
||||
|
@ -109,11 +179,12 @@ int pfn_is_nosave(unsigned long pfn)
|
|||
|
||||
struct restore_data_record {
|
||||
unsigned long jump_address;
|
||||
unsigned long jump_address_phys;
|
||||
unsigned long cr3;
|
||||
unsigned long magic;
|
||||
};
|
||||
|
||||
#define RESTORE_MAGIC 0x0123456789ABCDEFUL
|
||||
#define RESTORE_MAGIC 0x123456789ABCDEF0UL
|
||||
|
||||
/**
|
||||
* arch_hibernation_header_save - populate the architecture specific part
|
||||
|
@ -126,7 +197,8 @@ int arch_hibernation_header_save(void *addr, unsigned int max_size)
|
|||
|
||||
if (max_size < sizeof(struct restore_data_record))
|
||||
return -EOVERFLOW;
|
||||
rdr->jump_address = restore_jump_address;
|
||||
rdr->jump_address = (unsigned long)&restore_registers;
|
||||
rdr->jump_address_phys = __pa_symbol(&restore_registers);
|
||||
rdr->cr3 = restore_cr3;
|
||||
rdr->magic = RESTORE_MAGIC;
|
||||
return 0;
|
||||
|
@ -142,6 +214,7 @@ int arch_hibernation_header_restore(void *addr)
|
|||
struct restore_data_record *rdr = addr;
|
||||
|
||||
restore_jump_address = rdr->jump_address;
|
||||
jump_address_phys = rdr->jump_address_phys;
|
||||
restore_cr3 = rdr->cr3;
|
||||
return (rdr->magic == RESTORE_MAGIC) ? 0 : -EINVAL;
|
||||
}
|
||||
|
|
|
@ -44,9 +44,6 @@ ENTRY(swsusp_arch_suspend)
|
|||
pushfq
|
||||
popq pt_regs_flags(%rax)
|
||||
|
||||
/* save the address of restore_registers */
|
||||
movq $restore_registers, %rax
|
||||
movq %rax, restore_jump_address(%rip)
|
||||
/* save cr3 */
|
||||
movq %cr3, %rax
|
||||
movq %rax, restore_cr3(%rip)
|
||||
|
@ -57,31 +54,34 @@ ENTRY(swsusp_arch_suspend)
|
|||
ENDPROC(swsusp_arch_suspend)
|
||||
|
||||
ENTRY(restore_image)
|
||||
/* switch to temporary page tables */
|
||||
movq $__PAGE_OFFSET, %rdx
|
||||
movq temp_level4_pgt(%rip), %rax
|
||||
subq %rdx, %rax
|
||||
movq %rax, %cr3
|
||||
/* Flush TLB */
|
||||
movq mmu_cr4_features(%rip), %rax
|
||||
movq %rax, %rdx
|
||||
andq $~(X86_CR4_PGE), %rdx
|
||||
movq %rdx, %cr4; # turn off PGE
|
||||
movq %cr3, %rcx; # flush TLB
|
||||
movq %rcx, %cr3;
|
||||
movq %rax, %cr4; # turn PGE back on
|
||||
|
||||
/* prepare to jump to the image kernel */
|
||||
movq restore_jump_address(%rip), %rax
|
||||
movq restore_cr3(%rip), %rbx
|
||||
movq restore_jump_address(%rip), %r8
|
||||
movq restore_cr3(%rip), %r9
|
||||
|
||||
/* prepare to switch to temporary page tables */
|
||||
movq temp_level4_pgt(%rip), %rax
|
||||
movq mmu_cr4_features(%rip), %rbx
|
||||
|
||||
/* prepare to copy image data to their original locations */
|
||||
movq restore_pblist(%rip), %rdx
|
||||
|
||||
/* jump to relocated restore code */
|
||||
movq relocated_restore_code(%rip), %rcx
|
||||
jmpq *%rcx
|
||||
|
||||
/* code below has been relocated to a safe page */
|
||||
ENTRY(core_restore_code)
|
||||
/* switch to temporary page tables */
|
||||
movq $__PAGE_OFFSET, %rcx
|
||||
subq %rcx, %rax
|
||||
movq %rax, %cr3
|
||||
/* flush TLB */
|
||||
movq %rbx, %rcx
|
||||
andq $~(X86_CR4_PGE), %rcx
|
||||
movq %rcx, %cr4; # turn off PGE
|
||||
movq %cr3, %rcx; # flush TLB
|
||||
movq %rcx, %cr3;
|
||||
movq %rbx, %cr4; # turn PGE back on
|
||||
.Lloop:
|
||||
testq %rdx, %rdx
|
||||
jz .Ldone
|
||||
|
@ -96,24 +96,17 @@ ENTRY(core_restore_code)
|
|||
/* progress to the next pbe */
|
||||
movq pbe_next(%rdx), %rdx
|
||||
jmp .Lloop
|
||||
|
||||
.Ldone:
|
||||
/* jump to the restore_registers address from the image header */
|
||||
jmpq *%rax
|
||||
/*
|
||||
* NOTE: This assumes that the boot kernel's text mapping covers the
|
||||
* image kernel's page containing restore_registers and the address of
|
||||
* this page is the same as in the image kernel's text mapping (it
|
||||
* should always be true, because the text mapping is linear, starting
|
||||
* from 0, and is supposed to cover the entire kernel text for every
|
||||
* kernel).
|
||||
*
|
||||
* code below belongs to the image kernel
|
||||
*/
|
||||
jmpq *%r8
|
||||
|
||||
/* code below belongs to the image kernel */
|
||||
.align PAGE_SIZE
|
||||
ENTRY(restore_registers)
|
||||
FRAME_BEGIN
|
||||
/* go back to the original page tables */
|
||||
movq %rbx, %cr3
|
||||
movq %r9, %cr3
|
||||
|
||||
/* Flush TLB, including "global" things (vmalloc) */
|
||||
movq mmu_cr4_features(%rip), %rax
|
||||
|
|
|
@ -173,7 +173,7 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv,
|
|||
|
||||
struct cpuidle_state *target_state = &drv->states[index];
|
||||
bool broadcast = !!(target_state->flags & CPUIDLE_FLAG_TIMER_STOP);
|
||||
u64 time_start, time_end;
|
||||
ktime_t time_start, time_end;
|
||||
s64 diff;
|
||||
|
||||
/*
|
||||
|
@ -195,13 +195,13 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv,
|
|||
sched_idle_set_state(target_state);
|
||||
|
||||
trace_cpu_idle_rcuidle(index, dev->cpu);
|
||||
time_start = local_clock();
|
||||
time_start = ns_to_ktime(local_clock());
|
||||
|
||||
stop_critical_timings();
|
||||
entered_state = target_state->enter(dev, drv, index);
|
||||
start_critical_timings();
|
||||
|
||||
time_end = local_clock();
|
||||
time_end = ns_to_ktime(local_clock());
|
||||
trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, dev->cpu);
|
||||
|
||||
/* The cpu is no longer idle or about to enter idle. */
|
||||
|
@ -217,11 +217,7 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv,
|
|||
if (!cpuidle_state_is_coupled(drv, index))
|
||||
local_irq_enable();
|
||||
|
||||
/*
|
||||
* local_clock() returns the time in nanosecond, let's shift
|
||||
* by 10 (divide by 1024) to have microsecond based time.
|
||||
*/
|
||||
diff = (time_end - time_start) >> 10;
|
||||
diff = ktime_us_delta(time_end, time_start);
|
||||
if (diff > INT_MAX)
|
||||
diff = INT_MAX;
|
||||
|
||||
|
|
Loading…
Reference in New Issue