From e78a77c38cf0ce3b8169ff6a6fd3711e81dc22c8 Mon Sep 17 00:00:00 2001 From: Andres Salomon Date: Sat, 9 Feb 2008 23:24:08 +0100 Subject: [PATCH 01/32] x86: GEODE: MFGPT: Minor cleanups - uninline timer functions; the compiler knows better than we do whether or not to inline these. - mfgpt_start_timer() had an unused 'clock' argument, drop it. From both Jordan and myself. Signed-off-by: Jordan Crouse Signed-off-by: Andres Salomon Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/kernel/mfgpt_32.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/mfgpt_32.c b/arch/x86/kernel/mfgpt_32.c index 219f86eb6123..9146b2de1698 100644 --- a/arch/x86/kernel/mfgpt_32.c +++ b/arch/x86/kernel/mfgpt_32.c @@ -244,7 +244,7 @@ static int __init mfgpt_setup(char *str) } __setup("mfgpt_irq=", mfgpt_setup); -static inline void mfgpt_disable_timer(u16 clock) +static void mfgpt_disable_timer(u16 clock) { u16 val = geode_mfgpt_read(clock, MFGPT_REG_SETUP); geode_mfgpt_write(clock, MFGPT_REG_SETUP, val & ~MFGPT_SETUP_CNTEN); @@ -263,7 +263,7 @@ static struct clock_event_device mfgpt_clockevent = { .shift = 32 }; -static inline void mfgpt_start_timer(u16 clock, u16 delta) +static void mfgpt_start_timer(u16 delta) { geode_mfgpt_write(mfgpt_event_clock, MFGPT_REG_CMP2, (u16) delta); geode_mfgpt_write(mfgpt_event_clock, MFGPT_REG_COUNTER, 0); @@ -278,14 +278,14 @@ static void mfgpt_set_mode(enum clock_event_mode mode, mfgpt_disable_timer(mfgpt_event_clock); if (mode == CLOCK_EVT_MODE_PERIODIC) - mfgpt_start_timer(mfgpt_event_clock, MFGPT_PERIODIC); + mfgpt_start_timer(MFGPT_PERIODIC); mfgpt_tick_mode = mode; } static int mfgpt_next_event(unsigned long delta, struct clock_event_device *evt) { - mfgpt_start_timer(mfgpt_event_clock, delta); + mfgpt_start_timer(delta); return 0; } From 36445cf30686b9ea4ddf71f28057e4dd07db0e2d Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Sat, 9 Feb 2008 23:24:08 +0100 Subject: [PATCH 02/32] x86: GEODE fix MFGPT input clock value The GEODE MFGPT code assumed that 32kHz was 32000 Hz while the boards run on a 32.768 kHz digital watch crystal. In practise, it will not change the timer's frequency as the skew was only 2.4%, but it should provide more accurate intervals. Signed-off-by: Willy Tarreau Signed-off-by: Andres Salomon Signed-off-by: Jordan Crouse Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/kernel/mfgpt_32.c | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/arch/x86/kernel/mfgpt_32.c b/arch/x86/kernel/mfgpt_32.c index 9146b2de1698..586228140b9e 100644 --- a/arch/x86/kernel/mfgpt_32.c +++ b/arch/x86/kernel/mfgpt_32.c @@ -12,21 +12,20 @@ */ /* - * We are using the 32Khz input clock - its the only one that has the + * We are using the 32.768kHz input clock - it's the only one that has the * ranges we find desirable. The following table lists the suitable - * divisors and the associated hz, minimum interval - * and the maximum interval: + * divisors and the associated Hz, minimum interval and the maximum interval: * - * Divisor Hz Min Delta (S) Max Delta (S) - * 1 32000 .0005 2.048 - * 2 16000 .001 4.096 - * 4 8000 .002 8.192 - * 8 4000 .004 16.384 - * 16 2000 .008 32.768 - * 32 1000 .016 65.536 - * 64 500 .032 131.072 - * 128 250 .064 262.144 - * 256 125 .128 524.288 + * Divisor Hz Min Delta (s) Max Delta (s) + * 1 32768 .00048828125 2.000 + * 2 16384 .0009765625 4.000 + * 4 8192 .001953125 8.000 + * 8 4096 .00390625 16.000 + * 16 2048 .0078125 32.000 + * 32 1024 .015625 64.000 + * 64 512 .03125 128.000 + * 128 256 .0625 256.000 + * 256 128 .125 512.000 */ #include @@ -45,7 +44,7 @@ static struct mfgpt_timer_t { #define MFGPT_DIVISOR 16 #define MFGPT_SCALE 4 /* divisor = 2^(scale) */ -#define MFGPT_HZ (32000 / MFGPT_DIVISOR) +#define MFGPT_HZ (32768 / MFGPT_DIVISOR) #define MFGPT_PERIODIC (MFGPT_HZ / HZ) #ifdef CONFIG_GEODE_MFGPT_TIMER From fa28e067c3b8af96c79c060e163b1387c172ae75 Mon Sep 17 00:00:00 2001 From: Andres Salomon Date: Sat, 9 Feb 2008 23:24:08 +0100 Subject: [PATCH 03/32] x86: GEODE: MFGPT: drop module owner usage from MFGPT API We had planned to use the 'owner' field for allowing re-allocation of MFGPTs; however, doing it by module owner name isn't flexible enough. So, drop this for now. If it turns out that we need timers in modules, we'll need to come up with a scheme that matches the write-once fields of the MFGPTx_SETUP register, and drops ponies from the sky. Signed-off-by: Andres Salomon Signed-off-by: Jordan Crouse Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/kernel/mfgpt_32.c | 14 +++++--------- include/asm-x86/geode.h | 2 +- 2 files changed, 6 insertions(+), 10 deletions(-) diff --git a/arch/x86/kernel/mfgpt_32.c b/arch/x86/kernel/mfgpt_32.c index 586228140b9e..186bd3649108 100644 --- a/arch/x86/kernel/mfgpt_32.c +++ b/arch/x86/kernel/mfgpt_32.c @@ -30,14 +30,12 @@ #include #include -#include #include #define F_AVAIL 0x01 static struct mfgpt_timer_t { int flags; - struct module *owner; } mfgpt_timers[MFGPT_MAX_TIMERS]; /* Selected from the table above */ @@ -182,15 +180,14 @@ int geode_mfgpt_set_irq(int timer, int cmp, int irq, int enable) return 0; } -static int mfgpt_get(int timer, struct module *owner) +static int mfgpt_get(int timer) { mfgpt_timers[timer].flags &= ~F_AVAIL; - mfgpt_timers[timer].owner = owner; printk(KERN_INFO "geode-mfgpt: Registered timer %d\n", timer); return timer; } -int geode_mfgpt_alloc_timer(int timer, int domain, struct module *owner) +int geode_mfgpt_alloc_timer(int timer, int domain) { int i; @@ -203,7 +200,7 @@ int geode_mfgpt_alloc_timer(int timer, int domain, struct module *owner) /* Try to find an available timer */ for (i = 0; i < MFGPT_MAX_TIMERS; i++) { if (mfgpt_timers[i].flags & F_AVAIL) - return mfgpt_get(i, owner); + return mfgpt_get(i); if (i == 5 && domain == MFGPT_DOMAIN_WORKING) break; @@ -211,7 +208,7 @@ int geode_mfgpt_alloc_timer(int timer, int domain, struct module *owner) } else { /* If they requested a specific timer, try to honor that */ if (mfgpt_timers[timer].flags & F_AVAIL) - return mfgpt_get(timer, owner); + return mfgpt_get(timer); } /* No timers available - too bad */ @@ -324,8 +321,7 @@ static int __init mfgpt_timer_setup(void) int timer, ret; u16 val; - timer = geode_mfgpt_alloc_timer(MFGPT_TIMER_ANY, MFGPT_DOMAIN_WORKING, - THIS_MODULE); + timer = geode_mfgpt_alloc_timer(MFGPT_TIMER_ANY, MFGPT_DOMAIN_WORKING); if (timer < 0) { printk(KERN_ERR "mfgpt-timer: Could not allocate a MFPGT timer\n"); diff --git a/include/asm-x86/geode.h b/include/asm-x86/geode.h index 811fe14f70b2..c4482753a358 100644 --- a/include/asm-x86/geode.h +++ b/include/asm-x86/geode.h @@ -209,7 +209,7 @@ static inline u16 geode_mfgpt_read(int timer, u16 reg) extern int __init geode_mfgpt_detect(void); extern int geode_mfgpt_toggle_event(int timer, int cmp, int event, int enable); extern int geode_mfgpt_set_irq(int timer, int cmp, int irq, int enable); -extern int geode_mfgpt_alloc_timer(int timer, int domain, struct module *owner); +extern int geode_mfgpt_alloc_timer(int timer, int domain); #define geode_mfgpt_setup_irq(t, c, i) geode_mfgpt_set_irq((t), (c), (i), 1) #define geode_mfgpt_release_irq(t, c, i) geode_mfgpt_set_irq((t), (c), (i), 0) From 9501b2efd70ad3957a70d44de54dab7c52f9b882 Mon Sep 17 00:00:00 2001 From: Andres Salomon Date: Sat, 9 Feb 2008 23:24:08 +0100 Subject: [PATCH 04/32] x86: GEODE: MFGPT: replace 'flags' field with 'avail' bit Drop F_AVAIL and the 'flags' field, replacing with an 'avail' bit. This looks more understandable to me. Signed-off-by: Andres Salomon Signed-off-by: Jordan Crouse Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/kernel/mfgpt_32.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/arch/x86/kernel/mfgpt_32.c b/arch/x86/kernel/mfgpt_32.c index 186bd3649108..6f79061cf119 100644 --- a/arch/x86/kernel/mfgpt_32.c +++ b/arch/x86/kernel/mfgpt_32.c @@ -32,10 +32,8 @@ #include #include -#define F_AVAIL 0x01 - static struct mfgpt_timer_t { - int flags; + unsigned int avail:1; } mfgpt_timers[MFGPT_MAX_TIMERS]; /* Selected from the table above */ @@ -95,7 +93,7 @@ int __init geode_mfgpt_detect(void) for (i = 0; i < MFGPT_MAX_TIMERS; i++) { val = geode_mfgpt_read(i, MFGPT_REG_SETUP); if (!(val & MFGPT_SETUP_SETUP)) { - mfgpt_timers[i].flags = F_AVAIL; + mfgpt_timers[i].avail = 1; count++; } } @@ -182,7 +180,7 @@ int geode_mfgpt_set_irq(int timer, int cmp, int irq, int enable) static int mfgpt_get(int timer) { - mfgpt_timers[timer].flags &= ~F_AVAIL; + mfgpt_timers[timer].avail = 0; printk(KERN_INFO "geode-mfgpt: Registered timer %d\n", timer); return timer; } @@ -199,7 +197,7 @@ int geode_mfgpt_alloc_timer(int timer, int domain) if (timer < 0) { /* Try to find an available timer */ for (i = 0; i < MFGPT_MAX_TIMERS; i++) { - if (mfgpt_timers[i].flags & F_AVAIL) + if (mfgpt_timers[i].avail) return mfgpt_get(i); if (i == 5 && domain == MFGPT_DOMAIN_WORKING) @@ -207,7 +205,7 @@ int geode_mfgpt_alloc_timer(int timer, int domain) } } else { /* If they requested a specific timer, try to honor that */ - if (mfgpt_timers[timer].flags & F_AVAIL) + if (mfgpt_timers[timer].avail) return mfgpt_get(timer); } From b0e6bf2571e9385335e6337bdedb85cb629ab3fb Mon Sep 17 00:00:00 2001 From: Andres Salomon Date: Sat, 9 Feb 2008 23:24:08 +0100 Subject: [PATCH 05/32] x86: GEODE: MFGPT: make mfgpt_timer_setup available outside of mfgpt_32.c We need to be called from elsewhere, and this gets some #ifdefs out of the .c file. Signed-off-by: Andres Salomon Signed-off-by: Jordan Crouse Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/kernel/mfgpt_32.c | 8 +------- include/asm-x86/geode.h | 6 ++++++ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/arch/x86/kernel/mfgpt_32.c b/arch/x86/kernel/mfgpt_32.c index 6f79061cf119..5cf3a839530c 100644 --- a/arch/x86/kernel/mfgpt_32.c +++ b/arch/x86/kernel/mfgpt_32.c @@ -43,12 +43,6 @@ static struct mfgpt_timer_t { #define MFGPT_HZ (32768 / MFGPT_DIVISOR) #define MFGPT_PERIODIC (MFGPT_HZ / HZ) -#ifdef CONFIG_GEODE_MFGPT_TIMER -static int __init mfgpt_timer_setup(void); -#else -#define mfgpt_timer_setup() (0) -#endif - /* Allow for disabling of MFGPTs */ static int disable; static int __init mfgpt_disable(char *s) @@ -314,7 +308,7 @@ static struct irqaction mfgptirq = { .name = "mfgpt-timer" }; -static int __init mfgpt_timer_setup(void) +int __init mfgpt_timer_setup(void) { int timer, ret; u16 val; diff --git a/include/asm-x86/geode.h b/include/asm-x86/geode.h index c4482753a358..c13630655d62 100644 --- a/include/asm-x86/geode.h +++ b/include/asm-x86/geode.h @@ -214,4 +214,10 @@ extern int geode_mfgpt_alloc_timer(int timer, int domain); #define geode_mfgpt_setup_irq(t, c, i) geode_mfgpt_set_irq((t), (c), (i), 1) #define geode_mfgpt_release_irq(t, c, i) geode_mfgpt_set_irq((t), (c), (i), 0) +#ifdef CONFIG_GEODE_MFGPT_TIMER +extern int __init mfgpt_timer_setup(void); +#else +static inline int mfgpt_timer_setup(void) { return 0; } +#endif + #endif From f087515c658a68454d43909d482ea4b59e7d6d5c Mon Sep 17 00:00:00 2001 From: Jordan Crouse Date: Sat, 9 Feb 2008 23:24:08 +0100 Subject: [PATCH 06/32] x86: GEODE: MFGPT: Use "just-in-time" detection for the MFGPT timers There isn't much value to always detecting the MFGPT timers on Geode platforms; detection is only needed when something wants to use the timers. Move the detection code so that it gets called the first time a timer is allocated. Signed-off-by: Jordan Crouse Signed-off-by: Andres Salomon Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/kernel/geode_32.c | 5 +---- arch/x86/kernel/mfgpt_32.c | 39 ++++++++++++++++++++++++++------------ include/asm-x86/geode.h | 1 - 3 files changed, 28 insertions(+), 17 deletions(-) diff --git a/arch/x86/kernel/geode_32.c b/arch/x86/kernel/geode_32.c index 9c7f7d395968..9dad6ca6cd70 100644 --- a/arch/x86/kernel/geode_32.c +++ b/arch/x86/kernel/geode_32.c @@ -163,14 +163,11 @@ EXPORT_SYMBOL_GPL(geode_gpio_setup_event); static int __init geode_southbridge_init(void) { - int timers; - if (!is_geode()) return -ENODEV; init_lbars(); - timers = geode_mfgpt_detect(); - printk(KERN_INFO "geode: %d MFGPT timers available.\n", timers); + (void) mfgpt_timer_setup(); return 0; } diff --git a/arch/x86/kernel/mfgpt_32.c b/arch/x86/kernel/mfgpt_32.c index 5cf3a839530c..abdb7c71199a 100644 --- a/arch/x86/kernel/mfgpt_32.c +++ b/arch/x86/kernel/mfgpt_32.c @@ -74,28 +74,37 @@ __setup("mfgptfix", mfgpt_fix); * In other cases (such as with VSAless OpenFirmware), the system firmware * leaves timers available for us to use. */ -int __init geode_mfgpt_detect(void) + + +static int timers = -1; + +static void geode_mfgpt_detect(void) { - int count = 0, i; + int i; u16 val; + timers = 0; + if (disable) { - printk(KERN_INFO "geode-mfgpt: Skipping MFGPT setup\n"); - return 0; + printk(KERN_INFO "geode-mfgpt: MFGPT support is disabled\n"); + goto done; + } + + if (!geode_get_dev_base(GEODE_DEV_MFGPT)) { + printk(KERN_INFO "geode-mfgpt: MFGPT LBAR is not set up\n"); + goto done; } for (i = 0; i < MFGPT_MAX_TIMERS; i++) { val = geode_mfgpt_read(i, MFGPT_REG_SETUP); if (!(val & MFGPT_SETUP_SETUP)) { mfgpt_timers[i].avail = 1; - count++; + timers++; } } - /* set up clock event device, if desired */ - i = mfgpt_timer_setup(); - - return count; +done: + printk(KERN_INFO "geode-mfgpt: %d MFGPT timers available.\n", timers); } int geode_mfgpt_toggle_event(int timer, int cmp, int event, int enable) @@ -183,10 +192,16 @@ int geode_mfgpt_alloc_timer(int timer, int domain) { int i; - if (!geode_get_dev_base(GEODE_DEV_MFGPT)) - return -ENODEV; + if (timers == -1) { + /* timers haven't been detected yet */ + geode_mfgpt_detect(); + } + + if (!timers) + return -1; + if (timer >= MFGPT_MAX_TIMERS) - return -EIO; + return -1; if (timer < 0) { /* Try to find an available timer */ diff --git a/include/asm-x86/geode.h b/include/asm-x86/geode.h index c13630655d62..9e7280092a48 100644 --- a/include/asm-x86/geode.h +++ b/include/asm-x86/geode.h @@ -206,7 +206,6 @@ static inline u16 geode_mfgpt_read(int timer, u16 reg) return inw(base + reg + (timer * 8)); } -extern int __init geode_mfgpt_detect(void); extern int geode_mfgpt_toggle_event(int timer, int cmp, int event, int enable); extern int geode_mfgpt_set_irq(int timer, int cmp, int irq, int enable); extern int geode_mfgpt_alloc_timer(int timer, int domain); From f54ae69bafa16434ce46bc2f1fe556bce4d23650 Mon Sep 17 00:00:00 2001 From: Andres Salomon Date: Sat, 9 Feb 2008 23:24:08 +0100 Subject: [PATCH 07/32] x86: GEODE: MFGPT: fix a potential race when disabling a timer We *really* don't want to be reading MFGPTx_SETUP and writing back those values. What we want to be doing is clearing CMP1 and CMP2 unconditionally; otherwise, we have races where CMP1 and/or CMP2 fire after we've read MFGPTx_SETUP. They can also fire between when we've written ~CNTEN to the register, and when the new register values get copied to the timer's version of the register. By clearing both fields, we're okay. Signed-off-by: Andres Salomon Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/kernel/mfgpt_32.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/mfgpt_32.c b/arch/x86/kernel/mfgpt_32.c index abdb7c71199a..81aa9db01f5f 100644 --- a/arch/x86/kernel/mfgpt_32.c +++ b/arch/x86/kernel/mfgpt_32.c @@ -249,8 +249,9 @@ __setup("mfgpt_irq=", mfgpt_setup); static void mfgpt_disable_timer(u16 clock) { - u16 val = geode_mfgpt_read(clock, MFGPT_REG_SETUP); - geode_mfgpt_write(clock, MFGPT_REG_SETUP, val & ~MFGPT_SETUP_CNTEN); + /* avoid races by clearing CMP1 and CMP2 unconditionally */ + geode_mfgpt_write(clock, MFGPT_REG_SETUP, (u16) ~MFGPT_SETUP_CNTEN | + MFGPT_SETUP_CMP1 | MFGPT_SETUP_CMP2); } static int mfgpt_next_event(unsigned long, struct clock_event_device *); From dcee77be2f0a7010633fb2c025db38550c4b0e72 Mon Sep 17 00:00:00 2001 From: Jordan Crouse Date: Sat, 9 Feb 2008 23:24:08 +0100 Subject: [PATCH 08/32] x86: GEODE: make sure the right MFGPT timer fired the timer tick Each AMD Geode MFGPT timer interrupt output is paired with another timer; esentially the interrupt goes if either timer fires. This is okay, but the handlers need to be aware of this. Make sure in the timer tick handler that our timer really did expire. Signed-off-by: Jordan Crouse Signed-off-by: Andres Salomon Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/kernel/mfgpt_32.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/mfgpt_32.c b/arch/x86/kernel/mfgpt_32.c index 81aa9db01f5f..eeb461f391a0 100644 --- a/arch/x86/kernel/mfgpt_32.c +++ b/arch/x86/kernel/mfgpt_32.c @@ -293,10 +293,14 @@ static int mfgpt_next_event(unsigned long delta, struct clock_event_device *evt) return 0; } -/* Assume (foolishly?), that this interrupt was due to our tick */ - static irqreturn_t mfgpt_tick(int irq, void *dev_id) { + u16 val = geode_mfgpt_read(mfgpt_event_clock, MFGPT_REG_SETUP); + + /* See if the interrupt was for us */ + if (!(val & (MFGPT_SETUP_SETUP | MFGPT_SETUP_CMP2 | MFGPT_SETUP_CMP1))) + return IRQ_NONE; + /* Turn off the clock (and clear the event) */ mfgpt_disable_timer(mfgpt_event_clock); From 3406c158ba8e83defb178e867919e24e110a59bf Mon Sep 17 00:00:00 2001 From: Arnd Hannemann Date: Sat, 9 Feb 2008 23:24:08 +0100 Subject: [PATCH 09/32] x86: GEODE: MFGPT: fix typo in printk in mfgpt_timer_setup Signed-off-by: Arnd Hannemann Signed-off-by: Andres Salomon Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/kernel/mfgpt_32.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/mfgpt_32.c b/arch/x86/kernel/mfgpt_32.c index eeb461f391a0..027fc067b399 100644 --- a/arch/x86/kernel/mfgpt_32.c +++ b/arch/x86/kernel/mfgpt_32.c @@ -370,7 +370,7 @@ int __init mfgpt_timer_setup(void) &mfgpt_clockevent); printk(KERN_INFO - "mfgpt-timer: registering the MFGT timer as a clock event.\n"); + "mfgpt-timer: registering the MFGPT timer as a clock event.\n"); clockevents_register_device(&mfgpt_clockevent); return 0; From 88a5ac89667d22e1471ba1f45ea635df1f7da06f Mon Sep 17 00:00:00 2001 From: Harvey Harrison Date: Sat, 9 Feb 2008 23:24:08 +0100 Subject: [PATCH 10/32] x86: fix sparse warning in xen/time.c Use xen_khz to denote xen_specific clock speed. Avoid shadowing cpu_khz. arch/x86/xen/time.c:220:6: warning: symbol 'cpu_khz' shadows an earlier one include/asm/tsc.h:17:21: originally declared here Signed-off-by: Harvey Harrison Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/xen/time.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index b3721fd6877b..c39e1a5aa241 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -217,17 +217,17 @@ unsigned long long xen_sched_clock(void) /* Get the CPU speed from Xen */ unsigned long xen_cpu_khz(void) { - u64 cpu_khz = 1000000ULL << 32; + u64 xen_khz = 1000000ULL << 32; const struct vcpu_time_info *info = &HYPERVISOR_shared_info->vcpu_info[0].time; - do_div(cpu_khz, info->tsc_to_system_mul); + do_div(xen_khz, info->tsc_to_system_mul); if (info->tsc_shift < 0) - cpu_khz <<= -info->tsc_shift; + xen_khz <<= -info->tsc_shift; else - cpu_khz >>= info->tsc_shift; + xen_khz >>= info->tsc_shift; - return cpu_khz; + return xen_khz; } /* From 7c36752a6be84892afb085c67fd4209e686db482 Mon Sep 17 00:00:00 2001 From: Harvey Harrison Date: Sat, 9 Feb 2008 23:24:08 +0100 Subject: [PATCH 11/32] x86: sparse warning in therm_throt.c arch/x86/kernel/cpu/mcheck/therm_throt.c:121:2: warning: returning void-valued expression Signed-off-by: Harvey Harrison Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/kernel/cpu/mcheck/therm_throt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index 24885be5c48c..9b7e01daa1ca 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -118,7 +118,7 @@ static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev) static __cpuinit void thermal_throttle_remove_dev(struct sys_device *sys_dev) { - return sysfs_remove_group(&sys_dev->kobj, &thermal_throttle_attr_group); + sysfs_remove_group(&sys_dev->kobj, &thermal_throttle_attr_group); } /* Mutex protecting device creation against CPU hotplug */ From da7bfc50f5cb54aeee8147dca0c1de9d487cb5e0 Mon Sep 17 00:00:00 2001 From: Harvey Harrison Date: Sat, 9 Feb 2008 23:24:08 +0100 Subject: [PATCH 12/32] x86: sparse warnings in pageattr.c Adjust the definition of lookup_address to take an unsigned long level argument. Adjust callers in xen/mmu.c that pass in a dummy variable. Signed-off-by: Harvey Harrison Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/mm/pageattr.c | 8 +++++--- arch/x86/xen/mmu.c | 6 +++--- include/asm-x86/pgtable.h | 2 +- 3 files changed, 9 insertions(+), 7 deletions(-) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 8493c855582b..eb2a54415a77 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -191,7 +191,7 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address) * or when the present bit is not set. Otherwise we would return a * pointer to a nonexisting mapping. */ -pte_t *lookup_address(unsigned long address, int *level) +pte_t *lookup_address(unsigned long address, unsigned int *level) { pgd_t *pgd = pgd_offset_k(address); pud_t *pud; @@ -255,7 +255,8 @@ try_preserve_large_page(pte_t *kpte, unsigned long address, unsigned long nextpage_addr, numpages, pmask, psize, flags; pte_t new_pte, old_pte, *tmp; pgprot_t old_prot, new_prot; - int level, do_split = 1; + int do_split = 1; + unsigned int level; spin_lock_irqsave(&pgd_lock, flags); /* @@ -406,7 +407,8 @@ out_unlock: static int __change_page_attr(unsigned long address, struct cpa_data *cpa) { - int level, do_split, err; + int do_split, err; + unsigned int level; struct page *kpte_page; pte_t *kpte; diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 45aa771e73a9..0144395448ae 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -58,7 +58,7 @@ xmaddr_t arbitrary_virt_to_machine(unsigned long address) { - int level; + unsigned int level; pte_t *pte = lookup_address(address, &level); unsigned offset = address & PAGE_MASK; @@ -71,7 +71,7 @@ void make_lowmem_page_readonly(void *vaddr) { pte_t *pte, ptev; unsigned long address = (unsigned long)vaddr; - int level; + unsigned int level; pte = lookup_address(address, &level); BUG_ON(pte == NULL); @@ -86,7 +86,7 @@ void make_lowmem_page_readwrite(void *vaddr) { pte_t *pte, ptev; unsigned long address = (unsigned long)vaddr; - int level; + unsigned int level; pte = lookup_address(address, &level); BUG_ON(pte == NULL); diff --git a/include/asm-x86/pgtable.h b/include/asm-x86/pgtable.h index 44c0a4f1b1eb..174b87738714 100644 --- a/include/asm-x86/pgtable.h +++ b/include/asm-x86/pgtable.h @@ -255,7 +255,7 @@ enum { * NOTE: the return type is pte_t but if the pmd is PSE then we return it * as a pte too. */ -extern pte_t *lookup_address(unsigned long address, int *level); +extern pte_t *lookup_address(unsigned long address, unsigned int *level); /* local pte updates need not use xchg for locking */ static inline pte_t native_local_ptep_get_and_clear(pte_t *ptep) From 9583d050d5b7bad76423b2bd667b174a122067a7 Mon Sep 17 00:00:00 2001 From: Harvey Harrison Date: Sat, 9 Feb 2008 23:24:08 +0100 Subject: [PATCH 13/32] x86: fix sparse warning in topology.c arch/x86/kernel/topology.c:56:2: warning: returning void-valued expression Signed-off-by: Harvey Harrison Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/kernel/topology.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/topology.c b/arch/x86/kernel/topology.c index e6757aaa202b..a40051b71d9b 100644 --- a/arch/x86/kernel/topology.c +++ b/arch/x86/kernel/topology.c @@ -53,7 +53,7 @@ EXPORT_SYMBOL(arch_register_cpu); void arch_unregister_cpu(int num) { - return unregister_cpu(&per_cpu(cpu_devices, num).cpu); + unregister_cpu(&per_cpu(cpu_devices, num).cpu); } EXPORT_SYMBOL(arch_unregister_cpu); #else From 6697c05296fab4d113c7144459b72b6172b485a5 Mon Sep 17 00:00:00 2001 From: Harvey Harrison Date: Sat, 9 Feb 2008 23:24:08 +0100 Subject: [PATCH 14/32] x86: fix sparse warnings in acpi/bus.c Add function definition and extern variables to asm-x86/acpi.h. All of these are used in bus.c in ifdef(CONFIG_X86) sections, so are only added to the x86 include headers. boot.c already includes acpi.h so no changes are needed there. Fixes the following: arch/x86/kernel/acpi/boot.c:83:4: warning: symbol 'acpi_sci_flags' was not declared. Should it be static? arch/x86/kernel/acpi/boot.c:84:5: warning: symbol 'acpi_sci_override_gsi' was not declared. Should it be static? arch/x86/kernel/acpi/boot.c:421:13: warning: symbol 'acpi_pic_sci_set_trigger' was not declared. Should it be static? Signed-off-by: Harvey Harrison Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- drivers/acpi/bus.c | 7 +------ include/asm-x86/acpi.h | 4 ++++ 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/drivers/acpi/bus.c b/drivers/acpi/bus.c index 8b0d4b7d188a..ce3c0a2cbac4 100644 --- a/drivers/acpi/bus.c +++ b/drivers/acpi/bus.c @@ -31,6 +31,7 @@ #include #include #include +#include #ifdef CONFIG_X86 #include #endif @@ -39,9 +40,6 @@ #define _COMPONENT ACPI_BUS_COMPONENT ACPI_MODULE_NAME("bus"); -#ifdef CONFIG_X86 -extern void __init acpi_pic_sci_set_trigger(unsigned int irq, u16 trigger); -#endif struct acpi_device *acpi_root; struct proc_dir_entry *acpi_root_dir; @@ -653,8 +651,6 @@ void __init acpi_early_init(void) #ifdef CONFIG_X86 if (!acpi_ioapic) { - extern u8 acpi_sci_flags; - /* compatible (0) means level (3) */ if (!(acpi_sci_flags & ACPI_MADT_TRIGGER_MASK)) { acpi_sci_flags &= ~ACPI_MADT_TRIGGER_MASK; @@ -664,7 +660,6 @@ void __init acpi_early_init(void) acpi_pic_sci_set_trigger(acpi_gbl_FADT.sci_interrupt, (acpi_sci_flags & ACPI_MADT_TRIGGER_MASK) >> 2); } else { - extern int acpi_sci_override_gsi; /* * now that acpi_gbl_FADT is initialized, * update it with result from INT_SRC_OVR parsing diff --git a/include/asm-x86/acpi.h b/include/asm-x86/acpi.h index 98a9ca266531..7a72d6aa50be 100644 --- a/include/asm-x86/acpi.h +++ b/include/asm-x86/acpi.h @@ -89,6 +89,10 @@ extern int acpi_pci_disabled; extern int acpi_skip_timer_override; extern int acpi_use_timer_override; +extern u8 acpi_sci_flags; +extern int acpi_sci_override_gsi; +void acpi_pic_sci_set_trigger(unsigned int, u16); + static inline void disable_acpi(void) { acpi_disabled = 1; From 1ec7fd50ba4f845d1cf6b67acabd774577ef13b6 Mon Sep 17 00:00:00 2001 From: Jiri Kosina Date: Sat, 9 Feb 2008 23:24:08 +0100 Subject: [PATCH 15/32] brk: document randomize_va_space and CONFIG_COMPAT_BRK (was Re: Document randomize_va_space and CONFIG_COMPAT_BRK. Signed-off-by: Jiri Kosina Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- Documentation/sysctl/kernel.txt | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index 8984a5396271..dc8801d4e944 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt @@ -41,6 +41,7 @@ show up in /proc/sys/kernel: - pid_max - powersave-nap [ PPC only ] - printk +- randomize_va_space - real-root-dev ==> Documentation/initrd.txt - reboot-cmd [ SPARC only ] - rtsig-max @@ -280,6 +281,34 @@ send before ratelimiting kicks in. ============================================================== +randomize-va-space: + +This option can be used to select the type of process address +space randomization that is used in the system, for architectures +that support this feature. + +0 - Turn the process address space randomization off by default. + +1 - Make the addresses of mmap base, stack and VDSO page randomized. + This, among other things, implies that shared libraries will be + loaded to random addresses. Also for PIE-linked binaries, the location + of code start is randomized. + + With heap randomization, the situation is a little bit more + complicated. + There a few legacy applications out there (such as some ancient + versions of libc.so.5 from 1996) that assume that brk area starts + just after the end of the code+bss. These applications break when + start of the brk area is randomized. There are however no known + non-legacy applications that would be broken this way, so for most + systems it is safe to choose full randomization. However there is + a CONFIG_COMPAT_BRK option for systems with ancient and/or broken + binaries, that makes heap non-randomized, but keeps all other + parts of process address space randomized if randomize_va_space + sysctl is turned on. + +============================================================== + reboot-cmd: (Sparc only) ??? This seems to be a way to give an argument to the Sparc From 3701d863b43d05ffeb223d269583398f914fb5d3 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 9 Feb 2008 23:24:08 +0100 Subject: [PATCH 16/32] x86: fixup more paravirt fallout Use a common irq_return entry point for all the iret places, which need the paravirt INTERRUPT return wrapper. Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/kernel/entry_32.S | 15 ++++++--------- arch/x86/kernel/entry_64.S | 18 +++++++++++++----- 2 files changed, 19 insertions(+), 14 deletions(-) diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index be5c31d04884..824e21b80aad 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -409,7 +409,8 @@ restore_nocheck_notrace: RESTORE_REGS addl $4, %esp # skip orig_eax/error_code CFI_ADJUST_CFA_OFFSET -4 -1: INTERRUPT_RETURN +ENTRY(irq_return) + INTERRUPT_RETURN .section .fixup,"ax" iret_exc: pushl $0 # no error code @@ -418,7 +419,7 @@ iret_exc: .previous .section __ex_table,"a" .align 4 - .long 1b,iret_exc + .long irq_return,iret_exc .previous CFI_RESTORE_STATE @@ -865,20 +866,16 @@ nmi_espfix_stack: RESTORE_REGS lss 12+4(%esp), %esp # back to espfix stack CFI_ADJUST_CFA_OFFSET -24 -1: INTERRUPT_RETURN + jmp irq_return CFI_ENDPROC -.section __ex_table,"a" - .align 4 - .long 1b,iret_exc -.previous KPROBE_END(nmi) #ifdef CONFIG_PARAVIRT ENTRY(native_iret) -1: iret + iret .section __ex_table,"a" .align 4 - .long 1b,iret_exc + .long native_iret, iret_exc .previous END(native_iret) diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index c7341e81941c..6be39a387c5a 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -581,16 +581,24 @@ retint_restore_args: /* return to kernel space */ */ TRACE_IRQS_IRETQ restore_args: - RESTORE_ARGS 0,8,0 -#ifdef CONFIG_PARAVIRT + RESTORE_ARGS 0,8,0 + +ENTRY(irq_return) INTERRUPT_RETURN -#endif + + .section __ex_table, "a" + .quad irq_return, bad_iret + .previous + +#ifdef CONFIG_PARAVIRT ENTRY(native_iret) iretq .section __ex_table,"a" .quad native_iret, bad_iret .previous +#endif + .section .fixup,"ax" bad_iret: /* @@ -804,7 +812,7 @@ paranoid_swapgs\trace: SWAPGS_UNSAFE_STACK paranoid_restore\trace: RESTORE_ALL 8 - INTERRUPT_RETURN + jmp irq_return paranoid_userspace\trace: GET_THREAD_INFO(%rcx) movl threadinfo_flags(%rcx),%ebx @@ -919,7 +927,7 @@ error_kernelspace: iret run with kernel gs again, so don't set the user space flag. B stepping K8s sometimes report an truncated RIP for IRET exceptions returning to compat mode. Check for these here too. */ - leaq native_iret(%rip),%rbp + leaq irq_return(%rip),%rbp cmpq %rbp,RIP(%rsp) je error_swapgs movl %ebp,%ebp /* zero extend */ From bfc734b24671b2639218ae2ef53af91dfd30b6c9 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 9 Feb 2008 23:24:09 +0100 Subject: [PATCH 17/32] x86: avoid unused variable warning in mm/init_64.c Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/mm/init_64.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 5fe880fc305d..620d2b6b6bf4 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -532,9 +532,9 @@ void __init mem_init(void) void free_init_pages(char *what, unsigned long begin, unsigned long end) { - unsigned long addr; + unsigned long addr = begin; - if (begin >= end) + if (addr >= end) return; /* @@ -549,7 +549,7 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end) #else printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10); - for (addr = begin; addr < end; addr += PAGE_SIZE) { + for (; addr < end; addr += PAGE_SIZE) { ClearPageReserved(virt_to_page(addr)); init_page_count(virt_to_page(addr)); memset((void *)(addr & ~(PAGE_SIZE-1)), From 185c045c245f46485ad8bbd8cc1100e986ff3f13 Mon Sep 17 00:00:00 2001 From: Harvey Harrison Date: Sat, 9 Feb 2008 23:24:09 +0100 Subject: [PATCH 18/32] x86, core: remove CONFIG_FORCED_INLINING Other than the defconfigs, remove the entry in compiler-gcc4.h, Kconfig.debug and feature-removal-schedule.txt. Signed-off-by: Harvey Harrison Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- Documentation/feature-removal-schedule.txt | 9 --------- arch/x86/configs/i386_defconfig | 1 - arch/x86/configs/x86_64_defconfig | 1 - include/linux/compiler-gcc4.h | 9 --------- lib/Kconfig.debug | 14 -------------- 5 files changed, 34 deletions(-) diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt index ce9503c892b5..557f4a2f1655 100644 --- a/Documentation/feature-removal-schedule.txt +++ b/Documentation/feature-removal-schedule.txt @@ -111,15 +111,6 @@ Who: Christoph Hellwig --------------------------- -What: CONFIG_FORCED_INLINING -When: June 2006 -Why: Config option is there to see if gcc is good enough. (in january - 2006). If it is, the behavior should just be the default. If it's not, - the option should just go away entirely. -Who: Arjan van de Ven - ---------------------------- - What: eepro100 network driver When: January 2007 Why: replaced by the e100 driver diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig index 77562e7cdab6..3df340b54e57 100644 --- a/arch/x86/configs/i386_defconfig +++ b/arch/x86/configs/i386_defconfig @@ -1421,7 +1421,6 @@ CONFIG_DEBUG_BUGVERBOSE=y # CONFIG_DEBUG_VM is not set # CONFIG_DEBUG_LIST is not set # CONFIG_FRAME_POINTER is not set -# CONFIG_FORCED_INLINING is not set # CONFIG_RCU_TORTURE_TEST is not set # CONFIG_LKDTM is not set # CONFIG_FAULT_INJECTION is not set diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig index 9e2b0ef851de..eef98cb00c62 100644 --- a/arch/x86/configs/x86_64_defconfig +++ b/arch/x86/configs/x86_64_defconfig @@ -1346,7 +1346,6 @@ CONFIG_DEBUG_BUGVERBOSE=y # CONFIG_DEBUG_VM is not set # CONFIG_DEBUG_LIST is not set # CONFIG_FRAME_POINTER is not set -# CONFIG_FORCED_INLINING is not set # CONFIG_RCU_TORTURE_TEST is not set # CONFIG_LKDTM is not set # CONFIG_FAULT_INJECTION is not set diff --git a/include/linux/compiler-gcc4.h b/include/linux/compiler-gcc4.h index 0ab3a3232330..974f5b7bb205 100644 --- a/include/linux/compiler-gcc4.h +++ b/include/linux/compiler-gcc4.h @@ -5,15 +5,6 @@ /* These definitions are for GCC v4.x. */ #include -#ifdef CONFIG_FORCED_INLINING -# undef inline -# undef __inline__ -# undef __inline -# define inline inline __attribute__((always_inline)) -# define __inline__ __inline__ __attribute__((always_inline)) -# define __inline __inline __attribute__((always_inline)) -#endif - #define __used __attribute__((__used__)) #define __must_check __attribute__((warn_unused_result)) #define __compiler_offsetof(a,b) __builtin_offsetof(a,b) diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index ce0bb2600c25..a370fe828a79 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -465,20 +465,6 @@ config FRAME_POINTER some architectures or if you use external debuggers. If you don't debug the kernel, you can say N. -config FORCED_INLINING - bool "Force gcc to inline functions marked 'inline'" - depends on DEBUG_KERNEL - default y - help - This option determines if the kernel forces gcc to inline the functions - developers have marked 'inline'. Doing so takes away freedom from gcc to - do what it thinks is best, which is desirable for the gcc 3.x series of - compilers. The gcc 4.x series have a rewritten inlining algorithm and - disabling this option will generate a smaller kernel there. Hopefully - this algorithm is so good that allowing gcc4 to make the decision can - become the default in the future, until then this option is there to - test gcc for this. - config BOOT_PRINTK_DELAY bool "Delay each boot printk message by N milliseconds" depends on DEBUG_KERNEL && PRINTK && GENERIC_CALIBRATE_DELAY From 551889a6e2a24a9c06fd453ea03b57b7746ffdc0 Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Sat, 9 Feb 2008 23:24:09 +0100 Subject: [PATCH 19/32] x86: construct 32-bit boot time page tables in native format. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Specifically the boot time page tables in a CONFIG_X86_PAE=y enabled kernel are in PAE format. early_ioremap is updated to use the standard page table accessors. Clear any mappings beyond max_low_pfn from the boot page tables in native_pagetable_setup_start because the initial mappings can extend beyond the range of physical memory and into the vmalloc area. Derived from patches by Eric Biederman and H. Peter Anvin. [ jeremy@goop.org: PAE swapper_pg_dir needs to be page-sized fix ] Signed-off-by: Ian Campbell Cc: H. Peter Anvin Cc: Eric W. Biederman Cc: Andi Kleen Cc: Mika Penttilä Cc: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/kernel/head_32.S | 151 +++++++++++++++++++++++++++-------- arch/x86/kernel/setup_32.c | 4 + arch/x86/mm/init_32.c | 70 ++++++---------- arch/x86/mm/ioremap.c | 55 +++++++------ include/asm-x86/page_32.h | 1 - include/asm-x86/pgtable_32.h | 4 - 6 files changed, 177 insertions(+), 108 deletions(-) diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 5d8c5730686b..74ef4a41f224 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -19,6 +19,10 @@ #include #include #include +#include + +/* Physical address */ +#define pa(X) ((X) - __PAGE_OFFSET) /* * References to members of the new_cpu_data structure. @@ -80,10 +84,6 @@ INIT_MAP_BEYOND_END = BOOTBITMAP_SIZE + (PAGE_TABLE_SIZE + ALLOCATOR_SLOP)*PAGE_ */ .section .text.head,"ax",@progbits ENTRY(startup_32) - /* check to see if KEEP_SEGMENTS flag is meaningful */ - cmpw $0x207, BP_version(%esi) - jb 1f - /* test KEEP_SEGMENTS flag to see if the bootloader is asking us to not reload segments */ testb $(1<<6), BP_loadflags(%esi) @@ -92,7 +92,7 @@ ENTRY(startup_32) /* * Set segments to known values. */ -1: lgdt boot_gdt_descr - __PAGE_OFFSET + lgdt pa(boot_gdt_descr) movl $(__BOOT_DS),%eax movl %eax,%ds movl %eax,%es @@ -105,8 +105,8 @@ ENTRY(startup_32) */ cld xorl %eax,%eax - movl $__bss_start - __PAGE_OFFSET,%edi - movl $__bss_stop - __PAGE_OFFSET,%ecx + movl $pa(__bss_start),%edi + movl $pa(__bss_stop),%ecx subl %edi,%ecx shrl $2,%ecx rep ; stosl @@ -118,31 +118,32 @@ ENTRY(startup_32) * (kexec on panic case). Hence copy out the parameters before initializing * page tables. */ - movl $(boot_params - __PAGE_OFFSET),%edi + movl $pa(boot_params),%edi movl $(PARAM_SIZE/4),%ecx cld rep movsl - movl boot_params - __PAGE_OFFSET + NEW_CL_POINTER,%esi + movl pa(boot_params) + NEW_CL_POINTER,%esi andl %esi,%esi jz 1f # No comand line - movl $(boot_command_line - __PAGE_OFFSET),%edi + movl $pa(boot_command_line),%edi movl $(COMMAND_LINE_SIZE/4),%ecx rep movsl 1: #ifdef CONFIG_PARAVIRT - cmpw $0x207, (boot_params + BP_version - __PAGE_OFFSET) + /* This is can only trip for a broken bootloader... */ + cmpw $0x207, pa(boot_params + BP_version) jb default_entry /* Paravirt-compatible boot parameters. Look to see what architecture we're booting under. */ - movl (boot_params + BP_hardware_subarch - __PAGE_OFFSET), %eax + movl pa(boot_params + BP_hardware_subarch), %eax cmpl $num_subarch_entries, %eax jae bad_subarch - movl subarch_entries - __PAGE_OFFSET(,%eax,4), %eax + movl pa(subarch_entries)(,%eax,4), %eax subl $__PAGE_OFFSET, %eax jmp *%eax @@ -170,17 +171,68 @@ num_subarch_entries = (. - subarch_entries) / 4 * Mappings are created both at virtual address 0 (identity mapping) * and PAGE_OFFSET for up to _end+sizeof(page tables)+INIT_MAP_BEYOND_END. * - * Warning: don't use %esi or the stack in this code. However, %esp - * can be used as a GPR if you really need it... + * Note that the stack is not yet set up! */ -page_pde_offset = (__PAGE_OFFSET >> 20); +#define PTE_ATTR 0x007 /* PRESENT+RW+USER */ +#define PDE_ATTR 0x067 /* PRESENT+RW+USER+DIRTY+ACCESSED */ +#define PGD_ATTR 0x001 /* PRESENT (no other attributes) */ default_entry: - movl $(pg0 - __PAGE_OFFSET), %edi - movl $(swapper_pg_dir - __PAGE_OFFSET), %edx - movl $0x007, %eax /* 0x007 = PRESENT+RW+USER */ +#ifdef CONFIG_X86_PAE + + /* + * In PAE mode swapper_pg_dir is statically defined to contain enough + * entries to cover the VMSPLIT option (that is the top 1, 2 or 3 + * entries). The identity mapping is handled by pointing two PGD + * entries to the first kernel PMD. + * + * Note the upper half of each PMD or PTE are always zero at + * this stage. + */ + +#define KPMDS ((0x100000000-__PAGE_OFFSET) >> 30) /* Number of kernel PMDs */ + + xorl %ebx,%ebx /* %ebx is kept at zero */ + + movl $pa(pg0), %edi + movl $pa(swapper_pg_pmd), %edx + movl $PTE_ATTR, %eax 10: - leal 0x007(%edi),%ecx /* Create PDE entry */ + leal PDE_ATTR(%edi),%ecx /* Create PMD entry */ + movl %ecx,(%edx) /* Store PMD entry */ + /* Upper half already zero */ + addl $8,%edx + movl $512,%ecx +11: + stosl + xchgl %eax,%ebx + stosl + xchgl %eax,%ebx + addl $0x1000,%eax + loop 11b + + /* + * End condition: we must map up to and including INIT_MAP_BEYOND_END + * bytes beyond the end of our own page tables. + */ + leal (INIT_MAP_BEYOND_END+PTE_ATTR)(%edi),%ebp + cmpl %ebp,%eax + jb 10b +1: + movl %edi,pa(init_pg_tables_end) + + /* Do early initialization of the fixmap area */ + movl $pa(swapper_pg_fixmap)+PDE_ATTR,%eax + movl %eax,pa(swapper_pg_pmd+0x1000*KPMDS-8) +#else /* Not PAE */ + +page_pde_offset = (__PAGE_OFFSET >> 20); + + movl $pa(pg0), %edi + movl $pa(swapper_pg_dir), %edx + movl $PTE_ATTR, %eax +10: + leal PDE_ATTR(%edi),%ecx /* Create PDE entry */ movl %ecx,(%edx) /* Store identity PDE entry */ movl %ecx,page_pde_offset(%edx) /* Store kernel PDE entry */ addl $4,%edx @@ -189,19 +241,20 @@ default_entry: stosl addl $0x1000,%eax loop 11b - /* End condition: we must map up to and including INIT_MAP_BEYOND_END */ - /* bytes beyond the end of our own page tables; the +0x007 is the attribute bits */ - leal (INIT_MAP_BEYOND_END+0x007)(%edi),%ebp + /* + * End condition: we must map up to and including INIT_MAP_BEYOND_END + * bytes beyond the end of our own page tables; the +0x007 is + * the attribute bits + */ + leal (INIT_MAP_BEYOND_END+PTE_ATTR)(%edi),%ebp cmpl %ebp,%eax jb 10b - movl %edi,(init_pg_tables_end - __PAGE_OFFSET) - - /* Do an early initialization of the fixmap area */ - movl $(swapper_pg_dir - __PAGE_OFFSET), %edx - movl $(swapper_pg_pmd - __PAGE_OFFSET), %eax - addl $0x67, %eax /* 0x67 == _PAGE_TABLE */ - movl %eax, 4092(%edx) + movl %edi,pa(init_pg_tables_end) + /* Do early initialization of the fixmap area */ + movl $pa(swapper_pg_fixmap)+PDE_ATTR,%eax + movl %eax,pa(swapper_pg_dir+0xffc) +#endif jmp 3f /* * Non-boot CPU entry point; entered from trampoline.S @@ -241,7 +294,7 @@ ENTRY(startup_32_smp) * NOTE! We have to correct for the fact that we're * not yet offset PAGE_OFFSET.. */ -#define cr4_bits mmu_cr4_features-__PAGE_OFFSET +#define cr4_bits pa(mmu_cr4_features) movl cr4_bits,%edx andl %edx,%edx jz 6f @@ -276,10 +329,10 @@ ENTRY(startup_32_smp) /* * Enable paging */ - movl $swapper_pg_dir-__PAGE_OFFSET,%eax + movl $pa(swapper_pg_dir),%eax movl %eax,%cr3 /* set the page table pointer.. */ movl %cr0,%eax - orl $0x80000000,%eax + orl $X86_CR0_PG,%eax movl %eax,%cr0 /* ..and set paging (PG) bit */ ljmp $__BOOT_CS,$1f /* Clear prefetch and normalize %eip */ 1: @@ -552,16 +605,44 @@ ENTRY(_stext) */ .section ".bss.page_aligned","wa" .align PAGE_SIZE_asm +#ifdef CONFIG_X86_PAE +ENTRY(swapper_pg_pmd) + .fill 1024*KPMDS,4,0 +#else ENTRY(swapper_pg_dir) .fill 1024,4,0 -ENTRY(swapper_pg_pmd) +#endif +ENTRY(swapper_pg_fixmap) .fill 1024,4,0 ENTRY(empty_zero_page) .fill 4096,1,0 - /* * This starts the data section. */ +#ifdef CONFIG_X86_PAE +.section ".data.page_aligned","wa" + /* Page-aligned for the benefit of paravirt? */ + .align PAGE_SIZE_asm +ENTRY(swapper_pg_dir) + .long pa(swapper_pg_pmd+PGD_ATTR),0 /* low identity map */ +# if KPMDS == 3 + .long pa(swapper_pg_pmd+PGD_ATTR),0 + .long pa(swapper_pg_pmd+PGD_ATTR+0x1000),0 + .long pa(swapper_pg_pmd+PGD_ATTR+0x2000),0 +# elif KPMDS == 2 + .long 0,0 + .long pa(swapper_pg_pmd+PGD_ATTR),0 + .long pa(swapper_pg_pmd+PGD_ATTR+0x1000),0 +# elif KPMDS == 1 + .long 0,0 + .long 0,0 + .long pa(swapper_pg_pmd+PGD_ATTR),0 +# else +# error "Kernel PMDs should be 1, 2 or 3" +# endif + .align PAGE_SIZE_asm /* needs to be page-sized too */ +#endif + .data ENTRY(stack_start) .long init_thread_union+THREAD_SIZE diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c index d1d8c347cc0b..691ab4cb167b 100644 --- a/arch/x86/kernel/setup_32.c +++ b/arch/x86/kernel/setup_32.c @@ -154,7 +154,11 @@ struct cpuinfo_x86 new_cpu_data __cpuinitdata = { 0, 0, 0, 0, -1, 1, 0, 0, -1 }; struct cpuinfo_x86 boot_cpu_data __read_mostly = { 0, 0, 0, 0, -1, 1, 0, 0, -1 }; EXPORT_SYMBOL(boot_cpu_data); +#ifndef CONFIG_X86_PAE unsigned long mmu_cr4_features; +#else +unsigned long mmu_cr4_features = X86_CR4_PAE; +#endif /* for MCA, but anyone else can use it if they want */ unsigned int machine_id; diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index d1bc04006d16..54aba3cf9efe 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -46,6 +46,7 @@ #include #include #include +#include unsigned int __VMALLOC_RESERVE = 128 << 20; @@ -328,44 +329,38 @@ pteval_t __PAGE_KERNEL_EXEC = _PAGE_KERNEL_EXEC; void __init native_pagetable_setup_start(pgd_t *base) { -#ifdef CONFIG_X86_PAE - int i; + unsigned long pfn, va; + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; /* - * Init entries of the first-level page table to the - * zero page, if they haven't already been set up. - * - * In a normal native boot, we'll be running on a - * pagetable rooted in swapper_pg_dir, but not in PAE - * mode, so this will end up clobbering the mappings - * for the lower 24Mbytes of the address space, - * without affecting the kernel address space. + * Remove any mappings which extend past the end of physical + * memory from the boot time page table: */ - for (i = 0; i < USER_PTRS_PER_PGD; i++) - set_pgd(&base[i], - __pgd(__pa(empty_zero_page) | _PAGE_PRESENT)); + for (pfn = max_low_pfn + 1; pfn < 1<<(32-PAGE_SHIFT); pfn++) { + va = PAGE_OFFSET + (pfn<> PAGE_SHIFT); -#endif } void __init native_pagetable_setup_done(pgd_t *base) { -#ifdef CONFIG_X86_PAE - /* - * Add low memory identity-mappings - SMP needs it when - * starting up on an AP from real-mode. In the non-PAE - * case we already have these mappings through head.S. - * All user-space mappings are explicitly cleared after - * SMP startup. - */ - set_pgd(&base[0], base[USER_PTRS_PER_PGD]); -#endif } /* @@ -374,9 +369,8 @@ void __init native_pagetable_setup_done(pgd_t *base) * the boot process. * * If we're booting on native hardware, this will be a pagetable - * constructed in arch/i386/kernel/head.S, and not running in PAE mode - * (even if we'll end up running in PAE). The root of the pagetable - * will be swapper_pg_dir. + * constructed in arch/x86/kernel/head_32.S. The root of the + * pagetable will be swapper_pg_dir. * * If we're booting paravirtualized under a hypervisor, then there are * more options: we may already be running PAE, and the pagetable may @@ -537,14 +531,6 @@ void __init paging_init(void) load_cr3(swapper_pg_dir); -#ifdef CONFIG_X86_PAE - /* - * We will bail out later - printk doesn't work right now so - * the user would just see a hanging kernel. - */ - if (cpu_has_pae) - set_in_cr4(X86_CR4_PAE); -#endif __flush_tlb_all(); kmap_init(); @@ -675,10 +661,6 @@ void __init mem_init(void) BUG_ON((unsigned long)high_memory > VMALLOC_START); #endif /* double-sanity-check paranoia */ -#ifdef CONFIG_X86_PAE - if (!cpu_has_pae) - panic("cannot execute a PAE-enabled kernel on a PAE-less CPU!"); -#endif if (boot_cpu_data.wp_works_ok < 0) test_wp_bit(); diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index ee6648fe6b15..1106b7f477bd 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -260,41 +260,46 @@ static int __init early_ioremap_debug_setup(char *str) early_param("early_ioremap_debug", early_ioremap_debug_setup); static __initdata int after_paging_init; -static __initdata unsigned long bm_pte[1024] +static __initdata pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] __attribute__((aligned(PAGE_SIZE))); -static inline unsigned long * __init early_ioremap_pgd(unsigned long addr) +static inline pmd_t * __init early_ioremap_pmd(unsigned long addr) { - return (unsigned long *)swapper_pg_dir + ((addr >> 22) & 1023); + pgd_t *pgd = &swapper_pg_dir[pgd_index(addr)]; + pud_t *pud = pud_offset(pgd, addr); + pmd_t *pmd = pmd_offset(pud, addr); + + return pmd; } -static inline unsigned long * __init early_ioremap_pte(unsigned long addr) +static inline pte_t * __init early_ioremap_pte(unsigned long addr) { - return bm_pte + ((addr >> PAGE_SHIFT) & 1023); + return &bm_pte[pte_index(addr)]; } void __init early_ioremap_init(void) { - unsigned long *pgd; + pmd_t *pmd; if (early_ioremap_debug) printk(KERN_INFO "early_ioremap_init()\n"); - pgd = early_ioremap_pgd(fix_to_virt(FIX_BTMAP_BEGIN)); - *pgd = __pa(bm_pte) | _PAGE_TABLE; + pmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN)); memset(bm_pte, 0, sizeof(bm_pte)); + set_pmd(pmd, __pmd(__pa(bm_pte) | _PAGE_TABLE)); + /* - * The boot-ioremap range spans multiple pgds, for which + * The boot-ioremap range spans multiple pmds, for which * we are not prepared: */ - if (pgd != early_ioremap_pgd(fix_to_virt(FIX_BTMAP_END))) { + if (pmd != early_ioremap_pmd(fix_to_virt(FIX_BTMAP_END))) { WARN_ON(1); - printk(KERN_WARNING "pgd %p != %p\n", - pgd, early_ioremap_pgd(fix_to_virt(FIX_BTMAP_END))); + printk(KERN_WARNING "pmd %p != %p\n", + pmd, early_ioremap_pmd(fix_to_virt(FIX_BTMAP_END))); printk(KERN_WARNING "fix_to_virt(FIX_BTMAP_BEGIN): %08lx\n", - fix_to_virt(FIX_BTMAP_BEGIN)); + fix_to_virt(FIX_BTMAP_BEGIN)); printk(KERN_WARNING "fix_to_virt(FIX_BTMAP_END): %08lx\n", - fix_to_virt(FIX_BTMAP_END)); + fix_to_virt(FIX_BTMAP_END)); printk(KERN_WARNING "FIX_BTMAP_END: %d\n", FIX_BTMAP_END); printk(KERN_WARNING "FIX_BTMAP_BEGIN: %d\n", @@ -304,28 +309,29 @@ void __init early_ioremap_init(void) void __init early_ioremap_clear(void) { - unsigned long *pgd; + pmd_t *pmd; if (early_ioremap_debug) printk(KERN_INFO "early_ioremap_clear()\n"); - pgd = early_ioremap_pgd(fix_to_virt(FIX_BTMAP_BEGIN)); - *pgd = 0; - paravirt_release_pt(__pa(pgd) >> PAGE_SHIFT); + pmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN)); + pmd_clear(pmd); + paravirt_release_pt(__pa(pmd) >> PAGE_SHIFT); __flush_tlb_all(); } void __init early_ioremap_reset(void) { enum fixed_addresses idx; - unsigned long *pte, phys, addr; + unsigned long addr, phys; + pte_t *pte; after_paging_init = 1; for (idx = FIX_BTMAP_BEGIN; idx >= FIX_BTMAP_END; idx--) { addr = fix_to_virt(idx); pte = early_ioremap_pte(addr); - if (*pte & _PAGE_PRESENT) { - phys = *pte & PAGE_MASK; + if (pte_present(*pte)) { + phys = pte_val(*pte) & PAGE_MASK; set_fixmap(idx, phys); } } @@ -334,7 +340,8 @@ void __init early_ioremap_reset(void) static void __init __early_set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t flags) { - unsigned long *pte, addr = __fix_to_virt(idx); + unsigned long addr = __fix_to_virt(idx); + pte_t *pte; if (idx >= __end_of_fixed_addresses) { BUG(); @@ -342,9 +349,9 @@ static void __init __early_set_fixmap(enum fixed_addresses idx, } pte = early_ioremap_pte(addr); if (pgprot_val(flags)) - *pte = (phys & PAGE_MASK) | pgprot_val(flags); + set_pte(pte, pfn_pte(phys >> PAGE_SHIFT, flags)); else - *pte = 0; + pte_clear(NULL, addr, pte); __flush_tlb_one(addr); } diff --git a/include/asm-x86/page_32.h b/include/asm-x86/page_32.h index 984998a30741..5f7257fd589b 100644 --- a/include/asm-x86/page_32.h +++ b/include/asm-x86/page_32.h @@ -48,7 +48,6 @@ typedef unsigned long pgprotval_t; typedef unsigned long phys_addr_t; typedef union { pteval_t pte, pte_low; } pte_t; -typedef pte_t boot_pte_t; #endif /* __ASSEMBLY__ */ #endif /* CONFIG_X86_PAE */ diff --git a/include/asm-x86/pgtable_32.h b/include/asm-x86/pgtable_32.h index 80dd438642f6..a842c7222b1e 100644 --- a/include/asm-x86/pgtable_32.h +++ b/include/asm-x86/pgtable_32.h @@ -52,10 +52,6 @@ void paging_init(void); #define USER_PGD_PTRS (PAGE_OFFSET >> PGDIR_SHIFT) #define KERNEL_PGD_PTRS (PTRS_PER_PGD-USER_PGD_PTRS) -#define TWOLEVEL_PGDIR_SHIFT 22 -#define BOOT_USER_PGD_PTRS (__PAGE_OFFSET >> TWOLEVEL_PGDIR_SHIFT) -#define BOOT_KERNEL_PGD_PTRS (1024-BOOT_USER_PGD_PTRS) - /* Just any arbitrary offset to the start of the vmalloc VM area: the * current 8MB value just means that there will be a 8MB "hole" after the * physical memory until the kernel virtual memory starts. That means that From b6fbb669c8ef3a112121697ca901c290ccd35eb2 Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Sat, 9 Feb 2008 23:24:09 +0100 Subject: [PATCH 20/32] x86: fix early_ioremap pagetable ops Some important parts of f6df72e71eba621b2f5c49b3a763116fac748f6e got dropped along the way, reintroduce them. Only affects paravirt guests. Signed-off-by: Ian Campbell Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/mm/ioremap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 1106b7f477bd..a4897a85268a 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -286,7 +286,7 @@ void __init early_ioremap_init(void) pmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN)); memset(bm_pte, 0, sizeof(bm_pte)); - set_pmd(pmd, __pmd(__pa(bm_pte) | _PAGE_TABLE)); + pmd_populate_kernel(&init_mm, pmd, bm_pte); /* * The boot-ioremap range spans multiple pmds, for which @@ -316,7 +316,7 @@ void __init early_ioremap_clear(void) pmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN)); pmd_clear(pmd); - paravirt_release_pt(__pa(pmd) >> PAGE_SHIFT); + paravirt_release_pt(__pa(bm_pte) >> PAGE_SHIFT); __flush_tlb_all(); } From 9b706aee7d92d6ac3002547aea12e3eaa0a750ae Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Sat, 9 Feb 2008 23:24:09 +0100 Subject: [PATCH 21/32] x86: trivial printk optimizations In arch/x86/boot/printf.c gets rid of unused tail of digits: const char *digits = "0123456789abcdefghijklmnopqrstuvwxyz"; (we are using 0-9a-f only) Uses smaller/faster lowercasing (by ORing with 0x20) if we know that we work on numbers/digits. Makes strtoul smaller, and also we are getting rid of static const char small_digits[] = "0123456789abcdefx"; static const char large_digits[] = "0123456789ABCDEFX"; since this works equally well: static const char digits[16] = "0123456789ABCDEF"; Size savings: $ size vmlinux.org vmlinux text data bss dec hex filename 877320 112252 90112 1079684 107984 vmlinux.org 877048 112252 90112 1079412 107874 vmlinux It may be also a tiny bit faster because code has less branches now, but I doubt it is measurable. [ hugh@veritas.com: uppercase pointers fix ] Signed-off-by: Denys Vlasenko Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/boot/printf.c | 24 ++++++++++++--------- lib/vsprintf.c | 49 +++++++++++++++++++++++------------------- 2 files changed, 41 insertions(+), 32 deletions(-) diff --git a/arch/x86/boot/printf.c b/arch/x86/boot/printf.c index 1a09f9309d3c..7e7e890699be 100644 --- a/arch/x86/boot/printf.c +++ b/arch/x86/boot/printf.c @@ -33,8 +33,8 @@ static int skip_atoi(const char **s) #define PLUS 4 /* show plus */ #define SPACE 8 /* space if plus */ #define LEFT 16 /* left justified */ -#define SPECIAL 32 /* 0x */ -#define LARGE 64 /* use 'ABCDEF' instead of 'abcdef' */ +#define SMALL 32 /* Must be 32 == 0x20 */ +#define SPECIAL 64 /* 0x */ #define do_div(n,base) ({ \ int __res; \ @@ -45,12 +45,16 @@ __res; }) static char *number(char *str, long num, int base, int size, int precision, int type) { - char c, sign, tmp[66]; - const char *digits = "0123456789abcdefghijklmnopqrstuvwxyz"; + /* we are called with base 8, 10 or 16, only, thus don't need "G..." */ + static const char digits[16] = "0123456789ABCDEF"; /* "GHIJKLMNOPQRSTUVWXYZ"; */ + + char tmp[66]; + char c, sign, locase; int i; - if (type & LARGE) - digits = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"; + /* locase = 0 or 0x20. ORing digits or letters with 'locase' + * produces same digits or (maybe lowercased) letters */ + locase = (type & SMALL); if (type & LEFT) type &= ~ZEROPAD; if (base < 2 || base > 36) @@ -81,7 +85,7 @@ static char *number(char *str, long num, int base, int size, int precision, tmp[i++] = '0'; else while (num != 0) - tmp[i++] = digits[do_div(num, base)]; + tmp[i++] = (digits[do_div(num, base)] | locase); if (i > precision) precision = i; size -= precision; @@ -95,7 +99,7 @@ static char *number(char *str, long num, int base, int size, int precision, *str++ = '0'; else if (base == 16) { *str++ = '0'; - *str++ = digits[33]; + *str++ = ('X' | locase); } } if (!(type & LEFT)) @@ -244,9 +248,9 @@ int vsprintf(char *buf, const char *fmt, va_list args) base = 8; break; - case 'X': - flags |= LARGE; case 'x': + flags |= SMALL; + case 'X': base = 16; break; diff --git a/lib/vsprintf.c b/lib/vsprintf.c index 419993f58c6b..fd987b17bda7 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c @@ -26,6 +26,9 @@ #include /* for PAGE_SIZE */ #include +/* Works only for digits and letters, but small and fast */ +#define TOLOWER(x) ((x) | 0x20) + /** * simple_strtoul - convert a string to an unsigned long * @cp: The start of the string @@ -41,17 +44,17 @@ unsigned long simple_strtoul(const char *cp,char **endp,unsigned int base) if (*cp == '0') { base = 8; cp++; - if ((toupper(*cp) == 'X') && isxdigit(cp[1])) { + if ((TOLOWER(*cp) == 'x') && isxdigit(cp[1])) { cp++; base = 16; } } } else if (base == 16) { - if (cp[0] == '0' && toupper(cp[1]) == 'X') + if (cp[0] == '0' && TOLOWER(cp[1]) == 'x') cp += 2; } while (isxdigit(*cp) && - (value = isdigit(*cp) ? *cp-'0' : toupper(*cp)-'A'+10) < base) { + (value = isdigit(*cp) ? *cp-'0' : TOLOWER(*cp)-'a'+10) < base) { result = result*base + value; cp++; } @@ -92,17 +95,17 @@ unsigned long long simple_strtoull(const char *cp,char **endp,unsigned int base) if (*cp == '0') { base = 8; cp++; - if ((toupper(*cp) == 'X') && isxdigit(cp[1])) { + if ((TOLOWER(*cp) == 'x') && isxdigit(cp[1])) { cp++; base = 16; } } } else if (base == 16) { - if (cp[0] == '0' && toupper(cp[1]) == 'X') + if (cp[0] == '0' && TOLOWER(cp[1]) == 'x') cp += 2; } - while (isxdigit(*cp) && (value = isdigit(*cp) ? *cp-'0' : (islower(*cp) - ? toupper(*cp) : *cp)-'A'+10) < base) { + while (isxdigit(*cp) + && (value = isdigit(*cp) ? *cp-'0' : TOLOWER(*cp)-'a'+10) < base) { result = result*base + value; cp++; } @@ -360,24 +363,25 @@ static noinline char* put_dec(char *buf, unsigned long long num) #define PLUS 4 /* show plus */ #define SPACE 8 /* space if plus */ #define LEFT 16 /* left justified */ -#define SPECIAL 32 /* 0x */ -#define LARGE 64 /* use 'ABCDEF' instead of 'abcdef' */ +#define SMALL 32 /* Must be 32 == 0x20 */ +#define SPECIAL 64 /* 0x */ static char *number(char *buf, char *end, unsigned long long num, int base, int size, int precision, int type) { - char sign,tmp[66]; - const char *digits; - /* we are called with base 8, 10 or 16, only, thus don't need "g..." */ - static const char small_digits[] = "0123456789abcdefx"; /* "ghijklmnopqrstuvwxyz"; */ - static const char large_digits[] = "0123456789ABCDEFX"; /* "GHIJKLMNOPQRSTUVWXYZ"; */ + /* we are called with base 8, 10 or 16, only, thus don't need "G..." */ + static const char digits[16] = "0123456789ABCDEF"; /* "GHIJKLMNOPQRSTUVWXYZ"; */ + + char tmp[66]; + char sign; + char locase; int need_pfx = ((type & SPECIAL) && base != 10); int i; - digits = (type & LARGE) ? large_digits : small_digits; + /* locase = 0 or 0x20. ORing digits or letters with 'locase' + * produces same digits or (maybe lowercased) letters */ + locase = (type & SMALL); if (type & LEFT) type &= ~ZEROPAD; - if (base < 2 || base > 36) - return NULL; sign = 0; if (type & SIGN) { if ((signed long long) num < 0) { @@ -404,7 +408,7 @@ static char *number(char *buf, char *end, unsigned long long num, int base, int tmp[i++] = '0'; /* Generic code, for any base: else do { - tmp[i++] = digits[do_div(num,base)]; + tmp[i++] = (digits[do_div(num,base)] | locase); } while (num != 0); */ else if (base != 10) { /* 8 or 16 */ @@ -412,7 +416,7 @@ static char *number(char *buf, char *end, unsigned long long num, int base, int int shift = 3; if (base == 16) shift = 4; do { - tmp[i++] = digits[((unsigned char)num) & mask]; + tmp[i++] = (digits[((unsigned char)num) & mask] | locase); num >>= shift; } while (num); } else { /* base 10 */ @@ -444,7 +448,7 @@ static char *number(char *buf, char *end, unsigned long long num, int base, int ++buf; if (base == 16) { if (buf < end) - *buf = digits[16]; /* for arbitrary base: digits[33]; */ + *buf = ('X' | locase); ++buf; } } @@ -644,6 +648,7 @@ int vsnprintf(char *buf, size_t size, const char *fmt, va_list args) continue; case 'p': + flags |= SMALL; if (field_width == -1) { field_width = 2*sizeof(void *); flags |= ZEROPAD; @@ -680,9 +685,9 @@ int vsnprintf(char *buf, size_t size, const char *fmt, va_list args) base = 8; break; - case 'X': - flags |= LARGE; case 'x': + flags |= SMALL; + case 'X': base = 16; break; From cf7700fe24301df2c8d3636cf40784651c098207 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Sat, 9 Feb 2008 23:24:09 +0100 Subject: [PATCH 22/32] x86 PM: move 64-bit hibernation files to arch/x86/power Move arch/x86/kernel/suspend_64.c to arch/x86/power . Move arch/x86/kernel/suspend_asm_64.S to arch/x86/power as hibernate_asm_64.S . Update purpose and copyright information in arch/x86/power/suspend_64.c and arch/x86/power/hibernate_asm_64.S . Update the Makefiles in arch/x86, arch/x86/kernel and arch/x86/power to reflect the above changes. Signed-off-by: Rafael J. Wysocki Acked-by: Pavel Machek Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/Makefile | 4 +++- arch/x86/kernel/Makefile | 2 -- arch/x86/power/Makefile | 9 +++++++-- .../suspend_asm_64.S => power/hibernate_asm_64.S} | 9 +++++++-- arch/x86/{kernel => power}/suspend_64.c | 5 +++-- 5 files changed, 20 insertions(+), 9 deletions(-) rename arch/x86/{kernel/suspend_asm_64.S => power/hibernate_asm_64.S} (95%) rename arch/x86/{kernel => power}/suspend_64.c (98%) diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 364865b1b08d..204af43535c5 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -191,8 +191,10 @@ drivers-$(CONFIG_PCI) += arch/x86/pci/ # must be linked after kernel/ drivers-$(CONFIG_OPROFILE) += arch/x86/oprofile/ -ifeq ($(CONFIG_X86_32),y) +# suspend and hibernation support drivers-$(CONFIG_PM) += arch/x86/power/ + +ifeq ($(CONFIG_X86_32),y) drivers-$(CONFIG_FB) += arch/x86/video/ endif diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 21dc1a061bf1..76ec0f8f138a 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -84,8 +84,6 @@ ifeq ($(CONFIG_X86_64),y) obj-y += genapic_64.o genapic_flat_64.o obj-$(CONFIG_X86_PM_TIMER) += pmtimer_64.o obj-$(CONFIG_AUDIT) += audit_64.o - obj-$(CONFIG_PM) += suspend_64.o - obj-$(CONFIG_HIBERNATION) += suspend_asm_64.o obj-$(CONFIG_GART_IOMMU) += pci-gart_64.o aperture_64.o obj-$(CONFIG_CALGARY_IOMMU) += pci-calgary_64.o tce_64.o diff --git a/arch/x86/power/Makefile b/arch/x86/power/Makefile index d764ec950065..8ce87fb4abb4 100644 --- a/arch/x86/power/Makefile +++ b/arch/x86/power/Makefile @@ -1,2 +1,7 @@ -obj-$(CONFIG_PM) += cpu.o -obj-$(CONFIG_HIBERNATION) += swsusp.o suspend.o +ifeq ($(CONFIG_X86_64),y) + obj-$(CONFIG_PM) += suspend_64.o + obj-$(CONFIG_HIBERNATION) += hibernate_asm_64.o +else + obj-$(CONFIG_PM) += cpu.o + obj-$(CONFIG_HIBERNATION) += swsusp.o suspend.o +endif diff --git a/arch/x86/kernel/suspend_asm_64.S b/arch/x86/power/hibernate_asm_64.S similarity index 95% rename from arch/x86/kernel/suspend_asm_64.S rename to arch/x86/power/hibernate_asm_64.S index aeb9a4d7681e..1deb3244b99b 100644 --- a/arch/x86/kernel/suspend_asm_64.S +++ b/arch/x86/power/hibernate_asm_64.S @@ -1,7 +1,12 @@ -/* Copyright 2004,2005 Pavel Machek , Andi Kleen , Rafael J. Wysocki +/* + * Hibernation support for x86-64 * * Distribute under GPLv2. * + * Copyright 2007 Rafael J. Wysocki + * Copyright 2005 Andi Kleen + * Copyright 2004 Pavel Machek + * * swsusp_arch_resume must not use any stack or any nonlocal variables while * copying pages: * @@ -9,7 +14,7 @@ * image could very well be data page in "new" image, and overwriting * your own stack under you is bad idea. */ - + .text #include #include diff --git a/arch/x86/kernel/suspend_64.c b/arch/x86/power/suspend_64.c similarity index 98% rename from arch/x86/kernel/suspend_64.c rename to arch/x86/power/suspend_64.c index 7ac7130022f1..d51dbf21d021 100644 --- a/arch/x86/kernel/suspend_64.c +++ b/arch/x86/power/suspend_64.c @@ -1,8 +1,9 @@ /* - * Suspend support specific for i386. + * Suspend and hibernation support for x86-64 * * Distribute under GPLv2 * + * Copyright (c) 2007 Rafael J. Wysocki * Copyright (c) 2002 Pavel Machek * Copyright (c) 2001 Patrick Mochel */ @@ -63,7 +64,7 @@ static void __save_processor_state(struct saved_context *ctxt) mtrr_save_fixed_ranges(NULL); /* - * control registers + * control registers */ rdmsrl(MSR_EFER, ctxt->efer); ctxt->cr0 = read_cr0(); From c57591244a08bb441c83472f5c110151bb7c2cc6 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Sat, 9 Feb 2008 23:24:09 +0100 Subject: [PATCH 23/32] x86 PM: rename 32-bit files in arch/x86/power Rename cpu.c, suspend.c and swsusp.S in arch/x86/power to cpu_32.c, hibernate_32.c and hibernate_asm_32.S, respectively, and update the purpose and copyright information in these files. Update the Makefile in arch/x86/power to reflect the above changes. Signed-off-by: Rafael J. Wysocki Acked-by: Pavel Machek Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/power/Makefile | 4 ++-- arch/x86/power/{cpu.c => cpu_32.c} | 2 +- arch/x86/power/{suspend.c => hibernate_32.c} | 2 +- arch/x86/power/{swsusp.S => hibernate_asm_32.S} | 3 +-- 4 files changed, 5 insertions(+), 6 deletions(-) rename arch/x86/power/{cpu.c => cpu_32.c} (99%) rename arch/x86/power/{suspend.c => hibernate_32.c} (98%) rename arch/x86/power/{swsusp.S => hibernate_asm_32.S} (96%) diff --git a/arch/x86/power/Makefile b/arch/x86/power/Makefile index 8ce87fb4abb4..2c95118e510a 100644 --- a/arch/x86/power/Makefile +++ b/arch/x86/power/Makefile @@ -2,6 +2,6 @@ ifeq ($(CONFIG_X86_64),y) obj-$(CONFIG_PM) += suspend_64.o obj-$(CONFIG_HIBERNATION) += hibernate_asm_64.o else - obj-$(CONFIG_PM) += cpu.o - obj-$(CONFIG_HIBERNATION) += swsusp.o suspend.o + obj-$(CONFIG_PM) += cpu_32.o + obj-$(CONFIG_HIBERNATION) += hibernate_32.o hibernate_asm_32.o endif diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu_32.c similarity index 99% rename from arch/x86/power/cpu.c rename to arch/x86/power/cpu_32.c index efcf620d1439..7f9c6da04a4c 100644 --- a/arch/x86/power/cpu.c +++ b/arch/x86/power/cpu_32.c @@ -40,7 +40,7 @@ static void __save_processor_state(struct saved_context *ctxt) savesegment(ss, ctxt->ss); /* - * control registers + * control registers */ ctxt->cr0 = read_cr0(); ctxt->cr2 = read_cr2(); diff --git a/arch/x86/power/suspend.c b/arch/x86/power/hibernate_32.c similarity index 98% rename from arch/x86/power/suspend.c rename to arch/x86/power/hibernate_32.c index a0020b913f31..5080c377ef12 100644 --- a/arch/x86/power/suspend.c +++ b/arch/x86/power/hibernate_32.c @@ -1,5 +1,5 @@ /* - * Suspend support specific for i386 - temporary page tables + * Hibernation support specific for i386 - temporary page tables * * Distribute under GPLv2 * diff --git a/arch/x86/power/swsusp.S b/arch/x86/power/hibernate_asm_32.S similarity index 96% rename from arch/x86/power/swsusp.S rename to arch/x86/power/hibernate_asm_32.S index 53662e05b393..b95aa6cfe3cb 100644 --- a/arch/x86/power/swsusp.S +++ b/arch/x86/power/hibernate_asm_32.S @@ -1,7 +1,6 @@ .text -/* Originally gcc generated, modified by hand - * +/* * This may not use any stack, nor any variable that is not "NoSave": * * Its rewriting one kernel image with another. What is stack in "old" From ef8b03fabfbab0738dacbb6c0c38d5af91759ca1 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Sat, 9 Feb 2008 23:24:09 +0100 Subject: [PATCH 24/32] x86 PM: consolidate suspend and hibernation code Move the hibernation-specific code from arch/x86/power/suspend_64.c to a separate file (hibernate_64.c) and the CPU-handling code to cpu_64.c (in line with the corresponding 32-bit code). Simplify arch/x86/power/Makefile . Signed-off-by: Rafael J. Wysocki Acked-by: Pavel Machek Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/power/Makefile | 9 +- arch/x86/power/{suspend_64.c => cpu_64.c} | 155 -------------------- arch/x86/power/hibernate_64.c | 169 ++++++++++++++++++++++ 3 files changed, 171 insertions(+), 162 deletions(-) rename arch/x86/power/{suspend_64.c => cpu_64.c} (54%) create mode 100644 arch/x86/power/hibernate_64.c diff --git a/arch/x86/power/Makefile b/arch/x86/power/Makefile index 2c95118e510a..9ff4d5b55ad1 100644 --- a/arch/x86/power/Makefile +++ b/arch/x86/power/Makefile @@ -1,7 +1,2 @@ -ifeq ($(CONFIG_X86_64),y) - obj-$(CONFIG_PM) += suspend_64.o - obj-$(CONFIG_HIBERNATION) += hibernate_asm_64.o -else - obj-$(CONFIG_PM) += cpu_32.o - obj-$(CONFIG_HIBERNATION) += hibernate_32.o hibernate_asm_32.o -endif +obj-$(CONFIG_PM_SLEEP) += cpu_$(BITS).o +obj-$(CONFIG_HIBERNATION) += hibernate_$(BITS).o hibernate_asm_$(BITS).o diff --git a/arch/x86/power/suspend_64.c b/arch/x86/power/cpu_64.c similarity index 54% rename from arch/x86/power/suspend_64.c rename to arch/x86/power/cpu_64.c index d51dbf21d021..66bdfb591fd8 100644 --- a/arch/x86/power/suspend_64.c +++ b/arch/x86/power/cpu_64.c @@ -15,9 +15,6 @@ #include #include -/* References to section boundaries */ -extern const void __nosave_begin, __nosave_end; - static void fix_processor_context(void); struct saved_context saved_context; @@ -167,155 +164,3 @@ static void fix_processor_context(void) loaddebug(¤t->thread, 7); } } - -#ifdef CONFIG_HIBERNATION -/* Defined in arch/x86_64/kernel/suspend_asm.S */ -extern int restore_image(void); - -/* - * Address to jump to in the last phase of restore in order to get to the image - * kernel's text (this value is passed in the image header). - */ -unsigned long restore_jump_address; - -/* - * Value of the cr3 register from before the hibernation (this value is passed - * in the image header). - */ -unsigned long restore_cr3; - -pgd_t *temp_level4_pgt; - -void *relocated_restore_code; - -static int res_phys_pud_init(pud_t *pud, unsigned long address, unsigned long end) -{ - long i, j; - - i = pud_index(address); - pud = pud + i; - for (; i < PTRS_PER_PUD; pud++, i++) { - unsigned long paddr; - pmd_t *pmd; - - paddr = address + i*PUD_SIZE; - if (paddr >= end) - break; - - pmd = (pmd_t *)get_safe_page(GFP_ATOMIC); - if (!pmd) - return -ENOMEM; - set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE)); - for (j = 0; j < PTRS_PER_PMD; pmd++, j++, paddr += PMD_SIZE) { - unsigned long pe; - - if (paddr >= end) - break; - pe = __PAGE_KERNEL_LARGE_EXEC | paddr; - pe &= __supported_pte_mask; - set_pmd(pmd, __pmd(pe)); - } - } - return 0; -} - -static int set_up_temporary_mappings(void) -{ - unsigned long start, end, next; - int error; - - temp_level4_pgt = (pgd_t *)get_safe_page(GFP_ATOMIC); - if (!temp_level4_pgt) - return -ENOMEM; - - /* It is safe to reuse the original kernel mapping */ - set_pgd(temp_level4_pgt + pgd_index(__START_KERNEL_map), - init_level4_pgt[pgd_index(__START_KERNEL_map)]); - - /* Set up the direct mapping from scratch */ - start = (unsigned long)pfn_to_kaddr(0); - end = (unsigned long)pfn_to_kaddr(end_pfn); - - for (; start < end; start = next) { - pud_t *pud = (pud_t *)get_safe_page(GFP_ATOMIC); - if (!pud) - return -ENOMEM; - next = start + PGDIR_SIZE; - if (next > end) - next = end; - if ((error = res_phys_pud_init(pud, __pa(start), __pa(next)))) - return error; - set_pgd(temp_level4_pgt + pgd_index(start), - mk_kernel_pgd(__pa(pud))); - } - return 0; -} - -int swsusp_arch_resume(void) -{ - int error; - - /* We have got enough memory and from now on we cannot recover */ - if ((error = set_up_temporary_mappings())) - return error; - - relocated_restore_code = (void *)get_safe_page(GFP_ATOMIC); - if (!relocated_restore_code) - return -ENOMEM; - memcpy(relocated_restore_code, &core_restore_code, - &restore_registers - &core_restore_code); - - restore_image(); - return 0; -} - -/* - * pfn_is_nosave - check if given pfn is in the 'nosave' section - */ - -int pfn_is_nosave(unsigned long pfn) -{ - unsigned long nosave_begin_pfn = __pa_symbol(&__nosave_begin) >> PAGE_SHIFT; - unsigned long nosave_end_pfn = PAGE_ALIGN(__pa_symbol(&__nosave_end)) >> PAGE_SHIFT; - return (pfn >= nosave_begin_pfn) && (pfn < nosave_end_pfn); -} - -struct restore_data_record { - unsigned long jump_address; - unsigned long cr3; - unsigned long magic; -}; - -#define RESTORE_MAGIC 0x0123456789ABCDEFUL - -/** - * arch_hibernation_header_save - populate the architecture specific part - * of a hibernation image header - * @addr: address to save the data at - */ -int arch_hibernation_header_save(void *addr, unsigned int max_size) -{ - struct restore_data_record *rdr = addr; - - if (max_size < sizeof(struct restore_data_record)) - return -EOVERFLOW; - rdr->jump_address = restore_jump_address; - rdr->cr3 = restore_cr3; - rdr->magic = RESTORE_MAGIC; - return 0; -} - -/** - * arch_hibernation_header_restore - read the architecture specific data - * from the hibernation image header - * @addr: address to read the data from - */ -int arch_hibernation_header_restore(void *addr) -{ - struct restore_data_record *rdr = addr; - - restore_jump_address = rdr->jump_address; - restore_cr3 = rdr->cr3; - return (rdr->magic == RESTORE_MAGIC) ? 0 : -EINVAL; -} -#endif /* CONFIG_HIBERNATION */ diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c new file mode 100644 index 000000000000..05f28f0d684a --- /dev/null +++ b/arch/x86/power/hibernate_64.c @@ -0,0 +1,169 @@ +/* + * Hibernation support for x86-64 + * + * Distribute under GPLv2 + * + * Copyright (c) 2007 Rafael J. Wysocki + * Copyright (c) 2002 Pavel Machek + * Copyright (c) 2001 Patrick Mochel + */ + +#include +#include +#include +#include +#include +#include + +/* References to section boundaries */ +extern const void __nosave_begin, __nosave_end; + +/* Defined in arch/x86_64/kernel/suspend_asm.S */ +extern int restore_image(void); + +/* + * Address to jump to in the last phase of restore in order to get to the image + * kernel's text (this value is passed in the image header). + */ +unsigned long restore_jump_address; + +/* + * Value of the cr3 register from before the hibernation (this value is passed + * in the image header). + */ +unsigned long restore_cr3; + +pgd_t *temp_level4_pgt; + +void *relocated_restore_code; + +static int res_phys_pud_init(pud_t *pud, unsigned long address, unsigned long end) +{ + long i, j; + + i = pud_index(address); + pud = pud + i; + for (; i < PTRS_PER_PUD; pud++, i++) { + unsigned long paddr; + pmd_t *pmd; + + paddr = address + i*PUD_SIZE; + if (paddr >= end) + break; + + pmd = (pmd_t *)get_safe_page(GFP_ATOMIC); + if (!pmd) + return -ENOMEM; + set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE)); + for (j = 0; j < PTRS_PER_PMD; pmd++, j++, paddr += PMD_SIZE) { + unsigned long pe; + + if (paddr >= end) + break; + pe = __PAGE_KERNEL_LARGE_EXEC | paddr; + pe &= __supported_pte_mask; + set_pmd(pmd, __pmd(pe)); + } + } + return 0; +} + +static int set_up_temporary_mappings(void) +{ + unsigned long start, end, next; + int error; + + temp_level4_pgt = (pgd_t *)get_safe_page(GFP_ATOMIC); + if (!temp_level4_pgt) + return -ENOMEM; + + /* It is safe to reuse the original kernel mapping */ + set_pgd(temp_level4_pgt + pgd_index(__START_KERNEL_map), + init_level4_pgt[pgd_index(__START_KERNEL_map)]); + + /* Set up the direct mapping from scratch */ + start = (unsigned long)pfn_to_kaddr(0); + end = (unsigned long)pfn_to_kaddr(end_pfn); + + for (; start < end; start = next) { + pud_t *pud = (pud_t *)get_safe_page(GFP_ATOMIC); + if (!pud) + return -ENOMEM; + next = start + PGDIR_SIZE; + if (next > end) + next = end; + if ((error = res_phys_pud_init(pud, __pa(start), __pa(next)))) + return error; + set_pgd(temp_level4_pgt + pgd_index(start), + mk_kernel_pgd(__pa(pud))); + } + return 0; +} + +int swsusp_arch_resume(void) +{ + int error; + + /* We have got enough memory and from now on we cannot recover */ + if ((error = set_up_temporary_mappings())) + return error; + + relocated_restore_code = (void *)get_safe_page(GFP_ATOMIC); + if (!relocated_restore_code) + return -ENOMEM; + memcpy(relocated_restore_code, &core_restore_code, + &restore_registers - &core_restore_code); + + restore_image(); + return 0; +} + +/* + * pfn_is_nosave - check if given pfn is in the 'nosave' section + */ + +int pfn_is_nosave(unsigned long pfn) +{ + unsigned long nosave_begin_pfn = __pa_symbol(&__nosave_begin) >> PAGE_SHIFT; + unsigned long nosave_end_pfn = PAGE_ALIGN(__pa_symbol(&__nosave_end)) >> PAGE_SHIFT; + return (pfn >= nosave_begin_pfn) && (pfn < nosave_end_pfn); +} + +struct restore_data_record { + unsigned long jump_address; + unsigned long cr3; + unsigned long magic; +}; + +#define RESTORE_MAGIC 0x0123456789ABCDEFUL + +/** + * arch_hibernation_header_save - populate the architecture specific part + * of a hibernation image header + * @addr: address to save the data at + */ +int arch_hibernation_header_save(void *addr, unsigned int max_size) +{ + struct restore_data_record *rdr = addr; + + if (max_size < sizeof(struct restore_data_record)) + return -EOVERFLOW; + rdr->jump_address = restore_jump_address; + rdr->cr3 = restore_cr3; + rdr->magic = RESTORE_MAGIC; + return 0; +} + +/** + * arch_hibernation_header_restore - read the architecture specific data + * from the hibernation image header + * @addr: address to read the data from + */ +int arch_hibernation_header_restore(void *addr) +{ + struct restore_data_record *rdr = addr; + + restore_jump_address = rdr->jump_address; + restore_cr3 = rdr->cr3; + return (rdr->magic == RESTORE_MAGIC) ? 0 : -EINVAL; +} From 261f0ce5ccdd17dc240d8453ca5ffc4688b92700 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Sat, 9 Feb 2008 23:24:09 +0100 Subject: [PATCH 25/32] x86 PM: update stale comments In some suspend and hibernation files in arch/x86/power there are comments referring to arch/x86-64 and arch/i386 . Update them to reflect the current code layout. Signed-off-by: Rafael J. Wysocki Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/power/hibernate_32.c | 4 ++-- arch/x86/power/hibernate_64.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/power/hibernate_32.c b/arch/x86/power/hibernate_32.c index 5080c377ef12..f2b6e3f11bfc 100644 --- a/arch/x86/power/hibernate_32.c +++ b/arch/x86/power/hibernate_32.c @@ -13,7 +13,7 @@ #include #include -/* Defined in arch/i386/power/swsusp.S */ +/* Defined in hibernate_asm_32.S */ extern int restore_image(void); /* References to section boundaries */ @@ -23,7 +23,7 @@ extern const void __nosave_begin, __nosave_end; pgd_t *resume_pg_dir; /* The following three functions are based on the analogous code in - * arch/i386/mm/init.c + * arch/x86/mm/init_32.c */ /* diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c index 05f28f0d684a..b542355e0e34 100644 --- a/arch/x86/power/hibernate_64.c +++ b/arch/x86/power/hibernate_64.c @@ -18,7 +18,7 @@ /* References to section boundaries */ extern const void __nosave_begin, __nosave_end; -/* Defined in arch/x86_64/kernel/suspend_asm.S */ +/* Defined in hibernate_asm_64.S */ extern int restore_image(void); /* From 31f4b46ec6f889533c06537dea96bb0d20fa625b Mon Sep 17 00:00:00 2001 From: "Ahmed S. Darwish" Date: Sat, 9 Feb 2008 23:24:09 +0100 Subject: [PATCH 26/32] lguest: accept guest _PAGE_PWT page table entries Beginning from commit 4138cc3418f5, ioremap_nocache() sets the _PAGE_PWT flag. Lguest doesn't accept a guest pte with a _PWT flag and reports a "bad page table entry" in that case. Accept guest _PAGE_PWT page table entries. Signed-off-by: Ahmed S. Darwish Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- drivers/lguest/page_tables.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c index 74b4cf2a6c41..275f23c2deb4 100644 --- a/drivers/lguest/page_tables.c +++ b/drivers/lguest/page_tables.c @@ -178,8 +178,8 @@ static void release_pte(pte_t pte) static void check_gpte(struct lg_cpu *cpu, pte_t gpte) { - if ((pte_flags(gpte) & (_PAGE_PWT|_PAGE_PSE)) - || pte_pfn(gpte) >= cpu->lg->pfn_limit) + if ((pte_flags(gpte) & _PAGE_PSE) || + pte_pfn(gpte) >= cpu->lg->pfn_limit) kill_guest(cpu, "bad page table entry"); } From 166124fde978b5a6c4412fb295c7f39711beb1b0 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 9 Feb 2008 23:24:09 +0100 Subject: [PATCH 27/32] brk: help text typo fix Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- init/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/init/Kconfig b/init/Kconfig index 455170e1c1e3..824d48cb67bf 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -587,7 +587,7 @@ config COMPAT_BRK disabled, and can be overriden runtime by setting /proc/sys/kernel/randomize_va_space to 2. - On non-ancient distros (post-2000 ones) Y is usually a safe choice. + On non-ancient distros (post-2000 ones) N is usually a safe choice. config BASE_FULL default y From a03c2a48e02aacaaea211c94691b729be357e047 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 9 Feb 2008 23:24:09 +0100 Subject: [PATCH 28/32] x86: DEBUG_PAGEALLOC: enable after mem_init() DEBUG_PAGEALLOC must not be enabled before mem_init(). Before this point there is nothing to allocate. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- init/main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/init/main.c b/init/main.c index c59859b85db0..8b1982082ad8 100644 --- a/init/main.c +++ b/init/main.c @@ -558,7 +558,6 @@ asmlinkage void __init start_kernel(void) preempt_disable(); build_all_zonelists(); page_alloc_init(); - enable_debug_pagealloc(); printk(KERN_NOTICE "Kernel command line: %s\n", boot_command_line); parse_early_param(); parse_args("Booting kernel", static_command_line, __start___param, @@ -614,6 +613,7 @@ asmlinkage void __init start_kernel(void) vfs_caches_init_early(); cpuset_init_early(); mem_init(); + enable_debug_pagealloc(); cpu_hotplug_init(); kmem_cache_init(); setup_per_cpu_pageset(); From 76ebd0548df6ee48586e9b80d8fc2f58aa5fb51c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 9 Feb 2008 23:24:09 +0100 Subject: [PATCH 29/32] x86: introduce page pool in cpa DEBUG_PAGEALLOC was not possible on 64-bit due to its early-bootup hardcoded reliance on PSE pages, and the unrobustness of the runtime splitup of large pages. The splitup ended in recursive calls to alloc_pages() when a page for a pte split was requested. Avoid the recursion with a preallocated page pool, which is used to split up large mappings and gets refilled in the return path of kernel_map_pages after the split has been done. The size of the page pool is adjusted to the available memory. This part just implements the page pool and the initialization w/o using it yet. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/mm/init_32.c | 2 + arch/x86/mm/init_64.c | 2 + arch/x86/mm/pageattr.c | 82 +++++++++++++++++++++++++++++++++++- include/asm-x86/cacheflush.h | 2 + 4 files changed, 87 insertions(+), 1 deletion(-) diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 54aba3cf9efe..8106bba41ecb 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -664,6 +664,8 @@ void __init mem_init(void) if (boot_cpu_data.wp_works_ok < 0) test_wp_bit(); + cpa_init(); + /* * Subtle. SMP is doing it's boot stuff late (because it has to * fork idle threads) - but it also needs low mappings for the diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 620d2b6b6bf4..b59fc238151f 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -528,6 +528,8 @@ void __init mem_init(void) reservedpages << (PAGE_SHIFT-10), datasize >> 10, initsize >> 10); + + cpa_init(); } void free_init_pages(char *what, unsigned long begin, unsigned long end) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index eb2a54415a77..831462c3bc35 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include @@ -336,6 +337,77 @@ out_unlock: return do_split; } +static LIST_HEAD(page_pool); +static unsigned long pool_size, pool_pages, pool_low; +static unsigned long pool_used, pool_failed, pool_refill; + +static void cpa_fill_pool(void) +{ + struct page *p; + gfp_t gfp = GFP_KERNEL; + + /* Do not allocate from interrupt context */ + if (in_irq() || irqs_disabled()) + return; + /* + * Check unlocked. I does not matter when we have one more + * page in the pool. The bit lock avoids recursive pool + * allocations: + */ + if (pool_pages >= pool_size || test_and_set_bit_lock(0, &pool_refill)) + return; + +#ifdef CONFIG_DEBUG_PAGEALLOC + /* + * We could do: + * gfp = in_atomic() ? GFP_ATOMIC : GFP_KERNEL; + * but this fails on !PREEMPT kernels + */ + gfp = GFP_ATOMIC | __GFP_NORETRY | __GFP_NOWARN; +#endif + + while (pool_pages < pool_size) { + p = alloc_pages(gfp, 0); + if (!p) { + pool_failed++; + break; + } + spin_lock_irq(&pgd_lock); + list_add(&p->lru, &page_pool); + pool_pages++; + spin_unlock_irq(&pgd_lock); + } + clear_bit_unlock(0, &pool_refill); +} + +#define SHIFT_MB (20 - PAGE_SHIFT) +#define ROUND_MB_GB ((1 << 10) - 1) +#define SHIFT_MB_GB 10 +#define POOL_PAGES_PER_GB 16 + +void __init cpa_init(void) +{ + struct sysinfo si; + unsigned long gb; + + si_meminfo(&si); + /* + * Calculate the number of pool pages: + * + * Convert totalram (nr of pages) to MiB and round to the next + * GiB. Shift MiB to Gib and multiply the result by + * POOL_PAGES_PER_GB: + */ + gb = ((si.totalram >> SHIFT_MB) + ROUND_MB_GB) >> SHIFT_MB_GB; + pool_size = POOL_PAGES_PER_GB * gb; + pool_low = pool_size; + + cpa_fill_pool(); + printk(KERN_DEBUG + "CPA: page pool initialized %lu of %lu pages preallocated\n", + pool_pages, pool_size); +} + static int split_large_page(pte_t *kpte, unsigned long address) { unsigned long flags, pfn, pfninc = 1; @@ -600,7 +672,7 @@ static int change_page_attr_set_clr(unsigned long addr, int numpages, * Check whether we really changed something: */ if (!cpa.flushtlb) - return ret; + goto out; /* * No need to flush, when we did not set any of the caching @@ -619,6 +691,8 @@ static int change_page_attr_set_clr(unsigned long addr, int numpages, else cpa_flush_all(cache); +out: + cpa_fill_pool(); return ret; } @@ -772,6 +846,12 @@ void kernel_map_pages(struct page *page, int numpages, int enable) * but that can deadlock->flush only current cpu: */ __flush_tlb_all(); + + /* + * Try to refill the page pool here. We can do this only after + * the tlb flush. + */ + cpa_fill_pool(); } #endif diff --git a/include/asm-x86/cacheflush.h b/include/asm-x86/cacheflush.h index 8dd8c5e3cc7f..6a22212b4b20 100644 --- a/include/asm-x86/cacheflush.h +++ b/include/asm-x86/cacheflush.h @@ -44,6 +44,8 @@ int set_memory_np(unsigned long addr, int numpages); void clflush_cache_range(void *addr, unsigned int size); +void cpa_init(void); + #ifdef CONFIG_DEBUG_RODATA void mark_rodata_ro(void); #endif From eb5b5f024c40f02e9b0f3801173769a726f170fb Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 9 Feb 2008 23:24:09 +0100 Subject: [PATCH 30/32] x86: cpa, use page pool Switch the split page code to use the page pool. We do this unconditionally to avoid different behaviour with and without DEBUG_PAGEALLOC enabled. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/mm/pageattr.c | 37 ++++++++++++++++++++++++++----------- 1 file changed, 26 insertions(+), 11 deletions(-) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 831462c3bc35..e5d29a112d00 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -411,20 +411,29 @@ void __init cpa_init(void) static int split_large_page(pte_t *kpte, unsigned long address) { unsigned long flags, pfn, pfninc = 1; - gfp_t gfp_flags = GFP_KERNEL; unsigned int i, level; pte_t *pbase, *tmp; pgprot_t ref_prot; struct page *base; -#ifdef CONFIG_DEBUG_PAGEALLOC - gfp_flags = GFP_ATOMIC | __GFP_NOWARN; -#endif - base = alloc_pages(gfp_flags, 0); - if (!base) - return -ENOMEM; - + /* + * Get a page from the pool. The pool list is protected by the + * pgd_lock, which we have to take anyway for the split + * operation: + */ spin_lock_irqsave(&pgd_lock, flags); + if (list_empty(&page_pool)) { + spin_unlock_irqrestore(&pgd_lock, flags); + return -ENOMEM; + } + + base = list_first_entry(&page_pool, struct page, lru); + list_del(&base->lru); + pool_pages--; + + if (pool_pages < pool_low) + pool_low = pool_pages; + /* * Check for races, another CPU might have split this page * up for us already: @@ -469,11 +478,17 @@ static int split_large_page(pte_t *kpte, unsigned long address) base = NULL; out_unlock: + /* + * If we dropped out via the lookup_address check under + * pgd_lock then stick the page back into the pool: + */ + if (base) { + list_add(&base->lru, &page_pool); + pool_pages++; + } else + pool_used++; spin_unlock_irqrestore(&pgd_lock, flags); - if (base) - __free_pages(base, 0); - return 0; } From b1d95f4e41d6a5969e3a847ceeae8379f30c84c3 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 9 Feb 2008 23:24:09 +0100 Subject: [PATCH 31/32] x86: cpa, enable CONFIG_DEBUG_PAGEALLOC on 64-bit Now, that the page pool is in place we can enable DEBUG_PAGEALLOC on 64bit. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/Kconfig.debug | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index fa555148823d..864affc9a7b0 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -34,13 +34,9 @@ config DEBUG_STACK_USAGE This option will slow down process creation somewhat. -comment "Page alloc debug is incompatible with Software Suspend on i386" - depends on DEBUG_KERNEL && HIBERNATION - depends on X86_32 - config DEBUG_PAGEALLOC bool "Debug page memory allocations" - depends on DEBUG_KERNEL && X86_32 + depends on DEBUG_KERNEL help Unmap pages from the kernel linear mapping after free_pages(). This results in a large slowdown, but helps to find certain types From fac84939609a683503947f41eb93e1917d026263 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 9 Feb 2008 23:24:09 +0100 Subject: [PATCH 32/32] x86: cpa, strict range check in try_preserve_large_page() Right now, we check only the first 4k page for static required protections. This does not take overlapping regions into account. So we might end up setting the wrong permissions/protections for other parts of this large page. This can be optimized further, but correctness is the important part. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/mm/pageattr.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index e5d29a112d00..440210a2277d 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -253,10 +253,10 @@ static int try_preserve_large_page(pte_t *kpte, unsigned long address, struct cpa_data *cpa) { - unsigned long nextpage_addr, numpages, pmask, psize, flags; + unsigned long nextpage_addr, numpages, pmask, psize, flags, addr; pte_t new_pte, old_pte, *tmp; pgprot_t old_prot, new_prot; - int do_split = 1; + int i, do_split = 1; unsigned int level; spin_lock_irqsave(&pgd_lock, flags); @@ -303,6 +303,19 @@ try_preserve_large_page(pte_t *kpte, unsigned long address, pgprot_val(new_prot) |= pgprot_val(cpa->mask_set); new_prot = static_protections(new_prot, address); + /* + * We need to check the full range, whether + * static_protection() requires a different pgprot for one of + * the pages in the range we try to preserve: + */ + addr = address + PAGE_SIZE; + for (i = 1; i < cpa->numpages; i++, addr += PAGE_SIZE) { + pgprot_t chk_prot = static_protections(new_prot, addr); + + if (pgprot_val(chk_prot) != pgprot_val(new_prot)) + goto out_unlock; + } + /* * If there are no changes, return. maxpages has been updated * above: