Merge branch 'timers-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip

* 'timers-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip:
  Documentation: Add timers/timers-howto.txt
  timer: Added usleep_range timer
  Revert "timer: Added usleep[_range] timer"
  clockevents: Remove the per cpu tick skew
  posix_timer: Move copy_to_user(created_timer_id) down in timer_create()
  timer: Added usleep[_range] timer
  timers: Document meaning of deferrable timer
This commit is contained in:
Linus Torvalds 2010-08-06 13:12:36 -07:00
commit af39008435
5 changed files with 141 additions and 12 deletions

View File

@ -0,0 +1,105 @@
delays - Information on the various kernel delay / sleep mechanisms
-------------------------------------------------------------------
This document seeks to answer the common question: "What is the
RightWay (TM) to insert a delay?"
This question is most often faced by driver writers who have to
deal with hardware delays and who may not be the most intimately
familiar with the inner workings of the Linux Kernel.
Inserting Delays
----------------
The first, and most important, question you need to ask is "Is my
code in an atomic context?" This should be followed closely by "Does
it really need to delay in atomic context?" If so...
ATOMIC CONTEXT:
You must use the *delay family of functions. These
functions use the jiffie estimation of clock speed
and will busy wait for enough loop cycles to achieve
the desired delay:
ndelay(unsigned long nsecs)
udelay(unsigned long usecs)
mdelay(unsgined long msecs)
udelay is the generally preferred API; ndelay-level
precision may not actually exist on many non-PC devices.
mdelay is macro wrapper around udelay, to account for
possible overflow when passing large arguments to udelay.
In general, use of mdelay is discouraged and code should
be refactored to allow for the use of msleep.
NON-ATOMIC CONTEXT:
You should use the *sleep[_range] family of functions.
There are a few more options here, while any of them may
work correctly, using the "right" sleep function will
help the scheduler, power management, and just make your
driver better :)
-- Backed by busy-wait loop:
udelay(unsigned long usecs)
-- Backed by hrtimers:
usleep_range(unsigned long min, unsigned long max)
-- Backed by jiffies / legacy_timers
msleep(unsigned long msecs)
msleep_interruptible(unsigned long msecs)
Unlike the *delay family, the underlying mechanism
driving each of these calls varies, thus there are
quirks you should be aware of.
SLEEPING FOR "A FEW" USECS ( < ~10us? ):
* Use udelay
- Why not usleep?
On slower systems, (embedded, OR perhaps a speed-
stepped PC!) the overhead of setting up the hrtimers
for usleep *may* not be worth it. Such an evaluation
will obviously depend on your specific situation, but
it is something to be aware of.
SLEEPING FOR ~USECS OR SMALL MSECS ( 10us - 20ms):
* Use usleep_range
- Why not msleep for (1ms - 20ms)?
Explained originally here:
http://lkml.org/lkml/2007/8/3/250
msleep(1~20) may not do what the caller intends, and
will often sleep longer (~20 ms actual sleep for any
value given in the 1~20ms range). In many cases this
is not the desired behavior.
- Why is there no "usleep" / What is a good range?
Since usleep_range is built on top of hrtimers, the
wakeup will be very precise (ish), thus a simple
usleep function would likely introduce a large number
of undesired interrupts.
With the introduction of a range, the scheduler is
free to coalesce your wakeup with any other wakeup
that may have happened for other reasons, or at the
worst case, fire an interrupt for your upper bound.
The larger a range you supply, the greater a chance
that you will not trigger an interrupt; this should
be balanced with what is an acceptable upper bound on
delay / performance for your specific code path. Exact
tolerances here are very situation specific, thus it
is left to the caller to determine a reasonable range.
SLEEPING FOR LARGER MSECS ( 10ms+ )
* Use msleep or possibly msleep_interruptible
- What's the difference?
msleep sets the current task to TASK_UNINTERRUPTIBLE
whereas msleep_interruptible sets the current task to
TASK_INTERRUPTIBLE before scheduling the sleep. In
short, the difference is whether the sleep can be ended
early by a signal. In general, just use msleep unless
you know you have a need for the interruptible variant.

View File

@ -45,6 +45,7 @@ extern unsigned long lpj_fine;
void calibrate_delay(void); void calibrate_delay(void);
void msleep(unsigned int msecs); void msleep(unsigned int msecs);
unsigned long msleep_interruptible(unsigned int msecs); unsigned long msleep_interruptible(unsigned int msecs);
void usleep_range(unsigned long min, unsigned long max);
static inline void ssleep(unsigned int seconds) static inline void ssleep(unsigned int seconds)
{ {

View File

@ -560,11 +560,6 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock,
new_timer->it_clock = which_clock; new_timer->it_clock = which_clock;
new_timer->it_overrun = -1; new_timer->it_overrun = -1;
if (copy_to_user(created_timer_id,
&new_timer_id, sizeof (new_timer_id))) {
error = -EFAULT;
goto out;
}
if (timer_event_spec) { if (timer_event_spec) {
if (copy_from_user(&event, timer_event_spec, sizeof (event))) { if (copy_from_user(&event, timer_event_spec, sizeof (event))) {
error = -EFAULT; error = -EFAULT;
@ -590,6 +585,12 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock,
new_timer->sigq->info.si_tid = new_timer->it_id; new_timer->sigq->info.si_tid = new_timer->it_id;
new_timer->sigq->info.si_code = SI_TIMER; new_timer->sigq->info.si_code = SI_TIMER;
if (copy_to_user(created_timer_id,
&new_timer_id, sizeof (new_timer_id))) {
error = -EFAULT;
goto out;
}
error = CLOCK_DISPATCH(which_clock, timer_create, (new_timer)); error = CLOCK_DISPATCH(which_clock, timer_create, (new_timer));
if (error) if (error)
goto out; goto out;

View File

@ -774,7 +774,6 @@ void tick_setup_sched_timer(void)
{ {
struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
ktime_t now = ktime_get(); ktime_t now = ktime_get();
u64 offset;
/* /*
* Emulate tick processing via per-CPU hrtimers: * Emulate tick processing via per-CPU hrtimers:
@ -784,10 +783,6 @@ void tick_setup_sched_timer(void)
/* Get the next period (per cpu) */ /* Get the next period (per cpu) */
hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update()); hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update());
offset = ktime_to_ns(tick_period) >> 1;
do_div(offset, num_possible_cpus());
offset *= smp_processor_id();
hrtimer_add_expires_ns(&ts->sched_timer, offset);
for (;;) { for (;;) {
hrtimer_forward(&ts->sched_timer, now, tick_period); hrtimer_forward(&ts->sched_timer, now, tick_period);

View File

@ -90,8 +90,13 @@ static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) = &boot_tvec_bases;
/* /*
* Note that all tvec_bases are 2 byte aligned and lower bit of * Note that all tvec_bases are 2 byte aligned and lower bit of
* base in timer_list is guaranteed to be zero. Use the LSB for * base in timer_list is guaranteed to be zero. Use the LSB to
* the new flag to indicate whether the timer is deferrable * indicate whether the timer is deferrable.
*
* A deferrable timer will work normally when the system is busy, but
* will not cause a CPU to come out of idle just to service it; instead,
* the timer will be serviced when the CPU eventually wakes up with a
* subsequent non-deferrable timer.
*/ */
#define TBASE_DEFERRABLE_FLAG (0x1) #define TBASE_DEFERRABLE_FLAG (0x1)
@ -1758,3 +1763,25 @@ unsigned long msleep_interruptible(unsigned int msecs)
} }
EXPORT_SYMBOL(msleep_interruptible); EXPORT_SYMBOL(msleep_interruptible);
static int __sched do_usleep_range(unsigned long min, unsigned long max)
{
ktime_t kmin;
unsigned long delta;
kmin = ktime_set(0, min * NSEC_PER_USEC);
delta = (max - min) * NSEC_PER_USEC;
return schedule_hrtimeout_range(&kmin, delta, HRTIMER_MODE_REL);
}
/**
* usleep_range - Drop in replacement for udelay where wakeup is flexible
* @min: Minimum time in usecs to sleep
* @max: Maximum time in usecs to sleep
*/
void usleep_range(unsigned long min, unsigned long max)
{
__set_current_state(TASK_UNINTERRUPTIBLE);
do_usleep_range(min, max);
}
EXPORT_SYMBOL(usleep_range);