2012-09-05 03:12:07 +08:00
|
|
|
/*
|
|
|
|
* You SHOULD NOT be including this unless you're vsyscall
|
|
|
|
* handling code or timekeeping internal code!
|
|
|
|
*/
|
|
|
|
|
|
|
|
#ifndef _LINUX_TIMEKEEPER_INTERNAL_H
|
|
|
|
#define _LINUX_TIMEKEEPER_INTERNAL_H
|
|
|
|
|
|
|
|
#include <linux/clocksource.h>
|
|
|
|
#include <linux/jiffies.h>
|
|
|
|
#include <linux/time.h>
|
|
|
|
|
2014-07-17 05:05:16 +08:00
|
|
|
/**
|
|
|
|
* struct tk_read_base - base structure for timekeeping readout
|
|
|
|
* @clock: Current clocksource used for timekeeping.
|
|
|
|
* @read: Read function of @clock
|
|
|
|
* @mask: Bitmask for two's complement subtraction of non 64bit clocks
|
|
|
|
* @cycle_last: @clock cycle value at last update
|
2015-03-19 17:09:06 +08:00
|
|
|
* @mult: (NTP adjusted) multiplier for scaled math conversion
|
2014-07-17 05:05:16 +08:00
|
|
|
* @shift: Shift value for scaled math conversion
|
|
|
|
* @xtime_nsec: Shifted (fractional) nano seconds offset for readout
|
2015-03-19 17:09:06 +08:00
|
|
|
* @base: ktime_t (nanoseconds) base time for readout
|
2014-07-17 05:04:07 +08:00
|
|
|
*
|
2014-07-17 05:05:16 +08:00
|
|
|
* This struct has size 56 byte on 64 bit. Together with a seqcount it
|
|
|
|
* occupies a single 64byte cache line.
|
2014-07-17 05:04:07 +08:00
|
|
|
*
|
2014-07-17 05:05:16 +08:00
|
|
|
* The struct is separate from struct timekeeper as it is also used
|
2015-03-19 17:09:06 +08:00
|
|
|
* for a fast NMI safe accessors.
|
2014-07-17 05:04:07 +08:00
|
|
|
*/
|
2014-07-17 05:05:16 +08:00
|
|
|
struct tk_read_base {
|
2012-09-05 03:12:07 +08:00
|
|
|
struct clocksource *clock;
|
2014-07-17 05:05:15 +08:00
|
|
|
cycle_t (*read)(struct clocksource *cs);
|
|
|
|
cycle_t mask;
|
2014-07-17 05:05:13 +08:00
|
|
|
cycle_t cycle_last;
|
2012-09-05 03:12:07 +08:00
|
|
|
u32 mult;
|
|
|
|
u32 shift;
|
2014-07-17 05:04:07 +08:00
|
|
|
u64 xtime_nsec;
|
2015-03-19 17:09:06 +08:00
|
|
|
ktime_t base;
|
2014-07-17 05:05:16 +08:00
|
|
|
};
|
2014-07-17 05:04:10 +08:00
|
|
|
|
2014-07-17 05:05:16 +08:00
|
|
|
/**
|
|
|
|
* struct timekeeper - Structure holding internal timekeeping values.
|
2015-03-19 17:09:06 +08:00
|
|
|
* @tkr_mono: The readout base structure for CLOCK_MONOTONIC
|
2015-03-19 16:28:44 +08:00
|
|
|
* @tkr_raw: The readout base structure for CLOCK_MONOTONIC_RAW
|
2014-07-17 05:05:16 +08:00
|
|
|
* @xtime_sec: Current CLOCK_REALTIME time in seconds
|
2014-10-29 18:31:16 +08:00
|
|
|
* @ktime_sec: Current CLOCK_MONOTONIC time in seconds
|
2014-07-17 05:05:16 +08:00
|
|
|
* @wall_to_monotonic: CLOCK_REALTIME to CLOCK_MONOTONIC offset
|
|
|
|
* @offs_real: Offset clock monotonic -> clock realtime
|
|
|
|
* @offs_boot: Offset clock monotonic -> clock boottime
|
|
|
|
* @offs_tai: Offset clock monotonic -> clock tai
|
|
|
|
* @tai_offset: The current UTC to TAI offset in seconds
|
2015-04-15 05:08:37 +08:00
|
|
|
* @clock_was_set_seq: The sequence number of clock was set events
|
time: Add history to cross timestamp interface supporting slower devices
Another representative use case of time sync and the correlated
clocksource (in addition to PTP noted above) is PTP synchronized
audio.
In a streaming application, as an example, samples will be sent and/or
received by multiple devices with a presentation time that is in terms
of the PTP master clock. Synchronizing the audio output on these
devices requires correlating the audio clock with the PTP master
clock. The more precise this correlation is, the better the audio
quality (i.e. out of sync audio sounds bad).
From an application standpoint, to correlate the PTP master clock with
the audio device clock, the system clock is used as a intermediate
timebase. The transforms such an application would perform are:
System Clock <-> Audio clock
System Clock <-> Network Device Clock [<-> PTP Master Clock]
Modern Intel platforms can perform a more accurate cross timestamp in
hardware (ART,audio device clock). The audio driver requires
ART->system time transforms -- the same as required for the network
driver. These platforms offload audio processing (including
cross-timestamps) to a DSP which to ensure uninterrupted audio
processing, communicates and response to the host only once every
millsecond. As a result is takes up to a millisecond for the DSP to
receive a request, the request is processed by the DSP, the audio
output hardware is polled for completion, the result is copied into
shared memory, and the host is notified. All of these operation occur
on a millisecond cadence. This transaction requires about 2 ms, but
under heavier workloads it may take up to 4 ms.
Adding a history allows these slow devices the option of providing an
ART value outside of the current interval. In this case, the callback
provided is an accessor function for the previously obtained counter
value. If get_system_device_crosststamp() receives a counter value
previous to cycle_last, it consults the history provided as an
argument in history_ref and interpolates the realtime and monotonic
raw system time using the provided counter value. If there are any
clock discontinuities, e.g. from calling settimeofday(), the monotonic
raw time is interpolated in the usual way, but the realtime clock time
is adjusted by scaling the monotonic raw adjustment.
When an accessor function is used a history argument *must* be
provided. The history is initialized using ktime_get_snapshot() and
must be called before the counter values are read.
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: kevin.b.stanton@intel.com
Cc: kevin.j.clarke@intel.com
Cc: hpa@zytor.com
Cc: jeffrey.t.kirsher@intel.com
Cc: netdev@vger.kernel.org
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Christopher S. Hall <christopher.s.hall@intel.com>
[jstultz: Fixed up cycles_t/cycle_t type confusion]
Signed-off-by: John Stultz <john.stultz@linaro.org>
2016-02-22 19:15:23 +08:00
|
|
|
* @cs_was_changed_seq: The sequence number of clocksource change events
|
time: Prevent early expiry of hrtimers[CLOCK_REALTIME] at the leap second edge
Currently, leapsecond adjustments are done at tick time. As a result,
the leapsecond was applied at the first timer tick *after* the
leapsecond (~1-10ms late depending on HZ), rather then exactly on the
second edge.
This was in part historical from back when we were always tick based,
but correcting this since has been avoided since it adds extra
conditional checks in the gettime fastpath, which has performance
overhead.
However, it was recently pointed out that ABS_TIME CLOCK_REALTIME
timers set for right after the leapsecond could fire a second early,
since some timers may be expired before we trigger the timekeeping
timer, which then applies the leapsecond.
This isn't quite as bad as it sounds, since behaviorally it is similar
to what is possible w/ ntpd made leapsecond adjustments done w/o using
the kernel discipline. Where due to latencies, timers may fire just
prior to the settimeofday call. (Also, one should note that all
applications using CLOCK_REALTIME timers should always be careful,
since they are prone to quirks from settimeofday() disturbances.)
However, the purpose of having the kernel do the leap adjustment is to
avoid such latencies, so I think this is worth fixing.
So in order to properly keep those timers from firing a second early,
this patch modifies the ntp and timekeeping logic so that we keep
enough state so that the update_base_offsets_now accessor, which
provides the hrtimer core the current time, can check and apply the
leapsecond adjustment on the second edge. This prevents the hrtimer
core from expiring timers too early.
This patch does not modify any other time read path, so no additional
overhead is incurred. However, this also means that the leap-second
continues to be applied at tick time for all other read-paths.
Apologies to Richard Cochran, who pushed for similar changes years
ago, which I resisted due to the concerns about the performance
overhead.
While I suspect this isn't extremely critical, folks who care about
strict leap-second correctness will likely want to watch
this. Potentially a -stable candidate eventually.
Originally-suggested-by: Richard Cochran <richardcochran@gmail.com>
Reported-by: Daniel Bristot de Oliveira <bristot@redhat.com>
Reported-by: Prarit Bhargava <prarit@redhat.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jiri Bohac <jbohac@suse.cz>
Cc: Shuah Khan <shuahkh@osg.samsung.com>
Cc: Ingo Molnar <mingo@kernel.org>
Link: http://lkml.kernel.org/r/1434063297-28657-4-git-send-email-john.stultz@linaro.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
2015-06-12 06:54:55 +08:00
|
|
|
* @next_leap_ktime: CLOCK_MONOTONIC time value of a pending leap-second
|
2014-07-17 05:05:16 +08:00
|
|
|
* @raw_time: Monotonic raw base time in timespec64 format
|
|
|
|
* @cycle_interval: Number of clock cycles in one NTP interval
|
|
|
|
* @xtime_interval: Number of clock shifted nano seconds in one NTP
|
|
|
|
* interval.
|
|
|
|
* @xtime_remainder: Shifted nano seconds left over when rounding
|
|
|
|
* @cycle_interval
|
|
|
|
* @raw_interval: Raw nano seconds accumulated per NTP interval.
|
|
|
|
* @ntp_error: Difference between accumulated time and NTP time in ntp
|
|
|
|
* shifted nano seconds.
|
|
|
|
* @ntp_error_shift: Shift conversion between clock shifted nano seconds and
|
|
|
|
* ntp shifted nano seconds.
|
2015-05-14 07:04:47 +08:00
|
|
|
* @last_warning: Warning ratelimiter (DEBUG_TIMEKEEPING)
|
|
|
|
* @underflow_seen: Underflow warning flag (DEBUG_TIMEKEEPING)
|
|
|
|
* @overflow_seen: Overflow warning flag (DEBUG_TIMEKEEPING)
|
2014-07-17 05:05:16 +08:00
|
|
|
*
|
|
|
|
* Note: For timespec(64) based interfaces wall_to_monotonic is what
|
|
|
|
* we need to add to xtime (or xtime corrected for sub jiffie times)
|
|
|
|
* to get to monotonic time. Monotonic is pegged at zero at system
|
|
|
|
* boot time, so wall_to_monotonic will be negative, however, we will
|
|
|
|
* ALWAYS keep the tv_nsec part positive so we can use the usual
|
|
|
|
* normalization.
|
|
|
|
*
|
|
|
|
* wall_to_monotonic is moved after resume from suspend for the
|
|
|
|
* monotonic time not to jump. We need to add total_sleep_time to
|
|
|
|
* wall_to_monotonic to get the real boot based time offset.
|
|
|
|
*
|
|
|
|
* wall_to_monotonic is no longer the boot time, getboottime must be
|
|
|
|
* used instead.
|
|
|
|
*/
|
|
|
|
struct timekeeper {
|
2015-03-19 17:09:06 +08:00
|
|
|
struct tk_read_base tkr_mono;
|
2015-03-19 16:28:44 +08:00
|
|
|
struct tk_read_base tkr_raw;
|
2014-07-17 05:04:07 +08:00
|
|
|
u64 xtime_sec;
|
2014-10-29 18:31:16 +08:00
|
|
|
unsigned long ktime_sec;
|
2014-07-17 05:04:07 +08:00
|
|
|
struct timespec64 wall_to_monotonic;
|
|
|
|
ktime_t offs_real;
|
|
|
|
ktime_t offs_boot;
|
|
|
|
ktime_t offs_tai;
|
|
|
|
s32 tai_offset;
|
2015-04-15 05:08:37 +08:00
|
|
|
unsigned int clock_was_set_seq;
|
time: Add history to cross timestamp interface supporting slower devices
Another representative use case of time sync and the correlated
clocksource (in addition to PTP noted above) is PTP synchronized
audio.
In a streaming application, as an example, samples will be sent and/or
received by multiple devices with a presentation time that is in terms
of the PTP master clock. Synchronizing the audio output on these
devices requires correlating the audio clock with the PTP master
clock. The more precise this correlation is, the better the audio
quality (i.e. out of sync audio sounds bad).
From an application standpoint, to correlate the PTP master clock with
the audio device clock, the system clock is used as a intermediate
timebase. The transforms such an application would perform are:
System Clock <-> Audio clock
System Clock <-> Network Device Clock [<-> PTP Master Clock]
Modern Intel platforms can perform a more accurate cross timestamp in
hardware (ART,audio device clock). The audio driver requires
ART->system time transforms -- the same as required for the network
driver. These platforms offload audio processing (including
cross-timestamps) to a DSP which to ensure uninterrupted audio
processing, communicates and response to the host only once every
millsecond. As a result is takes up to a millisecond for the DSP to
receive a request, the request is processed by the DSP, the audio
output hardware is polled for completion, the result is copied into
shared memory, and the host is notified. All of these operation occur
on a millisecond cadence. This transaction requires about 2 ms, but
under heavier workloads it may take up to 4 ms.
Adding a history allows these slow devices the option of providing an
ART value outside of the current interval. In this case, the callback
provided is an accessor function for the previously obtained counter
value. If get_system_device_crosststamp() receives a counter value
previous to cycle_last, it consults the history provided as an
argument in history_ref and interpolates the realtime and monotonic
raw system time using the provided counter value. If there are any
clock discontinuities, e.g. from calling settimeofday(), the monotonic
raw time is interpolated in the usual way, but the realtime clock time
is adjusted by scaling the monotonic raw adjustment.
When an accessor function is used a history argument *must* be
provided. The history is initialized using ktime_get_snapshot() and
must be called before the counter values are read.
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: kevin.b.stanton@intel.com
Cc: kevin.j.clarke@intel.com
Cc: hpa@zytor.com
Cc: jeffrey.t.kirsher@intel.com
Cc: netdev@vger.kernel.org
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Christopher S. Hall <christopher.s.hall@intel.com>
[jstultz: Fixed up cycles_t/cycle_t type confusion]
Signed-off-by: John Stultz <john.stultz@linaro.org>
2016-02-22 19:15:23 +08:00
|
|
|
u8 cs_was_changed_seq;
|
time: Prevent early expiry of hrtimers[CLOCK_REALTIME] at the leap second edge
Currently, leapsecond adjustments are done at tick time. As a result,
the leapsecond was applied at the first timer tick *after* the
leapsecond (~1-10ms late depending on HZ), rather then exactly on the
second edge.
This was in part historical from back when we were always tick based,
but correcting this since has been avoided since it adds extra
conditional checks in the gettime fastpath, which has performance
overhead.
However, it was recently pointed out that ABS_TIME CLOCK_REALTIME
timers set for right after the leapsecond could fire a second early,
since some timers may be expired before we trigger the timekeeping
timer, which then applies the leapsecond.
This isn't quite as bad as it sounds, since behaviorally it is similar
to what is possible w/ ntpd made leapsecond adjustments done w/o using
the kernel discipline. Where due to latencies, timers may fire just
prior to the settimeofday call. (Also, one should note that all
applications using CLOCK_REALTIME timers should always be careful,
since they are prone to quirks from settimeofday() disturbances.)
However, the purpose of having the kernel do the leap adjustment is to
avoid such latencies, so I think this is worth fixing.
So in order to properly keep those timers from firing a second early,
this patch modifies the ntp and timekeeping logic so that we keep
enough state so that the update_base_offsets_now accessor, which
provides the hrtimer core the current time, can check and apply the
leapsecond adjustment on the second edge. This prevents the hrtimer
core from expiring timers too early.
This patch does not modify any other time read path, so no additional
overhead is incurred. However, this also means that the leap-second
continues to be applied at tick time for all other read-paths.
Apologies to Richard Cochran, who pushed for similar changes years
ago, which I resisted due to the concerns about the performance
overhead.
While I suspect this isn't extremely critical, folks who care about
strict leap-second correctness will likely want to watch
this. Potentially a -stable candidate eventually.
Originally-suggested-by: Richard Cochran <richardcochran@gmail.com>
Reported-by: Daniel Bristot de Oliveira <bristot@redhat.com>
Reported-by: Prarit Bhargava <prarit@redhat.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jiri Bohac <jbohac@suse.cz>
Cc: Shuah Khan <shuahkh@osg.samsung.com>
Cc: Ingo Molnar <mingo@kernel.org>
Link: http://lkml.kernel.org/r/1434063297-28657-4-git-send-email-john.stultz@linaro.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
2015-06-12 06:54:55 +08:00
|
|
|
ktime_t next_leap_ktime;
|
2014-07-17 05:04:07 +08:00
|
|
|
struct timespec64 raw_time;
|
|
|
|
|
2014-07-17 05:05:16 +08:00
|
|
|
/* The following members are for timekeeping internal use */
|
2012-09-05 03:12:07 +08:00
|
|
|
cycle_t cycle_interval;
|
|
|
|
u64 xtime_interval;
|
|
|
|
s64 xtime_remainder;
|
|
|
|
u32 raw_interval;
|
2014-04-24 11:53:29 +08:00
|
|
|
/* The ntp_tick_length() value currently being used.
|
|
|
|
* This cached copy ensures we consistently apply the tick
|
|
|
|
* length for an entire tick, as ntp_tick_length may change
|
|
|
|
* mid-tick, and we don't want to apply that new value to
|
|
|
|
* the tick in progress.
|
|
|
|
*/
|
|
|
|
u64 ntp_tick;
|
|
|
|
/* Difference between accumulated time and NTP time in ntp
|
|
|
|
* shifted nano seconds. */
|
2012-09-05 03:12:07 +08:00
|
|
|
s64 ntp_error;
|
2014-07-17 05:04:07 +08:00
|
|
|
u32 ntp_error_shift;
|
timekeeping: Rework frequency adjustments to work better w/ nohz
The existing timekeeping_adjust logic has always been complicated
to understand. Further, since it was developed prior to NOHZ becoming
common, its not surprising it performs poorly when NOHZ is enabled.
Since Miroslav pointed out the problematic nature of the existing code
in the NOHZ case, I've tried to refactor the code to perform better.
The problem with the previous approach was that it tried to adjust
for the total cumulative error using a scaled dampening factor. This
resulted in large errors to be corrected slowly, while small errors
were corrected quickly. With NOHZ the timekeeping code doesn't know
how far out the next tick will be, so this results in bad
over-correction to small errors, and insufficient correction to large
errors.
Inspired by Miroslav's patch, I've refactored the code to try to
address the correction in two steps.
1) Check the future freq error for the next tick, and if the frequency
error is large, try to make sure we correct it so it doesn't cause
much accumulated error.
2) Then make a small single unit adjustment to correct any cumulative
error that has collected over time.
This method performs fairly well in the simulator Miroslav created.
Major credit to Miroslav for pointing out the issue, providing the
original patch to resolve this, a simulator for testing, as well as
helping debug and resolve issues in my implementation so that it
performed closer to his original implementation.
Cc: Miroslav Lichvar <mlichvar@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Prarit Bhargava <prarit@redhat.com>
Reported-by: Miroslav Lichvar <mlichvar@redhat.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
2013-12-07 09:25:21 +08:00
|
|
|
u32 ntp_err_mult;
|
2015-05-14 07:04:47 +08:00
|
|
|
#ifdef CONFIG_DEBUG_TIMEKEEPING
|
|
|
|
long last_warning;
|
|
|
|
/*
|
|
|
|
* These simple flag variables are managed
|
|
|
|
* without locks, which is racy, but they are
|
|
|
|
* ok since we don't really care about being
|
|
|
|
* super precise about how many events were
|
|
|
|
* seen, just that a problem was observed.
|
|
|
|
*/
|
|
|
|
int underflow_seen;
|
|
|
|
int overflow_seen;
|
|
|
|
#endif
|
2012-09-05 03:12:07 +08:00
|
|
|
};
|
2012-09-05 03:27:48 +08:00
|
|
|
|
2012-09-12 07:58:13 +08:00
|
|
|
#ifdef CONFIG_GENERIC_TIME_VSYSCALL
|
|
|
|
|
|
|
|
extern void update_vsyscall(struct timekeeper *tk);
|
|
|
|
extern void update_vsyscall_tz(void);
|
2012-09-05 03:27:48 +08:00
|
|
|
|
2012-09-12 07:58:13 +08:00
|
|
|
#elif defined(CONFIG_GENERIC_TIME_VSYSCALL_OLD)
|
|
|
|
|
|
|
|
extern void update_vsyscall_old(struct timespec *ts, struct timespec *wtm,
|
2014-07-17 05:05:13 +08:00
|
|
|
struct clocksource *c, u32 mult,
|
2014-07-26 12:37:19 +08:00
|
|
|
cycle_t cycle_last);
|
2012-09-05 03:27:48 +08:00
|
|
|
extern void update_vsyscall_tz(void);
|
2012-09-12 07:58:13 +08:00
|
|
|
|
2012-09-05 03:27:48 +08:00
|
|
|
#else
|
2012-09-12 07:58:13 +08:00
|
|
|
|
|
|
|
static inline void update_vsyscall(struct timekeeper *tk)
|
2012-09-05 03:27:48 +08:00
|
|
|
{
|
|
|
|
}
|
|
|
|
static inline void update_vsyscall_tz(void)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2012-09-05 03:12:07 +08:00
|
|
|
#endif /* _LINUX_TIMEKEEPER_INTERNAL_H */
|