2018-11-01 02:21:09 +08:00
// SPDX-License-Identifier: GPL-2.0
2007-05-08 15:27:59 +08:00
/*
2018-11-01 02:21:08 +08:00
* Kernel timekeeping code and accessor functions . Based on code from
* timer . c , moved in commit 8524070 b7982 .
2007-05-08 15:27:59 +08:00
*/
2012-09-05 03:12:07 +08:00
# include <linux/timekeeper_internal.h>
2007-05-08 15:27:59 +08:00
# include <linux/module.h>
# include <linux/interrupt.h>
# include <linux/percpu.h>
# include <linux/init.h>
# include <linux/mm.h>
2017-02-09 01:51:31 +08:00
# include <linux/nmi.h>
2009-10-07 21:09:06 +08:00
# include <linux/sched.h>
2017-02-08 15:45:17 +08:00
# include <linux/sched/loadavg.h>
2018-07-20 04:55:34 +08:00
# include <linux/sched/clock.h>
2011-03-24 05:16:04 +08:00
# include <linux/syscore_ops.h>
2007-05-08 15:27:59 +08:00
# include <linux/clocksource.h>
# include <linux/jiffies.h>
# include <linux/time.h>
# include <linux/tick.h>
2009-08-14 21:47:30 +08:00
# include <linux/stop_machine.h>
2012-11-28 09:28:59 +08:00
# include <linux/pvclock_gtod.h>
2014-04-08 06:39:20 +08:00
# include <linux/compiler.h>
2019-04-10 17:14:19 +08:00
# include <linux/audit.h>
2007-05-08 15:27:59 +08:00
2013-02-22 06:51:36 +08:00
# include "tick-internal.h"
2013-03-23 02:31:29 +08:00
# include "ntp_internal.h"
2013-05-22 13:32:14 +08:00
# include "timekeeping_internal.h"
2009-08-14 21:47:26 +08:00
2013-06-27 18:35:45 +08:00
# define TK_CLEAR_NTP (1 << 0)
# define TK_MIRROR (1 << 1)
2013-06-27 18:35:46 +08:00
# define TK_CLOCK_WAS_SET (1 << 2)
2013-06-27 18:35:45 +08:00
2018-06-04 21:34:21 +08:00
enum timekeeping_adv_mode {
/* Update timekeeper when a tick has passed */
TK_ADV_TICK ,
/* Update timekeeper on a direct frequency change */
TK_ADV_FREQ
} ;
2014-07-17 05:04:07 +08:00
/*
* The most important data for readout fits into a single 64 byte
* cache line .
*/
static struct {
seqcount_t seq ;
struct timekeeper timekeeper ;
2018-11-29 07:43:09 +08:00
} tk_core ____cacheline_aligned = {
. seq = SEQCNT_ZERO ( tk_core . seq ) ,
} ;
2014-07-17 05:04:07 +08:00
2013-02-22 06:51:38 +08:00
static DEFINE_RAW_SPINLOCK ( timekeeper_lock ) ;
2013-02-22 06:51:40 +08:00
static struct timekeeper shadow_timekeeper ;
2009-08-14 21:47:26 +08:00
2014-07-17 05:05:23 +08:00
/**
* struct tk_fast - NMI safe timekeeper
* @ seq : Sequence counter for protecting updates . The lowest bit
* is the index for the tk_read_base array
* @ base : tk_read_base array . Access is indexed by the lowest bit of
* @ seq .
*
* See @ update_fast_timekeeper ( ) below .
*/
struct tk_fast {
seqcount_t seq ;
struct tk_read_base base [ 2 ] ;
} ;
2017-08-28 20:21:53 +08:00
/* Suspend-time cycles value for halted fast timekeeper. */
static u64 cycles_at_suspend ;
static u64 dummy_clock_read ( struct clocksource * cs )
{
return cycles_at_suspend ;
}
static struct clocksource dummy_clock = {
. read = dummy_clock_read ,
} ;
static struct tk_fast tk_fast_mono ____cacheline_aligned = {
. base [ 0 ] = { . clock = & dummy_clock , } ,
. base [ 1 ] = { . clock = & dummy_clock , } ,
} ;
static struct tk_fast tk_fast_raw ____cacheline_aligned = {
. base [ 0 ] = { . clock = & dummy_clock , } ,
. base [ 1 ] = { . clock = & dummy_clock , } ,
} ;
2014-07-17 05:05:23 +08:00
2011-11-15 03:46:39 +08:00
/* flag for if timekeeping is suspended */
int __read_mostly timekeeping_suspended ;
2012-07-13 13:21:53 +08:00
static inline void tk_normalize_xtime ( struct timekeeper * tk )
{
2015-03-19 17:09:06 +08:00
while ( tk - > tkr_mono . xtime_nsec > = ( ( u64 ) NSEC_PER_SEC < < tk - > tkr_mono . shift ) ) {
tk - > tkr_mono . xtime_nsec - = ( u64 ) NSEC_PER_SEC < < tk - > tkr_mono . shift ;
2012-07-13 13:21:53 +08:00
tk - > xtime_sec + + ;
}
2017-05-23 08:20:20 +08:00
while ( tk - > tkr_raw . xtime_nsec > = ( ( u64 ) NSEC_PER_SEC < < tk - > tkr_raw . shift ) ) {
tk - > tkr_raw . xtime_nsec - = ( u64 ) NSEC_PER_SEC < < tk - > tkr_raw . shift ;
tk - > raw_sec + + ;
}
2012-07-13 13:21:53 +08:00
}
2018-07-13 20:06:42 +08:00
static inline struct timespec64 tk_xtime ( const struct timekeeper * tk )
2014-07-17 05:04:05 +08:00
{
struct timespec64 ts ;
ts . tv_sec = tk - > xtime_sec ;
2015-03-19 17:09:06 +08:00
ts . tv_nsec = ( long ) ( tk - > tkr_mono . xtime_nsec > > tk - > tkr_mono . shift ) ;
2014-07-17 05:04:05 +08:00
return ts ;
}
2014-07-17 05:04:01 +08:00
static void tk_set_xtime ( struct timekeeper * tk , const struct timespec64 * ts )
2012-07-13 13:21:53 +08:00
{
tk - > xtime_sec = ts - > tv_sec ;
2015-03-19 17:09:06 +08:00
tk - > tkr_mono . xtime_nsec = ( u64 ) ts - > tv_nsec < < tk - > tkr_mono . shift ;
2012-07-13 13:21:53 +08:00
}
2014-07-17 05:04:01 +08:00
static void tk_xtime_add ( struct timekeeper * tk , const struct timespec64 * ts )
2012-07-13 13:21:53 +08:00
{
tk - > xtime_sec + = ts - > tv_sec ;
2015-03-19 17:09:06 +08:00
tk - > tkr_mono . xtime_nsec + = ( u64 ) ts - > tv_nsec < < tk - > tkr_mono . shift ;
2012-08-22 08:30:46 +08:00
tk_normalize_xtime ( tk ) ;
2012-07-13 13:21:53 +08:00
}
2011-11-15 03:46:39 +08:00
2014-07-17 05:04:01 +08:00
static void tk_set_wall_to_mono ( struct timekeeper * tk , struct timespec64 wtm )
2012-07-28 02:48:12 +08:00
{
2014-07-17 05:04:01 +08:00
struct timespec64 tmp ;
2012-07-28 02:48:12 +08:00
/*
* Verify consistency of : offset_real = - wall_to_monotonic
* before modifying anything
*/
2014-07-17 05:04:01 +08:00
set_normalized_timespec64 ( & tmp , - tk - > wall_to_monotonic . tv_sec ,
2012-07-28 02:48:12 +08:00
- tk - > wall_to_monotonic . tv_nsec ) ;
2016-12-25 18:38:40 +08:00
WARN_ON_ONCE ( tk - > offs_real ! = timespec64_to_ktime ( tmp ) ) ;
2012-07-28 02:48:12 +08:00
tk - > wall_to_monotonic = wtm ;
2014-07-17 05:04:01 +08:00
set_normalized_timespec64 ( & tmp , - wtm . tv_sec , - wtm . tv_nsec ) ;
tk - > offs_real = timespec64_to_ktime ( tmp ) ;
2013-12-11 09:13:35 +08:00
tk - > offs_tai = ktime_add ( tk - > offs_real , ktime_set ( tk - > tai_offset , 0 ) ) ;
2012-07-28 02:48:12 +08:00
}
2014-07-17 05:05:00 +08:00
static inline void tk_update_sleep_time ( struct timekeeper * tk , ktime_t delta )
2012-07-28 02:48:12 +08:00
{
2018-04-25 21:33:38 +08:00
tk - > offs_boot = ktime_add ( tk - > offs_boot , delta ) ;
2019-08-22 19:00:15 +08:00
/*
* Timespec representation for VDSO update to avoid 64 bit division
* on every update .
*/
tk - > monotonic_to_boot = ktime_to_timespec64 ( tk - > offs_boot ) ;
2012-07-28 02:48:12 +08:00
}
2017-06-09 07:44:20 +08:00
/*
* tk_clock_read - atomic clocksource read ( ) helper
*
* This helper is necessary to use in the read paths because , while the
* seqlock ensures we don ' t return a bad value while structures are updated ,
* it doesn ' t protect from potential crashes . There is the possibility that
* the tkr ' s clocksource may change between the read reference , and the
* clock reference passed to the read function . This can cause crashes if
* the wrong clocksource is passed to the wrong read function .
* This isn ' t necessary to use when holding the timekeeper_lock or doing
* a read of the fast - timekeeper tkrs ( which is protected by its own locking
* and update logic ) .
*/
2018-07-13 20:06:42 +08:00
static inline u64 tk_clock_read ( const struct tk_read_base * tkr )
2017-06-09 07:44:20 +08:00
{
struct clocksource * clock = READ_ONCE ( tkr - > clock ) ;
return clock - > read ( clock ) ;
}
2015-03-12 12:16:32 +08:00
# ifdef CONFIG_DEBUG_TIMEKEEPING
2015-03-12 12:16:35 +08:00
# define WARNING_FREQ (HZ*300) /* 5 minute rate-limiting */
2016-12-22 03:32:01 +08:00
static void timekeeping_check_update ( struct timekeeper * tk , u64 offset )
2015-03-12 12:16:32 +08:00
{
2016-12-22 03:32:01 +08:00
u64 max_cycles = tk - > tkr_mono . clock - > max_cycles ;
2015-03-19 17:09:06 +08:00
const char * name = tk - > tkr_mono . clock - > name ;
2015-03-12 12:16:32 +08:00
if ( offset > max_cycles ) {
2015-03-12 12:16:33 +08:00
printk_deferred ( " WARNING: timekeeping: Cycle offset (%lld) is larger than allowed by the '%s' clock's max_cycles value (%lld): time overflow danger \n " ,
2015-03-12 12:16:32 +08:00
offset , name , max_cycles ) ;
2015-03-12 12:16:33 +08:00
printk_deferred ( " timekeeping: Your kernel is sick, but tries to cope by capping time updates \n " ) ;
2015-03-12 12:16:32 +08:00
} else {
if ( offset > ( max_cycles > > 1 ) ) {
2015-12-13 14:26:11 +08:00
printk_deferred ( " INFO: timekeeping: Cycle offset (%lld) is larger than the '%s' clock's 50%% safety margin (%lld) \n " ,
2015-03-12 12:16:32 +08:00
offset , name , max_cycles > > 1 ) ;
printk_deferred ( " timekeeping: Your kernel is still fine, but is feeling a bit nervous \n " ) ;
}
}
2015-03-12 12:16:35 +08:00
2015-05-14 07:04:47 +08:00
if ( tk - > underflow_seen ) {
if ( jiffies - tk - > last_warning > WARNING_FREQ ) {
2015-03-12 12:16:35 +08:00
printk_deferred ( " WARNING: Underflow in clocksource '%s' observed, time update ignored. \n " , name ) ;
printk_deferred ( " Please report this, consider using a different clocksource, if possible. \n " ) ;
printk_deferred ( " Your kernel is probably still fine. \n " ) ;
2015-05-14 07:04:47 +08:00
tk - > last_warning = jiffies ;
2015-03-12 12:16:35 +08:00
}
2015-05-14 07:04:47 +08:00
tk - > underflow_seen = 0 ;
2015-03-12 12:16:35 +08:00
}
2015-05-14 07:04:47 +08:00
if ( tk - > overflow_seen ) {
if ( jiffies - tk - > last_warning > WARNING_FREQ ) {
2015-03-12 12:16:35 +08:00
printk_deferred ( " WARNING: Overflow in clocksource '%s' observed, time update capped. \n " , name ) ;
printk_deferred ( " Please report this, consider using a different clocksource, if possible. \n " ) ;
printk_deferred ( " Your kernel is probably still fine. \n " ) ;
2015-05-14 07:04:47 +08:00
tk - > last_warning = jiffies ;
2015-03-12 12:16:35 +08:00
}
2015-05-14 07:04:47 +08:00
tk - > overflow_seen = 0 ;
2015-03-12 12:16:35 +08:00
}
2015-03-12 12:16:32 +08:00
}
2015-03-12 12:16:33 +08:00
2018-07-13 20:06:42 +08:00
static inline u64 timekeeping_get_delta ( const struct tk_read_base * tkr )
2015-03-12 12:16:33 +08:00
{
2015-05-14 07:04:47 +08:00
struct timekeeper * tk = & tk_core . timekeeper ;
2016-12-22 03:32:01 +08:00
u64 now , last , mask , max , delta ;
2015-03-12 12:16:35 +08:00
unsigned int seq ;
2015-03-12 12:16:33 +08:00
2015-03-12 12:16:35 +08:00
/*
* Since we ' re called holding a seqlock , the data may shift
* under us while we ' re doing the calculation . This can cause
* false positives , since we ' d note a problem but throw the
* results away . So nest another seqlock here to atomically
* grab the points we are checking with .
*/
do {
seq = read_seqcount_begin ( & tk_core . seq ) ;
2017-06-09 07:44:20 +08:00
now = tk_clock_read ( tkr ) ;
2015-03-12 12:16:35 +08:00
last = tkr - > cycle_last ;
mask = tkr - > mask ;
max = tkr - > clock - > max_cycles ;
} while ( read_seqcount_retry ( & tk_core . seq , seq ) ) ;
2015-03-12 12:16:33 +08:00
2015-03-12 12:16:35 +08:00
delta = clocksource_delta ( now , last , mask ) ;
2015-03-12 12:16:33 +08:00
2015-03-12 12:16:34 +08:00
/*
* Try to catch underflows by checking if we are seeing small
* mask - relative negative values .
*/
2015-03-12 12:16:35 +08:00
if ( unlikely ( ( ~ delta & mask ) < ( mask > > 3 ) ) ) {
2015-05-14 07:04:47 +08:00
tk - > underflow_seen = 1 ;
2015-03-12 12:16:34 +08:00
delta = 0 ;
2015-03-12 12:16:35 +08:00
}
2015-03-12 12:16:34 +08:00
2015-03-12 12:16:33 +08:00
/* Cap delta value to the max_cycles values to avoid mult overflows */
2015-03-12 12:16:35 +08:00
if ( unlikely ( delta > max ) ) {
2015-05-14 07:04:47 +08:00
tk - > overflow_seen = 1 ;
2015-03-12 12:16:33 +08:00
delta = tkr - > clock - > max_cycles ;
2015-03-12 12:16:35 +08:00
}
2015-03-12 12:16:33 +08:00
return delta ;
}
2015-03-12 12:16:32 +08:00
# else
2016-12-22 03:32:01 +08:00
static inline void timekeeping_check_update ( struct timekeeper * tk , u64 offset )
2015-03-12 12:16:32 +08:00
{
}
2018-07-13 20:06:42 +08:00
static inline u64 timekeeping_get_delta ( const struct tk_read_base * tkr )
2015-03-12 12:16:33 +08:00
{
2016-12-22 03:32:01 +08:00
u64 cycle_now , delta ;
2015-03-12 12:16:33 +08:00
/* read clocksource */
2017-06-09 07:44:20 +08:00
cycle_now = tk_clock_read ( tkr ) ;
2015-03-12 12:16:33 +08:00
/* calculate the delta since the last update_wall_time */
delta = clocksource_delta ( cycle_now , tkr - > cycle_last , tkr - > mask ) ;
return delta ;
}
2015-03-12 12:16:32 +08:00
# endif
2009-08-14 21:47:26 +08:00
/**
2013-11-28 16:28:55 +08:00
* tk_setup_internals - Set up internals to use clocksource clock .
2009-08-14 21:47:26 +08:00
*
2013-11-28 16:28:55 +08:00
* @ tk : The target timekeeper to setup .
2009-08-14 21:47:26 +08:00
* @ clock : Pointer to clocksource .
*
* Calculates a fixed cycle / nsec interval for a given clocksource / adjustment
* pair and interval request .
*
* Unless you ' re the timekeeping code , you should not be using this !
*/
2012-07-13 13:21:57 +08:00
static void tk_setup_internals ( struct timekeeper * tk , struct clocksource * clock )
2009-08-14 21:47:26 +08:00
{
2016-12-22 03:32:01 +08:00
u64 interval ;
2010-10-21 06:55:15 +08:00
u64 tmp , ntpinterval ;
2012-07-13 13:21:53 +08:00
struct clocksource * old_clock ;
2009-08-14 21:47:26 +08:00
time: Add history to cross timestamp interface supporting slower devices
Another representative use case of time sync and the correlated
clocksource (in addition to PTP noted above) is PTP synchronized
audio.
In a streaming application, as an example, samples will be sent and/or
received by multiple devices with a presentation time that is in terms
of the PTP master clock. Synchronizing the audio output on these
devices requires correlating the audio clock with the PTP master
clock. The more precise this correlation is, the better the audio
quality (i.e. out of sync audio sounds bad).
From an application standpoint, to correlate the PTP master clock with
the audio device clock, the system clock is used as a intermediate
timebase. The transforms such an application would perform are:
System Clock <-> Audio clock
System Clock <-> Network Device Clock [<-> PTP Master Clock]
Modern Intel platforms can perform a more accurate cross timestamp in
hardware (ART,audio device clock). The audio driver requires
ART->system time transforms -- the same as required for the network
driver. These platforms offload audio processing (including
cross-timestamps) to a DSP which to ensure uninterrupted audio
processing, communicates and response to the host only once every
millsecond. As a result is takes up to a millisecond for the DSP to
receive a request, the request is processed by the DSP, the audio
output hardware is polled for completion, the result is copied into
shared memory, and the host is notified. All of these operation occur
on a millisecond cadence. This transaction requires about 2 ms, but
under heavier workloads it may take up to 4 ms.
Adding a history allows these slow devices the option of providing an
ART value outside of the current interval. In this case, the callback
provided is an accessor function for the previously obtained counter
value. If get_system_device_crosststamp() receives a counter value
previous to cycle_last, it consults the history provided as an
argument in history_ref and interpolates the realtime and monotonic
raw system time using the provided counter value. If there are any
clock discontinuities, e.g. from calling settimeofday(), the monotonic
raw time is interpolated in the usual way, but the realtime clock time
is adjusted by scaling the monotonic raw adjustment.
When an accessor function is used a history argument *must* be
provided. The history is initialized using ktime_get_snapshot() and
must be called before the counter values are read.
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: kevin.b.stanton@intel.com
Cc: kevin.j.clarke@intel.com
Cc: hpa@zytor.com
Cc: jeffrey.t.kirsher@intel.com
Cc: netdev@vger.kernel.org
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Christopher S. Hall <christopher.s.hall@intel.com>
[jstultz: Fixed up cycles_t/cycle_t type confusion]
Signed-off-by: John Stultz <john.stultz@linaro.org>
2016-02-22 19:15:23 +08:00
+ + tk - > cs_was_changed_seq ;
2015-03-19 17:09:06 +08:00
old_clock = tk - > tkr_mono . clock ;
tk - > tkr_mono . clock = clock ;
tk - > tkr_mono . mask = clock - > mask ;
2017-06-09 07:44:20 +08:00
tk - > tkr_mono . cycle_last = tk_clock_read ( & tk - > tkr_mono ) ;
2009-08-14 21:47:26 +08:00
2015-03-19 16:28:44 +08:00
tk - > tkr_raw . clock = clock ;
tk - > tkr_raw . mask = clock - > mask ;
tk - > tkr_raw . cycle_last = tk - > tkr_mono . cycle_last ;
2009-08-14 21:47:26 +08:00
/* Do the ns -> cycle conversion first, using original mult */
tmp = NTP_INTERVAL_LENGTH ;
tmp < < = clock - > shift ;
2010-10-21 06:55:15 +08:00
ntpinterval = tmp ;
2009-08-14 21:47:28 +08:00
tmp + = clock - > mult / 2 ;
do_div ( tmp , clock - > mult ) ;
2009-08-14 21:47:26 +08:00
if ( tmp = = 0 )
tmp = 1 ;
2016-12-22 03:32:01 +08:00
interval = ( u64 ) tmp ;
2012-07-13 13:21:57 +08:00
tk - > cycle_interval = interval ;
2009-08-14 21:47:26 +08:00
/* Go back from cycles -> shifted ns */
2016-12-09 04:49:36 +08:00
tk - > xtime_interval = interval * clock - > mult ;
2012-07-13 13:21:57 +08:00
tk - > xtime_remainder = ntpinterval - tk - > xtime_interval ;
2017-06-09 07:44:21 +08:00
tk - > raw_interval = interval * clock - > mult ;
2009-08-14 21:47:26 +08:00
2012-07-13 13:21:53 +08:00
/* if changing clocks, convert xtime_nsec shift units */
if ( old_clock ) {
int shift_change = clock - > shift - old_clock - > shift ;
2017-05-23 08:20:20 +08:00
if ( shift_change < 0 ) {
2015-03-19 17:09:06 +08:00
tk - > tkr_mono . xtime_nsec > > = - shift_change ;
2017-05-23 08:20:20 +08:00
tk - > tkr_raw . xtime_nsec > > = - shift_change ;
} else {
2015-03-19 17:09:06 +08:00
tk - > tkr_mono . xtime_nsec < < = shift_change ;
2017-05-23 08:20:20 +08:00
tk - > tkr_raw . xtime_nsec < < = shift_change ;
}
2012-07-13 13:21:53 +08:00
}
2015-03-19 16:28:44 +08:00
2015-03-19 17:09:06 +08:00
tk - > tkr_mono . shift = clock - > shift ;
2015-03-19 16:28:44 +08:00
tk - > tkr_raw . shift = clock - > shift ;
2009-08-14 21:47:26 +08:00
2012-07-13 13:21:57 +08:00
tk - > ntp_error = 0 ;
tk - > ntp_error_shift = NTP_SCALE_SHIFT - clock - > shift ;
2014-04-24 11:53:29 +08:00
tk - > ntp_tick = ntpinterval < < tk - > ntp_error_shift ;
2009-08-14 21:47:28 +08:00
/*
* The timekeeper keeps its own mult values for the currently
* active clocksource . These value will be adjusted via NTP
* to counteract clock drifting .
*/
2015-03-19 17:09:06 +08:00
tk - > tkr_mono . mult = clock - > mult ;
2015-03-19 16:28:44 +08:00
tk - > tkr_raw . mult = clock - > mult ;
timekeeping: Rework frequency adjustments to work better w/ nohz
The existing timekeeping_adjust logic has always been complicated
to understand. Further, since it was developed prior to NOHZ becoming
common, its not surprising it performs poorly when NOHZ is enabled.
Since Miroslav pointed out the problematic nature of the existing code
in the NOHZ case, I've tried to refactor the code to perform better.
The problem with the previous approach was that it tried to adjust
for the total cumulative error using a scaled dampening factor. This
resulted in large errors to be corrected slowly, while small errors
were corrected quickly. With NOHZ the timekeeping code doesn't know
how far out the next tick will be, so this results in bad
over-correction to small errors, and insufficient correction to large
errors.
Inspired by Miroslav's patch, I've refactored the code to try to
address the correction in two steps.
1) Check the future freq error for the next tick, and if the frequency
error is large, try to make sure we correct it so it doesn't cause
much accumulated error.
2) Then make a small single unit adjustment to correct any cumulative
error that has collected over time.
This method performs fairly well in the simulator Miroslav created.
Major credit to Miroslav for pointing out the issue, providing the
original patch to resolve this, a simulator for testing, as well as
helping debug and resolve issues in my implementation so that it
performed closer to his original implementation.
Cc: Miroslav Lichvar <mlichvar@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Prarit Bhargava <prarit@redhat.com>
Reported-by: Miroslav Lichvar <mlichvar@redhat.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
2013-12-07 09:25:21 +08:00
tk - > ntp_err_mult = 0 ;
2018-03-10 02:42:48 +08:00
tk - > skip_second_overflow = 0 ;
2009-08-14 21:47:26 +08:00
}
2007-05-08 15:27:59 +08:00
2009-08-14 21:47:29 +08:00
/* Timekeeper helper functions. */
time: convert arch_gettimeoffset to a pointer
Currently, whenever CONFIG_ARCH_USES_GETTIMEOFFSET is enabled, each
arch core provides a single implementation of arch_gettimeoffset(). In
many cases, different sub-architectures, different machines, or
different timer providers exist, and so the arch ends up implementing
arch_gettimeoffset() as a call-through-pointer anyway. Examples are
ARM, Cris, M68K, and it's arguable that the remaining architectures,
M32R and Blackfin, should be doing this anyway.
Modify arch_gettimeoffset so that it itself is a function pointer, which
the arch initializes. This will allow later changes to move the
initialization of this function into individual machine support or timer
drivers. This is particularly useful for code in drivers/clocksource
which should rely on an arch-independant mechanism to register their
implementation of arch_gettimeoffset().
This patch also converts the Cris architecture to set arch_gettimeoffset
directly to the final implementation in time_init(), because Cris already
had separate time_init() functions per sub-architecture. M68K and ARM
are converted to set arch_gettimeoffset to the final implementation in
later patches, because they already have function pointers in place for
this purpose.
Cc: Russell King <linux@arm.linux.org.uk>
Cc: Mike Frysinger <vapier@gentoo.org>
Cc: Mikael Starvik <starvik@axis.com>
Cc: Hirokazu Takata <takata@linux-m32r.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Geert Uytterhoeven <geert@linux-m68k.org>
Acked-by: Jesper Nilsson <jesper.nilsson@axis.com>
Acked-by: John Stultz <johnstul@us.ibm.com>
Signed-off-by: Stephen Warren <swarren@nvidia.com>
2012-11-08 08:58:54 +08:00
# ifdef CONFIG_ARCH_USES_GETTIMEOFFSET
2014-07-17 05:03:50 +08:00
static u32 default_arch_gettimeoffset ( void ) { return 0 ; }
u32 ( * arch_gettimeoffset ) ( void ) = default_arch_gettimeoffset ;
time: convert arch_gettimeoffset to a pointer
Currently, whenever CONFIG_ARCH_USES_GETTIMEOFFSET is enabled, each
arch core provides a single implementation of arch_gettimeoffset(). In
many cases, different sub-architectures, different machines, or
different timer providers exist, and so the arch ends up implementing
arch_gettimeoffset() as a call-through-pointer anyway. Examples are
ARM, Cris, M68K, and it's arguable that the remaining architectures,
M32R and Blackfin, should be doing this anyway.
Modify arch_gettimeoffset so that it itself is a function pointer, which
the arch initializes. This will allow later changes to move the
initialization of this function into individual machine support or timer
drivers. This is particularly useful for code in drivers/clocksource
which should rely on an arch-independant mechanism to register their
implementation of arch_gettimeoffset().
This patch also converts the Cris architecture to set arch_gettimeoffset
directly to the final implementation in time_init(), because Cris already
had separate time_init() functions per sub-architecture. M68K and ARM
are converted to set arch_gettimeoffset to the final implementation in
later patches, because they already have function pointers in place for
this purpose.
Cc: Russell King <linux@arm.linux.org.uk>
Cc: Mike Frysinger <vapier@gentoo.org>
Cc: Mikael Starvik <starvik@axis.com>
Cc: Hirokazu Takata <takata@linux-m32r.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Geert Uytterhoeven <geert@linux-m68k.org>
Acked-by: Jesper Nilsson <jesper.nilsson@axis.com>
Acked-by: John Stultz <johnstul@us.ibm.com>
Signed-off-by: Stephen Warren <swarren@nvidia.com>
2012-11-08 08:58:54 +08:00
# else
2014-07-17 05:03:50 +08:00
static inline u32 arch_gettimeoffset ( void ) { return 0 ; }
time: convert arch_gettimeoffset to a pointer
Currently, whenever CONFIG_ARCH_USES_GETTIMEOFFSET is enabled, each
arch core provides a single implementation of arch_gettimeoffset(). In
many cases, different sub-architectures, different machines, or
different timer providers exist, and so the arch ends up implementing
arch_gettimeoffset() as a call-through-pointer anyway. Examples are
ARM, Cris, M68K, and it's arguable that the remaining architectures,
M32R and Blackfin, should be doing this anyway.
Modify arch_gettimeoffset so that it itself is a function pointer, which
the arch initializes. This will allow later changes to move the
initialization of this function into individual machine support or timer
drivers. This is particularly useful for code in drivers/clocksource
which should rely on an arch-independant mechanism to register their
implementation of arch_gettimeoffset().
This patch also converts the Cris architecture to set arch_gettimeoffset
directly to the final implementation in time_init(), because Cris already
had separate time_init() functions per sub-architecture. M68K and ARM
are converted to set arch_gettimeoffset to the final implementation in
later patches, because they already have function pointers in place for
this purpose.
Cc: Russell King <linux@arm.linux.org.uk>
Cc: Mike Frysinger <vapier@gentoo.org>
Cc: Mikael Starvik <starvik@axis.com>
Cc: Hirokazu Takata <takata@linux-m32r.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Geert Uytterhoeven <geert@linux-m68k.org>
Acked-by: Jesper Nilsson <jesper.nilsson@axis.com>
Acked-by: John Stultz <johnstul@us.ibm.com>
Signed-off-by: Stephen Warren <swarren@nvidia.com>
2012-11-08 08:58:54 +08:00
# endif
2018-07-13 20:06:42 +08:00
static inline u64 timekeeping_delta_to_ns ( const struct tk_read_base * tkr , u64 delta )
2016-02-22 19:15:19 +08:00
{
timekeeping_Force_unsigned_clocksource_to_nanoseconds_conversion
The clocksource delta to nanoseconds conversion is using signed math, but
the delta is unsigned. This makes the conversion space smaller than
necessary and in case of a multiplication overflow the conversion can
become negative. The conversion is done with scaled math:
s64 nsec_delta = ((s64)clkdelta * clk->mult) >> clk->shift;
Shifting a signed integer right obvioulsy preserves the sign, which has
interesting consequences:
- Time jumps backwards
- __iter_div_u64_rem() which is used in one of the calling code pathes
will take forever to piecewise calculate the seconds/nanoseconds part.
This has been reported by several people with different scenarios:
David observed that when stopping a VM with a debugger:
"It was essentially the stopped by debugger case. I forget exactly why,
but the guest was being explicitly stopped from outside, it wasn't just
scheduling lag. I think it was something in the vicinity of 10 minutes
stopped."
When lifting the stop the machine went dead.
The stopped by debugger case is not really interesting, but nevertheless it
would be a good thing not to die completely.
But this was also observed on a live system by Liav:
"When the OS is too overloaded, delta will get a high enough value for the
msb of the sum delta * tkr->mult + tkr->xtime_nsec to be set, and so
after the shift the nsec variable will gain a value similar to
0xffffffffff000000."
Unfortunately this has been reintroduced recently with commit 6bd58f09e1d8
("time: Add cycles to nanoseconds translation"). It had been fixed a year
ago already in commit 35a4933a8959 ("time: Avoid signed overflow in
timekeeping_get_ns()").
Though it's not surprising that the issue has been reintroduced because the
function itself and the whole call chain uses s64 for the result and the
propagation of it. The change in this recent commit is subtle:
s64 nsec;
- nsec = (d * m + n) >> s:
+ nsec = d * m + n;
+ nsec >>= s;
d being type of cycle_t adds another level of obfuscation.
This wouldn't have happened if the previous change to unsigned computation
would have made the 'nsec' variable u64 right away and a follow up patch
had cleaned up the whole call chain.
There have been patches submitted which basically did a revert of the above
patch leaving everything else unchanged as signed. Back to square one. This
spawned a admittedly pointless discussion about potential users which rely
on the unsigned behaviour until someone pointed out that it had been fixed
before. The changelogs of said patches added further confusion as they made
finally false claims about the consequences for eventual users which expect
signed results.
Despite delta being cycle_t, aka. u64, it's very well possible to hand in
a signed negative value and the signed computation will happily return the
correct result. But nobody actually sat down and analyzed the code which
was added as user after the propably unintended signed conversion.
Though in sensitive code like this it's better to analyze it proper and
make sure that nothing relies on this than hunting the subtle wreckage half
a year later. After analyzing all call chains it stands that no caller can
hand in a negative value (which actually would work due to the s64 cast)
and rely on the signed math to do the right thing.
Change the conversion function to unsigned math. The conversion of all call
chains is done in a follow up patch.
This solves the starvation issue, which was caused by the negative result,
but it does not solve the underlying problem. It merily procrastinates
it. When the timekeeper update is deferred long enough that the unsigned
multiplication overflows, then time going backwards is observable again.
It does neither solve the issue of clocksources with a small counter width
which will wrap around possibly several times and cause random time stamps
to be generated. But those are usually not found on systems used for
virtualization, so this is likely a non issue.
I took the liberty to claim authorship for this simply because
analyzing all callsites and writing the changelog took substantially
more time than just making the simple s/s64/u64/ change and ignore the
rest.
Fixes: 6bd58f09e1d8 ("time: Add cycles to nanoseconds translation")
Reported-by: David Gibson <david@gibson.dropbear.id.au>
Reported-by: Liav Rehana <liavr@mellanox.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Parit Bhargava <prarit@redhat.com>
Cc: Laurent Vivier <lvivier@redhat.com>
Cc: "Christopher S. Hall" <christopher.s.hall@intel.com>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: John Stultz <john.stultz@linaro.org>
Cc: stable@vger.kernel.org
Link: http://lkml.kernel.org/r/20161208204228.688545601@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
2016-12-09 04:49:32 +08:00
u64 nsec ;
2016-02-22 19:15:19 +08:00
nsec = delta * tkr - > mult + tkr - > xtime_nsec ;
nsec > > = tkr - > shift ;
/* If arch requires, add in get_arch_timeoffset() */
return nsec + arch_gettimeoffset ( ) ;
}
2018-07-13 20:06:42 +08:00
static inline u64 timekeeping_get_ns ( const struct tk_read_base * tkr )
2009-08-14 21:47:29 +08:00
{
2016-12-22 03:32:01 +08:00
u64 delta ;
2009-08-14 21:47:29 +08:00
2015-03-12 12:16:33 +08:00
delta = timekeeping_get_delta ( tkr ) ;
2016-02-22 19:15:19 +08:00
return timekeeping_delta_to_ns ( tkr , delta ) ;
}
2009-08-14 21:47:29 +08:00
2018-07-13 20:06:42 +08:00
static inline u64 timekeeping_cycles_to_ns ( const struct tk_read_base * tkr , u64 cycles )
2016-02-22 19:15:19 +08:00
{
2016-12-22 03:32:01 +08:00
u64 delta ;
2012-07-13 13:21:55 +08:00
2016-02-22 19:15:19 +08:00
/* calculate the delta since the last update_wall_time */
delta = clocksource_delta ( cycles , tkr - > cycle_last , tkr - > mask ) ;
return timekeeping_delta_to_ns ( tkr , delta ) ;
2009-08-14 21:47:29 +08:00
}
2014-07-17 05:05:23 +08:00
/**
* update_fast_timekeeper - Update the fast and NMI safe monotonic timekeeper .
2015-02-11 12:01:52 +08:00
* @ tkr : Timekeeping readout base from which we take the update
2014-07-17 05:05:23 +08:00
*
* We want to use this from any context including NMI and tracing /
* instrumenting the timekeeping code itself .
*
2015-05-27 09:39:36 +08:00
* Employ the latch technique ; see @ raw_write_seqcount_latch .
2014-07-17 05:05:23 +08:00
*
* So if a NMI hits the update of base [ 0 ] then it will use base [ 1 ]
* which is still consistent . In the worst case this can result is a
* slightly wrong timestamp ( a few nanoseconds ) . See
* @ ktime_get_mono_fast_ns .
*/
2018-07-13 20:06:42 +08:00
static void update_fast_timekeeper ( const struct tk_read_base * tkr ,
struct tk_fast * tkf )
2014-07-17 05:05:23 +08:00
{
2015-03-19 16:36:19 +08:00
struct tk_read_base * base = tkf - > base ;
2014-07-17 05:05:23 +08:00
/* Force readers off to base[1] */
2015-03-19 16:36:19 +08:00
raw_write_seqcount_latch ( & tkf - > seq ) ;
2014-07-17 05:05:23 +08:00
/* Update base[0] */
2015-02-11 12:01:52 +08:00
memcpy ( base , tkr , sizeof ( * base ) ) ;
2014-07-17 05:05:23 +08:00
/* Force readers back to base[0] */
2015-03-19 16:36:19 +08:00
raw_write_seqcount_latch ( & tkf - > seq ) ;
2014-07-17 05:05:23 +08:00
/* Update base[1] */
memcpy ( base + 1 , base , sizeof ( * base ) ) ;
}
/**
* ktime_get_mono_fast_ns - Fast NMI safe access to clock monotonic
*
* This timestamp is not guaranteed to be monotonic across an update .
* The timestamp is calculated by :
*
* now = base_mono + clock_delta * slope
*
* So if the update lowers the slope , readers who are forced to the
* not yet updated second array are still using the old steeper slope .
*
* tmono
* ^
* | o n
* | o n
* | u
* | o
* | o
* | 12345678 - - - > reader order
*
* o = old slope
* u = update
* n = new slope
*
* So reader 6 will observe time going backwards versus reader 5.
*
* While other CPUs are likely to be able observe that , the only way
* for a CPU local observation is when an NMI hits in the middle of
* the update . Timestamps taken from that NMI context might be ahead
* of the following timestamps . Callers need to be aware of that and
* deal with it .
*/
2015-03-19 16:36:19 +08:00
static __always_inline u64 __ktime_get_fast_ns ( struct tk_fast * tkf )
2014-07-17 05:05:23 +08:00
{
struct tk_read_base * tkr ;
unsigned int seq ;
u64 now ;
do {
2015-05-27 09:39:36 +08:00
seq = raw_read_seqcount_latch ( & tkf - > seq ) ;
2015-03-19 16:36:19 +08:00
tkr = tkf - > base + ( seq & 0x01 ) ;
2016-08-24 07:08:21 +08:00
now = ktime_to_ns ( tkr - > base ) ;
2016-10-05 10:55:48 +08:00
now + = timekeeping_delta_to_ns ( tkr ,
clocksource_delta (
2017-06-09 07:44:20 +08:00
tk_clock_read ( tkr ) ,
2016-10-05 10:55:48 +08:00
tkr - > cycle_last ,
tkr - > mask ) ) ;
2015-03-19 16:36:19 +08:00
} while ( read_seqcount_retry ( & tkf - > seq , seq ) ) ;
2014-07-17 05:05:23 +08:00
return now ;
}
2015-03-19 16:36:19 +08:00
u64 ktime_get_mono_fast_ns ( void )
{
return __ktime_get_fast_ns ( & tk_fast_mono ) ;
}
2014-07-17 05:05:23 +08:00
EXPORT_SYMBOL_GPL ( ktime_get_mono_fast_ns ) ;
2015-03-19 16:39:08 +08:00
u64 ktime_get_raw_fast_ns ( void )
{
return __ktime_get_fast_ns ( & tk_fast_raw ) ;
}
EXPORT_SYMBOL_GPL ( ktime_get_raw_fast_ns ) ;
2018-04-25 21:33:38 +08:00
/**
* ktime_get_boot_fast_ns - NMI safe and fast access to boot clock .
*
* To keep it NMI safe since we ' re accessing from tracing , we ' re not using a
* separate timekeeper with updates to monotonic clock and boot offset
* protected with seqlocks . This has the following minor side effects :
*
* ( 1 ) Its possible that a timestamp be taken after the boot offset is updated
* but before the timekeeper is updated . If this happens , the new boot offset
* is added to the old timekeeping making the clock appear to update slightly
* earlier :
* CPU 0 CPU 1
* timekeeping_inject_sleeptime64 ( )
* __timekeeping_inject_sleeptime ( tk , delta ) ;
* timestamp ( ) ;
* timekeeping_update ( tk , TK_CLEAR_NTP . . . ) ;
*
* ( 2 ) On 32 - bit systems , the 64 - bit boot offset ( tk - > offs_boot ) may be
* partially updated . Since the tk - > offs_boot update is a rare event , this
* should be a rare occurrence which postprocessing should be able to handle .
*/
u64 notrace ktime_get_boot_fast_ns ( void )
{
struct timekeeper * tk = & tk_core . timekeeper ;
return ( ktime_get_mono_fast_ns ( ) + ktime_to_ns ( tk - > offs_boot ) ) ;
}
EXPORT_SYMBOL_GPL ( ktime_get_boot_fast_ns ) ;
2017-08-31 23:12:48 +08:00
/*
* See comment for __ktime_get_fast_ns ( ) vs . timestamp ordering
*/
static __always_inline u64 __ktime_get_real_fast_ns ( struct tk_fast * tkf )
{
struct tk_read_base * tkr ;
unsigned int seq ;
u64 now ;
do {
seq = raw_read_seqcount_latch ( & tkf - > seq ) ;
tkr = tkf - > base + ( seq & 0x01 ) ;
now = ktime_to_ns ( tkr - > base_real ) ;
now + = timekeeping_delta_to_ns ( tkr ,
clocksource_delta (
tk_clock_read ( tkr ) ,
tkr - > cycle_last ,
tkr - > mask ) ) ;
} while ( read_seqcount_retry ( & tkf - > seq , seq ) ) ;
return now ;
}
/**
* ktime_get_real_fast_ns : - NMI safe and fast access to clock realtime .
*/
u64 ktime_get_real_fast_ns ( void )
{
return __ktime_get_real_fast_ns ( & tk_fast_mono ) ;
}
2017-11-10 23:25:04 +08:00
EXPORT_SYMBOL_GPL ( ktime_get_real_fast_ns ) ;
2017-08-31 23:12:48 +08:00
2015-02-13 21:49:02 +08:00
/**
* halt_fast_timekeeper - Prevent fast timekeeper from accessing clocksource .
* @ tk : Timekeeper to snapshot .
*
* It generally is unsafe to access the clocksource after timekeeping has been
* suspended , so take a snapshot of the readout base of @ tk and use it as the
* fast timekeeper ' s readout base while suspended . It will return the same
* number of cycles every time until timekeeping is resumed at which time the
* proper readout base for the fast timekeeper will be restored automatically .
*/
2018-07-13 20:06:42 +08:00
static void halt_fast_timekeeper ( const struct timekeeper * tk )
2015-02-13 21:49:02 +08:00
{
static struct tk_read_base tkr_dummy ;
2018-07-13 20:06:42 +08:00
const struct tk_read_base * tkr = & tk - > tkr_mono ;
2015-02-13 21:49:02 +08:00
memcpy ( & tkr_dummy , tkr , sizeof ( tkr_dummy ) ) ;
2017-06-09 07:44:20 +08:00
cycles_at_suspend = tk_clock_read ( tkr ) ;
tkr_dummy . clock = & dummy_clock ;
2017-08-31 23:12:48 +08:00
tkr_dummy . base_real = tkr - > base + tk - > offs_real ;
2015-03-19 16:36:19 +08:00
update_fast_timekeeper ( & tkr_dummy , & tk_fast_mono ) ;
2015-03-19 16:39:08 +08:00
tkr = & tk - > tkr_raw ;
memcpy ( & tkr_dummy , tkr , sizeof ( tkr_dummy ) ) ;
2017-06-09 07:44:20 +08:00
tkr_dummy . clock = & dummy_clock ;
2015-03-19 16:39:08 +08:00
update_fast_timekeeper ( & tkr_dummy , & tk_fast_raw ) ;
2015-02-13 21:49:02 +08:00
}
2012-11-28 09:28:59 +08:00
static RAW_NOTIFIER_HEAD ( pvclock_gtod_chain ) ;
2013-06-27 18:35:46 +08:00
static void update_pvclock_gtod ( struct timekeeper * tk , bool was_set )
2012-11-28 09:28:59 +08:00
{
2013-06-27 18:35:46 +08:00
raw_notifier_call_chain ( & pvclock_gtod_chain , was_set , tk ) ;
2012-11-28 09:28:59 +08:00
}
/**
* pvclock_gtod_register_notifier - register a pvclock timedata update listener
*/
int pvclock_gtod_register_notifier ( struct notifier_block * nb )
{
2014-07-17 05:04:07 +08:00
struct timekeeper * tk = & tk_core . timekeeper ;
2012-11-28 09:28:59 +08:00
unsigned long flags ;
int ret ;
2013-02-22 06:51:38 +08:00
raw_spin_lock_irqsave ( & timekeeper_lock , flags ) ;
2012-11-28 09:28:59 +08:00
ret = raw_notifier_chain_register ( & pvclock_gtod_chain , nb ) ;
2013-06-27 18:35:46 +08:00
update_pvclock_gtod ( tk , true ) ;
2013-02-22 06:51:38 +08:00
raw_spin_unlock_irqrestore ( & timekeeper_lock , flags ) ;
2012-11-28 09:28:59 +08:00
return ret ;
}
EXPORT_SYMBOL_GPL ( pvclock_gtod_register_notifier ) ;
/**
* pvclock_gtod_unregister_notifier - unregister a pvclock
* timedata update listener
*/
int pvclock_gtod_unregister_notifier ( struct notifier_block * nb )
{
unsigned long flags ;
int ret ;
2013-02-22 06:51:38 +08:00
raw_spin_lock_irqsave ( & timekeeper_lock , flags ) ;
2012-11-28 09:28:59 +08:00
ret = raw_notifier_chain_unregister ( & pvclock_gtod_chain , nb ) ;
2013-02-22 06:51:38 +08:00
raw_spin_unlock_irqrestore ( & timekeeper_lock , flags ) ;
2012-11-28 09:28:59 +08:00
return ret ;
}
EXPORT_SYMBOL_GPL ( pvclock_gtod_unregister_notifier ) ;
time: Prevent early expiry of hrtimers[CLOCK_REALTIME] at the leap second edge
Currently, leapsecond adjustments are done at tick time. As a result,
the leapsecond was applied at the first timer tick *after* the
leapsecond (~1-10ms late depending on HZ), rather then exactly on the
second edge.
This was in part historical from back when we were always tick based,
but correcting this since has been avoided since it adds extra
conditional checks in the gettime fastpath, which has performance
overhead.
However, it was recently pointed out that ABS_TIME CLOCK_REALTIME
timers set for right after the leapsecond could fire a second early,
since some timers may be expired before we trigger the timekeeping
timer, which then applies the leapsecond.
This isn't quite as bad as it sounds, since behaviorally it is similar
to what is possible w/ ntpd made leapsecond adjustments done w/o using
the kernel discipline. Where due to latencies, timers may fire just
prior to the settimeofday call. (Also, one should note that all
applications using CLOCK_REALTIME timers should always be careful,
since they are prone to quirks from settimeofday() disturbances.)
However, the purpose of having the kernel do the leap adjustment is to
avoid such latencies, so I think this is worth fixing.
So in order to properly keep those timers from firing a second early,
this patch modifies the ntp and timekeeping logic so that we keep
enough state so that the update_base_offsets_now accessor, which
provides the hrtimer core the current time, can check and apply the
leapsecond adjustment on the second edge. This prevents the hrtimer
core from expiring timers too early.
This patch does not modify any other time read path, so no additional
overhead is incurred. However, this also means that the leap-second
continues to be applied at tick time for all other read-paths.
Apologies to Richard Cochran, who pushed for similar changes years
ago, which I resisted due to the concerns about the performance
overhead.
While I suspect this isn't extremely critical, folks who care about
strict leap-second correctness will likely want to watch
this. Potentially a -stable candidate eventually.
Originally-suggested-by: Richard Cochran <richardcochran@gmail.com>
Reported-by: Daniel Bristot de Oliveira <bristot@redhat.com>
Reported-by: Prarit Bhargava <prarit@redhat.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jiri Bohac <jbohac@suse.cz>
Cc: Shuah Khan <shuahkh@osg.samsung.com>
Cc: Ingo Molnar <mingo@kernel.org>
Link: http://lkml.kernel.org/r/1434063297-28657-4-git-send-email-john.stultz@linaro.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
2015-06-12 06:54:55 +08:00
/*
* tk_update_leap_state - helper to update the next_leap_ktime
*/
static inline void tk_update_leap_state ( struct timekeeper * tk )
{
tk - > next_leap_ktime = ntp_get_next_leap ( ) ;
2016-12-25 18:38:40 +08:00
if ( tk - > next_leap_ktime ! = KTIME_MAX )
time: Prevent early expiry of hrtimers[CLOCK_REALTIME] at the leap second edge
Currently, leapsecond adjustments are done at tick time. As a result,
the leapsecond was applied at the first timer tick *after* the
leapsecond (~1-10ms late depending on HZ), rather then exactly on the
second edge.
This was in part historical from back when we were always tick based,
but correcting this since has been avoided since it adds extra
conditional checks in the gettime fastpath, which has performance
overhead.
However, it was recently pointed out that ABS_TIME CLOCK_REALTIME
timers set for right after the leapsecond could fire a second early,
since some timers may be expired before we trigger the timekeeping
timer, which then applies the leapsecond.
This isn't quite as bad as it sounds, since behaviorally it is similar
to what is possible w/ ntpd made leapsecond adjustments done w/o using
the kernel discipline. Where due to latencies, timers may fire just
prior to the settimeofday call. (Also, one should note that all
applications using CLOCK_REALTIME timers should always be careful,
since they are prone to quirks from settimeofday() disturbances.)
However, the purpose of having the kernel do the leap adjustment is to
avoid such latencies, so I think this is worth fixing.
So in order to properly keep those timers from firing a second early,
this patch modifies the ntp and timekeeping logic so that we keep
enough state so that the update_base_offsets_now accessor, which
provides the hrtimer core the current time, can check and apply the
leapsecond adjustment on the second edge. This prevents the hrtimer
core from expiring timers too early.
This patch does not modify any other time read path, so no additional
overhead is incurred. However, this also means that the leap-second
continues to be applied at tick time for all other read-paths.
Apologies to Richard Cochran, who pushed for similar changes years
ago, which I resisted due to the concerns about the performance
overhead.
While I suspect this isn't extremely critical, folks who care about
strict leap-second correctness will likely want to watch
this. Potentially a -stable candidate eventually.
Originally-suggested-by: Richard Cochran <richardcochran@gmail.com>
Reported-by: Daniel Bristot de Oliveira <bristot@redhat.com>
Reported-by: Prarit Bhargava <prarit@redhat.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jiri Bohac <jbohac@suse.cz>
Cc: Shuah Khan <shuahkh@osg.samsung.com>
Cc: Ingo Molnar <mingo@kernel.org>
Link: http://lkml.kernel.org/r/1434063297-28657-4-git-send-email-john.stultz@linaro.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
2015-06-12 06:54:55 +08:00
/* Convert to monotonic time */
tk - > next_leap_ktime = ktime_sub ( tk - > next_leap_ktime , tk - > offs_real ) ;
}
2014-07-17 05:04:10 +08:00
/*
* Update the ktime_t based scalar nsec members of the timekeeper
*/
static inline void tk_update_ktime_data ( struct timekeeper * tk )
{
2014-10-29 18:31:16 +08:00
u64 seconds ;
u32 nsec ;
2014-07-17 05:04:10 +08:00
/*
* The xtime based monotonic readout is :
* nsec = ( xtime_sec + wtm_sec ) * 1e9 + wtm_nsec + now ( ) ;
* The ktime based monotonic readout is :
* nsec = base_mono + now ( ) ;
* = = > base_mono = ( xtime_sec + wtm_sec ) * 1e9 + wtm_nsec
*/
2014-10-29 18:31:16 +08:00
seconds = ( u64 ) ( tk - > xtime_sec + tk - > wall_to_monotonic . tv_sec ) ;
nsec = ( u32 ) tk - > wall_to_monotonic . tv_nsec ;
2015-03-19 17:09:06 +08:00
tk - > tkr_mono . base = ns_to_ktime ( seconds * NSEC_PER_SEC + nsec ) ;
2014-07-17 05:05:04 +08:00
2014-10-29 18:31:16 +08:00
/*
* The sum of the nanoseconds portions of xtime and
* wall_to_monotonic can be greater / equal one second . Take
* this into account before updating tk - > ktime_sec .
*/
2015-03-19 17:09:06 +08:00
nsec + = ( u32 ) ( tk - > tkr_mono . xtime_nsec > > tk - > tkr_mono . shift ) ;
2014-10-29 18:31:16 +08:00
if ( nsec > = NSEC_PER_SEC )
seconds + + ;
tk - > ktime_sec = seconds ;
2017-05-23 08:20:20 +08:00
/* Update the monotonic raw base */
2017-08-26 06:57:04 +08:00
tk - > tkr_raw . base = ns_to_ktime ( tk - > raw_sec * NSEC_PER_SEC ) ;
2014-07-17 05:04:10 +08:00
}
2013-02-22 06:51:38 +08:00
/* must hold timekeeper_lock */
2013-06-27 18:35:45 +08:00
static void timekeeping_update ( struct timekeeper * tk , unsigned int action )
2011-11-14 07:19:49 +08:00
{
2013-06-27 18:35:45 +08:00
if ( action & TK_CLEAR_NTP ) {
2012-07-13 13:21:57 +08:00
tk - > ntp_error = 0 ;
2011-11-14 07:19:49 +08:00
ntp_clear ( ) ;
}
2013-02-22 06:51:40 +08:00
time: Prevent early expiry of hrtimers[CLOCK_REALTIME] at the leap second edge
Currently, leapsecond adjustments are done at tick time. As a result,
the leapsecond was applied at the first timer tick *after* the
leapsecond (~1-10ms late depending on HZ), rather then exactly on the
second edge.
This was in part historical from back when we were always tick based,
but correcting this since has been avoided since it adds extra
conditional checks in the gettime fastpath, which has performance
overhead.
However, it was recently pointed out that ABS_TIME CLOCK_REALTIME
timers set for right after the leapsecond could fire a second early,
since some timers may be expired before we trigger the timekeeping
timer, which then applies the leapsecond.
This isn't quite as bad as it sounds, since behaviorally it is similar
to what is possible w/ ntpd made leapsecond adjustments done w/o using
the kernel discipline. Where due to latencies, timers may fire just
prior to the settimeofday call. (Also, one should note that all
applications using CLOCK_REALTIME timers should always be careful,
since they are prone to quirks from settimeofday() disturbances.)
However, the purpose of having the kernel do the leap adjustment is to
avoid such latencies, so I think this is worth fixing.
So in order to properly keep those timers from firing a second early,
this patch modifies the ntp and timekeeping logic so that we keep
enough state so that the update_base_offsets_now accessor, which
provides the hrtimer core the current time, can check and apply the
leapsecond adjustment on the second edge. This prevents the hrtimer
core from expiring timers too early.
This patch does not modify any other time read path, so no additional
overhead is incurred. However, this also means that the leap-second
continues to be applied at tick time for all other read-paths.
Apologies to Richard Cochran, who pushed for similar changes years
ago, which I resisted due to the concerns about the performance
overhead.
While I suspect this isn't extremely critical, folks who care about
strict leap-second correctness will likely want to watch
this. Potentially a -stable candidate eventually.
Originally-suggested-by: Richard Cochran <richardcochran@gmail.com>
Reported-by: Daniel Bristot de Oliveira <bristot@redhat.com>
Reported-by: Prarit Bhargava <prarit@redhat.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jiri Bohac <jbohac@suse.cz>
Cc: Shuah Khan <shuahkh@osg.samsung.com>
Cc: Ingo Molnar <mingo@kernel.org>
Link: http://lkml.kernel.org/r/1434063297-28657-4-git-send-email-john.stultz@linaro.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
2015-06-12 06:54:55 +08:00
tk_update_leap_state ( tk ) ;
2014-07-17 05:04:10 +08:00
tk_update_ktime_data ( tk ) ;
2014-09-06 18:24:49 +08:00
update_vsyscall ( tk ) ;
update_pvclock_gtod ( tk , action & TK_CLOCK_WAS_SET ) ;
2017-08-31 23:12:48 +08:00
tk - > tkr_mono . base_real = tk - > tkr_mono . base + tk - > offs_real ;
2015-03-19 16:36:19 +08:00
update_fast_timekeeper ( & tk - > tkr_mono , & tk_fast_mono ) ;
2015-03-19 16:39:08 +08:00
update_fast_timekeeper ( & tk - > tkr_raw , & tk_fast_raw ) ;
2015-04-15 05:08:37 +08:00
if ( action & TK_CLOCK_WAS_SET )
tk - > clock_was_set_seq + + ;
2015-06-12 06:54:53 +08:00
/*
* The mirroring of the data to the shadow - timekeeper needs
* to happen last here to ensure we don ' t over - write the
* timekeeper structure on the next update with stale data
*/
if ( action & TK_MIRROR )
memcpy ( & shadow_timekeeper , & tk_core . timekeeper ,
sizeof ( tk_core . timekeeper ) ) ;
2011-11-14 07:19:49 +08:00
}
2007-05-08 15:27:59 +08:00
/**
2009-08-14 21:47:26 +08:00
* timekeeping_forward_now - update clock to the current time
2007-05-08 15:27:59 +08:00
*
2008-08-21 07:37:28 +08:00
* Forward the current clock to update its state since the last call to
* update_wall_time ( ) . This is useful before significant clock changes ,
* as it avoids having to deal with this time offset explicitly .
2007-05-08 15:27:59 +08:00
*/
2012-07-13 13:21:57 +08:00
static void timekeeping_forward_now ( struct timekeeper * tk )
2007-05-08 15:27:59 +08:00
{
2016-12-22 03:32:01 +08:00
u64 cycle_now , delta ;
2007-05-08 15:27:59 +08:00
2017-06-09 07:44:20 +08:00
cycle_now = tk_clock_read ( & tk - > tkr_mono ) ;
2015-03-19 17:09:06 +08:00
delta = clocksource_delta ( cycle_now , tk - > tkr_mono . cycle_last , tk - > tkr_mono . mask ) ;
tk - > tkr_mono . cycle_last = cycle_now ;
2015-03-19 16:28:44 +08:00
tk - > tkr_raw . cycle_last = cycle_now ;
2007-05-08 15:27:59 +08:00
2015-03-19 17:09:06 +08:00
tk - > tkr_mono . xtime_nsec + = delta * tk - > tkr_mono . mult ;
2009-05-02 04:10:26 +08:00
time: convert arch_gettimeoffset to a pointer
Currently, whenever CONFIG_ARCH_USES_GETTIMEOFFSET is enabled, each
arch core provides a single implementation of arch_gettimeoffset(). In
many cases, different sub-architectures, different machines, or
different timer providers exist, and so the arch ends up implementing
arch_gettimeoffset() as a call-through-pointer anyway. Examples are
ARM, Cris, M68K, and it's arguable that the remaining architectures,
M32R and Blackfin, should be doing this anyway.
Modify arch_gettimeoffset so that it itself is a function pointer, which
the arch initializes. This will allow later changes to move the
initialization of this function into individual machine support or timer
drivers. This is particularly useful for code in drivers/clocksource
which should rely on an arch-independant mechanism to register their
implementation of arch_gettimeoffset().
This patch also converts the Cris architecture to set arch_gettimeoffset
directly to the final implementation in time_init(), because Cris already
had separate time_init() functions per sub-architecture. M68K and ARM
are converted to set arch_gettimeoffset to the final implementation in
later patches, because they already have function pointers in place for
this purpose.
Cc: Russell King <linux@arm.linux.org.uk>
Cc: Mike Frysinger <vapier@gentoo.org>
Cc: Mikael Starvik <starvik@axis.com>
Cc: Hirokazu Takata <takata@linux-m32r.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Geert Uytterhoeven <geert@linux-m68k.org>
Acked-by: Jesper Nilsson <jesper.nilsson@axis.com>
Acked-by: John Stultz <johnstul@us.ibm.com>
Signed-off-by: Stephen Warren <swarren@nvidia.com>
2012-11-08 08:58:54 +08:00
/* If arch requires, add in get_arch_timeoffset() */
2015-03-19 17:09:06 +08:00
tk - > tkr_mono . xtime_nsec + = ( u64 ) arch_gettimeoffset ( ) < < tk - > tkr_mono . shift ;
2009-05-02 04:10:26 +08:00
2008-08-21 07:37:30 +08:00
2017-05-23 08:20:20 +08:00
tk - > tkr_raw . xtime_nsec + = delta * tk - > tkr_raw . mult ;
/* If arch requires, add in get_arch_timeoffset() */
tk - > tkr_raw . xtime_nsec + = ( u64 ) arch_gettimeoffset ( ) < < tk - > tkr_raw . shift ;
tk_normalize_xtime ( tk ) ;
2007-05-08 15:27:59 +08:00
}
/**
2018-04-27 21:40:13 +08:00
* ktime_get_real_ts64 - Returns the time of day in a timespec64 .
2007-05-08 15:27:59 +08:00
* @ ts : pointer to the timespec to be set
*
2018-04-27 21:40:13 +08:00
* Returns the time of day in a timespec64 ( WARN if suspended ) .
2007-05-08 15:27:59 +08:00
*/
2018-04-27 21:40:13 +08:00
void ktime_get_real_ts64 ( struct timespec64 * ts )
2007-05-08 15:27:59 +08:00
{
2014-07-17 05:04:07 +08:00
struct timekeeper * tk = & tk_core . timekeeper ;
2019-03-19 03:55:56 +08:00
unsigned int seq ;
2016-12-09 04:49:34 +08:00
u64 nsecs ;
2007-05-08 15:27:59 +08:00
2018-04-27 21:40:13 +08:00
WARN_ON ( timekeeping_suspended ) ;
2007-05-08 15:27:59 +08:00
do {
2014-07-17 05:04:07 +08:00
seq = read_seqcount_begin ( & tk_core . seq ) ;
2007-05-08 15:27:59 +08:00
2012-07-28 02:48:13 +08:00
ts - > tv_sec = tk - > xtime_sec ;
2015-03-19 17:09:06 +08:00
nsecs = timekeeping_get_ns ( & tk - > tkr_mono ) ;
2007-05-08 15:27:59 +08:00
2014-07-17 05:04:07 +08:00
} while ( read_seqcount_retry ( & tk_core . seq , seq ) ) ;
2007-05-08 15:27:59 +08:00
2012-09-12 07:26:03 +08:00
ts - > tv_nsec = 0 ;
2014-07-17 05:04:04 +08:00
timespec64_add_ns ( ts , nsecs ) ;
2007-05-08 15:27:59 +08:00
}
2018-04-27 21:40:13 +08:00
EXPORT_SYMBOL ( ktime_get_real_ts64 ) ;
2007-05-08 15:27:59 +08:00
2009-07-07 17:27:28 +08:00
ktime_t ktime_get ( void )
{
2014-07-17 05:04:07 +08:00
struct timekeeper * tk = & tk_core . timekeeper ;
2009-07-07 17:27:28 +08:00
unsigned int seq ;
2014-07-17 05:04:12 +08:00
ktime_t base ;
2016-12-09 04:49:34 +08:00
u64 nsecs ;
2009-07-07 17:27:28 +08:00
WARN_ON ( timekeeping_suspended ) ;
do {
2014-07-17 05:04:07 +08:00
seq = read_seqcount_begin ( & tk_core . seq ) ;
2015-03-19 17:09:06 +08:00
base = tk - > tkr_mono . base ;
nsecs = timekeeping_get_ns ( & tk - > tkr_mono ) ;
2009-07-07 17:27:28 +08:00
2014-07-17 05:04:07 +08:00
} while ( read_seqcount_retry ( & tk_core . seq , seq ) ) ;
2014-07-17 05:03:53 +08:00
2014-07-17 05:04:12 +08:00
return ktime_add_ns ( base , nsecs ) ;
2009-07-07 17:27:28 +08:00
}
EXPORT_SYMBOL_GPL ( ktime_get ) ;
2015-04-07 19:12:35 +08:00
u32 ktime_get_resolution_ns ( void )
{
struct timekeeper * tk = & tk_core . timekeeper ;
unsigned int seq ;
u32 nsecs ;
WARN_ON ( timekeeping_suspended ) ;
do {
seq = read_seqcount_begin ( & tk_core . seq ) ;
nsecs = tk - > tkr_mono . mult > > tk - > tkr_mono . shift ;
} while ( read_seqcount_retry ( & tk_core . seq , seq ) ) ;
return nsecs ;
}
EXPORT_SYMBOL_GPL ( ktime_get_resolution_ns ) ;
2014-07-17 05:04:13 +08:00
static ktime_t * offsets [ TK_OFFS_MAX ] = {
[ TK_OFFS_REAL ] = & tk_core . timekeeper . offs_real ,
2018-04-25 21:33:38 +08:00
[ TK_OFFS_BOOT ] = & tk_core . timekeeper . offs_boot ,
2014-07-17 05:04:13 +08:00
[ TK_OFFS_TAI ] = & tk_core . timekeeper . offs_tai ,
} ;
ktime_t ktime_get_with_offset ( enum tk_offsets offs )
{
struct timekeeper * tk = & tk_core . timekeeper ;
unsigned int seq ;
ktime_t base , * offset = offsets [ offs ] ;
2016-12-09 04:49:34 +08:00
u64 nsecs ;
2014-07-17 05:04:13 +08:00
WARN_ON ( timekeeping_suspended ) ;
do {
seq = read_seqcount_begin ( & tk_core . seq ) ;
2015-03-19 17:09:06 +08:00
base = ktime_add ( tk - > tkr_mono . base , * offset ) ;
nsecs = timekeeping_get_ns ( & tk - > tkr_mono ) ;
2014-07-17 05:04:13 +08:00
} while ( read_seqcount_retry ( & tk_core . seq , seq ) ) ;
return ktime_add_ns ( base , nsecs ) ;
}
EXPORT_SYMBOL_GPL ( ktime_get_with_offset ) ;
2018-04-27 21:40:15 +08:00
ktime_t ktime_get_coarse_with_offset ( enum tk_offsets offs )
{
struct timekeeper * tk = & tk_core . timekeeper ;
unsigned int seq ;
ktime_t base , * offset = offsets [ offs ] ;
2019-06-14 03:40:45 +08:00
u64 nsecs ;
2018-04-27 21:40:15 +08:00
WARN_ON ( timekeeping_suspended ) ;
do {
seq = read_seqcount_begin ( & tk_core . seq ) ;
base = ktime_add ( tk - > tkr_mono . base , * offset ) ;
2019-06-14 03:40:45 +08:00
nsecs = tk - > tkr_mono . xtime_nsec > > tk - > tkr_mono . shift ;
2018-04-27 21:40:15 +08:00
} while ( read_seqcount_retry ( & tk_core . seq , seq ) ) ;
2019-06-22 04:32:47 +08:00
return ktime_add_ns ( base , nsecs ) ;
2018-04-27 21:40:15 +08:00
}
EXPORT_SYMBOL_GPL ( ktime_get_coarse_with_offset ) ;
2014-07-17 05:04:22 +08:00
/**
* ktime_mono_to_any ( ) - convert mononotic time to any other time
* @ tmono : time to convert .
* @ offs : which offset to use
*/
ktime_t ktime_mono_to_any ( ktime_t tmono , enum tk_offsets offs )
{
ktime_t * offset = offsets [ offs ] ;
2019-03-19 03:55:56 +08:00
unsigned int seq ;
2014-07-17 05:04:22 +08:00
ktime_t tconv ;
do {
seq = read_seqcount_begin ( & tk_core . seq ) ;
tconv = ktime_add ( tmono , * offset ) ;
} while ( read_seqcount_retry ( & tk_core . seq , seq ) ) ;
return tconv ;
}
EXPORT_SYMBOL_GPL ( ktime_mono_to_any ) ;
2014-07-17 05:05:04 +08:00
/**
* ktime_get_raw - Returns the raw monotonic time in ktime_t format
*/
ktime_t ktime_get_raw ( void )
{
struct timekeeper * tk = & tk_core . timekeeper ;
unsigned int seq ;
ktime_t base ;
2016-12-09 04:49:34 +08:00
u64 nsecs ;
2014-07-17 05:05:04 +08:00
do {
seq = read_seqcount_begin ( & tk_core . seq ) ;
2015-03-19 16:28:44 +08:00
base = tk - > tkr_raw . base ;
nsecs = timekeeping_get_ns ( & tk - > tkr_raw ) ;
2014-07-17 05:05:04 +08:00
} while ( read_seqcount_retry ( & tk_core . seq , seq ) ) ;
return ktime_add_ns ( base , nsecs ) ;
}
EXPORT_SYMBOL_GPL ( ktime_get_raw ) ;
2009-07-07 17:27:28 +08:00
/**
2014-07-17 05:04:04 +08:00
* ktime_get_ts64 - get the monotonic clock in timespec64 format
2009-07-07 17:27:28 +08:00
* @ ts : pointer to timespec variable
*
* The function calculates the monotonic clock from the realtime
* clock and the wall_to_monotonic offset and stores the result
2014-11-08 05:13:04 +08:00
* in normalized timespec64 format in the variable pointed to by @ ts .
2009-07-07 17:27:28 +08:00
*/
2014-07-17 05:04:04 +08:00
void ktime_get_ts64 ( struct timespec64 * ts )
2009-07-07 17:27:28 +08:00
{
2014-07-17 05:04:07 +08:00
struct timekeeper * tk = & tk_core . timekeeper ;
2014-07-17 05:04:04 +08:00
struct timespec64 tomono ;
2009-07-07 17:27:28 +08:00
unsigned int seq ;
2016-12-09 04:49:34 +08:00
u64 nsec ;
2009-07-07 17:27:28 +08:00
WARN_ON ( timekeeping_suspended ) ;
do {
2014-07-17 05:04:07 +08:00
seq = read_seqcount_begin ( & tk_core . seq ) ;
2014-07-17 05:04:04 +08:00
ts - > tv_sec = tk - > xtime_sec ;
2015-03-19 17:09:06 +08:00
nsec = timekeeping_get_ns ( & tk - > tkr_mono ) ;
2012-07-28 02:48:13 +08:00
tomono = tk - > wall_to_monotonic ;
2009-07-07 17:27:28 +08:00
2014-07-17 05:04:07 +08:00
} while ( read_seqcount_retry ( & tk_core . seq , seq ) ) ;
2009-07-07 17:27:28 +08:00
2014-07-17 05:04:04 +08:00
ts - > tv_sec + = tomono . tv_sec ;
ts - > tv_nsec = 0 ;
timespec64_add_ns ( ts , nsec + tomono . tv_nsec ) ;
2009-07-07 17:27:28 +08:00
}
2014-07-17 05:04:04 +08:00
EXPORT_SYMBOL_GPL ( ktime_get_ts64 ) ;
2009-07-07 17:27:28 +08:00
2014-10-29 18:31:16 +08:00
/**
* ktime_get_seconds - Get the seconds portion of CLOCK_MONOTONIC
*
* Returns the seconds portion of CLOCK_MONOTONIC with a single non
* serialized read . tk - > ktime_sec is of type ' unsigned long ' so this
* works on both 32 and 64 bit systems . On 32 bit systems the readout
* covers ~ 136 years of uptime which should be enough to prevent
* premature wrap arounds .
*/
time64_t ktime_get_seconds ( void )
{
struct timekeeper * tk = & tk_core . timekeeper ;
WARN_ON ( timekeeping_suspended ) ;
return tk - > ktime_sec ;
}
EXPORT_SYMBOL_GPL ( ktime_get_seconds ) ;
2014-10-29 18:31:50 +08:00
/**
* ktime_get_real_seconds - Get the seconds portion of CLOCK_REALTIME
*
* Returns the wall clock seconds since 1970. This replaces the
* get_seconds ( ) interface which is not y2038 safe on 32 bit systems .
*
* For 64 bit systems the fast access to tk - > xtime_sec is preserved . On
* 32 bit systems the access must be protected with the sequence
* counter to provide " atomic " access to the 64 bit tk - > xtime_sec
* value .
*/
time64_t ktime_get_real_seconds ( void )
{
struct timekeeper * tk = & tk_core . timekeeper ;
time64_t seconds ;
unsigned int seq ;
if ( IS_ENABLED ( CONFIG_64BIT ) )
return tk - > xtime_sec ;
do {
seq = read_seqcount_begin ( & tk_core . seq ) ;
seconds = tk - > xtime_sec ;
} while ( read_seqcount_retry ( & tk_core . seq , seq ) ) ;
return seconds ;
}
EXPORT_SYMBOL_GPL ( ktime_get_real_seconds ) ;
2015-12-13 12:24:18 +08:00
/**
* __ktime_get_real_seconds - The same as ktime_get_real_seconds
* but without the sequence counter protect . This internal function
* is called just when timekeeping lock is already held .
*/
time64_t __ktime_get_real_seconds ( void )
{
struct timekeeper * tk = & tk_core . timekeeper ;
return tk - > xtime_sec ;
}
2016-02-22 19:15:20 +08:00
/**
* ktime_get_snapshot - snapshots the realtime / monotonic raw clocks with counter
* @ systime_snapshot : pointer to struct receiving the system time snapshot
*/
void ktime_get_snapshot ( struct system_time_snapshot * systime_snapshot )
{
struct timekeeper * tk = & tk_core . timekeeper ;
2019-03-19 03:55:56 +08:00
unsigned int seq ;
2016-02-22 19:15:20 +08:00
ktime_t base_raw ;
ktime_t base_real ;
2016-12-09 04:49:34 +08:00
u64 nsec_raw ;
u64 nsec_real ;
2016-12-22 03:32:01 +08:00
u64 now ;
2016-02-22 19:15:20 +08:00
2016-02-22 19:15:21 +08:00
WARN_ON_ONCE ( timekeeping_suspended ) ;
2016-02-22 19:15:20 +08:00
do {
seq = read_seqcount_begin ( & tk_core . seq ) ;
2017-06-09 07:44:20 +08:00
now = tk_clock_read ( & tk - > tkr_mono ) ;
time: Add history to cross timestamp interface supporting slower devices
Another representative use case of time sync and the correlated
clocksource (in addition to PTP noted above) is PTP synchronized
audio.
In a streaming application, as an example, samples will be sent and/or
received by multiple devices with a presentation time that is in terms
of the PTP master clock. Synchronizing the audio output on these
devices requires correlating the audio clock with the PTP master
clock. The more precise this correlation is, the better the audio
quality (i.e. out of sync audio sounds bad).
From an application standpoint, to correlate the PTP master clock with
the audio device clock, the system clock is used as a intermediate
timebase. The transforms such an application would perform are:
System Clock <-> Audio clock
System Clock <-> Network Device Clock [<-> PTP Master Clock]
Modern Intel platforms can perform a more accurate cross timestamp in
hardware (ART,audio device clock). The audio driver requires
ART->system time transforms -- the same as required for the network
driver. These platforms offload audio processing (including
cross-timestamps) to a DSP which to ensure uninterrupted audio
processing, communicates and response to the host only once every
millsecond. As a result is takes up to a millisecond for the DSP to
receive a request, the request is processed by the DSP, the audio
output hardware is polled for completion, the result is copied into
shared memory, and the host is notified. All of these operation occur
on a millisecond cadence. This transaction requires about 2 ms, but
under heavier workloads it may take up to 4 ms.
Adding a history allows these slow devices the option of providing an
ART value outside of the current interval. In this case, the callback
provided is an accessor function for the previously obtained counter
value. If get_system_device_crosststamp() receives a counter value
previous to cycle_last, it consults the history provided as an
argument in history_ref and interpolates the realtime and monotonic
raw system time using the provided counter value. If there are any
clock discontinuities, e.g. from calling settimeofday(), the monotonic
raw time is interpolated in the usual way, but the realtime clock time
is adjusted by scaling the monotonic raw adjustment.
When an accessor function is used a history argument *must* be
provided. The history is initialized using ktime_get_snapshot() and
must be called before the counter values are read.
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: kevin.b.stanton@intel.com
Cc: kevin.j.clarke@intel.com
Cc: hpa@zytor.com
Cc: jeffrey.t.kirsher@intel.com
Cc: netdev@vger.kernel.org
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Christopher S. Hall <christopher.s.hall@intel.com>
[jstultz: Fixed up cycles_t/cycle_t type confusion]
Signed-off-by: John Stultz <john.stultz@linaro.org>
2016-02-22 19:15:23 +08:00
systime_snapshot - > cs_was_changed_seq = tk - > cs_was_changed_seq ;
systime_snapshot - > clock_was_set_seq = tk - > clock_was_set_seq ;
2016-02-22 19:15:20 +08:00
base_real = ktime_add ( tk - > tkr_mono . base ,
tk_core . timekeeper . offs_real ) ;
base_raw = tk - > tkr_raw . base ;
nsec_real = timekeeping_cycles_to_ns ( & tk - > tkr_mono , now ) ;
nsec_raw = timekeeping_cycles_to_ns ( & tk - > tkr_raw , now ) ;
} while ( read_seqcount_retry ( & tk_core . seq , seq ) ) ;
systime_snapshot - > cycles = now ;
systime_snapshot - > real = ktime_add_ns ( base_real , nsec_real ) ;
systime_snapshot - > raw = ktime_add_ns ( base_raw , nsec_raw ) ;
}
EXPORT_SYMBOL_GPL ( ktime_get_snapshot ) ;
2015-12-13 12:24:18 +08:00
time: Add history to cross timestamp interface supporting slower devices
Another representative use case of time sync and the correlated
clocksource (in addition to PTP noted above) is PTP synchronized
audio.
In a streaming application, as an example, samples will be sent and/or
received by multiple devices with a presentation time that is in terms
of the PTP master clock. Synchronizing the audio output on these
devices requires correlating the audio clock with the PTP master
clock. The more precise this correlation is, the better the audio
quality (i.e. out of sync audio sounds bad).
From an application standpoint, to correlate the PTP master clock with
the audio device clock, the system clock is used as a intermediate
timebase. The transforms such an application would perform are:
System Clock <-> Audio clock
System Clock <-> Network Device Clock [<-> PTP Master Clock]
Modern Intel platforms can perform a more accurate cross timestamp in
hardware (ART,audio device clock). The audio driver requires
ART->system time transforms -- the same as required for the network
driver. These platforms offload audio processing (including
cross-timestamps) to a DSP which to ensure uninterrupted audio
processing, communicates and response to the host only once every
millsecond. As a result is takes up to a millisecond for the DSP to
receive a request, the request is processed by the DSP, the audio
output hardware is polled for completion, the result is copied into
shared memory, and the host is notified. All of these operation occur
on a millisecond cadence. This transaction requires about 2 ms, but
under heavier workloads it may take up to 4 ms.
Adding a history allows these slow devices the option of providing an
ART value outside of the current interval. In this case, the callback
provided is an accessor function for the previously obtained counter
value. If get_system_device_crosststamp() receives a counter value
previous to cycle_last, it consults the history provided as an
argument in history_ref and interpolates the realtime and monotonic
raw system time using the provided counter value. If there are any
clock discontinuities, e.g. from calling settimeofday(), the monotonic
raw time is interpolated in the usual way, but the realtime clock time
is adjusted by scaling the monotonic raw adjustment.
When an accessor function is used a history argument *must* be
provided. The history is initialized using ktime_get_snapshot() and
must be called before the counter values are read.
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: kevin.b.stanton@intel.com
Cc: kevin.j.clarke@intel.com
Cc: hpa@zytor.com
Cc: jeffrey.t.kirsher@intel.com
Cc: netdev@vger.kernel.org
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Christopher S. Hall <christopher.s.hall@intel.com>
[jstultz: Fixed up cycles_t/cycle_t type confusion]
Signed-off-by: John Stultz <john.stultz@linaro.org>
2016-02-22 19:15:23 +08:00
/* Scale base by mult/div checking for overflow */
static int scale64_check_overflow ( u64 mult , u64 div , u64 * base )
{
u64 tmp , rem ;
tmp = div64_u64_rem ( * base , div , & rem ) ;
if ( ( ( int ) sizeof ( u64 ) * 8 - fls64 ( mult ) < fls64 ( tmp ) ) | |
( ( int ) sizeof ( u64 ) * 8 - fls64 ( mult ) < fls64 ( rem ) ) )
return - EOVERFLOW ;
tmp * = mult ;
rem * = mult ;
do_div ( rem , div ) ;
* base = tmp + rem ;
return 0 ;
}
/**
* adjust_historical_crosststamp - adjust crosstimestamp previous to current interval
* @ history : Snapshot representing start of history
* @ partial_history_cycles : Cycle offset into history ( fractional part )
* @ total_history_cycles : Total history length in cycles
* @ discontinuity : True indicates clock was set on history period
* @ ts : Cross timestamp that should be adjusted using
* partial / total ratio
*
* Helper function used by get_device_system_crosststamp ( ) to correct the
* crosstimestamp corresponding to the start of the current interval to the
* system counter value ( timestamp point ) provided by the driver . The
* total_history_ * quantities are the total history starting at the provided
* reference point and ending at the start of the current interval . The cycle
* count between the driver timestamp point and the start of the current
* interval is partial_history_cycles .
*/
static int adjust_historical_crosststamp ( struct system_time_snapshot * history ,
2016-12-22 03:32:01 +08:00
u64 partial_history_cycles ,
u64 total_history_cycles ,
time: Add history to cross timestamp interface supporting slower devices
Another representative use case of time sync and the correlated
clocksource (in addition to PTP noted above) is PTP synchronized
audio.
In a streaming application, as an example, samples will be sent and/or
received by multiple devices with a presentation time that is in terms
of the PTP master clock. Synchronizing the audio output on these
devices requires correlating the audio clock with the PTP master
clock. The more precise this correlation is, the better the audio
quality (i.e. out of sync audio sounds bad).
From an application standpoint, to correlate the PTP master clock with
the audio device clock, the system clock is used as a intermediate
timebase. The transforms such an application would perform are:
System Clock <-> Audio clock
System Clock <-> Network Device Clock [<-> PTP Master Clock]
Modern Intel platforms can perform a more accurate cross timestamp in
hardware (ART,audio device clock). The audio driver requires
ART->system time transforms -- the same as required for the network
driver. These platforms offload audio processing (including
cross-timestamps) to a DSP which to ensure uninterrupted audio
processing, communicates and response to the host only once every
millsecond. As a result is takes up to a millisecond for the DSP to
receive a request, the request is processed by the DSP, the audio
output hardware is polled for completion, the result is copied into
shared memory, and the host is notified. All of these operation occur
on a millisecond cadence. This transaction requires about 2 ms, but
under heavier workloads it may take up to 4 ms.
Adding a history allows these slow devices the option of providing an
ART value outside of the current interval. In this case, the callback
provided is an accessor function for the previously obtained counter
value. If get_system_device_crosststamp() receives a counter value
previous to cycle_last, it consults the history provided as an
argument in history_ref and interpolates the realtime and monotonic
raw system time using the provided counter value. If there are any
clock discontinuities, e.g. from calling settimeofday(), the monotonic
raw time is interpolated in the usual way, but the realtime clock time
is adjusted by scaling the monotonic raw adjustment.
When an accessor function is used a history argument *must* be
provided. The history is initialized using ktime_get_snapshot() and
must be called before the counter values are read.
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: kevin.b.stanton@intel.com
Cc: kevin.j.clarke@intel.com
Cc: hpa@zytor.com
Cc: jeffrey.t.kirsher@intel.com
Cc: netdev@vger.kernel.org
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Christopher S. Hall <christopher.s.hall@intel.com>
[jstultz: Fixed up cycles_t/cycle_t type confusion]
Signed-off-by: John Stultz <john.stultz@linaro.org>
2016-02-22 19:15:23 +08:00
bool discontinuity ,
struct system_device_crosststamp * ts )
{
struct timekeeper * tk = & tk_core . timekeeper ;
u64 corr_raw , corr_real ;
bool interp_forward ;
int ret ;
if ( total_history_cycles = = 0 | | partial_history_cycles = = 0 )
return 0 ;
/* Interpolate shortest distance from beginning or end of history */
2017-03-25 03:03:35 +08:00
interp_forward = partial_history_cycles > total_history_cycles / 2 ;
time: Add history to cross timestamp interface supporting slower devices
Another representative use case of time sync and the correlated
clocksource (in addition to PTP noted above) is PTP synchronized
audio.
In a streaming application, as an example, samples will be sent and/or
received by multiple devices with a presentation time that is in terms
of the PTP master clock. Synchronizing the audio output on these
devices requires correlating the audio clock with the PTP master
clock. The more precise this correlation is, the better the audio
quality (i.e. out of sync audio sounds bad).
From an application standpoint, to correlate the PTP master clock with
the audio device clock, the system clock is used as a intermediate
timebase. The transforms such an application would perform are:
System Clock <-> Audio clock
System Clock <-> Network Device Clock [<-> PTP Master Clock]
Modern Intel platforms can perform a more accurate cross timestamp in
hardware (ART,audio device clock). The audio driver requires
ART->system time transforms -- the same as required for the network
driver. These platforms offload audio processing (including
cross-timestamps) to a DSP which to ensure uninterrupted audio
processing, communicates and response to the host only once every
millsecond. As a result is takes up to a millisecond for the DSP to
receive a request, the request is processed by the DSP, the audio
output hardware is polled for completion, the result is copied into
shared memory, and the host is notified. All of these operation occur
on a millisecond cadence. This transaction requires about 2 ms, but
under heavier workloads it may take up to 4 ms.
Adding a history allows these slow devices the option of providing an
ART value outside of the current interval. In this case, the callback
provided is an accessor function for the previously obtained counter
value. If get_system_device_crosststamp() receives a counter value
previous to cycle_last, it consults the history provided as an
argument in history_ref and interpolates the realtime and monotonic
raw system time using the provided counter value. If there are any
clock discontinuities, e.g. from calling settimeofday(), the monotonic
raw time is interpolated in the usual way, but the realtime clock time
is adjusted by scaling the monotonic raw adjustment.
When an accessor function is used a history argument *must* be
provided. The history is initialized using ktime_get_snapshot() and
must be called before the counter values are read.
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: kevin.b.stanton@intel.com
Cc: kevin.j.clarke@intel.com
Cc: hpa@zytor.com
Cc: jeffrey.t.kirsher@intel.com
Cc: netdev@vger.kernel.org
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Christopher S. Hall <christopher.s.hall@intel.com>
[jstultz: Fixed up cycles_t/cycle_t type confusion]
Signed-off-by: John Stultz <john.stultz@linaro.org>
2016-02-22 19:15:23 +08:00
partial_history_cycles = interp_forward ?
total_history_cycles - partial_history_cycles :
partial_history_cycles ;
/*
* Scale the monotonic raw time delta by :
* partial_history_cycles / total_history_cycles
*/
corr_raw = ( u64 ) ktime_to_ns (
ktime_sub ( ts - > sys_monoraw , history - > raw ) ) ;
ret = scale64_check_overflow ( partial_history_cycles ,
total_history_cycles , & corr_raw ) ;
if ( ret )
return ret ;
/*
* If there is a discontinuity in the history , scale monotonic raw
* correction by :
* mult ( real ) / mult ( raw ) yielding the realtime correction
* Otherwise , calculate the realtime correction similar to monotonic
* raw calculation
*/
if ( discontinuity ) {
corr_real = mul_u64_u32_div
( corr_raw , tk - > tkr_mono . mult , tk - > tkr_raw . mult ) ;
} else {
corr_real = ( u64 ) ktime_to_ns (
ktime_sub ( ts - > sys_realtime , history - > real ) ) ;
ret = scale64_check_overflow ( partial_history_cycles ,
total_history_cycles , & corr_real ) ;
if ( ret )
return ret ;
}
/* Fixup monotonic raw and real time time values */
if ( interp_forward ) {
ts - > sys_monoraw = ktime_add_ns ( history - > raw , corr_raw ) ;
ts - > sys_realtime = ktime_add_ns ( history - > real , corr_real ) ;
} else {
ts - > sys_monoraw = ktime_sub_ns ( ts - > sys_monoraw , corr_raw ) ;
ts - > sys_realtime = ktime_sub_ns ( ts - > sys_realtime , corr_real ) ;
}
return 0 ;
}
/*
* cycle_between - true if test occurs chronologically between before and after
*/
2016-12-22 03:32:01 +08:00
static bool cycle_between ( u64 before , u64 test , u64 after )
time: Add history to cross timestamp interface supporting slower devices
Another representative use case of time sync and the correlated
clocksource (in addition to PTP noted above) is PTP synchronized
audio.
In a streaming application, as an example, samples will be sent and/or
received by multiple devices with a presentation time that is in terms
of the PTP master clock. Synchronizing the audio output on these
devices requires correlating the audio clock with the PTP master
clock. The more precise this correlation is, the better the audio
quality (i.e. out of sync audio sounds bad).
From an application standpoint, to correlate the PTP master clock with
the audio device clock, the system clock is used as a intermediate
timebase. The transforms such an application would perform are:
System Clock <-> Audio clock
System Clock <-> Network Device Clock [<-> PTP Master Clock]
Modern Intel platforms can perform a more accurate cross timestamp in
hardware (ART,audio device clock). The audio driver requires
ART->system time transforms -- the same as required for the network
driver. These platforms offload audio processing (including
cross-timestamps) to a DSP which to ensure uninterrupted audio
processing, communicates and response to the host only once every
millsecond. As a result is takes up to a millisecond for the DSP to
receive a request, the request is processed by the DSP, the audio
output hardware is polled for completion, the result is copied into
shared memory, and the host is notified. All of these operation occur
on a millisecond cadence. This transaction requires about 2 ms, but
under heavier workloads it may take up to 4 ms.
Adding a history allows these slow devices the option of providing an
ART value outside of the current interval. In this case, the callback
provided is an accessor function for the previously obtained counter
value. If get_system_device_crosststamp() receives a counter value
previous to cycle_last, it consults the history provided as an
argument in history_ref and interpolates the realtime and monotonic
raw system time using the provided counter value. If there are any
clock discontinuities, e.g. from calling settimeofday(), the monotonic
raw time is interpolated in the usual way, but the realtime clock time
is adjusted by scaling the monotonic raw adjustment.
When an accessor function is used a history argument *must* be
provided. The history is initialized using ktime_get_snapshot() and
must be called before the counter values are read.
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: kevin.b.stanton@intel.com
Cc: kevin.j.clarke@intel.com
Cc: hpa@zytor.com
Cc: jeffrey.t.kirsher@intel.com
Cc: netdev@vger.kernel.org
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Christopher S. Hall <christopher.s.hall@intel.com>
[jstultz: Fixed up cycles_t/cycle_t type confusion]
Signed-off-by: John Stultz <john.stultz@linaro.org>
2016-02-22 19:15:23 +08:00
{
if ( test > before & & test < after )
return true ;
if ( test < before & & before > after )
return true ;
return false ;
}
2016-02-22 19:15:22 +08:00
/**
* get_device_system_crosststamp - Synchronously capture system / device timestamp
time: Add history to cross timestamp interface supporting slower devices
Another representative use case of time sync and the correlated
clocksource (in addition to PTP noted above) is PTP synchronized
audio.
In a streaming application, as an example, samples will be sent and/or
received by multiple devices with a presentation time that is in terms
of the PTP master clock. Synchronizing the audio output on these
devices requires correlating the audio clock with the PTP master
clock. The more precise this correlation is, the better the audio
quality (i.e. out of sync audio sounds bad).
From an application standpoint, to correlate the PTP master clock with
the audio device clock, the system clock is used as a intermediate
timebase. The transforms such an application would perform are:
System Clock <-> Audio clock
System Clock <-> Network Device Clock [<-> PTP Master Clock]
Modern Intel platforms can perform a more accurate cross timestamp in
hardware (ART,audio device clock). The audio driver requires
ART->system time transforms -- the same as required for the network
driver. These platforms offload audio processing (including
cross-timestamps) to a DSP which to ensure uninterrupted audio
processing, communicates and response to the host only once every
millsecond. As a result is takes up to a millisecond for the DSP to
receive a request, the request is processed by the DSP, the audio
output hardware is polled for completion, the result is copied into
shared memory, and the host is notified. All of these operation occur
on a millisecond cadence. This transaction requires about 2 ms, but
under heavier workloads it may take up to 4 ms.
Adding a history allows these slow devices the option of providing an
ART value outside of the current interval. In this case, the callback
provided is an accessor function for the previously obtained counter
value. If get_system_device_crosststamp() receives a counter value
previous to cycle_last, it consults the history provided as an
argument in history_ref and interpolates the realtime and monotonic
raw system time using the provided counter value. If there are any
clock discontinuities, e.g. from calling settimeofday(), the monotonic
raw time is interpolated in the usual way, but the realtime clock time
is adjusted by scaling the monotonic raw adjustment.
When an accessor function is used a history argument *must* be
provided. The history is initialized using ktime_get_snapshot() and
must be called before the counter values are read.
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: kevin.b.stanton@intel.com
Cc: kevin.j.clarke@intel.com
Cc: hpa@zytor.com
Cc: jeffrey.t.kirsher@intel.com
Cc: netdev@vger.kernel.org
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Christopher S. Hall <christopher.s.hall@intel.com>
[jstultz: Fixed up cycles_t/cycle_t type confusion]
Signed-off-by: John Stultz <john.stultz@linaro.org>
2016-02-22 19:15:23 +08:00
* @ get_time_fn : Callback to get simultaneous device time and
2016-02-22 19:15:22 +08:00
* system counter from the device driver
time: Add history to cross timestamp interface supporting slower devices
Another representative use case of time sync and the correlated
clocksource (in addition to PTP noted above) is PTP synchronized
audio.
In a streaming application, as an example, samples will be sent and/or
received by multiple devices with a presentation time that is in terms
of the PTP master clock. Synchronizing the audio output on these
devices requires correlating the audio clock with the PTP master
clock. The more precise this correlation is, the better the audio
quality (i.e. out of sync audio sounds bad).
From an application standpoint, to correlate the PTP master clock with
the audio device clock, the system clock is used as a intermediate
timebase. The transforms such an application would perform are:
System Clock <-> Audio clock
System Clock <-> Network Device Clock [<-> PTP Master Clock]
Modern Intel platforms can perform a more accurate cross timestamp in
hardware (ART,audio device clock). The audio driver requires
ART->system time transforms -- the same as required for the network
driver. These platforms offload audio processing (including
cross-timestamps) to a DSP which to ensure uninterrupted audio
processing, communicates and response to the host only once every
millsecond. As a result is takes up to a millisecond for the DSP to
receive a request, the request is processed by the DSP, the audio
output hardware is polled for completion, the result is copied into
shared memory, and the host is notified. All of these operation occur
on a millisecond cadence. This transaction requires about 2 ms, but
under heavier workloads it may take up to 4 ms.
Adding a history allows these slow devices the option of providing an
ART value outside of the current interval. In this case, the callback
provided is an accessor function for the previously obtained counter
value. If get_system_device_crosststamp() receives a counter value
previous to cycle_last, it consults the history provided as an
argument in history_ref and interpolates the realtime and monotonic
raw system time using the provided counter value. If there are any
clock discontinuities, e.g. from calling settimeofday(), the monotonic
raw time is interpolated in the usual way, but the realtime clock time
is adjusted by scaling the monotonic raw adjustment.
When an accessor function is used a history argument *must* be
provided. The history is initialized using ktime_get_snapshot() and
must be called before the counter values are read.
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: kevin.b.stanton@intel.com
Cc: kevin.j.clarke@intel.com
Cc: hpa@zytor.com
Cc: jeffrey.t.kirsher@intel.com
Cc: netdev@vger.kernel.org
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Christopher S. Hall <christopher.s.hall@intel.com>
[jstultz: Fixed up cycles_t/cycle_t type confusion]
Signed-off-by: John Stultz <john.stultz@linaro.org>
2016-02-22 19:15:23 +08:00
* @ ctx : Context passed to get_time_fn ( )
* @ history_begin : Historical reference point used to interpolate system
* time when counter provided by the driver is before the current interval
2016-02-22 19:15:22 +08:00
* @ xtstamp : Receives simultaneously captured system and device time
*
* Reads a timestamp from a device and correlates it to system time
*/
int get_device_system_crosststamp ( int ( * get_time_fn )
( ktime_t * device_time ,
struct system_counterval_t * sys_counterval ,
void * ctx ) ,
void * ctx ,
time: Add history to cross timestamp interface supporting slower devices
Another representative use case of time sync and the correlated
clocksource (in addition to PTP noted above) is PTP synchronized
audio.
In a streaming application, as an example, samples will be sent and/or
received by multiple devices with a presentation time that is in terms
of the PTP master clock. Synchronizing the audio output on these
devices requires correlating the audio clock with the PTP master
clock. The more precise this correlation is, the better the audio
quality (i.e. out of sync audio sounds bad).
From an application standpoint, to correlate the PTP master clock with
the audio device clock, the system clock is used as a intermediate
timebase. The transforms such an application would perform are:
System Clock <-> Audio clock
System Clock <-> Network Device Clock [<-> PTP Master Clock]
Modern Intel platforms can perform a more accurate cross timestamp in
hardware (ART,audio device clock). The audio driver requires
ART->system time transforms -- the same as required for the network
driver. These platforms offload audio processing (including
cross-timestamps) to a DSP which to ensure uninterrupted audio
processing, communicates and response to the host only once every
millsecond. As a result is takes up to a millisecond for the DSP to
receive a request, the request is processed by the DSP, the audio
output hardware is polled for completion, the result is copied into
shared memory, and the host is notified. All of these operation occur
on a millisecond cadence. This transaction requires about 2 ms, but
under heavier workloads it may take up to 4 ms.
Adding a history allows these slow devices the option of providing an
ART value outside of the current interval. In this case, the callback
provided is an accessor function for the previously obtained counter
value. If get_system_device_crosststamp() receives a counter value
previous to cycle_last, it consults the history provided as an
argument in history_ref and interpolates the realtime and monotonic
raw system time using the provided counter value. If there are any
clock discontinuities, e.g. from calling settimeofday(), the monotonic
raw time is interpolated in the usual way, but the realtime clock time
is adjusted by scaling the monotonic raw adjustment.
When an accessor function is used a history argument *must* be
provided. The history is initialized using ktime_get_snapshot() and
must be called before the counter values are read.
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: kevin.b.stanton@intel.com
Cc: kevin.j.clarke@intel.com
Cc: hpa@zytor.com
Cc: jeffrey.t.kirsher@intel.com
Cc: netdev@vger.kernel.org
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Christopher S. Hall <christopher.s.hall@intel.com>
[jstultz: Fixed up cycles_t/cycle_t type confusion]
Signed-off-by: John Stultz <john.stultz@linaro.org>
2016-02-22 19:15:23 +08:00
struct system_time_snapshot * history_begin ,
2016-02-22 19:15:22 +08:00
struct system_device_crosststamp * xtstamp )
{
struct system_counterval_t system_counterval ;
struct timekeeper * tk = & tk_core . timekeeper ;
2016-12-22 03:32:01 +08:00
u64 cycles , now , interval_start ;
2016-03-08 18:09:53 +08:00
unsigned int clock_was_set_seq = 0 ;
2016-02-22 19:15:22 +08:00
ktime_t base_real , base_raw ;
2016-12-09 04:49:34 +08:00
u64 nsec_real , nsec_raw ;
time: Add history to cross timestamp interface supporting slower devices
Another representative use case of time sync and the correlated
clocksource (in addition to PTP noted above) is PTP synchronized
audio.
In a streaming application, as an example, samples will be sent and/or
received by multiple devices with a presentation time that is in terms
of the PTP master clock. Synchronizing the audio output on these
devices requires correlating the audio clock with the PTP master
clock. The more precise this correlation is, the better the audio
quality (i.e. out of sync audio sounds bad).
From an application standpoint, to correlate the PTP master clock with
the audio device clock, the system clock is used as a intermediate
timebase. The transforms such an application would perform are:
System Clock <-> Audio clock
System Clock <-> Network Device Clock [<-> PTP Master Clock]
Modern Intel platforms can perform a more accurate cross timestamp in
hardware (ART,audio device clock). The audio driver requires
ART->system time transforms -- the same as required for the network
driver. These platforms offload audio processing (including
cross-timestamps) to a DSP which to ensure uninterrupted audio
processing, communicates and response to the host only once every
millsecond. As a result is takes up to a millisecond for the DSP to
receive a request, the request is processed by the DSP, the audio
output hardware is polled for completion, the result is copied into
shared memory, and the host is notified. All of these operation occur
on a millisecond cadence. This transaction requires about 2 ms, but
under heavier workloads it may take up to 4 ms.
Adding a history allows these slow devices the option of providing an
ART value outside of the current interval. In this case, the callback
provided is an accessor function for the previously obtained counter
value. If get_system_device_crosststamp() receives a counter value
previous to cycle_last, it consults the history provided as an
argument in history_ref and interpolates the realtime and monotonic
raw system time using the provided counter value. If there are any
clock discontinuities, e.g. from calling settimeofday(), the monotonic
raw time is interpolated in the usual way, but the realtime clock time
is adjusted by scaling the monotonic raw adjustment.
When an accessor function is used a history argument *must* be
provided. The history is initialized using ktime_get_snapshot() and
must be called before the counter values are read.
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: kevin.b.stanton@intel.com
Cc: kevin.j.clarke@intel.com
Cc: hpa@zytor.com
Cc: jeffrey.t.kirsher@intel.com
Cc: netdev@vger.kernel.org
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Christopher S. Hall <christopher.s.hall@intel.com>
[jstultz: Fixed up cycles_t/cycle_t type confusion]
Signed-off-by: John Stultz <john.stultz@linaro.org>
2016-02-22 19:15:23 +08:00
u8 cs_was_changed_seq ;
2019-03-19 03:55:56 +08:00
unsigned int seq ;
time: Add history to cross timestamp interface supporting slower devices
Another representative use case of time sync and the correlated
clocksource (in addition to PTP noted above) is PTP synchronized
audio.
In a streaming application, as an example, samples will be sent and/or
received by multiple devices with a presentation time that is in terms
of the PTP master clock. Synchronizing the audio output on these
devices requires correlating the audio clock with the PTP master
clock. The more precise this correlation is, the better the audio
quality (i.e. out of sync audio sounds bad).
From an application standpoint, to correlate the PTP master clock with
the audio device clock, the system clock is used as a intermediate
timebase. The transforms such an application would perform are:
System Clock <-> Audio clock
System Clock <-> Network Device Clock [<-> PTP Master Clock]
Modern Intel platforms can perform a more accurate cross timestamp in
hardware (ART,audio device clock). The audio driver requires
ART->system time transforms -- the same as required for the network
driver. These platforms offload audio processing (including
cross-timestamps) to a DSP which to ensure uninterrupted audio
processing, communicates and response to the host only once every
millsecond. As a result is takes up to a millisecond for the DSP to
receive a request, the request is processed by the DSP, the audio
output hardware is polled for completion, the result is copied into
shared memory, and the host is notified. All of these operation occur
on a millisecond cadence. This transaction requires about 2 ms, but
under heavier workloads it may take up to 4 ms.
Adding a history allows these slow devices the option of providing an
ART value outside of the current interval. In this case, the callback
provided is an accessor function for the previously obtained counter
value. If get_system_device_crosststamp() receives a counter value
previous to cycle_last, it consults the history provided as an
argument in history_ref and interpolates the realtime and monotonic
raw system time using the provided counter value. If there are any
clock discontinuities, e.g. from calling settimeofday(), the monotonic
raw time is interpolated in the usual way, but the realtime clock time
is adjusted by scaling the monotonic raw adjustment.
When an accessor function is used a history argument *must* be
provided. The history is initialized using ktime_get_snapshot() and
must be called before the counter values are read.
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: kevin.b.stanton@intel.com
Cc: kevin.j.clarke@intel.com
Cc: hpa@zytor.com
Cc: jeffrey.t.kirsher@intel.com
Cc: netdev@vger.kernel.org
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Christopher S. Hall <christopher.s.hall@intel.com>
[jstultz: Fixed up cycles_t/cycle_t type confusion]
Signed-off-by: John Stultz <john.stultz@linaro.org>
2016-02-22 19:15:23 +08:00
bool do_interp ;
2016-02-22 19:15:22 +08:00
int ret ;
do {
seq = read_seqcount_begin ( & tk_core . seq ) ;
/*
* Try to synchronously capture device time and a system
* counter value calling back into the device driver
*/
ret = get_time_fn ( & xtstamp - > device , & system_counterval , ctx ) ;
if ( ret )
return ret ;
/*
* Verify that the clocksource associated with the captured
* system counter value is the same as the currently installed
* timekeeper clocksource
*/
if ( tk - > tkr_mono . clock ! = system_counterval . cs )
return - ENODEV ;
time: Add history to cross timestamp interface supporting slower devices
Another representative use case of time sync and the correlated
clocksource (in addition to PTP noted above) is PTP synchronized
audio.
In a streaming application, as an example, samples will be sent and/or
received by multiple devices with a presentation time that is in terms
of the PTP master clock. Synchronizing the audio output on these
devices requires correlating the audio clock with the PTP master
clock. The more precise this correlation is, the better the audio
quality (i.e. out of sync audio sounds bad).
From an application standpoint, to correlate the PTP master clock with
the audio device clock, the system clock is used as a intermediate
timebase. The transforms such an application would perform are:
System Clock <-> Audio clock
System Clock <-> Network Device Clock [<-> PTP Master Clock]
Modern Intel platforms can perform a more accurate cross timestamp in
hardware (ART,audio device clock). The audio driver requires
ART->system time transforms -- the same as required for the network
driver. These platforms offload audio processing (including
cross-timestamps) to a DSP which to ensure uninterrupted audio
processing, communicates and response to the host only once every
millsecond. As a result is takes up to a millisecond for the DSP to
receive a request, the request is processed by the DSP, the audio
output hardware is polled for completion, the result is copied into
shared memory, and the host is notified. All of these operation occur
on a millisecond cadence. This transaction requires about 2 ms, but
under heavier workloads it may take up to 4 ms.
Adding a history allows these slow devices the option of providing an
ART value outside of the current interval. In this case, the callback
provided is an accessor function for the previously obtained counter
value. If get_system_device_crosststamp() receives a counter value
previous to cycle_last, it consults the history provided as an
argument in history_ref and interpolates the realtime and monotonic
raw system time using the provided counter value. If there are any
clock discontinuities, e.g. from calling settimeofday(), the monotonic
raw time is interpolated in the usual way, but the realtime clock time
is adjusted by scaling the monotonic raw adjustment.
When an accessor function is used a history argument *must* be
provided. The history is initialized using ktime_get_snapshot() and
must be called before the counter values are read.
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: kevin.b.stanton@intel.com
Cc: kevin.j.clarke@intel.com
Cc: hpa@zytor.com
Cc: jeffrey.t.kirsher@intel.com
Cc: netdev@vger.kernel.org
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Christopher S. Hall <christopher.s.hall@intel.com>
[jstultz: Fixed up cycles_t/cycle_t type confusion]
Signed-off-by: John Stultz <john.stultz@linaro.org>
2016-02-22 19:15:23 +08:00
cycles = system_counterval . cycles ;
/*
* Check whether the system counter value provided by the
* device driver is on the current timekeeping interval .
*/
2017-06-09 07:44:20 +08:00
now = tk_clock_read ( & tk - > tkr_mono ) ;
time: Add history to cross timestamp interface supporting slower devices
Another representative use case of time sync and the correlated
clocksource (in addition to PTP noted above) is PTP synchronized
audio.
In a streaming application, as an example, samples will be sent and/or
received by multiple devices with a presentation time that is in terms
of the PTP master clock. Synchronizing the audio output on these
devices requires correlating the audio clock with the PTP master
clock. The more precise this correlation is, the better the audio
quality (i.e. out of sync audio sounds bad).
From an application standpoint, to correlate the PTP master clock with
the audio device clock, the system clock is used as a intermediate
timebase. The transforms such an application would perform are:
System Clock <-> Audio clock
System Clock <-> Network Device Clock [<-> PTP Master Clock]
Modern Intel platforms can perform a more accurate cross timestamp in
hardware (ART,audio device clock). The audio driver requires
ART->system time transforms -- the same as required for the network
driver. These platforms offload audio processing (including
cross-timestamps) to a DSP which to ensure uninterrupted audio
processing, communicates and response to the host only once every
millsecond. As a result is takes up to a millisecond for the DSP to
receive a request, the request is processed by the DSP, the audio
output hardware is polled for completion, the result is copied into
shared memory, and the host is notified. All of these operation occur
on a millisecond cadence. This transaction requires about 2 ms, but
under heavier workloads it may take up to 4 ms.
Adding a history allows these slow devices the option of providing an
ART value outside of the current interval. In this case, the callback
provided is an accessor function for the previously obtained counter
value. If get_system_device_crosststamp() receives a counter value
previous to cycle_last, it consults the history provided as an
argument in history_ref and interpolates the realtime and monotonic
raw system time using the provided counter value. If there are any
clock discontinuities, e.g. from calling settimeofday(), the monotonic
raw time is interpolated in the usual way, but the realtime clock time
is adjusted by scaling the monotonic raw adjustment.
When an accessor function is used a history argument *must* be
provided. The history is initialized using ktime_get_snapshot() and
must be called before the counter values are read.
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: kevin.b.stanton@intel.com
Cc: kevin.j.clarke@intel.com
Cc: hpa@zytor.com
Cc: jeffrey.t.kirsher@intel.com
Cc: netdev@vger.kernel.org
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Christopher S. Hall <christopher.s.hall@intel.com>
[jstultz: Fixed up cycles_t/cycle_t type confusion]
Signed-off-by: John Stultz <john.stultz@linaro.org>
2016-02-22 19:15:23 +08:00
interval_start = tk - > tkr_mono . cycle_last ;
if ( ! cycle_between ( interval_start , cycles , now ) ) {
clock_was_set_seq = tk - > clock_was_set_seq ;
cs_was_changed_seq = tk - > cs_was_changed_seq ;
cycles = interval_start ;
do_interp = true ;
} else {
do_interp = false ;
}
2016-02-22 19:15:22 +08:00
base_real = ktime_add ( tk - > tkr_mono . base ,
tk_core . timekeeper . offs_real ) ;
base_raw = tk - > tkr_raw . base ;
nsec_real = timekeeping_cycles_to_ns ( & tk - > tkr_mono ,
system_counterval . cycles ) ;
nsec_raw = timekeeping_cycles_to_ns ( & tk - > tkr_raw ,
system_counterval . cycles ) ;
} while ( read_seqcount_retry ( & tk_core . seq , seq ) ) ;
xtstamp - > sys_realtime = ktime_add_ns ( base_real , nsec_real ) ;
xtstamp - > sys_monoraw = ktime_add_ns ( base_raw , nsec_raw ) ;
time: Add history to cross timestamp interface supporting slower devices
Another representative use case of time sync and the correlated
clocksource (in addition to PTP noted above) is PTP synchronized
audio.
In a streaming application, as an example, samples will be sent and/or
received by multiple devices with a presentation time that is in terms
of the PTP master clock. Synchronizing the audio output on these
devices requires correlating the audio clock with the PTP master
clock. The more precise this correlation is, the better the audio
quality (i.e. out of sync audio sounds bad).
From an application standpoint, to correlate the PTP master clock with
the audio device clock, the system clock is used as a intermediate
timebase. The transforms such an application would perform are:
System Clock <-> Audio clock
System Clock <-> Network Device Clock [<-> PTP Master Clock]
Modern Intel platforms can perform a more accurate cross timestamp in
hardware (ART,audio device clock). The audio driver requires
ART->system time transforms -- the same as required for the network
driver. These platforms offload audio processing (including
cross-timestamps) to a DSP which to ensure uninterrupted audio
processing, communicates and response to the host only once every
millsecond. As a result is takes up to a millisecond for the DSP to
receive a request, the request is processed by the DSP, the audio
output hardware is polled for completion, the result is copied into
shared memory, and the host is notified. All of these operation occur
on a millisecond cadence. This transaction requires about 2 ms, but
under heavier workloads it may take up to 4 ms.
Adding a history allows these slow devices the option of providing an
ART value outside of the current interval. In this case, the callback
provided is an accessor function for the previously obtained counter
value. If get_system_device_crosststamp() receives a counter value
previous to cycle_last, it consults the history provided as an
argument in history_ref and interpolates the realtime and monotonic
raw system time using the provided counter value. If there are any
clock discontinuities, e.g. from calling settimeofday(), the monotonic
raw time is interpolated in the usual way, but the realtime clock time
is adjusted by scaling the monotonic raw adjustment.
When an accessor function is used a history argument *must* be
provided. The history is initialized using ktime_get_snapshot() and
must be called before the counter values are read.
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: kevin.b.stanton@intel.com
Cc: kevin.j.clarke@intel.com
Cc: hpa@zytor.com
Cc: jeffrey.t.kirsher@intel.com
Cc: netdev@vger.kernel.org
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Christopher S. Hall <christopher.s.hall@intel.com>
[jstultz: Fixed up cycles_t/cycle_t type confusion]
Signed-off-by: John Stultz <john.stultz@linaro.org>
2016-02-22 19:15:23 +08:00
/*
* Interpolate if necessary , adjusting back from the start of the
* current interval
*/
if ( do_interp ) {
2016-12-22 03:32:01 +08:00
u64 partial_history_cycles , total_history_cycles ;
time: Add history to cross timestamp interface supporting slower devices
Another representative use case of time sync and the correlated
clocksource (in addition to PTP noted above) is PTP synchronized
audio.
In a streaming application, as an example, samples will be sent and/or
received by multiple devices with a presentation time that is in terms
of the PTP master clock. Synchronizing the audio output on these
devices requires correlating the audio clock with the PTP master
clock. The more precise this correlation is, the better the audio
quality (i.e. out of sync audio sounds bad).
From an application standpoint, to correlate the PTP master clock with
the audio device clock, the system clock is used as a intermediate
timebase. The transforms such an application would perform are:
System Clock <-> Audio clock
System Clock <-> Network Device Clock [<-> PTP Master Clock]
Modern Intel platforms can perform a more accurate cross timestamp in
hardware (ART,audio device clock). The audio driver requires
ART->system time transforms -- the same as required for the network
driver. These platforms offload audio processing (including
cross-timestamps) to a DSP which to ensure uninterrupted audio
processing, communicates and response to the host only once every
millsecond. As a result is takes up to a millisecond for the DSP to
receive a request, the request is processed by the DSP, the audio
output hardware is polled for completion, the result is copied into
shared memory, and the host is notified. All of these operation occur
on a millisecond cadence. This transaction requires about 2 ms, but
under heavier workloads it may take up to 4 ms.
Adding a history allows these slow devices the option of providing an
ART value outside of the current interval. In this case, the callback
provided is an accessor function for the previously obtained counter
value. If get_system_device_crosststamp() receives a counter value
previous to cycle_last, it consults the history provided as an
argument in history_ref and interpolates the realtime and monotonic
raw system time using the provided counter value. If there are any
clock discontinuities, e.g. from calling settimeofday(), the monotonic
raw time is interpolated in the usual way, but the realtime clock time
is adjusted by scaling the monotonic raw adjustment.
When an accessor function is used a history argument *must* be
provided. The history is initialized using ktime_get_snapshot() and
must be called before the counter values are read.
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: kevin.b.stanton@intel.com
Cc: kevin.j.clarke@intel.com
Cc: hpa@zytor.com
Cc: jeffrey.t.kirsher@intel.com
Cc: netdev@vger.kernel.org
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Christopher S. Hall <christopher.s.hall@intel.com>
[jstultz: Fixed up cycles_t/cycle_t type confusion]
Signed-off-by: John Stultz <john.stultz@linaro.org>
2016-02-22 19:15:23 +08:00
bool discontinuity ;
/*
* Check that the counter value occurs after the provided
* history reference and that the history doesn ' t cross a
* clocksource change
*/
if ( ! history_begin | |
! cycle_between ( history_begin - > cycles ,
system_counterval . cycles , cycles ) | |
history_begin - > cs_was_changed_seq ! = cs_was_changed_seq )
return - EINVAL ;
partial_history_cycles = cycles - system_counterval . cycles ;
total_history_cycles = cycles - history_begin - > cycles ;
discontinuity =
history_begin - > clock_was_set_seq ! = clock_was_set_seq ;
ret = adjust_historical_crosststamp ( history_begin ,
partial_history_cycles ,
total_history_cycles ,
discontinuity , xtstamp ) ;
if ( ret )
return ret ;
}
2016-02-22 19:15:22 +08:00
return 0 ;
}
EXPORT_SYMBOL_GPL ( get_device_system_crosststamp ) ;
2007-05-08 15:27:59 +08:00
/**
2014-11-18 19:15:16 +08:00
* do_settimeofday64 - Sets the time of day .
* @ ts : pointer to the timespec64 variable containing the new time
2007-05-08 15:27:59 +08:00
*
* Sets the time of day to the new time and update NTP and notify hrtimers
*/
2014-11-18 19:15:16 +08:00
int do_settimeofday64 ( const struct timespec64 * ts )
2007-05-08 15:27:59 +08:00
{
2014-07-17 05:04:07 +08:00
struct timekeeper * tk = & tk_core . timekeeper ;
2014-11-18 19:15:16 +08:00
struct timespec64 ts_delta , xt ;
2011-11-15 06:05:44 +08:00
unsigned long flags ;
2015-06-23 18:38:54 +08:00
int ret = 0 ;
2007-05-08 15:27:59 +08:00
2019-03-23 18:36:19 +08:00
if ( ! timespec64_valid_settod ( ts ) )
2007-05-08 15:27:59 +08:00
return - EINVAL ;
2013-02-22 06:51:38 +08:00
raw_spin_lock_irqsave ( & timekeeper_lock , flags ) ;
2014-07-17 05:04:07 +08:00
write_seqcount_begin ( & tk_core . seq ) ;
2007-05-08 15:27:59 +08:00
2012-07-28 02:48:13 +08:00
timekeeping_forward_now ( tk ) ;
2008-08-21 07:37:28 +08:00
2012-07-28 02:48:13 +08:00
xt = tk_xtime ( tk ) ;
2014-11-18 19:15:16 +08:00
ts_delta . tv_sec = ts - > tv_sec - xt . tv_sec ;
ts_delta . tv_nsec = ts - > tv_nsec - xt . tv_nsec ;
2012-07-13 13:21:53 +08:00
2015-06-23 18:38:54 +08:00
if ( timespec64_compare ( & tk - > wall_to_monotonic , & ts_delta ) > 0 ) {
ret = - EINVAL ;
goto out ;
}
2014-07-17 05:04:01 +08:00
tk_set_wall_to_mono ( tk , timespec64_sub ( tk - > wall_to_monotonic , ts_delta ) ) ;
2007-05-08 15:27:59 +08:00
2014-11-18 19:15:16 +08:00
tk_set_xtime ( tk , ts ) ;
2015-06-23 18:38:54 +08:00
out :
2013-06-27 18:35:46 +08:00
timekeeping_update ( tk , TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET ) ;
2007-05-08 15:27:59 +08:00
2014-07-17 05:04:07 +08:00
write_seqcount_end ( & tk_core . seq ) ;
2013-02-22 06:51:38 +08:00
raw_spin_unlock_irqrestore ( & timekeeper_lock , flags ) ;
2007-05-08 15:27:59 +08:00
/* signal hrtimers about time change */
clock_was_set ( ) ;
2019-04-10 17:14:19 +08:00
if ( ! ret )
audit_tk_injoffset ( ts_delta ) ;
2015-06-23 18:38:54 +08:00
return ret ;
2007-05-08 15:27:59 +08:00
}
2014-11-18 19:15:16 +08:00
EXPORT_SYMBOL ( do_settimeofday64 ) ;
2007-05-08 15:27:59 +08:00
2011-02-01 21:52:17 +08:00
/**
* timekeeping_inject_offset - Adds or subtracts from the current time .
* @ tv : pointer to the timespec variable containing the offset
*
* Adds or subtracts an offset value from the current time .
*/
2018-07-13 20:06:42 +08:00
static int timekeeping_inject_offset ( const struct timespec64 * ts )
2011-02-01 21:52:17 +08:00
{
2014-07-17 05:04:07 +08:00
struct timekeeper * tk = & tk_core . timekeeper ;
2011-11-15 06:05:44 +08:00
unsigned long flags ;
2017-10-19 19:14:45 +08:00
struct timespec64 tmp ;
2012-08-09 03:36:20 +08:00
int ret = 0 ;
2011-02-01 21:52:17 +08:00
2017-10-19 19:14:45 +08:00
if ( ts - > tv_nsec < 0 | | ts - > tv_nsec > = NSEC_PER_SEC )
2011-02-01 21:52:17 +08:00
return - EINVAL ;
2013-02-22 06:51:38 +08:00
raw_spin_lock_irqsave ( & timekeeper_lock , flags ) ;
2014-07-17 05:04:07 +08:00
write_seqcount_begin ( & tk_core . seq ) ;
2011-02-01 21:52:17 +08:00
2012-07-28 02:48:13 +08:00
timekeeping_forward_now ( tk ) ;
2011-02-01 21:52:17 +08:00
2012-08-09 03:36:20 +08:00
/* Make sure the proposed value is valid */
2017-10-19 19:14:45 +08:00
tmp = timespec64_add ( tk_xtime ( tk ) , * ts ) ;
if ( timespec64_compare ( & tk - > wall_to_monotonic , ts ) > 0 | |
2019-03-23 18:36:19 +08:00
! timespec64_valid_settod ( & tmp ) ) {
2012-08-09 03:36:20 +08:00
ret = - EINVAL ;
goto error ;
}
2012-07-13 13:21:53 +08:00
2017-10-19 19:14:45 +08:00
tk_xtime_add ( tk , ts ) ;
tk_set_wall_to_mono ( tk , timespec64_sub ( tk - > wall_to_monotonic , * ts ) ) ;
2011-02-01 21:52:17 +08:00
2012-08-09 03:36:20 +08:00
error : /* even if we error out, we forwarded the time, so call update */
2013-06-27 18:35:46 +08:00
timekeeping_update ( tk , TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET ) ;
2011-02-01 21:52:17 +08:00
2014-07-17 05:04:07 +08:00
write_seqcount_end ( & tk_core . seq ) ;
2013-02-22 06:51:38 +08:00
raw_spin_unlock_irqrestore ( & timekeeper_lock , flags ) ;
2011-02-01 21:52:17 +08:00
/* signal hrtimers about time change */
clock_was_set ( ) ;
2012-08-09 03:36:20 +08:00
return ret ;
2011-02-01 21:52:17 +08:00
}
2017-10-19 19:14:44 +08:00
/*
* Indicates if there is an offset between the system clock and the hardware
* clock / persistent clock / rtc .
*/
int persistent_clock_is_local ;
/*
* Adjust the time obtained from the CMOS to be UTC time instead of
* local time .
*
* This is ugly , but preferable to the alternatives . Otherwise we
* would either need to write a program to do it in / etc / rc ( and risk
* confusion if the program gets run more than once ; it would also be
* hard to make the program warp the clock precisely n hours ) or
* compile in the timezone information into the kernel . Bad , bad . . . .
*
* - TYT , 1992 - 01 - 01
*
* The best thing to do is to keep the CMOS clock in universal time ( UTC )
* as real UNIX machines always do it . This avoids all headaches about
* daylight saving times and warping kernel clocks .
*/
void timekeeping_warp_clock ( void )
{
if ( sys_tz . tz_minuteswest ! = 0 ) {
2017-10-19 19:14:45 +08:00
struct timespec64 adjust ;
2017-10-19 19:14:44 +08:00
persistent_clock_is_local = 1 ;
adjust . tv_sec = sys_tz . tz_minuteswest * 60 ;
adjust . tv_nsec = 0 ;
timekeeping_inject_offset ( & adjust ) ;
}
}
2011-02-01 21:52:17 +08:00
2012-05-04 03:30:07 +08:00
/**
2016-12-08 06:33:23 +08:00
* __timekeeping_set_tai_offset - Sets the TAI offset from UTC and monotonic
2012-05-04 03:30:07 +08:00
*
*/
2013-03-26 03:24:24 +08:00
static void __timekeeping_set_tai_offset ( struct timekeeper * tk , s32 tai_offset )
2012-05-04 03:30:07 +08:00
{
tk - > tai_offset = tai_offset ;
2013-12-11 09:13:35 +08:00
tk - > offs_tai = ktime_add ( tk - > offs_real , ktime_set ( tai_offset , 0 ) ) ;
2012-05-04 03:30:07 +08:00
}
2007-05-08 15:27:59 +08:00
/**
* change_clocksource - Swaps clocksources if a new one is available
*
* Accumulates current time interval and initializes new clocksource
*/
2009-08-14 21:47:30 +08:00
static int change_clocksource ( void * data )
2007-05-08 15:27:59 +08:00
{
2014-07-17 05:04:07 +08:00
struct timekeeper * tk = & tk_core . timekeeper ;
2009-04-22 03:24:02 +08:00
struct clocksource * new , * old ;
2012-03-15 07:38:15 +08:00
unsigned long flags ;
2007-05-08 15:27:59 +08:00
2009-08-14 21:47:30 +08:00
new = ( struct clocksource * ) data ;
2007-05-08 15:27:59 +08:00
2013-02-22 06:51:38 +08:00
raw_spin_lock_irqsave ( & timekeeper_lock , flags ) ;
2014-07-17 05:04:07 +08:00
write_seqcount_begin ( & tk_core . seq ) ;
2012-03-15 07:38:15 +08:00
2012-07-28 02:48:13 +08:00
timekeeping_forward_now ( tk ) ;
2013-04-26 04:31:44 +08:00
/*
* If the cs is in module , get a module reference . Succeeds
* for built - in code ( owner = = NULL ) as well .
*/
if ( try_module_get ( new - > owner ) ) {
if ( ! new - > enable | | new - > enable ( new ) = = 0 ) {
2015-03-19 17:09:06 +08:00
old = tk - > tkr_mono . clock ;
2013-04-26 04:31:44 +08:00
tk_setup_internals ( tk , new ) ;
if ( old - > disable )
old - > disable ( old ) ;
module_put ( old - > owner ) ;
} else {
module_put ( new - > owner ) ;
}
2009-08-14 21:47:30 +08:00
}
2013-06-27 18:35:46 +08:00
timekeeping_update ( tk , TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET ) ;
2012-03-15 07:38:15 +08:00
2014-07-17 05:04:07 +08:00
write_seqcount_end ( & tk_core . seq ) ;
2013-02-22 06:51:38 +08:00
raw_spin_unlock_irqrestore ( & timekeeper_lock , flags ) ;
2012-03-15 07:38:15 +08:00
2009-08-14 21:47:30 +08:00
return 0 ;
}
2007-05-08 15:27:59 +08:00
2009-08-14 21:47:30 +08:00
/**
* timekeeping_notify - Install a new clock source
* @ clock : pointer to the clock source
*
* This function is called from clocksource . c after a new , better clock
* source has been registered . The caller holds the clocksource_mutex .
*/
2013-04-26 04:31:44 +08:00
int timekeeping_notify ( struct clocksource * clock )
2009-08-14 21:47:30 +08:00
{
2014-07-17 05:04:07 +08:00
struct timekeeper * tk = & tk_core . timekeeper ;
2012-07-28 02:48:13 +08:00
2015-03-19 17:09:06 +08:00
if ( tk - > tkr_mono . clock = = clock )
2013-04-26 04:31:44 +08:00
return 0 ;
2009-08-14 21:47:30 +08:00
stop_machine ( change_clocksource , clock , NULL ) ;
2007-05-08 15:27:59 +08:00
tick_clock_notify ( ) ;
2015-03-19 17:09:06 +08:00
return tk - > tkr_mono . clock = = clock ? 0 : - 1 ;
2007-05-08 15:27:59 +08:00
}
2009-08-14 21:47:30 +08:00
2008-08-21 07:37:30 +08:00
/**
2018-04-27 21:40:14 +08:00
* ktime_get_raw_ts64 - Returns the raw monotonic time in a timespec
2014-11-08 03:03:20 +08:00
* @ ts : pointer to the timespec64 to be set
2008-08-21 07:37:30 +08:00
*
* Returns the raw monotonic time ( completely un - modified by ntp )
*/
2018-04-27 21:40:14 +08:00
void ktime_get_raw_ts64 ( struct timespec64 * ts )
2008-08-21 07:37:30 +08:00
{
2014-07-17 05:04:07 +08:00
struct timekeeper * tk = & tk_core . timekeeper ;
2019-03-19 03:55:56 +08:00
unsigned int seq ;
2016-12-09 04:49:34 +08:00
u64 nsecs ;
2008-08-21 07:37:30 +08:00
do {
2014-07-17 05:04:07 +08:00
seq = read_seqcount_begin ( & tk_core . seq ) ;
2017-05-23 08:20:20 +08:00
ts - > tv_sec = tk - > raw_sec ;
2015-03-19 16:28:44 +08:00
nsecs = timekeeping_get_ns ( & tk - > tkr_raw ) ;
2008-08-21 07:37:30 +08:00
2014-07-17 05:04:07 +08:00
} while ( read_seqcount_retry ( & tk_core . seq , seq ) ) ;
2008-08-21 07:37:30 +08:00
2017-05-23 08:20:20 +08:00
ts - > tv_nsec = 0 ;
timespec64_add_ns ( ts , nsecs ) ;
2008-08-21 07:37:30 +08:00
}
2018-04-27 21:40:14 +08:00
EXPORT_SYMBOL ( ktime_get_raw_ts64 ) ;
2014-11-08 03:03:20 +08:00
2008-08-21 07:37:30 +08:00
2007-05-08 15:27:59 +08:00
/**
2008-02-08 20:19:24 +08:00
* timekeeping_valid_for_hres - Check if timekeeping is suitable for hres
2007-05-08 15:27:59 +08:00
*/
2008-02-08 20:19:24 +08:00
int timekeeping_valid_for_hres ( void )
2007-05-08 15:27:59 +08:00
{
2014-07-17 05:04:07 +08:00
struct timekeeper * tk = & tk_core . timekeeper ;
2019-03-19 03:55:56 +08:00
unsigned int seq ;
2007-05-08 15:27:59 +08:00
int ret ;
do {
2014-07-17 05:04:07 +08:00
seq = read_seqcount_begin ( & tk_core . seq ) ;
2007-05-08 15:27:59 +08:00
2015-03-19 17:09:06 +08:00
ret = tk - > tkr_mono . clock - > flags & CLOCK_SOURCE_VALID_FOR_HRES ;
2007-05-08 15:27:59 +08:00
2014-07-17 05:04:07 +08:00
} while ( read_seqcount_retry ( & tk_core . seq , seq ) ) ;
2007-05-08 15:27:59 +08:00
return ret ;
}
2009-08-19 01:45:10 +08:00
/**
* timekeeping_max_deferment - Returns max time the clocksource can be deferred
*/
u64 timekeeping_max_deferment ( void )
{
2014-07-17 05:04:07 +08:00
struct timekeeper * tk = & tk_core . timekeeper ;
2019-03-19 03:55:56 +08:00
unsigned int seq ;
2011-11-15 04:48:10 +08:00
u64 ret ;
2012-07-13 13:21:51 +08:00
2011-11-15 04:48:10 +08:00
do {
2014-07-17 05:04:07 +08:00
seq = read_seqcount_begin ( & tk_core . seq ) ;
2011-11-15 04:48:10 +08:00
2015-03-19 17:09:06 +08:00
ret = tk - > tkr_mono . clock - > max_idle_ns ;
2011-11-15 04:48:10 +08:00
2014-07-17 05:04:07 +08:00
} while ( read_seqcount_retry ( & tk_core . seq , seq ) ) ;
2011-11-15 04:48:10 +08:00
return ret ;
2009-08-19 01:45:10 +08:00
}
2007-05-08 15:27:59 +08:00
/**
2018-08-14 20:15:23 +08:00
* read_persistent_clock64 - Return time from the persistent clock .
2007-05-08 15:27:59 +08:00
*
* Weak dummy function for arches that do not yet support it .
2009-08-14 21:47:31 +08:00
* Reads the time from the battery backed persistent clock .
* Returns a timespec with tv_sec = 0 and tv_nsec = 0 if unsupported .
2007-05-08 15:27:59 +08:00
*
* XXX - Do be sure to remove it once all arches implement it .
*/
2018-08-14 20:15:23 +08:00
void __weak read_persistent_clock64 ( struct timespec64 * ts )
2007-05-08 15:27:59 +08:00
{
2009-08-14 21:47:31 +08:00
ts - > tv_sec = 0 ;
ts - > tv_nsec = 0 ;
2007-05-08 15:27:59 +08:00
}
2009-08-14 21:47:32 +08:00
/**
2018-07-20 04:55:34 +08:00
* read_persistent_wall_and_boot_offset - Read persistent clock , and also offset
* from the boot .
2009-08-14 21:47:32 +08:00
*
* Weak dummy function for arches that do not yet support it .
2018-07-20 04:55:34 +08:00
* wall_time - current time as returned by persistent clock
* boot_offset - offset that is defined as wall_time - boot_time
2018-07-20 04:55:35 +08:00
* The default function calculates offset based on the current value of
* local_clock ( ) . This way architectures that support sched_clock ( ) but don ' t
* support dedicated boot time clock will provide the best estimate of the
* boot time .
2009-08-14 21:47:32 +08:00
*/
2018-07-20 04:55:34 +08:00
void __weak __init
read_persistent_wall_and_boot_offset ( struct timespec64 * wall_time ,
struct timespec64 * boot_offset )
2009-08-14 21:47:32 +08:00
{
2018-07-20 04:55:34 +08:00
read_persistent_clock64 ( wall_time ) ;
2018-07-20 04:55:35 +08:00
* boot_offset = ns_to_timespec64 ( local_clock ( ) ) ;
2009-08-14 21:47:32 +08:00
}
2018-07-17 14:31:29 +08:00
/*
* Flag reflecting whether timekeeping_resume ( ) has injected sleeptime .
*
* The flag starts of false and is only set when a suspend reaches
* timekeeping_suspend ( ) , timekeeping_resume ( ) sets it to false when the
* timekeeper clocksource is not stopping across suspend and has been
* used to update sleep time . If the timekeeper clocksource has stopped
* then the flag stays true and is used by the RTC resume code to decide
* whether sleeptime must be injected and if so the flag gets false then .
*
* If a suspend fails before reaching timekeeping_resume ( ) then the flag
* stays false and prevents erroneous sleeptime injection .
*/
static bool suspend_timing_needed ;
2015-04-02 11:34:38 +08:00
/* Flag for if there is a persistent clock on this platform */
static bool persistent_clock_exists ;
2007-05-08 15:27:59 +08:00
/*
* timekeeping_init - Initializes the clocksource and common timekeeping values
*/
void __init timekeeping_init ( void )
{
2018-07-20 04:55:34 +08:00
struct timespec64 wall_time , boot_offset , wall_to_mono ;
2014-07-17 05:04:07 +08:00
struct timekeeper * tk = & tk_core . timekeeper ;
2009-08-14 21:47:26 +08:00
struct clocksource * clock ;
2007-05-08 15:27:59 +08:00
unsigned long flags ;
2012-08-09 03:36:20 +08:00
2018-07-20 04:55:34 +08:00
read_persistent_wall_and_boot_offset ( & wall_time , & boot_offset ) ;
2019-03-23 18:36:19 +08:00
if ( timespec64_valid_settod ( & wall_time ) & &
2018-07-20 04:55:34 +08:00
timespec64_to_ns ( & wall_time ) > 0 ) {
persistent_clock_exists = true ;
2018-07-26 04:00:18 +08:00
} else if ( timespec64_to_ns ( & wall_time ) ! = 0 ) {
2018-07-20 04:55:34 +08:00
pr_warn ( " Persistent clock returned invalid value " ) ;
wall_time = ( struct timespec64 ) { 0 } ;
2012-08-09 03:36:20 +08:00
}
2007-05-08 15:27:59 +08:00
2018-07-20 04:55:34 +08:00
if ( timespec64_compare ( & wall_time , & boot_offset ) < 0 )
boot_offset = ( struct timespec64 ) { 0 } ;
/*
* We want set wall_to_mono , so the following is true :
* wall time + wall_to_mono = boot time
*/
wall_to_mono = timespec64_sub ( boot_offset , wall_time ) ;
2013-02-22 06:51:38 +08:00
raw_spin_lock_irqsave ( & timekeeper_lock , flags ) ;
2014-07-17 05:04:07 +08:00
write_seqcount_begin ( & tk_core . seq ) ;
2013-03-23 02:37:28 +08:00
ntp_init ( ) ;
2009-08-14 21:47:21 +08:00
clock = clocksource_default_clock ( ) ;
2009-08-14 21:47:19 +08:00
if ( clock - > enable )
clock - > enable ( clock ) ;
2012-07-28 02:48:13 +08:00
tk_setup_internals ( tk , clock ) ;
2007-05-08 15:27:59 +08:00
2018-07-20 04:55:34 +08:00
tk_set_xtime ( tk , & wall_time ) ;
2017-05-23 08:20:20 +08:00
tk - > raw_sec = 0 ;
2012-07-13 13:21:53 +08:00
2018-07-20 04:55:34 +08:00
tk_set_wall_to_mono ( tk , wall_to_mono ) ;
2012-07-28 02:48:12 +08:00
2015-10-16 21:50:22 +08:00
timekeeping_update ( tk , TK_MIRROR | TK_CLOCK_WAS_SET ) ;
2013-02-22 06:51:40 +08:00
2014-07-17 05:04:07 +08:00
write_seqcount_end ( & tk_core . seq ) ;
2013-02-22 06:51:38 +08:00
raw_spin_unlock_irqrestore ( & timekeeper_lock , flags ) ;
2007-05-08 15:27:59 +08:00
}
time: Fix a bug in timekeeping_suspend() with no persistent clock
When there's no persistent clock, normally
timekeeping_suspend_time should always be zero, but this can
break in timekeeping_suspend().
At T1, there was a system suspend, so old_delta was assigned T1.
After some time, one time adjustment happened, and xtime got the
value of T1-dt(0s<dt<2s). Then, there comes another system
suspend soon after this adjustment, obviously we will get a
small negative delta_delta, resulting in a negative
timekeeping_suspend_time.
This is problematic, when doing timekeeping_resume() if there is
no nonstop clocksource for example, it will hit the else leg and
inject the improper sleeptime which is the wrong logic.
So, we can solve this problem by only doing delta related code
when the persistent clock is existent. Actually the code only
makes sense for persistent clock cases.
Signed-off-by: Xunlei Pang <pang.xunlei@linaro.org>
Signed-off-by: John Stultz <john.stultz@linaro.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1427945681-29972-18-git-send-email-john.stultz@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-04-02 11:34:37 +08:00
/* time in seconds when suspend began for persistent clock */
2014-07-17 05:04:01 +08:00
static struct timespec64 timekeeping_suspend_time ;
2007-05-08 15:27:59 +08:00
2011-04-02 05:32:09 +08:00
/**
* __timekeeping_inject_sleeptime - Internal function to add sleep interval
* @ delta : pointer to a timespec delta value
*
* Takes a timespec offset measuring a suspend interval and properly
* adds the sleep offset to the timekeeping variables .
*/
2012-07-13 13:21:57 +08:00
static void __timekeeping_inject_sleeptime ( struct timekeeper * tk ,
2018-07-13 20:06:42 +08:00
const struct timespec64 * delta )
2011-04-02 05:32:09 +08:00
{
2014-07-17 05:04:01 +08:00
if ( ! timespec64_valid_strict ( delta ) ) {
2014-06-05 07:11:43 +08:00
printk_deferred ( KERN_WARNING
" __timekeeping_inject_sleeptime: Invalid "
" sleep delta value! \n " ) ;
2011-06-02 09:18:09 +08:00
return ;
}
2012-07-13 13:21:57 +08:00
tk_xtime_add ( tk , delta ) ;
2018-04-25 21:33:38 +08:00
tk_set_wall_to_mono ( tk , timespec64_sub ( tk - > wall_to_monotonic , * delta ) ) ;
2014-07-17 05:05:00 +08:00
tk_update_sleep_time ( tk , timespec64_to_ktime ( * delta ) ) ;
2013-05-22 13:32:14 +08:00
tk_debug_account_sleep_time ( delta ) ;
2011-04-02 05:32:09 +08:00
}
2015-04-02 11:34:35 +08:00
# if defined(CONFIG_PM_SLEEP) && defined(CONFIG_RTC_HCTOSYS_DEVICE)
2015-04-02 11:34:38 +08:00
/**
* We have three kinds of time sources to use for sleep time
* injection , the preference order is :
* 1 ) non - stop clocksource
* 2 ) persistent clock ( ie : RTC accessible when irqs are off )
* 3 ) RTC
*
* 1 ) and 2 ) are used by timekeeping , 3 ) by RTC subsystem .
* If system has neither 1 ) nor 2 ) , 3 ) will be used finally .
*
*
* If timekeeping has injected sleeptime via either 1 ) or 2 ) ,
* 3 ) becomes needless , so in this case we don ' t need to call
* rtc_resume ( ) , and this is what timekeeping_rtc_skipresume ( )
* means .
*/
bool timekeeping_rtc_skipresume ( void )
{
2018-07-17 14:31:29 +08:00
return ! suspend_timing_needed ;
2015-04-02 11:34:38 +08:00
}
/**
* 1 ) can be determined whether to use or not only when doing
* timekeeping_resume ( ) which is invoked after rtc_suspend ( ) ,
* so we can ' t skip rtc_suspend ( ) surely if system has 1 ) .
*
* But if system has 2 ) , 2 ) will definitely be used , so in this
* case we don ' t need to call rtc_suspend ( ) , and this is what
* timekeeping_rtc_skipsuspend ( ) means .
*/
bool timekeeping_rtc_skipsuspend ( void )
{
return persistent_clock_exists ;
}
2011-04-02 05:32:09 +08:00
/**
2014-11-18 19:15:17 +08:00
* timekeeping_inject_sleeptime64 - Adds suspend interval to timeekeeping values
* @ delta : pointer to a timespec64 delta value
2011-04-02 05:32:09 +08:00
*
2015-04-02 11:34:22 +08:00
* This hook is for architectures that cannot support read_persistent_clock64
2011-04-02 05:32:09 +08:00
* because their RTC / persistent clock is only accessible when irqs are enabled .
2015-04-02 11:34:38 +08:00
* and also don ' t have an effective nonstop clocksource .
2011-04-02 05:32:09 +08:00
*
* This function should only be called by rtc_resume ( ) , and allows
* a suspend offset to be injected into the timekeeping values .
*/
2018-07-13 20:06:42 +08:00
void timekeeping_inject_sleeptime64 ( const struct timespec64 * delta )
2011-04-02 05:32:09 +08:00
{
2014-07-17 05:04:07 +08:00
struct timekeeper * tk = & tk_core . timekeeper ;
2011-11-15 06:05:44 +08:00
unsigned long flags ;
2011-04-02 05:32:09 +08:00
2013-02-22 06:51:38 +08:00
raw_spin_lock_irqsave ( & timekeeper_lock , flags ) ;
2014-07-17 05:04:07 +08:00
write_seqcount_begin ( & tk_core . seq ) ;
2011-11-15 04:48:10 +08:00
2018-07-17 14:31:29 +08:00
suspend_timing_needed = false ;
2012-07-28 02:48:13 +08:00
timekeeping_forward_now ( tk ) ;
2011-04-02 05:32:09 +08:00
2014-11-18 19:15:17 +08:00
__timekeeping_inject_sleeptime ( tk , delta ) ;
2011-04-02 05:32:09 +08:00
2013-06-27 18:35:46 +08:00
timekeeping_update ( tk , TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET ) ;
2011-04-02 05:32:09 +08:00
2014-07-17 05:04:07 +08:00
write_seqcount_end ( & tk_core . seq ) ;
2013-02-22 06:51:38 +08:00
raw_spin_unlock_irqrestore ( & timekeeper_lock , flags ) ;
2011-04-02 05:32:09 +08:00
/* signal hrtimers about time change */
clock_was_set ( ) ;
}
2015-04-02 11:34:35 +08:00
# endif
2011-04-02 05:32:09 +08:00
2007-05-08 15:27:59 +08:00
/**
* timekeeping_resume - Resumes the generic timekeeping subsystem .
*/
PM / sleep: Make it possible to quiesce timers during suspend-to-idle
The efficiency of suspend-to-idle depends on being able to keep CPUs
in the deepest available idle states for as much time as possible.
Ideally, they should only be brought out of idle by system wakeup
interrupts.
However, timer interrupts occurring periodically prevent that from
happening and it is not practical to chase all of the "misbehaving"
timers in a whack-a-mole fashion. A much more effective approach is
to suspend the local ticks for all CPUs and the entire timekeeping
along the lines of what is done during full suspend, which also
helps to keep suspend-to-idle and full suspend reasonably similar.
The idea is to suspend the local tick on each CPU executing
cpuidle_enter_freeze() and to make the last of them suspend the
entire timekeeping. That should prevent timer interrupts from
triggering until an IO interrupt wakes up one of the CPUs. It
needs to be done with interrupts disabled on all of the CPUs,
though, because otherwise the suspended clocksource might be
accessed by an interrupt handler which might lead to fatal
consequences.
Unfortunately, the existing ->enter callbacks provided by cpuidle
drivers generally cannot be used for implementing that, because some
of them re-enable interrupts temporarily and some idle entry methods
cause interrupts to be re-enabled automatically on exit. Also some
of these callbacks manipulate local clock event devices of the CPUs
which really shouldn't be done after suspending their ticks.
To overcome that difficulty, introduce a new cpuidle state callback,
->enter_freeze, that will be guaranteed (1) to keep interrupts
disabled all the time (and return with interrupts disabled) and (2)
not to touch the CPU timer devices. Modify cpuidle_enter_freeze() to
look for the deepest available idle state with ->enter_freeze present
and to make the CPU execute that callback with suspended tick (and the
last of the online CPUs to execute it with suspended timekeeping).
Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
2015-02-14 06:50:43 +08:00
void timekeeping_resume ( void )
2007-05-08 15:27:59 +08:00
{
2014-07-17 05:04:07 +08:00
struct timekeeper * tk = & tk_core . timekeeper ;
2015-03-19 17:09:06 +08:00
struct clocksource * clock = tk - > tkr_mono . clock ;
2011-11-15 06:05:44 +08:00
unsigned long flags ;
2014-07-17 05:04:01 +08:00
struct timespec64 ts_new , ts_delta ;
2018-07-17 15:55:16 +08:00
u64 cycle_now , nsec ;
2018-07-17 14:31:29 +08:00
bool inject_sleeptime = false ;
2009-08-14 21:47:31 +08:00
2015-04-02 11:34:22 +08:00
read_persistent_clock64 ( & ts_new ) ;
2007-05-08 15:27:59 +08:00
2012-08-06 07:40:41 +08:00
clockevents_resume ( ) ;
2007-05-14 17:10:02 +08:00
clocksource_resume ( ) ;
2013-02-22 06:51:38 +08:00
raw_spin_lock_irqsave ( & timekeeper_lock , flags ) ;
2014-07-17 05:04:07 +08:00
write_seqcount_begin ( & tk_core . seq ) ;
2007-05-08 15:27:59 +08:00
2013-03-12 11:56:48 +08:00
/*
* After system resumes , we need to calculate the suspended time and
* compensate it for the OS time . There are 3 sources that could be
* used : Nonstop clocksource during suspend , persistent clock and rtc
* device .
*
* One specific platform may have 1 or 2 or all of them , and the
* preference will be :
* suspend - nonstop clocksource - > persistent clock - > rtc
* The less preferred source will only be tried if there is no better
* usable source . The rtc part is handled separately in rtc core code .
*/
2017-06-09 07:44:20 +08:00
cycle_now = tk_clock_read ( & tk - > tkr_mono ) ;
2018-07-17 15:55:16 +08:00
nsec = clocksource_stop_suspend_timing ( clock , cycle_now ) ;
if ( nsec > 0 ) {
2014-07-17 05:04:01 +08:00
ts_delta = ns_to_timespec64 ( nsec ) ;
2018-07-17 14:31:29 +08:00
inject_sleeptime = true ;
2014-07-17 05:04:01 +08:00
} else if ( timespec64_compare ( & ts_new , & timekeeping_suspend_time ) > 0 ) {
ts_delta = timespec64_sub ( ts_new , timekeeping_suspend_time ) ;
2018-07-17 14:31:29 +08:00
inject_sleeptime = true ;
2007-05-08 15:27:59 +08:00
}
2013-03-12 11:56:48 +08:00
2018-07-17 14:31:29 +08:00
if ( inject_sleeptime ) {
suspend_timing_needed = false ;
2013-03-12 11:56:48 +08:00
__timekeeping_inject_sleeptime ( tk , & ts_delta ) ;
2018-07-17 14:31:29 +08:00
}
2013-03-12 11:56:48 +08:00
/* Re-base the last cycle value */
2015-03-19 17:09:06 +08:00
tk - > tkr_mono . cycle_last = cycle_now ;
2015-03-19 16:28:44 +08:00
tk - > tkr_raw . cycle_last = cycle_now ;
2012-07-28 02:48:13 +08:00
tk - > ntp_error = 0 ;
2007-05-08 15:27:59 +08:00
timekeeping_suspended = 0 ;
2013-06-27 18:35:46 +08:00
timekeeping_update ( tk , TK_MIRROR | TK_CLOCK_WAS_SET ) ;
2014-07-17 05:04:07 +08:00
write_seqcount_end ( & tk_core . seq ) ;
2013-02-22 06:51:38 +08:00
raw_spin_unlock_irqrestore ( & timekeeper_lock , flags ) ;
2007-05-08 15:27:59 +08:00
touch_softlockup_watchdog ( ) ;
2015-03-25 20:09:16 +08:00
tick_resume ( ) ;
2011-05-02 22:48:57 +08:00
hrtimers_resume ( ) ;
2007-05-08 15:27:59 +08:00
}
PM / sleep: Make it possible to quiesce timers during suspend-to-idle
The efficiency of suspend-to-idle depends on being able to keep CPUs
in the deepest available idle states for as much time as possible.
Ideally, they should only be brought out of idle by system wakeup
interrupts.
However, timer interrupts occurring periodically prevent that from
happening and it is not practical to chase all of the "misbehaving"
timers in a whack-a-mole fashion. A much more effective approach is
to suspend the local ticks for all CPUs and the entire timekeeping
along the lines of what is done during full suspend, which also
helps to keep suspend-to-idle and full suspend reasonably similar.
The idea is to suspend the local tick on each CPU executing
cpuidle_enter_freeze() and to make the last of them suspend the
entire timekeeping. That should prevent timer interrupts from
triggering until an IO interrupt wakes up one of the CPUs. It
needs to be done with interrupts disabled on all of the CPUs,
though, because otherwise the suspended clocksource might be
accessed by an interrupt handler which might lead to fatal
consequences.
Unfortunately, the existing ->enter callbacks provided by cpuidle
drivers generally cannot be used for implementing that, because some
of them re-enable interrupts temporarily and some idle entry methods
cause interrupts to be re-enabled automatically on exit. Also some
of these callbacks manipulate local clock event devices of the CPUs
which really shouldn't be done after suspending their ticks.
To overcome that difficulty, introduce a new cpuidle state callback,
->enter_freeze, that will be guaranteed (1) to keep interrupts
disabled all the time (and return with interrupts disabled) and (2)
not to touch the CPU timer devices. Modify cpuidle_enter_freeze() to
look for the deepest available idle state with ->enter_freeze present
and to make the CPU execute that callback with suspended tick (and the
last of the online CPUs to execute it with suspended timekeeping).
Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
2015-02-14 06:50:43 +08:00
int timekeeping_suspend ( void )
2007-05-08 15:27:59 +08:00
{
2014-07-17 05:04:07 +08:00
struct timekeeper * tk = & tk_core . timekeeper ;
2011-11-15 06:05:44 +08:00
unsigned long flags ;
2014-07-17 05:04:01 +08:00
struct timespec64 delta , delta_delta ;
static struct timespec64 old_delta ;
2018-07-17 15:55:16 +08:00
struct clocksource * curr_clock ;
u64 cycle_now ;
2007-05-08 15:27:59 +08:00
2015-04-02 11:34:22 +08:00
read_persistent_clock64 ( & timekeeping_suspend_time ) ;
2007-09-16 21:36:43 +08:00
2013-05-18 02:24:05 +08:00
/*
* On some systems the persistent_clock can not be detected at
* timekeeping_init by its return value , so if we see a valid
* value returned , update the persistent_clock_exists flag .
*/
if ( timekeeping_suspend_time . tv_sec | | timekeeping_suspend_time . tv_nsec )
2015-04-02 11:34:38 +08:00
persistent_clock_exists = true ;
2013-05-18 02:24:05 +08:00
2018-07-17 14:31:29 +08:00
suspend_timing_needed = true ;
2013-02-22 06:51:38 +08:00
raw_spin_lock_irqsave ( & timekeeper_lock , flags ) ;
2014-07-17 05:04:07 +08:00
write_seqcount_begin ( & tk_core . seq ) ;
2012-07-28 02:48:13 +08:00
timekeeping_forward_now ( tk ) ;
2007-05-08 15:27:59 +08:00
timekeeping_suspended = 1 ;
time: Avoid accumulating time drift in suspend/resume
Because the read_persistent_clock interface is usually backed by
only a second granular interface, each time we read from the persistent
clock for suspend/resume, we introduce a half second (on average) of error.
In order to avoid this error accumulating as the system is suspended
over and over, this patch measures the time delta between the persistent
clock and the system CLOCK_REALTIME.
If the delta is less then 2 seconds from the last suspend, we compensate
by using the previous time delta (keeping it close). If it is larger
then 2 seconds, we assume the clock was set or has been changed, so we
do no correction and update the delta.
Note: If NTP is running, ths could seem to "fight" with the NTP corrected
time, where as if the system time was off by 1 second, and NTP slewed the
value in, a suspend/resume cycle could undo this correction, by trying to
restore the previous offset from the persistent clock. However, without
this patch, since each read could cause almost a full second worth of
error, its possible to get almost 2 seconds of error just from the
suspend/resume cycle alone, so this about equal to any offset added by
the compensation.
Further on systems that suspend/resume frequently, this should keep time
closer then NTP could compensate for if the errors were allowed to
accumulate.
Credits to Arve Hjønnevåg for suggesting this solution.
CC: Arve Hjønnevåg <arve@android.com>
CC: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: John Stultz <john.stultz@linaro.org>
2011-06-01 13:53:23 +08:00
2018-07-17 15:55:16 +08:00
/*
* Since we ' ve called forward_now , cycle_last stores the value
* just read from the current clocksource . Save this to potentially
* use in suspend timing .
*/
curr_clock = tk - > tkr_mono . clock ;
cycle_now = tk - > tkr_mono . cycle_last ;
clocksource_start_suspend_timing ( curr_clock , cycle_now ) ;
2015-04-02 11:34:38 +08:00
if ( persistent_clock_exists ) {
time: Avoid accumulating time drift in suspend/resume
Because the read_persistent_clock interface is usually backed by
only a second granular interface, each time we read from the persistent
clock for suspend/resume, we introduce a half second (on average) of error.
In order to avoid this error accumulating as the system is suspended
over and over, this patch measures the time delta between the persistent
clock and the system CLOCK_REALTIME.
If the delta is less then 2 seconds from the last suspend, we compensate
by using the previous time delta (keeping it close). If it is larger
then 2 seconds, we assume the clock was set or has been changed, so we
do no correction and update the delta.
Note: If NTP is running, ths could seem to "fight" with the NTP corrected
time, where as if the system time was off by 1 second, and NTP slewed the
value in, a suspend/resume cycle could undo this correction, by trying to
restore the previous offset from the persistent clock. However, without
this patch, since each read could cause almost a full second worth of
error, its possible to get almost 2 seconds of error just from the
suspend/resume cycle alone, so this about equal to any offset added by
the compensation.
Further on systems that suspend/resume frequently, this should keep time
closer then NTP could compensate for if the errors were allowed to
accumulate.
Credits to Arve Hjønnevåg for suggesting this solution.
CC: Arve Hjønnevåg <arve@android.com>
CC: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: John Stultz <john.stultz@linaro.org>
2011-06-01 13:53:23 +08:00
/*
time: Fix a bug in timekeeping_suspend() with no persistent clock
When there's no persistent clock, normally
timekeeping_suspend_time should always be zero, but this can
break in timekeeping_suspend().
At T1, there was a system suspend, so old_delta was assigned T1.
After some time, one time adjustment happened, and xtime got the
value of T1-dt(0s<dt<2s). Then, there comes another system
suspend soon after this adjustment, obviously we will get a
small negative delta_delta, resulting in a negative
timekeeping_suspend_time.
This is problematic, when doing timekeeping_resume() if there is
no nonstop clocksource for example, it will hit the else leg and
inject the improper sleeptime which is the wrong logic.
So, we can solve this problem by only doing delta related code
when the persistent clock is existent. Actually the code only
makes sense for persistent clock cases.
Signed-off-by: Xunlei Pang <pang.xunlei@linaro.org>
Signed-off-by: John Stultz <john.stultz@linaro.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1427945681-29972-18-git-send-email-john.stultz@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-04-02 11:34:37 +08:00
* To avoid drift caused by repeated suspend / resumes ,
* which each can add ~ 1 second drift error ,
* try to compensate so the difference in system time
* and persistent_clock time stays close to constant .
time: Avoid accumulating time drift in suspend/resume
Because the read_persistent_clock interface is usually backed by
only a second granular interface, each time we read from the persistent
clock for suspend/resume, we introduce a half second (on average) of error.
In order to avoid this error accumulating as the system is suspended
over and over, this patch measures the time delta between the persistent
clock and the system CLOCK_REALTIME.
If the delta is less then 2 seconds from the last suspend, we compensate
by using the previous time delta (keeping it close). If it is larger
then 2 seconds, we assume the clock was set or has been changed, so we
do no correction and update the delta.
Note: If NTP is running, ths could seem to "fight" with the NTP corrected
time, where as if the system time was off by 1 second, and NTP slewed the
value in, a suspend/resume cycle could undo this correction, by trying to
restore the previous offset from the persistent clock. However, without
this patch, since each read could cause almost a full second worth of
error, its possible to get almost 2 seconds of error just from the
suspend/resume cycle alone, so this about equal to any offset added by
the compensation.
Further on systems that suspend/resume frequently, this should keep time
closer then NTP could compensate for if the errors were allowed to
accumulate.
Credits to Arve Hjønnevåg for suggesting this solution.
CC: Arve Hjønnevåg <arve@android.com>
CC: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: John Stultz <john.stultz@linaro.org>
2011-06-01 13:53:23 +08:00
*/
time: Fix a bug in timekeeping_suspend() with no persistent clock
When there's no persistent clock, normally
timekeeping_suspend_time should always be zero, but this can
break in timekeeping_suspend().
At T1, there was a system suspend, so old_delta was assigned T1.
After some time, one time adjustment happened, and xtime got the
value of T1-dt(0s<dt<2s). Then, there comes another system
suspend soon after this adjustment, obviously we will get a
small negative delta_delta, resulting in a negative
timekeeping_suspend_time.
This is problematic, when doing timekeeping_resume() if there is
no nonstop clocksource for example, it will hit the else leg and
inject the improper sleeptime which is the wrong logic.
So, we can solve this problem by only doing delta related code
when the persistent clock is existent. Actually the code only
makes sense for persistent clock cases.
Signed-off-by: Xunlei Pang <pang.xunlei@linaro.org>
Signed-off-by: John Stultz <john.stultz@linaro.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1427945681-29972-18-git-send-email-john.stultz@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-04-02 11:34:37 +08:00
delta = timespec64_sub ( tk_xtime ( tk ) , timekeeping_suspend_time ) ;
delta_delta = timespec64_sub ( delta , old_delta ) ;
if ( abs ( delta_delta . tv_sec ) > = 2 ) {
/*
* if delta_delta is too large , assume time correction
* has occurred and set old_delta to the current delta .
*/
old_delta = delta ;
} else {
/* Otherwise try to adjust old_system to compensate */
timekeeping_suspend_time =
timespec64_add ( timekeeping_suspend_time , delta_delta ) ;
}
time: Avoid accumulating time drift in suspend/resume
Because the read_persistent_clock interface is usually backed by
only a second granular interface, each time we read from the persistent
clock for suspend/resume, we introduce a half second (on average) of error.
In order to avoid this error accumulating as the system is suspended
over and over, this patch measures the time delta between the persistent
clock and the system CLOCK_REALTIME.
If the delta is less then 2 seconds from the last suspend, we compensate
by using the previous time delta (keeping it close). If it is larger
then 2 seconds, we assume the clock was set or has been changed, so we
do no correction and update the delta.
Note: If NTP is running, ths could seem to "fight" with the NTP corrected
time, where as if the system time was off by 1 second, and NTP slewed the
value in, a suspend/resume cycle could undo this correction, by trying to
restore the previous offset from the persistent clock. However, without
this patch, since each read could cause almost a full second worth of
error, its possible to get almost 2 seconds of error just from the
suspend/resume cycle alone, so this about equal to any offset added by
the compensation.
Further on systems that suspend/resume frequently, this should keep time
closer then NTP could compensate for if the errors were allowed to
accumulate.
Credits to Arve Hjønnevåg for suggesting this solution.
CC: Arve Hjønnevåg <arve@android.com>
CC: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: John Stultz <john.stultz@linaro.org>
2011-06-01 13:53:23 +08:00
}
2013-12-12 11:10:36 +08:00
timekeeping_update ( tk , TK_MIRROR ) ;
2015-02-13 21:49:02 +08:00
halt_fast_timekeeper ( tk ) ;
2014-07-17 05:04:07 +08:00
write_seqcount_end ( & tk_core . seq ) ;
2013-02-22 06:51:38 +08:00
raw_spin_unlock_irqrestore ( & timekeeper_lock , flags ) ;
2007-05-08 15:27:59 +08:00
2015-03-25 20:09:16 +08:00
tick_suspend ( ) ;
2010-02-03 06:41:41 +08:00
clocksource_suspend ( ) ;
2012-08-06 07:40:41 +08:00
clockevents_suspend ( ) ;
2007-05-08 15:27:59 +08:00
return 0 ;
}
/* sysfs resume/suspend bits for timekeeping */
2011-03-24 05:16:04 +08:00
static struct syscore_ops timekeeping_syscore_ops = {
2007-05-08 15:27:59 +08:00
. resume = timekeeping_resume ,
. suspend = timekeeping_suspend ,
} ;
2011-03-24 05:16:04 +08:00
static int __init timekeeping_init_ops ( void )
2007-05-08 15:27:59 +08:00
{
2011-03-24 05:16:04 +08:00
register_syscore_ops ( & timekeeping_syscore_ops ) ;
return 0 ;
2007-05-08 15:27:59 +08:00
}
2011-03-24 05:16:04 +08:00
device_initcall ( timekeeping_init_ops ) ;
2007-05-08 15:27:59 +08:00
/*
timekeeping: Rework frequency adjustments to work better w/ nohz
The existing timekeeping_adjust logic has always been complicated
to understand. Further, since it was developed prior to NOHZ becoming
common, its not surprising it performs poorly when NOHZ is enabled.
Since Miroslav pointed out the problematic nature of the existing code
in the NOHZ case, I've tried to refactor the code to perform better.
The problem with the previous approach was that it tried to adjust
for the total cumulative error using a scaled dampening factor. This
resulted in large errors to be corrected slowly, while small errors
were corrected quickly. With NOHZ the timekeeping code doesn't know
how far out the next tick will be, so this results in bad
over-correction to small errors, and insufficient correction to large
errors.
Inspired by Miroslav's patch, I've refactored the code to try to
address the correction in two steps.
1) Check the future freq error for the next tick, and if the frequency
error is large, try to make sure we correct it so it doesn't cause
much accumulated error.
2) Then make a small single unit adjustment to correct any cumulative
error that has collected over time.
This method performs fairly well in the simulator Miroslav created.
Major credit to Miroslav for pointing out the issue, providing the
original patch to resolve this, a simulator for testing, as well as
helping debug and resolve issues in my implementation so that it
performed closer to his original implementation.
Cc: Miroslav Lichvar <mlichvar@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Prarit Bhargava <prarit@redhat.com>
Reported-by: Miroslav Lichvar <mlichvar@redhat.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
2013-12-07 09:25:21 +08:00
* Apply a multiplier adjustment to the timekeeper
2007-05-08 15:27:59 +08:00
*/
timekeeping: Rework frequency adjustments to work better w/ nohz
The existing timekeeping_adjust logic has always been complicated
to understand. Further, since it was developed prior to NOHZ becoming
common, its not surprising it performs poorly when NOHZ is enabled.
Since Miroslav pointed out the problematic nature of the existing code
in the NOHZ case, I've tried to refactor the code to perform better.
The problem with the previous approach was that it tried to adjust
for the total cumulative error using a scaled dampening factor. This
resulted in large errors to be corrected slowly, while small errors
were corrected quickly. With NOHZ the timekeeping code doesn't know
how far out the next tick will be, so this results in bad
over-correction to small errors, and insufficient correction to large
errors.
Inspired by Miroslav's patch, I've refactored the code to try to
address the correction in two steps.
1) Check the future freq error for the next tick, and if the frequency
error is large, try to make sure we correct it so it doesn't cause
much accumulated error.
2) Then make a small single unit adjustment to correct any cumulative
error that has collected over time.
This method performs fairly well in the simulator Miroslav created.
Major credit to Miroslav for pointing out the issue, providing the
original patch to resolve this, a simulator for testing, as well as
helping debug and resolve issues in my implementation so that it
performed closer to his original implementation.
Cc: Miroslav Lichvar <mlichvar@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Prarit Bhargava <prarit@redhat.com>
Reported-by: Miroslav Lichvar <mlichvar@redhat.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
2013-12-07 09:25:21 +08:00
static __always_inline void timekeeping_apply_adjustment ( struct timekeeper * tk ,
s64 offset ,
2018-03-10 02:42:48 +08:00
s32 mult_adj )
2007-05-08 15:27:59 +08:00
{
timekeeping: Rework frequency adjustments to work better w/ nohz
The existing timekeeping_adjust logic has always been complicated
to understand. Further, since it was developed prior to NOHZ becoming
common, its not surprising it performs poorly when NOHZ is enabled.
Since Miroslav pointed out the problematic nature of the existing code
in the NOHZ case, I've tried to refactor the code to perform better.
The problem with the previous approach was that it tried to adjust
for the total cumulative error using a scaled dampening factor. This
resulted in large errors to be corrected slowly, while small errors
were corrected quickly. With NOHZ the timekeeping code doesn't know
how far out the next tick will be, so this results in bad
over-correction to small errors, and insufficient correction to large
errors.
Inspired by Miroslav's patch, I've refactored the code to try to
address the correction in two steps.
1) Check the future freq error for the next tick, and if the frequency
error is large, try to make sure we correct it so it doesn't cause
much accumulated error.
2) Then make a small single unit adjustment to correct any cumulative
error that has collected over time.
This method performs fairly well in the simulator Miroslav created.
Major credit to Miroslav for pointing out the issue, providing the
original patch to resolve this, a simulator for testing, as well as
helping debug and resolve issues in my implementation so that it
performed closer to his original implementation.
Cc: Miroslav Lichvar <mlichvar@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Prarit Bhargava <prarit@redhat.com>
Reported-by: Miroslav Lichvar <mlichvar@redhat.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
2013-12-07 09:25:21 +08:00
s64 interval = tk - > cycle_interval ;
2007-05-08 15:27:59 +08:00
2018-03-10 02:42:48 +08:00
if ( mult_adj = = 0 ) {
return ;
} else if ( mult_adj = = - 1 ) {
timekeeping: Rework frequency adjustments to work better w/ nohz
The existing timekeeping_adjust logic has always been complicated
to understand. Further, since it was developed prior to NOHZ becoming
common, its not surprising it performs poorly when NOHZ is enabled.
Since Miroslav pointed out the problematic nature of the existing code
in the NOHZ case, I've tried to refactor the code to perform better.
The problem with the previous approach was that it tried to adjust
for the total cumulative error using a scaled dampening factor. This
resulted in large errors to be corrected slowly, while small errors
were corrected quickly. With NOHZ the timekeeping code doesn't know
how far out the next tick will be, so this results in bad
over-correction to small errors, and insufficient correction to large
errors.
Inspired by Miroslav's patch, I've refactored the code to try to
address the correction in two steps.
1) Check the future freq error for the next tick, and if the frequency
error is large, try to make sure we correct it so it doesn't cause
much accumulated error.
2) Then make a small single unit adjustment to correct any cumulative
error that has collected over time.
This method performs fairly well in the simulator Miroslav created.
Major credit to Miroslav for pointing out the issue, providing the
original patch to resolve this, a simulator for testing, as well as
helping debug and resolve issues in my implementation so that it
performed closer to his original implementation.
Cc: Miroslav Lichvar <mlichvar@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Prarit Bhargava <prarit@redhat.com>
Reported-by: Miroslav Lichvar <mlichvar@redhat.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
2013-12-07 09:25:21 +08:00
interval = - interval ;
2018-03-10 02:42:48 +08:00
offset = - offset ;
} else if ( mult_adj ! = 1 ) {
interval * = mult_adj ;
offset * = mult_adj ;
2012-08-05 03:21:14 +08:00
}
2007-05-08 15:27:59 +08:00
2011-10-28 09:12:42 +08:00
/*
* So the following can be confusing .
*
timekeeping: Rework frequency adjustments to work better w/ nohz
The existing timekeeping_adjust logic has always been complicated
to understand. Further, since it was developed prior to NOHZ becoming
common, its not surprising it performs poorly when NOHZ is enabled.
Since Miroslav pointed out the problematic nature of the existing code
in the NOHZ case, I've tried to refactor the code to perform better.
The problem with the previous approach was that it tried to adjust
for the total cumulative error using a scaled dampening factor. This
resulted in large errors to be corrected slowly, while small errors
were corrected quickly. With NOHZ the timekeeping code doesn't know
how far out the next tick will be, so this results in bad
over-correction to small errors, and insufficient correction to large
errors.
Inspired by Miroslav's patch, I've refactored the code to try to
address the correction in two steps.
1) Check the future freq error for the next tick, and if the frequency
error is large, try to make sure we correct it so it doesn't cause
much accumulated error.
2) Then make a small single unit adjustment to correct any cumulative
error that has collected over time.
This method performs fairly well in the simulator Miroslav created.
Major credit to Miroslav for pointing out the issue, providing the
original patch to resolve this, a simulator for testing, as well as
helping debug and resolve issues in my implementation so that it
performed closer to his original implementation.
Cc: Miroslav Lichvar <mlichvar@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Prarit Bhargava <prarit@redhat.com>
Reported-by: Miroslav Lichvar <mlichvar@redhat.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
2013-12-07 09:25:21 +08:00
* To keep things simple , lets assume mult_adj = = 1 for now .
2011-10-28 09:12:42 +08:00
*
timekeeping: Rework frequency adjustments to work better w/ nohz
The existing timekeeping_adjust logic has always been complicated
to understand. Further, since it was developed prior to NOHZ becoming
common, its not surprising it performs poorly when NOHZ is enabled.
Since Miroslav pointed out the problematic nature of the existing code
in the NOHZ case, I've tried to refactor the code to perform better.
The problem with the previous approach was that it tried to adjust
for the total cumulative error using a scaled dampening factor. This
resulted in large errors to be corrected slowly, while small errors
were corrected quickly. With NOHZ the timekeeping code doesn't know
how far out the next tick will be, so this results in bad
over-correction to small errors, and insufficient correction to large
errors.
Inspired by Miroslav's patch, I've refactored the code to try to
address the correction in two steps.
1) Check the future freq error for the next tick, and if the frequency
error is large, try to make sure we correct it so it doesn't cause
much accumulated error.
2) Then make a small single unit adjustment to correct any cumulative
error that has collected over time.
This method performs fairly well in the simulator Miroslav created.
Major credit to Miroslav for pointing out the issue, providing the
original patch to resolve this, a simulator for testing, as well as
helping debug and resolve issues in my implementation so that it
performed closer to his original implementation.
Cc: Miroslav Lichvar <mlichvar@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Prarit Bhargava <prarit@redhat.com>
Reported-by: Miroslav Lichvar <mlichvar@redhat.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
2013-12-07 09:25:21 +08:00
* When mult_adj ! = 1 , remember that the interval and offset values
2011-10-28 09:12:42 +08:00
* have been appropriately scaled so the math is the same .
*
* The basic idea here is that we ' re increasing the multiplier
* by one , this causes the xtime_interval to be incremented by
* one cycle_interval . This is because :
* xtime_interval = cycle_interval * mult
* So if mult is being incremented by one :
* xtime_interval = cycle_interval * ( mult + 1 )
* Its the same as :
* xtime_interval = ( cycle_interval * mult ) + cycle_interval
* Which can be shortened to :
* xtime_interval + = cycle_interval
*
* So offset stores the non - accumulated cycles . Thus the current
* time ( in shifted nanoseconds ) is :
* now = ( offset * adj ) + xtime_nsec
* Now , even though we ' re adjusting the clock frequency , we have
* to keep time consistent . In other words , we can ' t jump back
* in time , and we also want to avoid jumping forward in time .
*
* So given the same offset value , we need the time to be the same
* both before and after the freq adjustment .
* now = ( offset * adj_1 ) + xtime_nsec_1
* now = ( offset * adj_2 ) + xtime_nsec_2
* So :
* ( offset * adj_1 ) + xtime_nsec_1 =
* ( offset * adj_2 ) + xtime_nsec_2
* And we know :
* adj_2 = adj_1 + 1
* So :
* ( offset * adj_1 ) + xtime_nsec_1 =
* ( offset * ( adj_1 + 1 ) ) + xtime_nsec_2
* ( offset * adj_1 ) + xtime_nsec_1 =
* ( offset * adj_1 ) + offset + xtime_nsec_2
* Canceling the sides :
* xtime_nsec_1 = offset + xtime_nsec_2
* Which gives us :
* xtime_nsec_2 = xtime_nsec_1 - offset
* Which simplfies to :
* xtime_nsec - = offset
*/
2015-03-19 17:09:06 +08:00
if ( ( mult_adj > 0 ) & & ( tk - > tkr_mono . mult + mult_adj < mult_adj ) ) {
time: Avoid possible NTP adjustment mult overflow.
Ideally, __clocksource_updatefreq_scale, selects the largest shift
value possible for a clocksource. This results in the mult memember of
struct clocksource being particularly large, although not so large
that NTP would adjust the clock to cause it to overflow.
That said, nothing actually prohibits an overflow from occuring, its
just that it "shouldn't" occur.
So while very unlikely, and so far never observed, the value of
(cs->mult+cs->maxadj) may have a chance to reach very near 0xFFFFFFFF,
so there is a possibility it may overflow when doing NTP positive
adjustment
See the following detail: When NTP slewes the clock, kernel goes
through update_wall_time()->...->timekeeping_apply_adjustment():
tk->tkr.mult += mult_adj;
Since there is no guard against it, its possible tk->tkr.mult may
overflow during this operation.
This patch avoids any possible mult overflow by judging the overflow
case before adding mult_adj to mult, also adds the WARNING message
when capturing such case.
Signed-off-by: pang.xunlei <pang.xunlei@linaro.org>
[jstultz: Reworded commit message]
Signed-off-by: John Stultz <john.stultz@linaro.org>
2014-10-08 15:03:34 +08:00
/* NTP adjustment caused clocksource mult overflow */
WARN_ON_ONCE ( 1 ) ;
return ;
}
2015-03-19 17:09:06 +08:00
tk - > tkr_mono . mult + = mult_adj ;
2012-07-13 13:21:57 +08:00
tk - > xtime_interval + = interval ;
2015-03-19 17:09:06 +08:00
tk - > tkr_mono . xtime_nsec - = offset ;
timekeeping: Rework frequency adjustments to work better w/ nohz
The existing timekeeping_adjust logic has always been complicated
to understand. Further, since it was developed prior to NOHZ becoming
common, its not surprising it performs poorly when NOHZ is enabled.
Since Miroslav pointed out the problematic nature of the existing code
in the NOHZ case, I've tried to refactor the code to perform better.
The problem with the previous approach was that it tried to adjust
for the total cumulative error using a scaled dampening factor. This
resulted in large errors to be corrected slowly, while small errors
were corrected quickly. With NOHZ the timekeeping code doesn't know
how far out the next tick will be, so this results in bad
over-correction to small errors, and insufficient correction to large
errors.
Inspired by Miroslav's patch, I've refactored the code to try to
address the correction in two steps.
1) Check the future freq error for the next tick, and if the frequency
error is large, try to make sure we correct it so it doesn't cause
much accumulated error.
2) Then make a small single unit adjustment to correct any cumulative
error that has collected over time.
This method performs fairly well in the simulator Miroslav created.
Major credit to Miroslav for pointing out the issue, providing the
original patch to resolve this, a simulator for testing, as well as
helping debug and resolve issues in my implementation so that it
performed closer to his original implementation.
Cc: Miroslav Lichvar <mlichvar@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Prarit Bhargava <prarit@redhat.com>
Reported-by: Miroslav Lichvar <mlichvar@redhat.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
2013-12-07 09:25:21 +08:00
}
/*
2018-03-10 02:42:48 +08:00
* Adjust the timekeeper ' s multiplier to the correct frequency
* and also to reduce the accumulated error value .
timekeeping: Rework frequency adjustments to work better w/ nohz
The existing timekeeping_adjust logic has always been complicated
to understand. Further, since it was developed prior to NOHZ becoming
common, its not surprising it performs poorly when NOHZ is enabled.
Since Miroslav pointed out the problematic nature of the existing code
in the NOHZ case, I've tried to refactor the code to perform better.
The problem with the previous approach was that it tried to adjust
for the total cumulative error using a scaled dampening factor. This
resulted in large errors to be corrected slowly, while small errors
were corrected quickly. With NOHZ the timekeeping code doesn't know
how far out the next tick will be, so this results in bad
over-correction to small errors, and insufficient correction to large
errors.
Inspired by Miroslav's patch, I've refactored the code to try to
address the correction in two steps.
1) Check the future freq error for the next tick, and if the frequency
error is large, try to make sure we correct it so it doesn't cause
much accumulated error.
2) Then make a small single unit adjustment to correct any cumulative
error that has collected over time.
This method performs fairly well in the simulator Miroslav created.
Major credit to Miroslav for pointing out the issue, providing the
original patch to resolve this, a simulator for testing, as well as
helping debug and resolve issues in my implementation so that it
performed closer to his original implementation.
Cc: Miroslav Lichvar <mlichvar@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Prarit Bhargava <prarit@redhat.com>
Reported-by: Miroslav Lichvar <mlichvar@redhat.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
2013-12-07 09:25:21 +08:00
*/
2018-03-10 02:42:48 +08:00
static void timekeeping_adjust ( struct timekeeper * tk , s64 offset )
timekeeping: Rework frequency adjustments to work better w/ nohz
The existing timekeeping_adjust logic has always been complicated
to understand. Further, since it was developed prior to NOHZ becoming
common, its not surprising it performs poorly when NOHZ is enabled.
Since Miroslav pointed out the problematic nature of the existing code
in the NOHZ case, I've tried to refactor the code to perform better.
The problem with the previous approach was that it tried to adjust
for the total cumulative error using a scaled dampening factor. This
resulted in large errors to be corrected slowly, while small errors
were corrected quickly. With NOHZ the timekeeping code doesn't know
how far out the next tick will be, so this results in bad
over-correction to small errors, and insufficient correction to large
errors.
Inspired by Miroslav's patch, I've refactored the code to try to
address the correction in two steps.
1) Check the future freq error for the next tick, and if the frequency
error is large, try to make sure we correct it so it doesn't cause
much accumulated error.
2) Then make a small single unit adjustment to correct any cumulative
error that has collected over time.
This method performs fairly well in the simulator Miroslav created.
Major credit to Miroslav for pointing out the issue, providing the
original patch to resolve this, a simulator for testing, as well as
helping debug and resolve issues in my implementation so that it
performed closer to his original implementation.
Cc: Miroslav Lichvar <mlichvar@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Prarit Bhargava <prarit@redhat.com>
Reported-by: Miroslav Lichvar <mlichvar@redhat.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
2013-12-07 09:25:21 +08:00
{
2018-03-10 02:42:48 +08:00
u32 mult ;
timekeeping: Rework frequency adjustments to work better w/ nohz
The existing timekeeping_adjust logic has always been complicated
to understand. Further, since it was developed prior to NOHZ becoming
common, its not surprising it performs poorly when NOHZ is enabled.
Since Miroslav pointed out the problematic nature of the existing code
in the NOHZ case, I've tried to refactor the code to perform better.
The problem with the previous approach was that it tried to adjust
for the total cumulative error using a scaled dampening factor. This
resulted in large errors to be corrected slowly, while small errors
were corrected quickly. With NOHZ the timekeeping code doesn't know
how far out the next tick will be, so this results in bad
over-correction to small errors, and insufficient correction to large
errors.
Inspired by Miroslav's patch, I've refactored the code to try to
address the correction in two steps.
1) Check the future freq error for the next tick, and if the frequency
error is large, try to make sure we correct it so it doesn't cause
much accumulated error.
2) Then make a small single unit adjustment to correct any cumulative
error that has collected over time.
This method performs fairly well in the simulator Miroslav created.
Major credit to Miroslav for pointing out the issue, providing the
original patch to resolve this, a simulator for testing, as well as
helping debug and resolve issues in my implementation so that it
performed closer to his original implementation.
Cc: Miroslav Lichvar <mlichvar@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Prarit Bhargava <prarit@redhat.com>
Reported-by: Miroslav Lichvar <mlichvar@redhat.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
2013-12-07 09:25:21 +08:00
2015-12-04 02:23:30 +08:00
/*
2018-03-10 02:42:48 +08:00
* Determine the multiplier from the current NTP tick length .
* Avoid expensive division when the tick length doesn ' t change .
2015-12-04 02:23:30 +08:00
*/
2018-03-10 02:42:48 +08:00
if ( likely ( tk - > ntp_tick = = ntp_tick_length ( ) ) ) {
mult = tk - > tkr_mono . mult - tk - > ntp_err_mult ;
} else {
tk - > ntp_tick = ntp_tick_length ( ) ;
mult = div64_u64 ( ( tk - > ntp_tick > > tk - > ntp_error_shift ) -
tk - > xtime_remainder , tk - > cycle_interval ) ;
2015-12-04 02:23:30 +08:00
}
timekeeping: Rework frequency adjustments to work better w/ nohz
The existing timekeeping_adjust logic has always been complicated
to understand. Further, since it was developed prior to NOHZ becoming
common, its not surprising it performs poorly when NOHZ is enabled.
Since Miroslav pointed out the problematic nature of the existing code
in the NOHZ case, I've tried to refactor the code to perform better.
The problem with the previous approach was that it tried to adjust
for the total cumulative error using a scaled dampening factor. This
resulted in large errors to be corrected slowly, while small errors
were corrected quickly. With NOHZ the timekeeping code doesn't know
how far out the next tick will be, so this results in bad
over-correction to small errors, and insufficient correction to large
errors.
Inspired by Miroslav's patch, I've refactored the code to try to
address the correction in two steps.
1) Check the future freq error for the next tick, and if the frequency
error is large, try to make sure we correct it so it doesn't cause
much accumulated error.
2) Then make a small single unit adjustment to correct any cumulative
error that has collected over time.
This method performs fairly well in the simulator Miroslav created.
Major credit to Miroslav for pointing out the issue, providing the
original patch to resolve this, a simulator for testing, as well as
helping debug and resolve issues in my implementation so that it
performed closer to his original implementation.
Cc: Miroslav Lichvar <mlichvar@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Prarit Bhargava <prarit@redhat.com>
Reported-by: Miroslav Lichvar <mlichvar@redhat.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
2013-12-07 09:25:21 +08:00
2018-03-10 02:42:48 +08:00
/*
* If the clock is behind the NTP time , increase the multiplier by 1
* to catch up with it . If it ' s ahead and there was a remainder in the
* tick division , the clock will slow down . Otherwise it will stay
* ahead until the tick length changes to a non - divisible value .
*/
tk - > ntp_err_mult = tk - > ntp_error > 0 ? 1 : 0 ;
mult + = tk - > ntp_err_mult ;
timekeeping: Rework frequency adjustments to work better w/ nohz
The existing timekeeping_adjust logic has always been complicated
to understand. Further, since it was developed prior to NOHZ becoming
common, its not surprising it performs poorly when NOHZ is enabled.
Since Miroslav pointed out the problematic nature of the existing code
in the NOHZ case, I've tried to refactor the code to perform better.
The problem with the previous approach was that it tried to adjust
for the total cumulative error using a scaled dampening factor. This
resulted in large errors to be corrected slowly, while small errors
were corrected quickly. With NOHZ the timekeeping code doesn't know
how far out the next tick will be, so this results in bad
over-correction to small errors, and insufficient correction to large
errors.
Inspired by Miroslav's patch, I've refactored the code to try to
address the correction in two steps.
1) Check the future freq error for the next tick, and if the frequency
error is large, try to make sure we correct it so it doesn't cause
much accumulated error.
2) Then make a small single unit adjustment to correct any cumulative
error that has collected over time.
This method performs fairly well in the simulator Miroslav created.
Major credit to Miroslav for pointing out the issue, providing the
original patch to resolve this, a simulator for testing, as well as
helping debug and resolve issues in my implementation so that it
performed closer to his original implementation.
Cc: Miroslav Lichvar <mlichvar@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Prarit Bhargava <prarit@redhat.com>
Reported-by: Miroslav Lichvar <mlichvar@redhat.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
2013-12-07 09:25:21 +08:00
2018-03-10 02:42:48 +08:00
timekeeping_apply_adjustment ( tk , offset , mult - tk - > tkr_mono . mult ) ;
timekeeping: Rework frequency adjustments to work better w/ nohz
The existing timekeeping_adjust logic has always been complicated
to understand. Further, since it was developed prior to NOHZ becoming
common, its not surprising it performs poorly when NOHZ is enabled.
Since Miroslav pointed out the problematic nature of the existing code
in the NOHZ case, I've tried to refactor the code to perform better.
The problem with the previous approach was that it tried to adjust
for the total cumulative error using a scaled dampening factor. This
resulted in large errors to be corrected slowly, while small errors
were corrected quickly. With NOHZ the timekeeping code doesn't know
how far out the next tick will be, so this results in bad
over-correction to small errors, and insufficient correction to large
errors.
Inspired by Miroslav's patch, I've refactored the code to try to
address the correction in two steps.
1) Check the future freq error for the next tick, and if the frequency
error is large, try to make sure we correct it so it doesn't cause
much accumulated error.
2) Then make a small single unit adjustment to correct any cumulative
error that has collected over time.
This method performs fairly well in the simulator Miroslav created.
Major credit to Miroslav for pointing out the issue, providing the
original patch to resolve this, a simulator for testing, as well as
helping debug and resolve issues in my implementation so that it
performed closer to his original implementation.
Cc: Miroslav Lichvar <mlichvar@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Prarit Bhargava <prarit@redhat.com>
Reported-by: Miroslav Lichvar <mlichvar@redhat.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
2013-12-07 09:25:21 +08:00
2015-03-19 17:09:06 +08:00
if ( unlikely ( tk - > tkr_mono . clock - > maxadj & &
( abs ( tk - > tkr_mono . mult - tk - > tkr_mono . clock - > mult )
> tk - > tkr_mono . clock - > maxadj ) ) ) {
timekeeping: Rework frequency adjustments to work better w/ nohz
The existing timekeeping_adjust logic has always been complicated
to understand. Further, since it was developed prior to NOHZ becoming
common, its not surprising it performs poorly when NOHZ is enabled.
Since Miroslav pointed out the problematic nature of the existing code
in the NOHZ case, I've tried to refactor the code to perform better.
The problem with the previous approach was that it tried to adjust
for the total cumulative error using a scaled dampening factor. This
resulted in large errors to be corrected slowly, while small errors
were corrected quickly. With NOHZ the timekeeping code doesn't know
how far out the next tick will be, so this results in bad
over-correction to small errors, and insufficient correction to large
errors.
Inspired by Miroslav's patch, I've refactored the code to try to
address the correction in two steps.
1) Check the future freq error for the next tick, and if the frequency
error is large, try to make sure we correct it so it doesn't cause
much accumulated error.
2) Then make a small single unit adjustment to correct any cumulative
error that has collected over time.
This method performs fairly well in the simulator Miroslav created.
Major credit to Miroslav for pointing out the issue, providing the
original patch to resolve this, a simulator for testing, as well as
helping debug and resolve issues in my implementation so that it
performed closer to his original implementation.
Cc: Miroslav Lichvar <mlichvar@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Prarit Bhargava <prarit@redhat.com>
Reported-by: Miroslav Lichvar <mlichvar@redhat.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
2013-12-07 09:25:21 +08:00
printk_once ( KERN_WARNING
" Adjusting %s more than 11%% (%ld vs %ld) \n " ,
2015-03-19 17:09:06 +08:00
tk - > tkr_mono . clock - > name , ( long ) tk - > tkr_mono . mult ,
( long ) tk - > tkr_mono . clock - > mult + tk - > tkr_mono . clock - > maxadj ) ;
timekeeping: Rework frequency adjustments to work better w/ nohz
The existing timekeeping_adjust logic has always been complicated
to understand. Further, since it was developed prior to NOHZ becoming
common, its not surprising it performs poorly when NOHZ is enabled.
Since Miroslav pointed out the problematic nature of the existing code
in the NOHZ case, I've tried to refactor the code to perform better.
The problem with the previous approach was that it tried to adjust
for the total cumulative error using a scaled dampening factor. This
resulted in large errors to be corrected slowly, while small errors
were corrected quickly. With NOHZ the timekeeping code doesn't know
how far out the next tick will be, so this results in bad
over-correction to small errors, and insufficient correction to large
errors.
Inspired by Miroslav's patch, I've refactored the code to try to
address the correction in two steps.
1) Check the future freq error for the next tick, and if the frequency
error is large, try to make sure we correct it so it doesn't cause
much accumulated error.
2) Then make a small single unit adjustment to correct any cumulative
error that has collected over time.
This method performs fairly well in the simulator Miroslav created.
Major credit to Miroslav for pointing out the issue, providing the
original patch to resolve this, a simulator for testing, as well as
helping debug and resolve issues in my implementation so that it
performed closer to his original implementation.
Cc: Miroslav Lichvar <mlichvar@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Prarit Bhargava <prarit@redhat.com>
Reported-by: Miroslav Lichvar <mlichvar@redhat.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
2013-12-07 09:25:21 +08:00
}
2012-07-13 13:21:56 +08:00
/*
* It may be possible that when we entered this function , xtime_nsec
* was very small . Further , if we ' re slightly speeding the clocksource
* in the code above , its possible the required corrective factor to
* xtime_nsec could cause it to underflow .
*
2018-03-10 02:42:48 +08:00
* Now , since we have already accumulated the second and the NTP
* subsystem has been notified via second_overflow ( ) , we need to skip
* the next update .
2012-07-13 13:21:56 +08:00
*/
2015-03-19 17:09:06 +08:00
if ( unlikely ( ( s64 ) tk - > tkr_mono . xtime_nsec < 0 ) ) {
2018-03-10 02:42:48 +08:00
tk - > tkr_mono . xtime_nsec + = ( u64 ) NSEC_PER_SEC < <
tk - > tkr_mono . shift ;
tk - > xtime_sec - - ;
tk - > skip_second_overflow = 1 ;
2012-07-13 13:21:56 +08:00
}
2007-05-08 15:27:59 +08:00
}
2012-07-13 13:21:54 +08:00
/**
* accumulate_nsecs_to_secs - Accumulates nsecs into secs
*
2015-08-25 14:42:53 +08:00
* Helper function that accumulates the nsecs greater than a second
2012-07-13 13:21:54 +08:00
* from the xtime_nsec field to the xtime_secs field .
* It also calls into the NTP code to handle leapsecond processing .
*
*/
2013-06-27 18:35:46 +08:00
static inline unsigned int accumulate_nsecs_to_secs ( struct timekeeper * tk )
2012-07-13 13:21:54 +08:00
{
2015-03-19 17:09:06 +08:00
u64 nsecps = ( u64 ) NSEC_PER_SEC < < tk - > tkr_mono . shift ;
2013-12-12 12:07:49 +08:00
unsigned int clock_set = 0 ;
2012-07-13 13:21:54 +08:00
2015-03-19 17:09:06 +08:00
while ( tk - > tkr_mono . xtime_nsec > = nsecps ) {
2012-07-13 13:21:54 +08:00
int leap ;
2015-03-19 17:09:06 +08:00
tk - > tkr_mono . xtime_nsec - = nsecps ;
2012-07-13 13:21:54 +08:00
tk - > xtime_sec + + ;
2018-03-10 02:42:48 +08:00
/*
* Skip NTP update if this second was accumulated before ,
* i . e . xtime_nsec underflowed in timekeeping_adjust ( )
*/
if ( unlikely ( tk - > skip_second_overflow ) ) {
tk - > skip_second_overflow = 0 ;
continue ;
}
2012-07-13 13:21:54 +08:00
/* Figure out if its a leap sec and apply if needed */
leap = second_overflow ( tk - > xtime_sec ) ;
2012-07-28 02:48:12 +08:00
if ( unlikely ( leap ) ) {
2014-07-17 05:04:01 +08:00
struct timespec64 ts ;
2012-07-28 02:48:12 +08:00
tk - > xtime_sec + = leap ;
2012-07-13 13:21:54 +08:00
2012-07-28 02:48:12 +08:00
ts . tv_sec = leap ;
ts . tv_nsec = 0 ;
tk_set_wall_to_mono ( tk ,
2014-07-17 05:04:01 +08:00
timespec64_sub ( tk - > wall_to_monotonic , ts ) ) ;
2012-07-28 02:48:12 +08:00
2012-05-04 03:30:07 +08:00
__timekeeping_set_tai_offset ( tk , tk - > tai_offset - leap ) ;
2013-12-12 12:07:49 +08:00
clock_set = TK_CLOCK_WAS_SET ;
2012-07-28 02:48:12 +08:00
}
2012-07-13 13:21:54 +08:00
}
2013-12-12 12:07:49 +08:00
return clock_set ;
2012-07-13 13:21:54 +08:00
}
time: Implement logarithmic time accumulation
Accumulating one tick at a time works well unless we're using NOHZ.
Then it can be an issue, since we may have to run through the loop
a few thousand times, which can increase timer interrupt caused
latency.
The current solution was to accumulate in half-second intervals
with NOHZ. This kept the number of loops down, however it did
slightly change how we make NTP adjustments. While not an issue
with NTPd users, as NTPd makes adjustments over a longer period of
time, other adjtimex() users have noticed the half-second
granularity with which we can apply frequency changes to the clock.
For instance, if a application tries to apply a 100ppm frequency
correction for 20ms to correct a 2us offset, with NOHZ they either
get no correction, or a 50us correction.
Now, there will always be some granularity error for applying
frequency corrections. However with users sensitive to this error
have seen a 50-500x increase with NOHZ compared to running without
NOHZ.
So I figured I'd try another approach then just simply increasing
the interval. My approach is to consume the time interval
logarithmically. This reduces the number of times through the loop
needed keeping latency down, while still preserving the original
granularity error for adjtimex() changes.
Further, this change allows us to remove the xtime_cache code
(patch to follow), as xtime is always within one tick of the
current time, instead of the half-second updates it saw before.
An earlier version of this patch has been shipping to x86 users in
the RedHat MRG releases for awhile without issue, but I've reworked
this version to be even more careful about avoiding possible
overflows if the shift value gets too large.
Signed-off-by: John Stultz <johnstul@us.ibm.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: John Kacur <jkacur@redhat.com>
Cc: Clark Williams <williams@redhat.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
LKML-Reference: <1254525473.7741.88.camel@localhost.localdomain>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-10-03 07:17:53 +08:00
/**
* logarithmic_accumulation - shifted accumulation of cycles
*
* This functions accumulates a shifted interval of cycles into
* into a shifted interval nanoseconds . Allows for O ( log ) accumulation
* loop .
*
* Returns the unconsumed cycles .
*/
2016-12-22 03:32:01 +08:00
static u64 logarithmic_accumulation ( struct timekeeper * tk , u64 offset ,
u32 shift , unsigned int * clock_set )
time: Implement logarithmic time accumulation
Accumulating one tick at a time works well unless we're using NOHZ.
Then it can be an issue, since we may have to run through the loop
a few thousand times, which can increase timer interrupt caused
latency.
The current solution was to accumulate in half-second intervals
with NOHZ. This kept the number of loops down, however it did
slightly change how we make NTP adjustments. While not an issue
with NTPd users, as NTPd makes adjustments over a longer period of
time, other adjtimex() users have noticed the half-second
granularity with which we can apply frequency changes to the clock.
For instance, if a application tries to apply a 100ppm frequency
correction for 20ms to correct a 2us offset, with NOHZ they either
get no correction, or a 50us correction.
Now, there will always be some granularity error for applying
frequency corrections. However with users sensitive to this error
have seen a 50-500x increase with NOHZ compared to running without
NOHZ.
So I figured I'd try another approach then just simply increasing
the interval. My approach is to consume the time interval
logarithmically. This reduces the number of times through the loop
needed keeping latency down, while still preserving the original
granularity error for adjtimex() changes.
Further, this change allows us to remove the xtime_cache code
(patch to follow), as xtime is always within one tick of the
current time, instead of the half-second updates it saw before.
An earlier version of this patch has been shipping to x86 users in
the RedHat MRG releases for awhile without issue, but I've reworked
this version to be even more careful about avoiding possible
overflows if the shift value gets too large.
Signed-off-by: John Stultz <johnstul@us.ibm.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: John Kacur <jkacur@redhat.com>
Cc: Clark Williams <williams@redhat.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
LKML-Reference: <1254525473.7741.88.camel@localhost.localdomain>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-10-03 07:17:53 +08:00
{
2016-12-22 03:32:01 +08:00
u64 interval = tk - > cycle_interval < < shift ;
2017-06-09 07:44:21 +08:00
u64 snsec_per_sec ;
time: Implement logarithmic time accumulation
Accumulating one tick at a time works well unless we're using NOHZ.
Then it can be an issue, since we may have to run through the loop
a few thousand times, which can increase timer interrupt caused
latency.
The current solution was to accumulate in half-second intervals
with NOHZ. This kept the number of loops down, however it did
slightly change how we make NTP adjustments. While not an issue
with NTPd users, as NTPd makes adjustments over a longer period of
time, other adjtimex() users have noticed the half-second
granularity with which we can apply frequency changes to the clock.
For instance, if a application tries to apply a 100ppm frequency
correction for 20ms to correct a 2us offset, with NOHZ they either
get no correction, or a 50us correction.
Now, there will always be some granularity error for applying
frequency corrections. However with users sensitive to this error
have seen a 50-500x increase with NOHZ compared to running without
NOHZ.
So I figured I'd try another approach then just simply increasing
the interval. My approach is to consume the time interval
logarithmically. This reduces the number of times through the loop
needed keeping latency down, while still preserving the original
granularity error for adjtimex() changes.
Further, this change allows us to remove the xtime_cache code
(patch to follow), as xtime is always within one tick of the
current time, instead of the half-second updates it saw before.
An earlier version of this patch has been shipping to x86 users in
the RedHat MRG releases for awhile without issue, but I've reworked
this version to be even more careful about avoiding possible
overflows if the shift value gets too large.
Signed-off-by: John Stultz <johnstul@us.ibm.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: John Kacur <jkacur@redhat.com>
Cc: Clark Williams <williams@redhat.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
LKML-Reference: <1254525473.7741.88.camel@localhost.localdomain>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-10-03 07:17:53 +08:00
2015-08-25 14:42:53 +08:00
/* If the offset is smaller than a shifted interval, do nothing */
2013-02-22 06:51:36 +08:00
if ( offset < interval )
time: Implement logarithmic time accumulation
Accumulating one tick at a time works well unless we're using NOHZ.
Then it can be an issue, since we may have to run through the loop
a few thousand times, which can increase timer interrupt caused
latency.
The current solution was to accumulate in half-second intervals
with NOHZ. This kept the number of loops down, however it did
slightly change how we make NTP adjustments. While not an issue
with NTPd users, as NTPd makes adjustments over a longer period of
time, other adjtimex() users have noticed the half-second
granularity with which we can apply frequency changes to the clock.
For instance, if a application tries to apply a 100ppm frequency
correction for 20ms to correct a 2us offset, with NOHZ they either
get no correction, or a 50us correction.
Now, there will always be some granularity error for applying
frequency corrections. However with users sensitive to this error
have seen a 50-500x increase with NOHZ compared to running without
NOHZ.
So I figured I'd try another approach then just simply increasing
the interval. My approach is to consume the time interval
logarithmically. This reduces the number of times through the loop
needed keeping latency down, while still preserving the original
granularity error for adjtimex() changes.
Further, this change allows us to remove the xtime_cache code
(patch to follow), as xtime is always within one tick of the
current time, instead of the half-second updates it saw before.
An earlier version of this patch has been shipping to x86 users in
the RedHat MRG releases for awhile without issue, but I've reworked
this version to be even more careful about avoiding possible
overflows if the shift value gets too large.
Signed-off-by: John Stultz <johnstul@us.ibm.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: John Kacur <jkacur@redhat.com>
Cc: Clark Williams <williams@redhat.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
LKML-Reference: <1254525473.7741.88.camel@localhost.localdomain>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-10-03 07:17:53 +08:00
return offset ;
/* Accumulate one shifted interval */
2013-02-22 06:51:36 +08:00
offset - = interval ;
2015-03-19 17:09:06 +08:00
tk - > tkr_mono . cycle_last + = interval ;
2015-03-19 16:28:44 +08:00
tk - > tkr_raw . cycle_last + = interval ;
time: Implement logarithmic time accumulation
Accumulating one tick at a time works well unless we're using NOHZ.
Then it can be an issue, since we may have to run through the loop
a few thousand times, which can increase timer interrupt caused
latency.
The current solution was to accumulate in half-second intervals
with NOHZ. This kept the number of loops down, however it did
slightly change how we make NTP adjustments. While not an issue
with NTPd users, as NTPd makes adjustments over a longer period of
time, other adjtimex() users have noticed the half-second
granularity with which we can apply frequency changes to the clock.
For instance, if a application tries to apply a 100ppm frequency
correction for 20ms to correct a 2us offset, with NOHZ they either
get no correction, or a 50us correction.
Now, there will always be some granularity error for applying
frequency corrections. However with users sensitive to this error
have seen a 50-500x increase with NOHZ compared to running without
NOHZ.
So I figured I'd try another approach then just simply increasing
the interval. My approach is to consume the time interval
logarithmically. This reduces the number of times through the loop
needed keeping latency down, while still preserving the original
granularity error for adjtimex() changes.
Further, this change allows us to remove the xtime_cache code
(patch to follow), as xtime is always within one tick of the
current time, instead of the half-second updates it saw before.
An earlier version of this patch has been shipping to x86 users in
the RedHat MRG releases for awhile without issue, but I've reworked
this version to be even more careful about avoiding possible
overflows if the shift value gets too large.
Signed-off-by: John Stultz <johnstul@us.ibm.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: John Kacur <jkacur@redhat.com>
Cc: Clark Williams <williams@redhat.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
LKML-Reference: <1254525473.7741.88.camel@localhost.localdomain>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-10-03 07:17:53 +08:00
2015-03-19 17:09:06 +08:00
tk - > tkr_mono . xtime_nsec + = tk - > xtime_interval < < shift ;
2013-12-12 12:07:49 +08:00
* clock_set | = accumulate_nsecs_to_secs ( tk ) ;
time: Implement logarithmic time accumulation
Accumulating one tick at a time works well unless we're using NOHZ.
Then it can be an issue, since we may have to run through the loop
a few thousand times, which can increase timer interrupt caused
latency.
The current solution was to accumulate in half-second intervals
with NOHZ. This kept the number of loops down, however it did
slightly change how we make NTP adjustments. While not an issue
with NTPd users, as NTPd makes adjustments over a longer period of
time, other adjtimex() users have noticed the half-second
granularity with which we can apply frequency changes to the clock.
For instance, if a application tries to apply a 100ppm frequency
correction for 20ms to correct a 2us offset, with NOHZ they either
get no correction, or a 50us correction.
Now, there will always be some granularity error for applying
frequency corrections. However with users sensitive to this error
have seen a 50-500x increase with NOHZ compared to running without
NOHZ.
So I figured I'd try another approach then just simply increasing
the interval. My approach is to consume the time interval
logarithmically. This reduces the number of times through the loop
needed keeping latency down, while still preserving the original
granularity error for adjtimex() changes.
Further, this change allows us to remove the xtime_cache code
(patch to follow), as xtime is always within one tick of the
current time, instead of the half-second updates it saw before.
An earlier version of this patch has been shipping to x86 users in
the RedHat MRG releases for awhile without issue, but I've reworked
this version to be even more careful about avoiding possible
overflows if the shift value gets too large.
Signed-off-by: John Stultz <johnstul@us.ibm.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: John Kacur <jkacur@redhat.com>
Cc: Clark Williams <williams@redhat.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
LKML-Reference: <1254525473.7741.88.camel@localhost.localdomain>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-10-03 07:17:53 +08:00
2010-08-10 05:20:09 +08:00
/* Accumulate raw time */
2017-06-09 07:44:21 +08:00
tk - > tkr_raw . xtime_nsec + = tk - > raw_interval < < shift ;
snsec_per_sec = ( u64 ) NSEC_PER_SEC < < tk - > tkr_raw . shift ;
while ( tk - > tkr_raw . xtime_nsec > = snsec_per_sec ) {
tk - > tkr_raw . xtime_nsec - = snsec_per_sec ;
2017-05-23 08:20:20 +08:00
tk - > raw_sec + + ;
time: Implement logarithmic time accumulation
Accumulating one tick at a time works well unless we're using NOHZ.
Then it can be an issue, since we may have to run through the loop
a few thousand times, which can increase timer interrupt caused
latency.
The current solution was to accumulate in half-second intervals
with NOHZ. This kept the number of loops down, however it did
slightly change how we make NTP adjustments. While not an issue
with NTPd users, as NTPd makes adjustments over a longer period of
time, other adjtimex() users have noticed the half-second
granularity with which we can apply frequency changes to the clock.
For instance, if a application tries to apply a 100ppm frequency
correction for 20ms to correct a 2us offset, with NOHZ they either
get no correction, or a 50us correction.
Now, there will always be some granularity error for applying
frequency corrections. However with users sensitive to this error
have seen a 50-500x increase with NOHZ compared to running without
NOHZ.
So I figured I'd try another approach then just simply increasing
the interval. My approach is to consume the time interval
logarithmically. This reduces the number of times through the loop
needed keeping latency down, while still preserving the original
granularity error for adjtimex() changes.
Further, this change allows us to remove the xtime_cache code
(patch to follow), as xtime is always within one tick of the
current time, instead of the half-second updates it saw before.
An earlier version of this patch has been shipping to x86 users in
the RedHat MRG releases for awhile without issue, but I've reworked
this version to be even more careful about avoiding possible
overflows if the shift value gets too large.
Signed-off-by: John Stultz <johnstul@us.ibm.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: John Kacur <jkacur@redhat.com>
Cc: Clark Williams <williams@redhat.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
LKML-Reference: <1254525473.7741.88.camel@localhost.localdomain>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-10-03 07:17:53 +08:00
}
/* Accumulate error between NTP and clock interval */
2014-04-24 11:53:29 +08:00
tk - > ntp_error + = tk - > ntp_tick < < shift ;
2012-07-13 13:21:57 +08:00
tk - > ntp_error - = ( tk - > xtime_interval + tk - > xtime_remainder ) < <
( tk - > ntp_error_shift + shift ) ;
time: Implement logarithmic time accumulation
Accumulating one tick at a time works well unless we're using NOHZ.
Then it can be an issue, since we may have to run through the loop
a few thousand times, which can increase timer interrupt caused
latency.
The current solution was to accumulate in half-second intervals
with NOHZ. This kept the number of loops down, however it did
slightly change how we make NTP adjustments. While not an issue
with NTPd users, as NTPd makes adjustments over a longer period of
time, other adjtimex() users have noticed the half-second
granularity with which we can apply frequency changes to the clock.
For instance, if a application tries to apply a 100ppm frequency
correction for 20ms to correct a 2us offset, with NOHZ they either
get no correction, or a 50us correction.
Now, there will always be some granularity error for applying
frequency corrections. However with users sensitive to this error
have seen a 50-500x increase with NOHZ compared to running without
NOHZ.
So I figured I'd try another approach then just simply increasing
the interval. My approach is to consume the time interval
logarithmically. This reduces the number of times through the loop
needed keeping latency down, while still preserving the original
granularity error for adjtimex() changes.
Further, this change allows us to remove the xtime_cache code
(patch to follow), as xtime is always within one tick of the
current time, instead of the half-second updates it saw before.
An earlier version of this patch has been shipping to x86 users in
the RedHat MRG releases for awhile without issue, but I've reworked
this version to be even more careful about avoiding possible
overflows if the shift value gets too large.
Signed-off-by: John Stultz <johnstul@us.ibm.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: John Kacur <jkacur@redhat.com>
Cc: Clark Williams <williams@redhat.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
LKML-Reference: <1254525473.7741.88.camel@localhost.localdomain>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-10-03 07:17:53 +08:00
return offset ;
}
2018-06-04 21:34:21 +08:00
/*
* timekeeping_advance - Updates the timekeeper to the current time and
* current NTP tick length
2007-05-08 15:27:59 +08:00
*/
2018-06-04 21:34:21 +08:00
static void timekeeping_advance ( enum timekeeping_adv_mode mode )
2007-05-08 15:27:59 +08:00
{
2014-07-17 05:04:07 +08:00
struct timekeeper * real_tk = & tk_core . timekeeper ;
2013-02-22 06:51:40 +08:00
struct timekeeper * tk = & shadow_timekeeper ;
2016-12-22 03:32:01 +08:00
u64 offset ;
time: Implement logarithmic time accumulation
Accumulating one tick at a time works well unless we're using NOHZ.
Then it can be an issue, since we may have to run through the loop
a few thousand times, which can increase timer interrupt caused
latency.
The current solution was to accumulate in half-second intervals
with NOHZ. This kept the number of loops down, however it did
slightly change how we make NTP adjustments. While not an issue
with NTPd users, as NTPd makes adjustments over a longer period of
time, other adjtimex() users have noticed the half-second
granularity with which we can apply frequency changes to the clock.
For instance, if a application tries to apply a 100ppm frequency
correction for 20ms to correct a 2us offset, with NOHZ they either
get no correction, or a 50us correction.
Now, there will always be some granularity error for applying
frequency corrections. However with users sensitive to this error
have seen a 50-500x increase with NOHZ compared to running without
NOHZ.
So I figured I'd try another approach then just simply increasing
the interval. My approach is to consume the time interval
logarithmically. This reduces the number of times through the loop
needed keeping latency down, while still preserving the original
granularity error for adjtimex() changes.
Further, this change allows us to remove the xtime_cache code
(patch to follow), as xtime is always within one tick of the
current time, instead of the half-second updates it saw before.
An earlier version of this patch has been shipping to x86 users in
the RedHat MRG releases for awhile without issue, but I've reworked
this version to be even more careful about avoiding possible
overflows if the shift value gets too large.
Signed-off-by: John Stultz <johnstul@us.ibm.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: John Kacur <jkacur@redhat.com>
Cc: Clark Williams <williams@redhat.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
LKML-Reference: <1254525473.7741.88.camel@localhost.localdomain>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-10-03 07:17:53 +08:00
int shift = 0 , maxshift ;
2013-12-12 12:07:49 +08:00
unsigned int clock_set = 0 ;
2011-11-15 04:48:10 +08:00
unsigned long flags ;
2013-02-22 06:51:38 +08:00
raw_spin_lock_irqsave ( & timekeeper_lock , flags ) ;
2007-05-08 15:27:59 +08:00
/* Make sure we're fully resumed: */
if ( unlikely ( timekeeping_suspended ) )
2011-11-15 04:48:10 +08:00
goto out ;
2007-05-08 15:27:59 +08:00
2010-07-14 08:56:20 +08:00
# ifdef CONFIG_ARCH_USES_GETTIMEOFFSET
2013-02-22 06:51:40 +08:00
offset = real_tk - > cycle_interval ;
2018-06-04 21:34:21 +08:00
if ( mode ! = TK_ADV_TICK )
goto out ;
2010-07-14 08:56:20 +08:00
# else
2017-06-09 07:44:20 +08:00
offset = clocksource_delta ( tk_clock_read ( & tk - > tkr_mono ) ,
2015-03-19 17:09:06 +08:00
tk - > tkr_mono . cycle_last , tk - > tkr_mono . mask ) ;
2007-05-08 15:27:59 +08:00
2012-08-22 08:30:49 +08:00
/* Check if there's really nothing to do */
2018-06-04 21:34:21 +08:00
if ( offset < real_tk - > cycle_interval & & mode = = TK_ADV_TICK )
2012-08-22 08:30:49 +08:00
goto out ;
2018-06-04 21:34:21 +08:00
# endif
2012-08-22 08:30:49 +08:00
2015-03-12 12:16:32 +08:00
/* Do some additional sanity checking */
2017-06-28 21:21:35 +08:00
timekeeping_check_update ( tk , offset ) ;
2015-03-12 12:16:32 +08:00
time: Implement logarithmic time accumulation
Accumulating one tick at a time works well unless we're using NOHZ.
Then it can be an issue, since we may have to run through the loop
a few thousand times, which can increase timer interrupt caused
latency.
The current solution was to accumulate in half-second intervals
with NOHZ. This kept the number of loops down, however it did
slightly change how we make NTP adjustments. While not an issue
with NTPd users, as NTPd makes adjustments over a longer period of
time, other adjtimex() users have noticed the half-second
granularity with which we can apply frequency changes to the clock.
For instance, if a application tries to apply a 100ppm frequency
correction for 20ms to correct a 2us offset, with NOHZ they either
get no correction, or a 50us correction.
Now, there will always be some granularity error for applying
frequency corrections. However with users sensitive to this error
have seen a 50-500x increase with NOHZ compared to running without
NOHZ.
So I figured I'd try another approach then just simply increasing
the interval. My approach is to consume the time interval
logarithmically. This reduces the number of times through the loop
needed keeping latency down, while still preserving the original
granularity error for adjtimex() changes.
Further, this change allows us to remove the xtime_cache code
(patch to follow), as xtime is always within one tick of the
current time, instead of the half-second updates it saw before.
An earlier version of this patch has been shipping to x86 users in
the RedHat MRG releases for awhile without issue, but I've reworked
this version to be even more careful about avoiding possible
overflows if the shift value gets too large.
Signed-off-by: John Stultz <johnstul@us.ibm.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: John Kacur <jkacur@redhat.com>
Cc: Clark Williams <williams@redhat.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
LKML-Reference: <1254525473.7741.88.camel@localhost.localdomain>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-10-03 07:17:53 +08:00
/*
* With NO_HZ we may have to accumulate many cycle_intervals
* ( think " ticks " ) worth of time at once . To do this efficiently ,
* we calculate the largest doubling multiple of cycle_intervals
2012-03-15 11:28:56 +08:00
* that is smaller than the offset . We then accumulate that
time: Implement logarithmic time accumulation
Accumulating one tick at a time works well unless we're using NOHZ.
Then it can be an issue, since we may have to run through the loop
a few thousand times, which can increase timer interrupt caused
latency.
The current solution was to accumulate in half-second intervals
with NOHZ. This kept the number of loops down, however it did
slightly change how we make NTP adjustments. While not an issue
with NTPd users, as NTPd makes adjustments over a longer period of
time, other adjtimex() users have noticed the half-second
granularity with which we can apply frequency changes to the clock.
For instance, if a application tries to apply a 100ppm frequency
correction for 20ms to correct a 2us offset, with NOHZ they either
get no correction, or a 50us correction.
Now, there will always be some granularity error for applying
frequency corrections. However with users sensitive to this error
have seen a 50-500x increase with NOHZ compared to running without
NOHZ.
So I figured I'd try another approach then just simply increasing
the interval. My approach is to consume the time interval
logarithmically. This reduces the number of times through the loop
needed keeping latency down, while still preserving the original
granularity error for adjtimex() changes.
Further, this change allows us to remove the xtime_cache code
(patch to follow), as xtime is always within one tick of the
current time, instead of the half-second updates it saw before.
An earlier version of this patch has been shipping to x86 users in
the RedHat MRG releases for awhile without issue, but I've reworked
this version to be even more careful about avoiding possible
overflows if the shift value gets too large.
Signed-off-by: John Stultz <johnstul@us.ibm.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: John Kacur <jkacur@redhat.com>
Cc: Clark Williams <williams@redhat.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
LKML-Reference: <1254525473.7741.88.camel@localhost.localdomain>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-10-03 07:17:53 +08:00
* chunk in one go , and then try to consume the next smaller
* doubled multiple .
2007-05-08 15:27:59 +08:00
*/
2012-07-28 02:48:13 +08:00
shift = ilog2 ( offset ) - ilog2 ( tk - > cycle_interval ) ;
time: Implement logarithmic time accumulation
Accumulating one tick at a time works well unless we're using NOHZ.
Then it can be an issue, since we may have to run through the loop
a few thousand times, which can increase timer interrupt caused
latency.
The current solution was to accumulate in half-second intervals
with NOHZ. This kept the number of loops down, however it did
slightly change how we make NTP adjustments. While not an issue
with NTPd users, as NTPd makes adjustments over a longer period of
time, other adjtimex() users have noticed the half-second
granularity with which we can apply frequency changes to the clock.
For instance, if a application tries to apply a 100ppm frequency
correction for 20ms to correct a 2us offset, with NOHZ they either
get no correction, or a 50us correction.
Now, there will always be some granularity error for applying
frequency corrections. However with users sensitive to this error
have seen a 50-500x increase with NOHZ compared to running without
NOHZ.
So I figured I'd try another approach then just simply increasing
the interval. My approach is to consume the time interval
logarithmically. This reduces the number of times through the loop
needed keeping latency down, while still preserving the original
granularity error for adjtimex() changes.
Further, this change allows us to remove the xtime_cache code
(patch to follow), as xtime is always within one tick of the
current time, instead of the half-second updates it saw before.
An earlier version of this patch has been shipping to x86 users in
the RedHat MRG releases for awhile without issue, but I've reworked
this version to be even more careful about avoiding possible
overflows if the shift value gets too large.
Signed-off-by: John Stultz <johnstul@us.ibm.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: John Kacur <jkacur@redhat.com>
Cc: Clark Williams <williams@redhat.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
LKML-Reference: <1254525473.7741.88.camel@localhost.localdomain>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-10-03 07:17:53 +08:00
shift = max ( 0 , shift ) ;
2012-03-15 11:28:56 +08:00
/* Bound shift to one less than what overflows tick_length */
2011-11-15 05:18:07 +08:00
maxshift = ( 64 - ( ilog2 ( ntp_tick_length ( ) ) + 1 ) ) - 1 ;
time: Implement logarithmic time accumulation
Accumulating one tick at a time works well unless we're using NOHZ.
Then it can be an issue, since we may have to run through the loop
a few thousand times, which can increase timer interrupt caused
latency.
The current solution was to accumulate in half-second intervals
with NOHZ. This kept the number of loops down, however it did
slightly change how we make NTP adjustments. While not an issue
with NTPd users, as NTPd makes adjustments over a longer period of
time, other adjtimex() users have noticed the half-second
granularity with which we can apply frequency changes to the clock.
For instance, if a application tries to apply a 100ppm frequency
correction for 20ms to correct a 2us offset, with NOHZ they either
get no correction, or a 50us correction.
Now, there will always be some granularity error for applying
frequency corrections. However with users sensitive to this error
have seen a 50-500x increase with NOHZ compared to running without
NOHZ.
So I figured I'd try another approach then just simply increasing
the interval. My approach is to consume the time interval
logarithmically. This reduces the number of times through the loop
needed keeping latency down, while still preserving the original
granularity error for adjtimex() changes.
Further, this change allows us to remove the xtime_cache code
(patch to follow), as xtime is always within one tick of the
current time, instead of the half-second updates it saw before.
An earlier version of this patch has been shipping to x86 users in
the RedHat MRG releases for awhile without issue, but I've reworked
this version to be even more careful about avoiding possible
overflows if the shift value gets too large.
Signed-off-by: John Stultz <johnstul@us.ibm.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: John Kacur <jkacur@redhat.com>
Cc: Clark Williams <williams@redhat.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
LKML-Reference: <1254525473.7741.88.camel@localhost.localdomain>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-10-03 07:17:53 +08:00
shift = min ( shift , maxshift ) ;
2012-07-28 02:48:13 +08:00
while ( offset > = tk - > cycle_interval ) {
2013-12-12 12:07:49 +08:00
offset = logarithmic_accumulation ( tk , offset , shift ,
& clock_set ) ;
2012-07-28 02:48:13 +08:00
if ( offset < tk - > cycle_interval < < shift )
2010-03-19 05:47:30 +08:00
shift - - ;
2007-05-08 15:27:59 +08:00
}
2018-03-10 02:42:48 +08:00
/* Adjust the multiplier to correct NTP error */
2012-07-28 02:48:13 +08:00
timekeeping_adjust ( tk , offset ) ;
2007-05-08 15:27:59 +08:00
2010-04-07 05:30:51 +08:00
/*
* Finally , make sure that after the rounding
2012-07-13 13:21:53 +08:00
* xtime_nsec isn ' t larger than NSEC_PER_SEC
2010-04-07 05:30:51 +08:00
*/
2013-12-12 12:07:49 +08:00
clock_set | = accumulate_nsecs_to_secs ( tk ) ;
Revert "time: Remove xtime_cache"
This reverts commit 7bc7d637452383d56ba4368d4336b0dde1bb476d, as
requested by John Stultz. Quoting John:
"Petr Titěra reported an issue where he saw odd atime regressions with
2.6.33 where there were a full second worth of nanoseconds in the
nanoseconds field.
He also reviewed the time code and narrowed down the problem: unhandled
overflow of the nanosecond field caused by rounding up the
sub-nanosecond accumulated time.
Details:
* At the end of update_wall_time(), we currently round up the
sub-nanosecond portion of accumulated time when storing it into xtime.
This was added to avoid time inconsistencies caused when the
sub-nanosecond portion was truncated when storing into xtime.
Unfortunately we don't handle the possible second overflow caused by
that rounding.
* Previously the xtime_cache code hid this overflow by normalizing the
xtime value when storing into the xtime_cache.
* We could try to handle the second overflow after the rounding up, but
since this affects the timekeeping's internal state, this would further
complicate the next accumulation cycle, causing small errors in ntp
steering. As much as I'd like to get rid of it, the xtime_cache code is
known to work.
* The correct fix is really to include the sub-nanosecond portion in the
timekeeping accessor function, so we don't need to round up at during
accumulation. This would greatly simplify the accumulation code.
Unfortunately, we can't do this safely until the last three
non-GENERIC_TIME arches (sparc32, arm, cris) are converted (those
patches are in -mm) and we kill off the spots where arches set xtime
directly. This is all 2.6.34 material, so I think reverting the
xtime_cache change is the best approach for now.
Many thanks to Petr for both reporting and finding the issue!"
Reported-by: Petr Titěra <P.Titera@century.cz>
Requested-by: john stultz <johnstul@us.ibm.com>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-12-23 06:10:37 +08:00
2014-07-17 05:04:07 +08:00
write_seqcount_begin ( & tk_core . seq ) ;
2013-02-22 06:51:40 +08:00
/*
* Update the real timekeeper .
*
* We could avoid this memcpy by switching pointers , but that
* requires changes to all other timekeeper usage sites as
* well , i . e . move the timekeeper pointer getter into the
* spinlocked / seqcount protected sections . And we trade this
2014-07-17 05:04:07 +08:00
* memcpy under the tk_core . seq against one before we start
2013-02-22 06:51:40 +08:00
* updating .
*/
timekeeping: Copy the shadow-timekeeper over the real timekeeper last
The fix in d151832650ed9 (time: Move clock_was_set_seq update
before updating shadow-timekeeper) was unfortunately incomplete.
The main gist of that change was to do the shadow-copy update
last, so that any state changes were properly duplicated, and
we wouldn't accidentally have stale data in the shadow.
Unfortunately in the main update_wall_time() logic, we update
use the shadow-timekeeper to calculate the next update values,
then while holding the lock, copy the shadow-timekeeper over,
then call timekeeping_update() to do some additional
bookkeeping, (skipping the shadow mirror). The bug with this is
the additional bookkeeping isn't all read-only, and some
changes timkeeper state. Thus we might then overwrite this state
change on the next update.
To avoid this problem, do the timekeeping_update() on the
shadow-timekeeper prior to copying the full state over to
the real-timekeeper.
This avoids problems with both the clock_was_set_seq and
next_leap_ktime being overwritten and possibly the
fast-timekeepers as well.
Many thanks to Prarit for his rigorous testing, which discovered
this problem, along with Prarit and Daniel's work validating this
fix.
Reported-by: Prarit Bhargava <prarit@redhat.com>
Tested-by: Prarit Bhargava <prarit@redhat.com>
Tested-by: Daniel Bristot de Oliveira <bristot@redhat.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jiri Bohac <jbohac@suse.cz>
Cc: Ingo Molnar <mingo@kernel.org>
Link: http://lkml.kernel.org/r/1434560753-7441-1-git-send-email-john.stultz@linaro.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
2015-06-18 01:05:53 +08:00
timekeeping_update ( tk , clock_set ) ;
2013-02-22 06:51:40 +08:00
memcpy ( real_tk , tk , sizeof ( * tk ) ) ;
timekeeping: Copy the shadow-timekeeper over the real timekeeper last
The fix in d151832650ed9 (time: Move clock_was_set_seq update
before updating shadow-timekeeper) was unfortunately incomplete.
The main gist of that change was to do the shadow-copy update
last, so that any state changes were properly duplicated, and
we wouldn't accidentally have stale data in the shadow.
Unfortunately in the main update_wall_time() logic, we update
use the shadow-timekeeper to calculate the next update values,
then while holding the lock, copy the shadow-timekeeper over,
then call timekeeping_update() to do some additional
bookkeeping, (skipping the shadow mirror). The bug with this is
the additional bookkeeping isn't all read-only, and some
changes timkeeper state. Thus we might then overwrite this state
change on the next update.
To avoid this problem, do the timekeeping_update() on the
shadow-timekeeper prior to copying the full state over to
the real-timekeeper.
This avoids problems with both the clock_was_set_seq and
next_leap_ktime being overwritten and possibly the
fast-timekeepers as well.
Many thanks to Prarit for his rigorous testing, which discovered
this problem, along with Prarit and Daniel's work validating this
fix.
Reported-by: Prarit Bhargava <prarit@redhat.com>
Tested-by: Prarit Bhargava <prarit@redhat.com>
Tested-by: Daniel Bristot de Oliveira <bristot@redhat.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jiri Bohac <jbohac@suse.cz>
Cc: Ingo Molnar <mingo@kernel.org>
Link: http://lkml.kernel.org/r/1434560753-7441-1-git-send-email-john.stultz@linaro.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
2015-06-18 01:05:53 +08:00
/* The memcpy must come last. Do not put anything here! */
2014-07-17 05:04:07 +08:00
write_seqcount_end ( & tk_core . seq ) ;
2013-02-22 06:51:40 +08:00
out :
2013-02-22 06:51:38 +08:00
raw_spin_unlock_irqrestore ( & timekeeper_lock , flags ) ;
2013-12-13 05:10:55 +08:00
if ( clock_set )
2014-03-28 07:30:49 +08:00
/* Have to call _delayed version, since in irq context*/
clock_was_set_delayed ( ) ;
2007-05-08 15:27:59 +08:00
}
2007-07-16 14:39:41 +08:00
2018-06-04 21:34:21 +08:00
/**
* update_wall_time - Uses the current clocksource to increment the wall time
*
*/
void update_wall_time ( void )
{
timekeeping_advance ( TK_ADV_TICK ) ;
}
2007-07-16 14:39:41 +08:00
/**
2014-12-09 04:00:09 +08:00
* getboottime64 - Return the real time of system boot .
* @ ts : pointer to the timespec64 to be set
2007-07-16 14:39:41 +08:00
*
2014-12-09 04:00:09 +08:00
* Returns the wall - time of boot in a timespec64 .
2007-07-16 14:39:41 +08:00
*
* This is based on the wall_to_monotonic offset and the total suspend
* time . Calls to settimeofday will affect the value returned ( which
* basically means that however wrong your real time clock is at boot time ,
* you get the right time here ) .
*/
2014-12-09 04:00:09 +08:00
void getboottime64 ( struct timespec64 * ts )
2007-07-16 14:39:41 +08:00
{
2014-07-17 05:04:07 +08:00
struct timekeeper * tk = & tk_core . timekeeper ;
2018-04-25 21:33:38 +08:00
ktime_t t = ktime_sub ( tk - > offs_real , tk - > offs_boot ) ;
2014-07-17 05:04:58 +08:00
2014-12-09 04:00:09 +08:00
* ts = ktime_to_timespec64 ( t ) ;
2007-07-16 14:39:41 +08:00
}
2014-12-09 04:00:09 +08:00
EXPORT_SYMBOL_GPL ( getboottime64 ) ;
2007-07-16 14:39:41 +08:00
2018-04-27 21:40:14 +08:00
void ktime_get_coarse_real_ts64 ( struct timespec64 * ts )
2007-07-25 08:47:43 +08:00
{
2014-07-17 05:04:07 +08:00
struct timekeeper * tk = & tk_core . timekeeper ;
2019-03-19 03:55:56 +08:00
unsigned int seq ;
2007-07-25 08:47:43 +08:00
do {
2014-07-17 05:04:07 +08:00
seq = read_seqcount_begin ( & tk_core . seq ) ;
Revert "time: Remove xtime_cache"
This reverts commit 7bc7d637452383d56ba4368d4336b0dde1bb476d, as
requested by John Stultz. Quoting John:
"Petr Titěra reported an issue where he saw odd atime regressions with
2.6.33 where there were a full second worth of nanoseconds in the
nanoseconds field.
He also reviewed the time code and narrowed down the problem: unhandled
overflow of the nanosecond field caused by rounding up the
sub-nanosecond accumulated time.
Details:
* At the end of update_wall_time(), we currently round up the
sub-nanosecond portion of accumulated time when storing it into xtime.
This was added to avoid time inconsistencies caused when the
sub-nanosecond portion was truncated when storing into xtime.
Unfortunately we don't handle the possible second overflow caused by
that rounding.
* Previously the xtime_cache code hid this overflow by normalizing the
xtime value when storing into the xtime_cache.
* We could try to handle the second overflow after the rounding up, but
since this affects the timekeeping's internal state, this would further
complicate the next accumulation cycle, causing small errors in ntp
steering. As much as I'd like to get rid of it, the xtime_cache code is
known to work.
* The correct fix is really to include the sub-nanosecond portion in the
timekeeping accessor function, so we don't need to round up at during
accumulation. This would greatly simplify the accumulation code.
Unfortunately, we can't do this safely until the last three
non-GENERIC_TIME arches (sparc32, arm, cris) are converted (those
patches are in -mm) and we kill off the spots where arches set xtime
directly. This is all 2.6.34 material, so I think reverting the
xtime_cache change is the best approach for now.
Many thanks to Petr for both reporting and finding the issue!"
Reported-by: Petr Titěra <P.Titera@century.cz>
Requested-by: john stultz <johnstul@us.ibm.com>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-12-23 06:10:37 +08:00
2018-04-27 21:40:14 +08:00
* ts = tk_xtime ( tk ) ;
2014-07-17 05:04:07 +08:00
} while ( read_seqcount_retry ( & tk_core . seq , seq ) ) ;
2007-07-25 08:47:43 +08:00
}
2018-04-27 21:40:14 +08:00
EXPORT_SYMBOL ( ktime_get_coarse_real_ts64 ) ;
2009-08-20 10:13:34 +08:00
2018-04-27 21:40:14 +08:00
void ktime_get_coarse_ts64 ( struct timespec64 * ts )
2009-08-20 10:13:34 +08:00
{
2014-07-17 05:04:07 +08:00
struct timekeeper * tk = & tk_core . timekeeper ;
2014-07-17 05:04:01 +08:00
struct timespec64 now , mono ;
2019-03-19 03:55:56 +08:00
unsigned int seq ;
2009-08-20 10:13:34 +08:00
do {
2014-07-17 05:04:07 +08:00
seq = read_seqcount_begin ( & tk_core . seq ) ;
Revert "time: Remove xtime_cache"
This reverts commit 7bc7d637452383d56ba4368d4336b0dde1bb476d, as
requested by John Stultz. Quoting John:
"Petr Titěra reported an issue where he saw odd atime regressions with
2.6.33 where there were a full second worth of nanoseconds in the
nanoseconds field.
He also reviewed the time code and narrowed down the problem: unhandled
overflow of the nanosecond field caused by rounding up the
sub-nanosecond accumulated time.
Details:
* At the end of update_wall_time(), we currently round up the
sub-nanosecond portion of accumulated time when storing it into xtime.
This was added to avoid time inconsistencies caused when the
sub-nanosecond portion was truncated when storing into xtime.
Unfortunately we don't handle the possible second overflow caused by
that rounding.
* Previously the xtime_cache code hid this overflow by normalizing the
xtime value when storing into the xtime_cache.
* We could try to handle the second overflow after the rounding up, but
since this affects the timekeeping's internal state, this would further
complicate the next accumulation cycle, causing small errors in ntp
steering. As much as I'd like to get rid of it, the xtime_cache code is
known to work.
* The correct fix is really to include the sub-nanosecond portion in the
timekeeping accessor function, so we don't need to round up at during
accumulation. This would greatly simplify the accumulation code.
Unfortunately, we can't do this safely until the last three
non-GENERIC_TIME arches (sparc32, arm, cris) are converted (those
patches are in -mm) and we kill off the spots where arches set xtime
directly. This is all 2.6.34 material, so I think reverting the
xtime_cache change is the best approach for now.
Many thanks to Petr for both reporting and finding the issue!"
Reported-by: Petr Titěra <P.Titera@century.cz>
Requested-by: john stultz <johnstul@us.ibm.com>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-12-23 06:10:37 +08:00
2012-07-28 02:48:13 +08:00
now = tk_xtime ( tk ) ;
mono = tk - > wall_to_monotonic ;
2014-07-17 05:04:07 +08:00
} while ( read_seqcount_retry ( & tk_core . seq , seq ) ) ;
2009-08-20 10:13:34 +08:00
2018-04-27 21:40:14 +08:00
set_normalized_timespec64 ( ts , now . tv_sec + mono . tv_sec ,
2009-08-20 10:13:34 +08:00
now . tv_nsec + mono . tv_nsec ) ;
}
2018-04-27 21:40:14 +08:00
EXPORT_SYMBOL ( ktime_get_coarse_ts64 ) ;
2011-01-27 22:58:55 +08:00
/*
2012-02-29 08:50:11 +08:00
* Must hold jiffies_lock
2011-01-27 22:58:55 +08:00
*/
void do_timer ( unsigned long ticks )
{
jiffies_64 + = ticks ;
calc_global_load ( ticks ) ;
}
2011-01-27 22:59:05 +08:00
2012-07-11 06:43:24 +08:00
/**
2014-07-17 05:03:52 +08:00
* ktime_get_update_offsets_now - hrtimer helper
2015-04-15 05:08:37 +08:00
* @ cwsseq : pointer to check and store the clock was set sequence number
2012-07-11 06:43:24 +08:00
* @ offs_real : pointer to storage for monotonic - > realtime offset
2018-04-25 21:33:38 +08:00
* @ offs_boot : pointer to storage for monotonic - > boottime offset
2013-10-18 09:13:30 +08:00
* @ offs_tai : pointer to storage for monotonic - > clock tai offset
2012-07-11 06:43:24 +08:00
*
2015-04-15 05:08:37 +08:00
* Returns current monotonic time and updates the offsets if the
* sequence number in @ cwsseq and timekeeper . clock_was_set_seq are
* different .
*
2013-10-18 09:13:30 +08:00
* Called from hrtimer_interrupt ( ) or retrigger_next_event ( )
2012-07-11 06:43:24 +08:00
*/
2015-04-15 05:08:37 +08:00
ktime_t ktime_get_update_offsets_now ( unsigned int * cwsseq , ktime_t * offs_real ,
2018-04-25 21:33:38 +08:00
ktime_t * offs_boot , ktime_t * offs_tai )
2012-07-11 06:43:24 +08:00
{
2014-07-17 05:04:07 +08:00
struct timekeeper * tk = & tk_core . timekeeper ;
2012-07-11 06:43:24 +08:00
unsigned int seq ;
2014-07-17 05:04:19 +08:00
ktime_t base ;
u64 nsecs ;
2012-07-11 06:43:24 +08:00
do {
2014-07-17 05:04:07 +08:00
seq = read_seqcount_begin ( & tk_core . seq ) ;
2012-07-11 06:43:24 +08:00
2015-03-19 17:09:06 +08:00
base = tk - > tkr_mono . base ;
nsecs = timekeeping_get_ns ( & tk - > tkr_mono ) ;
time: Prevent early expiry of hrtimers[CLOCK_REALTIME] at the leap second edge
Currently, leapsecond adjustments are done at tick time. As a result,
the leapsecond was applied at the first timer tick *after* the
leapsecond (~1-10ms late depending on HZ), rather then exactly on the
second edge.
This was in part historical from back when we were always tick based,
but correcting this since has been avoided since it adds extra
conditional checks in the gettime fastpath, which has performance
overhead.
However, it was recently pointed out that ABS_TIME CLOCK_REALTIME
timers set for right after the leapsecond could fire a second early,
since some timers may be expired before we trigger the timekeeping
timer, which then applies the leapsecond.
This isn't quite as bad as it sounds, since behaviorally it is similar
to what is possible w/ ntpd made leapsecond adjustments done w/o using
the kernel discipline. Where due to latencies, timers may fire just
prior to the settimeofday call. (Also, one should note that all
applications using CLOCK_REALTIME timers should always be careful,
since they are prone to quirks from settimeofday() disturbances.)
However, the purpose of having the kernel do the leap adjustment is to
avoid such latencies, so I think this is worth fixing.
So in order to properly keep those timers from firing a second early,
this patch modifies the ntp and timekeeping logic so that we keep
enough state so that the update_base_offsets_now accessor, which
provides the hrtimer core the current time, can check and apply the
leapsecond adjustment on the second edge. This prevents the hrtimer
core from expiring timers too early.
This patch does not modify any other time read path, so no additional
overhead is incurred. However, this also means that the leap-second
continues to be applied at tick time for all other read-paths.
Apologies to Richard Cochran, who pushed for similar changes years
ago, which I resisted due to the concerns about the performance
overhead.
While I suspect this isn't extremely critical, folks who care about
strict leap-second correctness will likely want to watch
this. Potentially a -stable candidate eventually.
Originally-suggested-by: Richard Cochran <richardcochran@gmail.com>
Reported-by: Daniel Bristot de Oliveira <bristot@redhat.com>
Reported-by: Prarit Bhargava <prarit@redhat.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jiri Bohac <jbohac@suse.cz>
Cc: Shuah Khan <shuahkh@osg.samsung.com>
Cc: Ingo Molnar <mingo@kernel.org>
Link: http://lkml.kernel.org/r/1434063297-28657-4-git-send-email-john.stultz@linaro.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
2015-06-12 06:54:55 +08:00
base = ktime_add_ns ( base , nsecs ) ;
2015-04-15 05:08:37 +08:00
if ( * cwsseq ! = tk - > clock_was_set_seq ) {
* cwsseq = tk - > clock_was_set_seq ;
* offs_real = tk - > offs_real ;
2018-04-25 21:33:38 +08:00
* offs_boot = tk - > offs_boot ;
2015-04-15 05:08:37 +08:00
* offs_tai = tk - > offs_tai ;
}
time: Prevent early expiry of hrtimers[CLOCK_REALTIME] at the leap second edge
Currently, leapsecond adjustments are done at tick time. As a result,
the leapsecond was applied at the first timer tick *after* the
leapsecond (~1-10ms late depending on HZ), rather then exactly on the
second edge.
This was in part historical from back when we were always tick based,
but correcting this since has been avoided since it adds extra
conditional checks in the gettime fastpath, which has performance
overhead.
However, it was recently pointed out that ABS_TIME CLOCK_REALTIME
timers set for right after the leapsecond could fire a second early,
since some timers may be expired before we trigger the timekeeping
timer, which then applies the leapsecond.
This isn't quite as bad as it sounds, since behaviorally it is similar
to what is possible w/ ntpd made leapsecond adjustments done w/o using
the kernel discipline. Where due to latencies, timers may fire just
prior to the settimeofday call. (Also, one should note that all
applications using CLOCK_REALTIME timers should always be careful,
since they are prone to quirks from settimeofday() disturbances.)
However, the purpose of having the kernel do the leap adjustment is to
avoid such latencies, so I think this is worth fixing.
So in order to properly keep those timers from firing a second early,
this patch modifies the ntp and timekeeping logic so that we keep
enough state so that the update_base_offsets_now accessor, which
provides the hrtimer core the current time, can check and apply the
leapsecond adjustment on the second edge. This prevents the hrtimer
core from expiring timers too early.
This patch does not modify any other time read path, so no additional
overhead is incurred. However, this also means that the leap-second
continues to be applied at tick time for all other read-paths.
Apologies to Richard Cochran, who pushed for similar changes years
ago, which I resisted due to the concerns about the performance
overhead.
While I suspect this isn't extremely critical, folks who care about
strict leap-second correctness will likely want to watch
this. Potentially a -stable candidate eventually.
Originally-suggested-by: Richard Cochran <richardcochran@gmail.com>
Reported-by: Daniel Bristot de Oliveira <bristot@redhat.com>
Reported-by: Prarit Bhargava <prarit@redhat.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jiri Bohac <jbohac@suse.cz>
Cc: Shuah Khan <shuahkh@osg.samsung.com>
Cc: Ingo Molnar <mingo@kernel.org>
Link: http://lkml.kernel.org/r/1434063297-28657-4-git-send-email-john.stultz@linaro.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
2015-06-12 06:54:55 +08:00
/* Handle leapsecond insertion adjustments */
2016-12-25 18:38:40 +08:00
if ( unlikely ( base > = tk - > next_leap_ktime ) )
time: Prevent early expiry of hrtimers[CLOCK_REALTIME] at the leap second edge
Currently, leapsecond adjustments are done at tick time. As a result,
the leapsecond was applied at the first timer tick *after* the
leapsecond (~1-10ms late depending on HZ), rather then exactly on the
second edge.
This was in part historical from back when we were always tick based,
but correcting this since has been avoided since it adds extra
conditional checks in the gettime fastpath, which has performance
overhead.
However, it was recently pointed out that ABS_TIME CLOCK_REALTIME
timers set for right after the leapsecond could fire a second early,
since some timers may be expired before we trigger the timekeeping
timer, which then applies the leapsecond.
This isn't quite as bad as it sounds, since behaviorally it is similar
to what is possible w/ ntpd made leapsecond adjustments done w/o using
the kernel discipline. Where due to latencies, timers may fire just
prior to the settimeofday call. (Also, one should note that all
applications using CLOCK_REALTIME timers should always be careful,
since they are prone to quirks from settimeofday() disturbances.)
However, the purpose of having the kernel do the leap adjustment is to
avoid such latencies, so I think this is worth fixing.
So in order to properly keep those timers from firing a second early,
this patch modifies the ntp and timekeeping logic so that we keep
enough state so that the update_base_offsets_now accessor, which
provides the hrtimer core the current time, can check and apply the
leapsecond adjustment on the second edge. This prevents the hrtimer
core from expiring timers too early.
This patch does not modify any other time read path, so no additional
overhead is incurred. However, this also means that the leap-second
continues to be applied at tick time for all other read-paths.
Apologies to Richard Cochran, who pushed for similar changes years
ago, which I resisted due to the concerns about the performance
overhead.
While I suspect this isn't extremely critical, folks who care about
strict leap-second correctness will likely want to watch
this. Potentially a -stable candidate eventually.
Originally-suggested-by: Richard Cochran <richardcochran@gmail.com>
Reported-by: Daniel Bristot de Oliveira <bristot@redhat.com>
Reported-by: Prarit Bhargava <prarit@redhat.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jiri Bohac <jbohac@suse.cz>
Cc: Shuah Khan <shuahkh@osg.samsung.com>
Cc: Ingo Molnar <mingo@kernel.org>
Link: http://lkml.kernel.org/r/1434063297-28657-4-git-send-email-john.stultz@linaro.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
2015-06-12 06:54:55 +08:00
* offs_real = ktime_sub ( tk - > offs_real , ktime_set ( 1 , 0 ) ) ;
2014-07-17 05:04:07 +08:00
} while ( read_seqcount_retry ( & tk_core . seq , seq ) ) ;
2012-07-11 06:43:24 +08:00
time: Prevent early expiry of hrtimers[CLOCK_REALTIME] at the leap second edge
Currently, leapsecond adjustments are done at tick time. As a result,
the leapsecond was applied at the first timer tick *after* the
leapsecond (~1-10ms late depending on HZ), rather then exactly on the
second edge.
This was in part historical from back when we were always tick based,
but correcting this since has been avoided since it adds extra
conditional checks in the gettime fastpath, which has performance
overhead.
However, it was recently pointed out that ABS_TIME CLOCK_REALTIME
timers set for right after the leapsecond could fire a second early,
since some timers may be expired before we trigger the timekeeping
timer, which then applies the leapsecond.
This isn't quite as bad as it sounds, since behaviorally it is similar
to what is possible w/ ntpd made leapsecond adjustments done w/o using
the kernel discipline. Where due to latencies, timers may fire just
prior to the settimeofday call. (Also, one should note that all
applications using CLOCK_REALTIME timers should always be careful,
since they are prone to quirks from settimeofday() disturbances.)
However, the purpose of having the kernel do the leap adjustment is to
avoid such latencies, so I think this is worth fixing.
So in order to properly keep those timers from firing a second early,
this patch modifies the ntp and timekeeping logic so that we keep
enough state so that the update_base_offsets_now accessor, which
provides the hrtimer core the current time, can check and apply the
leapsecond adjustment on the second edge. This prevents the hrtimer
core from expiring timers too early.
This patch does not modify any other time read path, so no additional
overhead is incurred. However, this also means that the leap-second
continues to be applied at tick time for all other read-paths.
Apologies to Richard Cochran, who pushed for similar changes years
ago, which I resisted due to the concerns about the performance
overhead.
While I suspect this isn't extremely critical, folks who care about
strict leap-second correctness will likely want to watch
this. Potentially a -stable candidate eventually.
Originally-suggested-by: Richard Cochran <richardcochran@gmail.com>
Reported-by: Daniel Bristot de Oliveira <bristot@redhat.com>
Reported-by: Prarit Bhargava <prarit@redhat.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jiri Bohac <jbohac@suse.cz>
Cc: Shuah Khan <shuahkh@osg.samsung.com>
Cc: Ingo Molnar <mingo@kernel.org>
Link: http://lkml.kernel.org/r/1434063297-28657-4-git-send-email-john.stultz@linaro.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
2015-06-12 06:54:55 +08:00
return base ;
2012-07-11 06:43:24 +08:00
}
2017-10-19 19:14:44 +08:00
/**
2017-10-19 19:14:45 +08:00
* timekeeping_validate_timex - Ensures the timex is ok for use in do_adjtimex
2017-10-19 19:14:44 +08:00
*/
2018-07-03 13:44:21 +08:00
static int timekeeping_validate_timex ( const struct __kernel_timex * txc )
2017-10-19 19:14:44 +08:00
{
if ( txc - > modes & ADJ_ADJTIME ) {
/* singleshot must not be used with any other mode bits */
if ( ! ( txc - > modes & ADJ_OFFSET_SINGLESHOT ) )
return - EINVAL ;
if ( ! ( txc - > modes & ADJ_OFFSET_READONLY ) & &
! capable ( CAP_SYS_TIME ) )
return - EPERM ;
} else {
/* In order to modify anything, you gotta be super-user! */
if ( txc - > modes & & ! capable ( CAP_SYS_TIME ) )
return - EPERM ;
/*
* if the quartz is off by more than 10 % then
* something is VERY wrong !
*/
if ( txc - > modes & ADJ_TICK & &
( txc - > tick < 900000 / USER_HZ | |
txc - > tick > 1100000 / USER_HZ ) )
return - EINVAL ;
}
if ( txc - > modes & ADJ_SETOFFSET ) {
/* In order to inject time, you gotta be super-user! */
if ( ! capable ( CAP_SYS_TIME ) )
return - EPERM ;
2017-10-19 19:14:45 +08:00
/*
* Validate if a timespec / timeval used to inject a time
* offset is valid . Offsets can be postive or negative , so
* we don ' t check tv_sec . The value of the timeval / timespec
* is the sum of its fields , but * NOTE * :
* The field tv_usec / tv_nsec must always be non - negative and
* we can ' t have more nanoseconds / microseconds than a second .
*/
if ( txc - > time . tv_usec < 0 )
return - EINVAL ;
2017-10-19 19:14:44 +08:00
2017-10-19 19:14:45 +08:00
if ( txc - > modes & ADJ_NANO ) {
if ( txc - > time . tv_usec > = NSEC_PER_SEC )
2017-10-19 19:14:44 +08:00
return - EINVAL ;
} else {
2017-10-19 19:14:45 +08:00
if ( txc - > time . tv_usec > = USEC_PER_SEC )
2017-10-19 19:14:44 +08:00
return - EINVAL ;
}
}
/*
* Check for potential multiplication overflows that can
* only happen on 64 - bit systems :
*/
if ( ( txc - > modes & ADJ_FREQUENCY ) & & ( BITS_PER_LONG = = 64 ) ) {
if ( LLONG_MIN / PPM_SCALE > txc - > freq )
return - EINVAL ;
if ( LLONG_MAX / PPM_SCALE < txc - > freq )
return - EINVAL ;
}
return 0 ;
}
2013-03-23 02:31:29 +08:00
/**
* do_adjtimex ( ) - Accessor function to NTP __do_adjtimex function
*/
2018-07-03 13:44:21 +08:00
int do_adjtimex ( struct __kernel_timex * txc )
2013-03-23 02:31:29 +08:00
{
2014-07-17 05:04:07 +08:00
struct timekeeper * tk = & tk_core . timekeeper ;
2019-04-10 17:14:20 +08:00
struct audit_ntp_data ad ;
2013-03-23 02:37:28 +08:00
unsigned long flags ;
2014-07-17 05:04:01 +08:00
struct timespec64 ts ;
2013-04-11 03:41:49 +08:00
s32 orig_tai , tai ;
2013-03-23 03:08:52 +08:00
int ret ;
/* Validate the data before disabling interrupts */
2017-10-19 19:14:45 +08:00
ret = timekeeping_validate_timex ( txc ) ;
2013-03-23 03:08:52 +08:00
if ( ret )
return ret ;
2013-03-23 06:04:13 +08:00
if ( txc - > modes & ADJ_SETOFFSET ) {
2017-10-19 19:14:45 +08:00
struct timespec64 delta ;
2013-03-23 06:04:13 +08:00
delta . tv_sec = txc - > time . tv_sec ;
delta . tv_nsec = txc - > time . tv_usec ;
if ( ! ( txc - > modes & ADJ_NANO ) )
delta . tv_nsec * = 1000 ;
ret = timekeeping_inject_offset ( & delta ) ;
if ( ret )
return ret ;
2019-04-10 17:14:19 +08:00
audit_tk_injoffset ( delta ) ;
2013-03-23 06:04:13 +08:00
}
2019-04-10 17:14:20 +08:00
audit_ntp_init ( & ad ) ;
2018-06-18 22:08:01 +08:00
ktime_get_real_ts64 ( & ts ) ;
2013-03-23 03:28:15 +08:00
2013-03-23 02:37:28 +08:00
raw_spin_lock_irqsave ( & timekeeper_lock , flags ) ;
2014-07-17 05:04:07 +08:00
write_seqcount_begin ( & tk_core . seq ) ;
2013-03-23 02:37:28 +08:00
2013-04-11 03:41:49 +08:00
orig_tai = tai = tk - > tai_offset ;
2019-04-10 17:14:20 +08:00
ret = __do_adjtimex ( txc , & ts , & tai , & ad ) ;
2013-03-23 02:31:29 +08:00
2013-04-11 03:41:49 +08:00
if ( tai ! = orig_tai ) {
__timekeeping_set_tai_offset ( tk , tai ) ;
2013-12-12 10:50:25 +08:00
timekeeping_update ( tk , TK_MIRROR | TK_CLOCK_WAS_SET ) ;
2013-04-11 03:41:49 +08:00
}
time: Prevent early expiry of hrtimers[CLOCK_REALTIME] at the leap second edge
Currently, leapsecond adjustments are done at tick time. As a result,
the leapsecond was applied at the first timer tick *after* the
leapsecond (~1-10ms late depending on HZ), rather then exactly on the
second edge.
This was in part historical from back when we were always tick based,
but correcting this since has been avoided since it adds extra
conditional checks in the gettime fastpath, which has performance
overhead.
However, it was recently pointed out that ABS_TIME CLOCK_REALTIME
timers set for right after the leapsecond could fire a second early,
since some timers may be expired before we trigger the timekeeping
timer, which then applies the leapsecond.
This isn't quite as bad as it sounds, since behaviorally it is similar
to what is possible w/ ntpd made leapsecond adjustments done w/o using
the kernel discipline. Where due to latencies, timers may fire just
prior to the settimeofday call. (Also, one should note that all
applications using CLOCK_REALTIME timers should always be careful,
since they are prone to quirks from settimeofday() disturbances.)
However, the purpose of having the kernel do the leap adjustment is to
avoid such latencies, so I think this is worth fixing.
So in order to properly keep those timers from firing a second early,
this patch modifies the ntp and timekeeping logic so that we keep
enough state so that the update_base_offsets_now accessor, which
provides the hrtimer core the current time, can check and apply the
leapsecond adjustment on the second edge. This prevents the hrtimer
core from expiring timers too early.
This patch does not modify any other time read path, so no additional
overhead is incurred. However, this also means that the leap-second
continues to be applied at tick time for all other read-paths.
Apologies to Richard Cochran, who pushed for similar changes years
ago, which I resisted due to the concerns about the performance
overhead.
While I suspect this isn't extremely critical, folks who care about
strict leap-second correctness will likely want to watch
this. Potentially a -stable candidate eventually.
Originally-suggested-by: Richard Cochran <richardcochran@gmail.com>
Reported-by: Daniel Bristot de Oliveira <bristot@redhat.com>
Reported-by: Prarit Bhargava <prarit@redhat.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jiri Bohac <jbohac@suse.cz>
Cc: Shuah Khan <shuahkh@osg.samsung.com>
Cc: Ingo Molnar <mingo@kernel.org>
Link: http://lkml.kernel.org/r/1434063297-28657-4-git-send-email-john.stultz@linaro.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
2015-06-12 06:54:55 +08:00
tk_update_leap_state ( tk ) ;
2014-07-17 05:04:07 +08:00
write_seqcount_end ( & tk_core . seq ) ;
2013-03-23 02:37:28 +08:00
raw_spin_unlock_irqrestore ( & timekeeper_lock , flags ) ;
2019-04-10 17:14:20 +08:00
audit_ntp_log ( & ad ) ;
2018-06-04 21:34:21 +08:00
/* Update the multiplier immediately if frequency was set directly */
if ( txc - > modes & ( ADJ_FREQUENCY | ADJ_TICK ) )
timekeeping_advance ( TK_ADV_FREQ ) ;
timekeeping: Avoid possible deadlock from clock_was_set_delayed
As part of normal operaions, the hrtimer subsystem frequently calls
into the timekeeping code, creating a locking order of
hrtimer locks -> timekeeping locks
clock_was_set_delayed() was suppoed to allow us to avoid deadlocks
between the timekeeping the hrtimer subsystem, so that we could
notify the hrtimer subsytem the time had changed while holding
the timekeeping locks. This was done by scheduling delayed work
that would run later once we were out of the timekeeing code.
But unfortunately the lock chains are complex enoguh that in
scheduling delayed work, we end up eventually trying to grab
an hrtimer lock.
Sasha Levin noticed this in testing when the new seqlock lockdep
enablement triggered the following (somewhat abrieviated) message:
[ 251.100221] ======================================================
[ 251.100221] [ INFO: possible circular locking dependency detected ]
[ 251.100221] 3.13.0-rc2-next-20131206-sasha-00005-g8be2375-dirty #4053 Not tainted
[ 251.101967] -------------------------------------------------------
[ 251.101967] kworker/10:1/4506 is trying to acquire lock:
[ 251.101967] (timekeeper_seq){----..}, at: [<ffffffff81160e96>] retrigger_next_event+0x56/0x70
[ 251.101967]
[ 251.101967] but task is already holding lock:
[ 251.101967] (hrtimer_bases.lock#11){-.-...}, at: [<ffffffff81160e7c>] retrigger_next_event+0x3c/0x70
[ 251.101967]
[ 251.101967] which lock already depends on the new lock.
[ 251.101967]
[ 251.101967]
[ 251.101967] the existing dependency chain (in reverse order) is:
[ 251.101967]
-> #5 (hrtimer_bases.lock#11){-.-...}:
[snipped]
-> #4 (&rt_b->rt_runtime_lock){-.-...}:
[snipped]
-> #3 (&rq->lock){-.-.-.}:
[snipped]
-> #2 (&p->pi_lock){-.-.-.}:
[snipped]
-> #1 (&(&pool->lock)->rlock){-.-...}:
[ 251.101967] [<ffffffff81194803>] validate_chain+0x6c3/0x7b0
[ 251.101967] [<ffffffff81194d9d>] __lock_acquire+0x4ad/0x580
[ 251.101967] [<ffffffff81194ff2>] lock_acquire+0x182/0x1d0
[ 251.101967] [<ffffffff84398500>] _raw_spin_lock+0x40/0x80
[ 251.101967] [<ffffffff81153e69>] __queue_work+0x1a9/0x3f0
[ 251.101967] [<ffffffff81154168>] queue_work_on+0x98/0x120
[ 251.101967] [<ffffffff81161351>] clock_was_set_delayed+0x21/0x30
[ 251.101967] [<ffffffff811c4bd1>] do_adjtimex+0x111/0x160
[ 251.101967] [<ffffffff811e2711>] compat_sys_adjtimex+0x41/0x70
[ 251.101967] [<ffffffff843a4b49>] ia32_sysret+0x0/0x5
[ 251.101967]
-> #0 (timekeeper_seq){----..}:
[snipped]
[ 251.101967] other info that might help us debug this:
[ 251.101967]
[ 251.101967] Chain exists of:
timekeeper_seq --> &rt_b->rt_runtime_lock --> hrtimer_bases.lock#11
[ 251.101967] Possible unsafe locking scenario:
[ 251.101967]
[ 251.101967] CPU0 CPU1
[ 251.101967] ---- ----
[ 251.101967] lock(hrtimer_bases.lock#11);
[ 251.101967] lock(&rt_b->rt_runtime_lock);
[ 251.101967] lock(hrtimer_bases.lock#11);
[ 251.101967] lock(timekeeper_seq);
[ 251.101967]
[ 251.101967] *** DEADLOCK ***
[ 251.101967]
[ 251.101967] 3 locks held by kworker/10:1/4506:
[ 251.101967] #0: (events){.+.+.+}, at: [<ffffffff81154960>] process_one_work+0x200/0x530
[ 251.101967] #1: (hrtimer_work){+.+...}, at: [<ffffffff81154960>] process_one_work+0x200/0x530
[ 251.101967] #2: (hrtimer_bases.lock#11){-.-...}, at: [<ffffffff81160e7c>] retrigger_next_event+0x3c/0x70
[ 251.101967]
[ 251.101967] stack backtrace:
[ 251.101967] CPU: 10 PID: 4506 Comm: kworker/10:1 Not tainted 3.13.0-rc2-next-20131206-sasha-00005-g8be2375-dirty #4053
[ 251.101967] Workqueue: events clock_was_set_work
So the best solution is to avoid calling clock_was_set_delayed() while
holding the timekeeping lock, and instead using a flag variable to
decide if we should call clock_was_set() once we've released the locks.
This works for the case here, where the do_adjtimex() was the deadlock
trigger point. Unfortuantely, in update_wall_time() we still hold
the jiffies lock, which would deadlock with the ipi triggered by
clock_was_set(), preventing us from calling it even after we drop the
timekeeping lock. So instead call clock_was_set_delayed() at that point.
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Sasha Levin <sasha.levin@oracle.com>
Cc: stable <stable@vger.kernel.org> #3.10+
Reported-by: Sasha Levin <sasha.levin@oracle.com>
Tested-by: Sasha Levin <sasha.levin@oracle.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
2013-12-11 09:18:18 +08:00
if ( tai ! = orig_tai )
clock_was_set ( ) ;
2013-09-12 07:50:56 +08:00
ntp_notify_cmos_timer ( ) ;
2013-03-23 03:28:15 +08:00
return ret ;
}
2013-03-23 02:31:29 +08:00
# ifdef CONFIG_NTP_PPS
/**
* hardpps ( ) - Accessor function to NTP __hardpps function
*/
2015-09-29 04:21:28 +08:00
void hardpps ( const struct timespec64 * phase_ts , const struct timespec64 * raw_ts )
2013-03-23 02:31:29 +08:00
{
2013-03-23 02:37:28 +08:00
unsigned long flags ;
raw_spin_lock_irqsave ( & timekeeper_lock , flags ) ;
2014-07-17 05:04:07 +08:00
write_seqcount_begin ( & tk_core . seq ) ;
2013-03-23 02:37:28 +08:00
2013-03-23 02:31:29 +08:00
__hardpps ( phase_ts , raw_ts ) ;
2013-03-23 02:37:28 +08:00
2014-07-17 05:04:07 +08:00
write_seqcount_end ( & tk_core . seq ) ;
2013-03-23 02:37:28 +08:00
raw_spin_unlock_irqrestore ( & timekeeper_lock , flags ) ;
2013-03-23 02:31:29 +08:00
}
EXPORT_SYMBOL ( hardpps ) ;
2017-09-09 07:17:19 +08:00
# endif /* CONFIG_NTP_PPS */
2013-03-23 02:31:29 +08:00
2011-01-27 22:59:10 +08:00
/**
* xtime_update ( ) - advances the timekeeping infrastructure
* @ ticks : number of ticks , that have elapsed since the last call .
*
* Must be called with interrupts disabled .
*/
void xtime_update ( unsigned long ticks )
{
2012-02-29 08:50:11 +08:00
write_seqlock ( & jiffies_lock ) ;
2011-01-27 22:59:10 +08:00
do_timer ( ticks ) ;
2012-02-29 08:50:11 +08:00
write_sequnlock ( & jiffies_lock ) ;
2013-12-13 05:10:55 +08:00
update_wall_time ( ) ;
2011-01-27 22:59:10 +08:00
}