forked from mirrors/linux
Lei Chen raised an issue with CLOCK_MONOTONIC_COARSE seeing time
inconsistencies. Lei tracked down that this was being caused by the
adjustment:
tk->tkr_mono.xtime_nsec -= offset;
which is made to compensate for the unaccumulated cycles in offset when the
multiplicator is adjusted forward, so that the non-_COARSE clockids don't
see inconsistencies.
However, the _COARSE clockid getter functions use the adjusted xtime_nsec
value directly and do not compensate the negative offset via the
clocksource delta multiplied with the new multiplicator. In that case the
caller can observe time going backwards in consecutive calls.
By design, this negative adjustment should be fine, because the logic run
from timekeeping_adjust() is done after it accumulated approximately
multiplicator * interval_cycles
into xtime_nsec. The accumulated value is always larger then the
mult_adj * offset
value, which is subtracted from xtime_nsec. Both operations are done
together under the tk_core.lock, so the net change to xtime_nsec is always
always be positive.
However, do_adjtimex() calls into timekeeping_advance() as well, to
apply the NTP frequency adjustment immediately. In this case,
timekeeping_advance() does not return early when the offset is smaller
then interval_cycles. In that case there is no time accumulated into
xtime_nsec. But the subsequent call into timekeeping_adjust(), which
modifies the multiplicator, subtracts from xtime_nsec to correct for the
new multiplicator.
Here because there was no accumulation, xtime_nsec becomes smaller than
before, which opens a window up to the next accumulation, where the
_COARSE clockid getters, which don't compensate for the offset, can
observe the inconsistency.
This has been tried to be fixed by forwarding the timekeeper in the case
that adjtimex() adjusts the multiplier, which resets the offset to zero:
757b000f7b ("timekeeping: Fix possible inconsistencies in _COARSE clockids")
That works correctly, but unfortunately causes a regression on the
adjtimex() side. There are two issues:
1) The forwarding of the base time moves the update out of the original
period and establishes a new one.
2) The clearing of the accumulated NTP error is changing the behaviour as
well.
User-space expects that multiplier/frequency updates are in effect, when the
syscall returns, so delaying the update to the next tick is not solving the
problem either.
Commit 757b000f7b was reverted so that the established expectations of
user space implementations (ntpd, chronyd) are restored, but that obviously
brought the inconsistencies back.
One of the initial approaches to fix this was to establish a separate
storage for the coarse time getter nanoseconds part by calculating it from
the offset. That was dropped on the floor because not having yet another
state to maintain was simpler. But given the result of the above exercise,
this solution turns out to be the right one. Bring it back in a slightly
modified form.
Thus introduce timekeeper::coarse_nsec and store that nanoseconds part in
it, switch the time getter functions and the VDSO update to use that value.
coarse_nsec is set on operations which forward or initialize the timekeeper
and after time was accumulated during a tick. If there is no accumulation
the timestamp is unchanged.
This leaves the adjtimex() behaviour unmodified and prevents coarse time
from going backwards.
[ jstultz: Simplified the coarse_nsec calculation and kept behavior so
coarse clockids aren't adjusted on each inter-tick adjtimex
call, slightly reworked the comments and commit message ]
Fixes: da15cfdae0 ("time: Introduce CLOCK_REALTIME_COARSE")
Reported-by: Lei Chen <lei.chen@smartx.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: John Stultz <jstultz@google.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lore.kernel.org/all/20250419054706.2319105-1-jstultz@google.com
Closes: https://lore.kernel.org/lkml/20250310030004.3705801-1-lei.chen@smartx.com/
166 lines
5.7 KiB
C
166 lines
5.7 KiB
C
/* SPDX-License-Identifier: GPL-2.0 */
|
|
/*
|
|
* You SHOULD NOT be including this unless you're vsyscall
|
|
* handling code or timekeeping internal code!
|
|
*/
|
|
|
|
#ifndef _LINUX_TIMEKEEPER_INTERNAL_H
|
|
#define _LINUX_TIMEKEEPER_INTERNAL_H
|
|
|
|
#include <linux/clocksource.h>
|
|
#include <linux/jiffies.h>
|
|
#include <linux/time.h>
|
|
|
|
/**
|
|
* struct tk_read_base - base structure for timekeeping readout
|
|
* @clock: Current clocksource used for timekeeping.
|
|
* @mask: Bitmask for two's complement subtraction of non 64bit clocks
|
|
* @cycle_last: @clock cycle value at last update
|
|
* @mult: (NTP adjusted) multiplier for scaled math conversion
|
|
* @shift: Shift value for scaled math conversion
|
|
* @xtime_nsec: Shifted (fractional) nano seconds offset for readout
|
|
* @base: ktime_t (nanoseconds) base time for readout
|
|
* @base_real: Nanoseconds base value for clock REALTIME readout
|
|
*
|
|
* This struct has size 56 byte on 64 bit. Together with a seqcount it
|
|
* occupies a single 64byte cache line.
|
|
*
|
|
* The struct is separate from struct timekeeper as it is also used
|
|
* for the fast NMI safe accessors.
|
|
*
|
|
* @base_real is for the fast NMI safe accessor to allow reading clock
|
|
* realtime from any context.
|
|
*/
|
|
struct tk_read_base {
|
|
struct clocksource *clock;
|
|
u64 mask;
|
|
u64 cycle_last;
|
|
u32 mult;
|
|
u32 shift;
|
|
u64 xtime_nsec;
|
|
ktime_t base;
|
|
u64 base_real;
|
|
};
|
|
|
|
/**
|
|
* struct timekeeper - Structure holding internal timekeeping values.
|
|
* @tkr_mono: The readout base structure for CLOCK_MONOTONIC
|
|
* @xtime_sec: Current CLOCK_REALTIME time in seconds
|
|
* @ktime_sec: Current CLOCK_MONOTONIC time in seconds
|
|
* @wall_to_monotonic: CLOCK_REALTIME to CLOCK_MONOTONIC offset
|
|
* @offs_real: Offset clock monotonic -> clock realtime
|
|
* @offs_boot: Offset clock monotonic -> clock boottime
|
|
* @offs_tai: Offset clock monotonic -> clock tai
|
|
* @coarse_nsec: The nanoseconds part for coarse time getters
|
|
* @tkr_raw: The readout base structure for CLOCK_MONOTONIC_RAW
|
|
* @raw_sec: CLOCK_MONOTONIC_RAW time in seconds
|
|
* @clock_was_set_seq: The sequence number of clock was set events
|
|
* @cs_was_changed_seq: The sequence number of clocksource change events
|
|
* @monotonic_to_boot: CLOCK_MONOTONIC to CLOCK_BOOTTIME offset
|
|
* @cycle_interval: Number of clock cycles in one NTP interval
|
|
* @xtime_interval: Number of clock shifted nano seconds in one NTP
|
|
* interval.
|
|
* @xtime_remainder: Shifted nano seconds left over when rounding
|
|
* @cycle_interval
|
|
* @raw_interval: Shifted raw nano seconds accumulated per NTP interval.
|
|
* @next_leap_ktime: CLOCK_MONOTONIC time value of a pending leap-second
|
|
* @ntp_tick: The ntp_tick_length() value currently being
|
|
* used. This cached copy ensures we consistently
|
|
* apply the tick length for an entire tick, as
|
|
* ntp_tick_length may change mid-tick, and we don't
|
|
* want to apply that new value to the tick in
|
|
* progress.
|
|
* @ntp_error: Difference between accumulated time and NTP time in ntp
|
|
* shifted nano seconds.
|
|
* @ntp_error_shift: Shift conversion between clock shifted nano seconds and
|
|
* ntp shifted nano seconds.
|
|
* @ntp_err_mult: Multiplication factor for scaled math conversion
|
|
* @skip_second_overflow: Flag used to avoid updating NTP twice with same second
|
|
* @tai_offset: The current UTC to TAI offset in seconds
|
|
*
|
|
* Note: For timespec(64) based interfaces wall_to_monotonic is what
|
|
* we need to add to xtime (or xtime corrected for sub jiffy times)
|
|
* to get to monotonic time. Monotonic is pegged at zero at system
|
|
* boot time, so wall_to_monotonic will be negative, however, we will
|
|
* ALWAYS keep the tv_nsec part positive so we can use the usual
|
|
* normalization.
|
|
*
|
|
* wall_to_monotonic is moved after resume from suspend for the
|
|
* monotonic time not to jump. We need to add total_sleep_time to
|
|
* wall_to_monotonic to get the real boot based time offset.
|
|
*
|
|
* wall_to_monotonic is no longer the boot time, getboottime must be
|
|
* used instead.
|
|
*
|
|
* @monotonic_to_boottime is a timespec64 representation of @offs_boot to
|
|
* accelerate the VDSO update for CLOCK_BOOTTIME.
|
|
*
|
|
* The cacheline ordering of the structure is optimized for in kernel usage of
|
|
* the ktime_get() and ktime_get_ts64() family of time accessors. Struct
|
|
* timekeeper is prepended in the core timekeeping code with a sequence count,
|
|
* which results in the following cacheline layout:
|
|
*
|
|
* 0: seqcount, tkr_mono
|
|
* 1: xtime_sec ... coarse_nsec
|
|
* 2: tkr_raw, raw_sec
|
|
* 3,4: Internal variables
|
|
*
|
|
* Cacheline 0,1 contain the data which is used for accessing
|
|
* CLOCK_MONOTONIC/REALTIME/BOOTTIME/TAI, while cacheline 2 contains the
|
|
* data for accessing CLOCK_MONOTONIC_RAW. Cacheline 3,4 are internal
|
|
* variables which are only accessed during timekeeper updates once per
|
|
* tick.
|
|
*/
|
|
struct timekeeper {
|
|
/* Cacheline 0 (together with prepended seqcount of timekeeper core): */
|
|
struct tk_read_base tkr_mono;
|
|
|
|
/* Cacheline 1: */
|
|
u64 xtime_sec;
|
|
unsigned long ktime_sec;
|
|
struct timespec64 wall_to_monotonic;
|
|
ktime_t offs_real;
|
|
ktime_t offs_boot;
|
|
ktime_t offs_tai;
|
|
u32 coarse_nsec;
|
|
|
|
/* Cacheline 2: */
|
|
struct tk_read_base tkr_raw;
|
|
u64 raw_sec;
|
|
|
|
/* Cachline 3 and 4 (timekeeping internal variables): */
|
|
unsigned int clock_was_set_seq;
|
|
u8 cs_was_changed_seq;
|
|
|
|
struct timespec64 monotonic_to_boot;
|
|
|
|
u64 cycle_interval;
|
|
u64 xtime_interval;
|
|
s64 xtime_remainder;
|
|
u64 raw_interval;
|
|
|
|
ktime_t next_leap_ktime;
|
|
u64 ntp_tick;
|
|
s64 ntp_error;
|
|
u32 ntp_error_shift;
|
|
u32 ntp_err_mult;
|
|
u32 skip_second_overflow;
|
|
s32 tai_offset;
|
|
};
|
|
|
|
#ifdef CONFIG_GENERIC_TIME_VSYSCALL
|
|
|
|
extern void update_vsyscall(struct timekeeper *tk);
|
|
extern void update_vsyscall_tz(void);
|
|
|
|
#else
|
|
|
|
static inline void update_vsyscall(struct timekeeper *tk)
|
|
{
|
|
}
|
|
static inline void update_vsyscall_tz(void)
|
|
{
|
|
}
|
|
#endif
|
|
|
|
#endif /* _LINUX_TIMEKEEPER_INTERNAL_H */
|