diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 7ee6619951ea..802bb38d9c81 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -27,7 +27,6 @@ #include #include -#include static void update_pages_handler(struct work_struct *work); @@ -449,6 +448,28 @@ enum { RB_CTX_MAX }; +#if BITS_PER_LONG == 32 +#define RB_TIME_32 +#endif + +/* To test on 64 bit machines */ +//#define RB_TIME_32 + +#ifdef RB_TIME_32 + +struct rb_time_struct { + local_t cnt; + local_t top; + local_t bottom; +}; +#else +#include +struct rb_time_struct { + local64_t time; +}; +#endif +typedef struct rb_time_struct rb_time_t; + /* * head_page == tail_page && head == tail then buffer is empty. */ @@ -484,8 +505,8 @@ struct ring_buffer_per_cpu { size_t shortest_full; unsigned long read; unsigned long read_bytes; - local64_t write_stamp; - local64_t before_stamp; + rb_time_t write_stamp; + rb_time_t before_stamp; u64 read_stamp; /* ring buffer pages to update, > 0 to add, < 0 to remove */ long nr_pages_to_update; @@ -528,6 +549,189 @@ struct ring_buffer_iter { int missed_events; }; +#ifdef RB_TIME_32 + +/* + * On 32 bit machines, local64_t is very expensive. As the ring + * buffer doesn't need all the features of a true 64 bit atomic, + * on 32 bit, it uses these functions (64 still uses local64_t). + * + * For the ring buffer, 64 bit required operations for the time is + * the following: + * + * - Only need 59 bits (uses 60 to make it even). + * - Reads may fail if it interrupted a modification of the time stamp. + * It will succeed if it did not interrupt another write even if + * the read itself is interrupted by a write. + * It returns whether it was successful or not. + * + * - Writes always succeed and will overwrite other writes and writes + * that were done by events interrupting the current write. + * + * - A write followed by a read of the same time stamp will always succeed, + * but may not contain the same value. + * + * - A cmpxchg will fail if it interrupted another write or cmpxchg. + * Other than that, it acts like a normal cmpxchg. + * + * The 60 bit time stamp is broken up by 30 bits in a top and bottom half + * (bottom being the least significant 30 bits of the 60 bit time stamp). + * + * The two most significant bits of each half holds a 2 bit counter (0-3). + * Each update will increment this counter by one. + * When reading the top and bottom, if the two counter bits match then the + * top and bottom together make a valid 60 bit number. + */ +#define RB_TIME_SHIFT 30 +#define RB_TIME_VAL_MASK ((1 << RB_TIME_SHIFT) - 1) + +static inline int rb_time_cnt(unsigned long val) +{ + return (val >> RB_TIME_SHIFT) & 3; +} + +static inline u64 rb_time_val(unsigned long top, unsigned long bottom) +{ + u64 val; + + val = top & RB_TIME_VAL_MASK; + val <<= RB_TIME_SHIFT; + val |= bottom & RB_TIME_VAL_MASK; + + return val; +} + +static inline bool __rb_time_read(rb_time_t *t, u64 *ret, unsigned long *cnt) +{ + unsigned long top, bottom; + unsigned long c; + + /* + * If the read is interrupted by a write, then the cnt will + * be different. Loop until both top and bottom have been read + * without interruption. + */ + do { + c = local_read(&t->cnt); + top = local_read(&t->top); + bottom = local_read(&t->bottom); + } while (c != local_read(&t->cnt)); + + *cnt = rb_time_cnt(top); + + /* If top and bottom counts don't match, this interrupted a write */ + if (*cnt != rb_time_cnt(bottom)) + return false; + + *ret = rb_time_val(top, bottom); + return true; +} + +static bool rb_time_read(rb_time_t *t, u64 *ret) +{ + unsigned long cnt; + + return __rb_time_read(t, ret, &cnt); +} + +static inline unsigned long rb_time_val_cnt(unsigned long val, unsigned long cnt) +{ + return (val & RB_TIME_VAL_MASK) | ((cnt & 3) << RB_TIME_SHIFT); +} + +static inline void rb_time_split(u64 val, unsigned long *top, unsigned long *bottom) +{ + *top = (unsigned long)((val >> RB_TIME_SHIFT) & RB_TIME_VAL_MASK); + *bottom = (unsigned long)(val & RB_TIME_VAL_MASK); +} + +static inline void rb_time_val_set(local_t *t, unsigned long val, unsigned long cnt) +{ + val = rb_time_val_cnt(val, cnt); + local_set(t, val); +} + +static void rb_time_set(rb_time_t *t, u64 val) +{ + unsigned long cnt, top, bottom; + + rb_time_split(val, &top, &bottom); + + /* Writes always succeed with a valid number even if it gets interrupted. */ + do { + cnt = local_inc_return(&t->cnt); + rb_time_val_set(&t->top, top, cnt); + rb_time_val_set(&t->bottom, bottom, cnt); + } while (cnt != local_read(&t->cnt)); +} + +static inline bool +rb_time_read_cmpxchg(local_t *l, unsigned long expect, unsigned long set) +{ + unsigned long ret; + + ret = local_cmpxchg(l, expect, set); + return ret == expect; +} + +static int rb_time_cmpxchg(rb_time_t *t, u64 expect, u64 set) +{ + unsigned long cnt, top, bottom; + unsigned long cnt2, top2, bottom2; + u64 val; + + /* The cmpxchg always fails if it interrupted an update */ + if (!__rb_time_read(t, &val, &cnt2)) + return false; + + if (val != expect) + return false; + + cnt = local_read(&t->cnt); + if ((cnt & 3) != cnt2) + return false; + + cnt2 = cnt + 1; + + rb_time_split(val, &top, &bottom); + top = rb_time_val_cnt(top, cnt); + bottom = rb_time_val_cnt(bottom, cnt); + + rb_time_split(set, &top2, &bottom2); + top2 = rb_time_val_cnt(top2, cnt2); + bottom2 = rb_time_val_cnt(bottom2, cnt2); + + if (!rb_time_read_cmpxchg(&t->cnt, cnt, cnt2)) + return false; + if (!rb_time_read_cmpxchg(&t->top, top, top2)) + return false; + if (!rb_time_read_cmpxchg(&t->bottom, bottom, bottom2)) + return false; + return true; +} + +#else /* 64 bits */ + +/* local64_t always succeeds */ + +static inline bool rb_time_read(rb_time_t *t, u64 *ret) +{ + *ret = local64_read(&t->time); + return true; +} +static void rb_time_set(rb_time_t *t, u64 val) +{ + local64_set(&t->time, val); +} + +static bool rb_time_cmpxchg(rb_time_t *t, u64 expect, u64 set) +{ + u64 val; + val = local64_cmpxchg(&t->time, expect, set); + return val == expect; +} +#endif + /** * ring_buffer_nr_pages - get the number of buffer pages in the ring buffer * @buffer: The ring_buffer to get the number of pages from @@ -2545,7 +2749,8 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer, delta = rb_time_delta(event); - write_stamp = local64_read(&cpu_buffer->write_stamp); + if (!rb_time_read(&cpu_buffer->write_stamp, &write_stamp)) + return 0; /* Make sure the write stamp is read before testing the location */ barrier(); @@ -2554,11 +2759,10 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer, unsigned long write_mask = local_read(&bpage->write) & ~RB_WRITE_MASK; unsigned long event_length = rb_event_length(event); - u64 ret; - ret = local64_cmpxchg(&cpu_buffer->write_stamp, write_stamp, write_stamp - delta); /* Something came in, can't discard */ - if (ret != write_stamp) + if (!rb_time_cmpxchg(&cpu_buffer->write_stamp, + write_stamp, write_stamp - delta)) return 0; /* @@ -2889,11 +3093,13 @@ static noinline void rb_check_timestamp(struct ring_buffer_per_cpu *cpu_buffer, struct rb_event_info *info) { + u64 write_stamp; + WARN_ONCE(info->delta > (1ULL << 59), KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s", (unsigned long long)info->delta, (unsigned long long)info->ts, - (unsigned long long)local64_read(&cpu_buffer->write_stamp), + (unsigned long long)(rb_time_read(&cpu_buffer->write_stamp, &write_stamp) ? write_stamp : 0), sched_clock_stable() ? "" : "If you just came from a suspend/resume,\n" "please switch to the trace global clock:\n" @@ -2909,14 +3115,16 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, struct buffer_page *tail_page; unsigned long tail, write, w; u64 before, after; + bool a_ok; + bool b_ok; /* Don't let the compiler play games with cpu_buffer->tail_page */ tail_page = info->tail_page = READ_ONCE(cpu_buffer->tail_page); /*A*/ w = local_read(&tail_page->write) & RB_WRITE_MASK; barrier(); - before = local64_read(&cpu_buffer->before_stamp); - after = local64_read(&cpu_buffer->write_stamp); + b_ok = rb_time_read(&cpu_buffer->before_stamp, &before); + a_ok = rb_time_read(&cpu_buffer->write_stamp, &after); barrier(); info->ts = rb_time_stamp(cpu_buffer->buffer); @@ -2927,16 +3135,18 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, info->delta = info->ts - after; } - if (unlikely(test_time_stamp(info->delta))) { - rb_check_timestamp(cpu_buffer, info); - info->add_timestamp |= RB_ADD_STAMP_EXTEND; + if (likely(a_ok && b_ok)) { + if (unlikely(test_time_stamp(info->delta))) { + rb_check_timestamp(cpu_buffer, info); + info->add_timestamp |= RB_ADD_STAMP_EXTEND; + } } /* * If interrupting an event time update, we may need an absolute timestamp. * Don't bother if this is the start of a new page (w == 0). */ - if (unlikely(before != after && w)) + if (unlikely(!a_ok || !b_ok || (before != after && w))) info->add_timestamp |= RB_ADD_STAMP_FORCE | RB_ADD_STAMP_EXTEND; /* @@ -2947,7 +3157,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, if (unlikely(info->add_timestamp)) info->length += RB_LEN_TIME_EXTEND; - /*B*/ local64_set(&cpu_buffer->before_stamp, info->ts); + /*B*/ rb_time_set(&cpu_buffer->before_stamp, info->ts); /*C*/ write = local_add_return(info->length, &tail_page->write); @@ -2960,21 +3170,23 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, if (unlikely(write > BUF_PAGE_SIZE)) { if (tail != w) { /* before and after may now different, fix it up*/ - before = local64_read(&cpu_buffer->before_stamp); - after = local64_read(&cpu_buffer->write_stamp); - if (before != after) - (void)local64_cmpxchg(&cpu_buffer->before_stamp, before, after); + b_ok = rb_time_read(&cpu_buffer->before_stamp, &before); + a_ok = rb_time_read(&cpu_buffer->write_stamp, &after); + if (a_ok && b_ok && before != after) + (void)rb_time_cmpxchg(&cpu_buffer->before_stamp, before, after); } return rb_move_tail(cpu_buffer, tail, info); } if (likely(tail == w)) { u64 save_before; + bool s_ok; /* Nothing interrupted us between A and C */ - /*D*/ local64_set(&cpu_buffer->write_stamp, info->ts); + /*D*/ rb_time_set(&cpu_buffer->write_stamp, info->ts); barrier(); - /*E*/ save_before = local64_read(&cpu_buffer->before_stamp); + /*E*/ s_ok = rb_time_read(&cpu_buffer->before_stamp, &save_before); + RB_WARN_ON(cpu_buffer, !s_ok); if (likely(!(info->add_timestamp & (RB_ADD_STAMP_FORCE | RB_ADD_STAMP_ABSOLUTE)))) /* This did not interrupt any time update */ @@ -2986,27 +3198,31 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, if (unlikely(info->ts != save_before)) { /* SLOW PATH - Interrupted between C and E */ - after = local64_read(&cpu_buffer->write_stamp); + a_ok = rb_time_read(&cpu_buffer->write_stamp, &after); + RB_WARN_ON(cpu_buffer, !a_ok); + /* Write stamp must only go forward */ if (save_before > after) { /* * We do not care about the result, only that * it gets updated atomically. */ - (void)local64_cmpxchg(&cpu_buffer->write_stamp, after, save_before); + (void)rb_time_cmpxchg(&cpu_buffer->write_stamp, after, save_before); } } } else { u64 ts; /* SLOW PATH - Interrupted between A and C */ - after = local64_read(&cpu_buffer->write_stamp); + a_ok = rb_time_read(&cpu_buffer->write_stamp, &after); + /* Was interrupted before here, write_stamp must be valid */ + RB_WARN_ON(cpu_buffer, !a_ok); ts = rb_time_stamp(cpu_buffer->buffer); barrier(); /*E*/ if (write == (local_read(&tail_page->write) & RB_WRITE_MASK) && after < ts) { /* Nothing came after this event between C and E */ info->delta = ts - after; - (void)local64_cmpxchg(&cpu_buffer->write_stamp, after, info->ts); + (void)rb_time_cmpxchg(&cpu_buffer->write_stamp, after, info->ts); info->ts = ts; } else { /* @@ -4565,8 +4781,8 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) cpu_buffer->read = 0; cpu_buffer->read_bytes = 0; - local64_set(&cpu_buffer->write_stamp, 0); - local64_set(&cpu_buffer->before_stamp, 0); + rb_time_set(&cpu_buffer->write_stamp, 0); + rb_time_set(&cpu_buffer->before_stamp, 0); cpu_buffer->lost_events = 0; cpu_buffer->last_overrun = 0;