From 41be4da4e85e58520b934040966a6ae919c66c2d Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 2 Mar 2009 20:56:48 -0500 Subject: [PATCH 1/5] ring-buffer: reset write field for ring_buffer_read_page Impact: fix ring_buffer_read_page After a page is swapped into the ring buffer, the write field must also be reset. Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index a8c275c01e83..9baad7ee4b36 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2492,6 +2492,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer, rb_init_page(bpage); bpage = cpu_buffer->reader_page->page; cpu_buffer->reader_page->page = *data_page; + local_set(&cpu_buffer->reader_page->write, 0); cpu_buffer->reader_page->read = 0; *data_page = bpage; } From ef7a4a161472b952941bf78855a9cd95703c024e Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 3 Mar 2009 00:27:49 -0500 Subject: [PATCH 2/5] ring-buffer: fix ring_buffer_read_page The ring_buffer_read_page was broken if it were to only copy part of the page. This patch fixes that up as well as adds a parameter to allow a length field, in order to only copy part of the buffer page. Signed-off-by: Steven Rostedt --- include/linux/ring_buffer.h | 7 ++- kernel/trace/ring_buffer.c | 92 ++++++++++++++++++++++++------------- 2 files changed, 64 insertions(+), 35 deletions(-) diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index f5e793d69bd3..79fcbc4b09d6 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -121,6 +121,9 @@ unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu); u64 ring_buffer_time_stamp(int cpu); void ring_buffer_normalize_time_stamp(int cpu, u64 *ts); +size_t ring_buffer_page_len(void *page); + + /* * The below functions are fine to use outside the tracing facility. */ @@ -138,8 +141,8 @@ static inline int tracing_is_on(void) { return 0; } void *ring_buffer_alloc_read_page(struct ring_buffer *buffer); void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data); -int ring_buffer_read_page(struct ring_buffer *buffer, - void **data_page, int cpu, int full); +int ring_buffer_read_page(struct ring_buffer *buffer, void **data_page, + size_t len, int cpu, int full); enum ring_buffer_flags { RB_FL_OVERWRITE = 1 << 0, diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 9baad7ee4b36..2ad6bae95a3d 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -234,6 +234,11 @@ static void rb_init_page(struct buffer_data_page *bpage) local_set(&bpage->commit, 0); } +size_t ring_buffer_page_len(void *page) +{ + return local_read(&((struct buffer_data_page *)page)->commit); +} + /* * Also stolen from mm/slob.c. Thanks to Mathieu Desnoyers for pointing * this issue out. @@ -2378,8 +2383,8 @@ static void rb_remove_entries(struct ring_buffer_per_cpu *cpu_buffer, */ void *ring_buffer_alloc_read_page(struct ring_buffer *buffer) { - unsigned long addr; struct buffer_data_page *bpage; + unsigned long addr; addr = __get_free_page(GFP_KERNEL); if (!addr) @@ -2387,6 +2392,8 @@ void *ring_buffer_alloc_read_page(struct ring_buffer *buffer) bpage = (void *)addr; + rb_init_page(bpage); + return bpage; } @@ -2406,6 +2413,7 @@ void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data) * ring_buffer_read_page - extract a page from the ring buffer * @buffer: buffer to extract from * @data_page: the page to use allocated from ring_buffer_alloc_read_page + * @len: amount to extract * @cpu: the cpu of the buffer to extract * @full: should the extraction only happen when the page is full. * @@ -2418,7 +2426,7 @@ void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data) * rpage = ring_buffer_alloc_read_page(buffer); * if (!rpage) * return error; - * ret = ring_buffer_read_page(buffer, &rpage, cpu, 0); + * ret = ring_buffer_read_page(buffer, &rpage, len, cpu, 0); * if (ret >= 0) * process_page(rpage, ret); * @@ -2435,71 +2443,89 @@ void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data) * <0 if no data has been transferred. */ int ring_buffer_read_page(struct ring_buffer *buffer, - void **data_page, int cpu, int full) + void **data_page, size_t len, int cpu, int full) { struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; struct ring_buffer_event *event; struct buffer_data_page *bpage; + struct buffer_page *reader; unsigned long flags; + unsigned int commit; unsigned int read; int ret = -1; if (!data_page) - return 0; + return -1; bpage = *data_page; if (!bpage) - return 0; + return -1; spin_lock_irqsave(&cpu_buffer->reader_lock, flags); - /* - * rb_buffer_peek will get the next ring buffer if - * the current reader page is empty. - */ - event = rb_buffer_peek(buffer, cpu, NULL); - if (!event) + reader = rb_get_reader_page(cpu_buffer); + if (!reader) goto out; - /* check for data */ - if (!local_read(&cpu_buffer->reader_page->page->commit)) - goto out; + event = rb_reader_event(cpu_buffer); + + read = reader->read; + commit = rb_page_commit(reader); - read = cpu_buffer->reader_page->read; /* - * If the writer is already off of the read page, then simply - * switch the read page with the given page. Otherwise - * we need to copy the data from the reader to the writer. + * If len > what's left on the page, and the writer is also off of + * the read page, then simply switch the read page with the given + * page. Otherwise we need to copy the data from the reader to the + * writer. */ - if (cpu_buffer->reader_page == cpu_buffer->commit_page) { - unsigned int commit = rb_page_commit(cpu_buffer->reader_page); + if ((len < (commit - read)) || + cpu_buffer->reader_page == cpu_buffer->commit_page) { struct buffer_data_page *rpage = cpu_buffer->reader_page->page; + unsigned int pos = read; + unsigned int size; if (full) goto out; - /* The writer is still on the reader page, we must copy */ - memcpy(bpage->data + read, rpage->data + read, commit - read); - /* consume what was read */ - cpu_buffer->reader_page->read = commit; + if (len > (commit - read)) + len = (commit - read); + + size = rb_event_length(event); + + if (len < size) + goto out; + + /* Need to copy one event at a time */ + do { + memcpy(bpage->data + pos, rpage->data + pos, size); + + len -= size; + + rb_advance_reader(cpu_buffer); + pos = reader->read; + + event = rb_reader_event(cpu_buffer); + size = rb_event_length(event); + } while (len > size); /* update bpage */ - local_set(&bpage->commit, commit); - if (!read) - bpage->time_stamp = rpage->time_stamp; + local_set(&bpage->commit, pos); + bpage->time_stamp = rpage->time_stamp; + } else { /* swap the pages */ rb_init_page(bpage); - bpage = cpu_buffer->reader_page->page; - cpu_buffer->reader_page->page = *data_page; - local_set(&cpu_buffer->reader_page->write, 0); - cpu_buffer->reader_page->read = 0; + bpage = reader->page; + reader->page = *data_page; + local_set(&reader->write, 0); + reader->read = 0; *data_page = bpage; + + /* update the entry counter */ + rb_remove_entries(cpu_buffer, bpage, read); } ret = read; - /* update the entry counter */ - rb_remove_entries(cpu_buffer, bpage, read); out: spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); From e3d6bf0a0781a269f34250fd41e0d3dbfe540cf1 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 3 Mar 2009 13:53:07 -0500 Subject: [PATCH 3/5] ring-buffer: replace sizeof of event header with offsetof Impact: fix to possible alignment problems on some archs. Some arch compilers include an NULL char array in the sizeof field. Since the ring_buffer_event type includes one of these, it is better to use the "offsetof" instead, to avoid strange bugs on these archs. Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 2ad6bae95a3d..27cf834d8b4e 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -132,7 +132,7 @@ void ring_buffer_normalize_time_stamp(int cpu, u64 *ts) } EXPORT_SYMBOL_GPL(ring_buffer_normalize_time_stamp); -#define RB_EVNT_HDR_SIZE (sizeof(struct ring_buffer_event)) +#define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array)) #define RB_ALIGNMENT 4U #define RB_MAX_SMALL_DATA 28 From 474d32b68d6d842f3e710e9ae9fe2568c53339f8 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 3 Mar 2009 19:51:40 -0500 Subject: [PATCH 4/5] ring-buffer: make ring_buffer_read_page read from start on partial page Impact: dont leave holes in read buffer page The ring_buffer_read_page swaps a given page with the reader page of the ring buffer, if certain conditions are set: 1) requested length is big enough to hold entire page data 2) a writer is not currently on the page 3) the page is not partially consumed. Instead of swapping with the supplied page. It copies the data to the supplied page instead. But currently the data is copied in the same offset as the source page. This causes a hole at the start of the reader page. This complicates the use of this function. Instead, it should copy the data at the beginning of the function and update the index fields accordingly. Other small clean ups are also done in this patch. Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 43 +++++++++++++++++++++++++++++--------- 1 file changed, 33 insertions(+), 10 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 27cf834d8b4e..f2a163db52f9 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -61,6 +61,8 @@ enum { static unsigned long ring_buffer_flags __read_mostly = RB_BUFFERS_ON; +#define BUF_PAGE_HDR_SIZE offsetof(struct buffer_data_page, data) + /** * tracing_on - enable all tracing buffers * @@ -234,9 +236,16 @@ static void rb_init_page(struct buffer_data_page *bpage) local_set(&bpage->commit, 0); } +/** + * ring_buffer_page_len - the size of data on the page. + * @page: The page to read + * + * Returns the amount of data on the page, including buffer page header. + */ size_t ring_buffer_page_len(void *page) { - return local_read(&((struct buffer_data_page *)page)->commit); + return local_read(&((struct buffer_data_page *)page)->commit) + + BUF_PAGE_HDR_SIZE; } /* @@ -259,7 +268,7 @@ static inline int test_time_stamp(u64 delta) return 0; } -#define BUF_PAGE_SIZE (PAGE_SIZE - offsetof(struct buffer_data_page, data)) +#define BUF_PAGE_SIZE (PAGE_SIZE - BUF_PAGE_HDR_SIZE) /* * head_page == tail_page && head == tail then buffer is empty. @@ -2454,6 +2463,15 @@ int ring_buffer_read_page(struct ring_buffer *buffer, unsigned int read; int ret = -1; + /* + * If len is not big enough to hold the page header, then + * we can not copy anything. + */ + if (len <= BUF_PAGE_HDR_SIZE) + return -1; + + len -= BUF_PAGE_HDR_SIZE; + if (!data_page) return -1; @@ -2473,15 +2491,17 @@ int ring_buffer_read_page(struct ring_buffer *buffer, commit = rb_page_commit(reader); /* - * If len > what's left on the page, and the writer is also off of - * the read page, then simply switch the read page with the given - * page. Otherwise we need to copy the data from the reader to the - * writer. + * If this page has been partially read or + * if len is not big enough to read the rest of the page or + * a writer is still on the page, then + * we must copy the data from the page to the buffer. + * Otherwise, we can simply swap the page with the one passed in. */ - if ((len < (commit - read)) || + if (read || (len < (commit - read)) || cpu_buffer->reader_page == cpu_buffer->commit_page) { struct buffer_data_page *rpage = cpu_buffer->reader_page->page; - unsigned int pos = read; + unsigned int rpos = read; + unsigned int pos = 0; unsigned int size; if (full) @@ -2497,12 +2517,13 @@ int ring_buffer_read_page(struct ring_buffer *buffer, /* Need to copy one event at a time */ do { - memcpy(bpage->data + pos, rpage->data + pos, size); + memcpy(bpage->data + pos, rpage->data + rpos, size); len -= size; rb_advance_reader(cpu_buffer); - pos = reader->read; + rpos = reader->read; + pos += size; event = rb_reader_event(cpu_buffer); size = rb_event_length(event); @@ -2512,6 +2533,8 @@ int ring_buffer_read_page(struct ring_buffer *buffer, local_set(&bpage->commit, pos); bpage->time_stamp = rpage->time_stamp; + /* we copied everything to the beginning */ + read = 0; } else { /* swap the pages */ rb_init_page(bpage); From 2cadf9135eb3b6d84b6427314be827ddd443c308 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 1 Dec 2008 22:20:19 -0500 Subject: [PATCH 5/5] tracing: add binary buffer files for use with splice Impact: new feature This patch creates a directory of files that correspond to the per CPU ring buffers. These are binary files and are made to be used with splice. This is the fastest way to extract data from the ftrace ring buffers. Thanks to Jiaying Zhang for pushing me to get this code fixed, and to Eduard - Gabriel Munteanu for his splice code that helped me debug my code. Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 274 +++++++++++++++++++++++++++++++++++++++++-- kernel/trace/trace.h | 1 + 2 files changed, 268 insertions(+), 7 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index ea055aa21cd9..12539f72f4a5 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -11,31 +11,30 @@ * Copyright (C) 2004-2006 Ingo Molnar * Copyright (C) 2004 William Lee Irwin III */ +#include #include +#include +#include #include #include #include +#include #include #include #include #include #include +#include #include #include #include +#include #include #include #include #include #include #include -#include -#include -#include - -#include -#include -#include #include "trace.h" #include "trace_output.h" @@ -3005,6 +3004,246 @@ static struct file_operations tracing_mark_fops = { .write = tracing_mark_write, }; +struct ftrace_buffer_info { + struct trace_array *tr; + void *spare; + int cpu; + unsigned int read; +}; + +static int tracing_buffers_open(struct inode *inode, struct file *filp) +{ + int cpu = (int)(long)inode->i_private; + struct ftrace_buffer_info *info; + + if (tracing_disabled) + return -ENODEV; + + info = kzalloc(sizeof(*info), GFP_KERNEL); + if (!info) + return -ENOMEM; + + info->tr = &global_trace; + info->cpu = cpu; + info->spare = ring_buffer_alloc_read_page(info->tr->buffer); + /* Force reading ring buffer for first read */ + info->read = (unsigned int)-1; + if (!info->spare) + goto out; + + filp->private_data = info; + + return 0; + + out: + kfree(info); + return -ENOMEM; +} + +static ssize_t +tracing_buffers_read(struct file *filp, char __user *ubuf, + size_t count, loff_t *ppos) +{ + struct ftrace_buffer_info *info = filp->private_data; + unsigned int pos; + ssize_t ret; + size_t size; + + /* Do we have previous read data to read? */ + if (info->read < PAGE_SIZE) + goto read; + + info->read = 0; + + ret = ring_buffer_read_page(info->tr->buffer, + &info->spare, + count, + info->cpu, 0); + if (ret < 0) + return 0; + + pos = ring_buffer_page_len(info->spare); + + if (pos < PAGE_SIZE) + memset(info->spare + pos, 0, PAGE_SIZE - pos); + +read: + size = PAGE_SIZE - info->read; + if (size > count) + size = count; + + ret = copy_to_user(ubuf, info->spare + info->read, size); + if (ret) + return -EFAULT; + *ppos += size; + info->read += size; + + return size; +} + +static int tracing_buffers_release(struct inode *inode, struct file *file) +{ + struct ftrace_buffer_info *info = file->private_data; + + ring_buffer_free_read_page(info->tr->buffer, info->spare); + kfree(info); + + return 0; +} + +struct buffer_ref { + struct ring_buffer *buffer; + void *page; + int ref; +}; + +static void buffer_pipe_buf_release(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) +{ + struct buffer_ref *ref = (struct buffer_ref *)buf->private; + + if (--ref->ref) + return; + + ring_buffer_free_read_page(ref->buffer, ref->page); + kfree(ref); + buf->private = 0; +} + +static int buffer_pipe_buf_steal(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) +{ + return 1; +} + +static void buffer_pipe_buf_get(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) +{ + struct buffer_ref *ref = (struct buffer_ref *)buf->private; + + ref->ref++; +} + +/* Pipe buffer operations for a buffer. */ +static struct pipe_buf_operations buffer_pipe_buf_ops = { + .can_merge = 0, + .map = generic_pipe_buf_map, + .unmap = generic_pipe_buf_unmap, + .confirm = generic_pipe_buf_confirm, + .release = buffer_pipe_buf_release, + .steal = buffer_pipe_buf_steal, + .get = buffer_pipe_buf_get, +}; + +/* + * Callback from splice_to_pipe(), if we need to release some pages + * at the end of the spd in case we error'ed out in filling the pipe. + */ +static void buffer_spd_release(struct splice_pipe_desc *spd, unsigned int i) +{ + struct buffer_ref *ref = + (struct buffer_ref *)spd->partial[i].private; + + if (--ref->ref) + return; + + ring_buffer_free_read_page(ref->buffer, ref->page); + kfree(ref); + spd->partial[i].private = 0; +} + +static ssize_t +tracing_buffers_splice_read(struct file *file, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags) +{ + struct ftrace_buffer_info *info = file->private_data; + struct partial_page partial[PIPE_BUFFERS]; + struct page *pages[PIPE_BUFFERS]; + struct splice_pipe_desc spd = { + .pages = pages, + .partial = partial, + .flags = flags, + .ops = &buffer_pipe_buf_ops, + .spd_release = buffer_spd_release, + }; + struct buffer_ref *ref; + int size, i; + size_t ret; + + /* + * We can't seek on a buffer input + */ + if (unlikely(*ppos)) + return -ESPIPE; + + + for (i = 0; i < PIPE_BUFFERS && len; i++, len -= size) { + struct page *page; + int r; + + ref = kzalloc(sizeof(*ref), GFP_KERNEL); + if (!ref) + break; + + ref->buffer = info->tr->buffer; + ref->page = ring_buffer_alloc_read_page(ref->buffer); + if (!ref->page) { + kfree(ref); + break; + } + + r = ring_buffer_read_page(ref->buffer, &ref->page, + len, info->cpu, 0); + if (r < 0) { + ring_buffer_free_read_page(ref->buffer, + ref->page); + kfree(ref); + break; + } + + /* + * zero out any left over data, this is going to + * user land. + */ + size = ring_buffer_page_len(ref->page); + if (size < PAGE_SIZE) + memset(ref->page + size, 0, PAGE_SIZE - size); + + page = virt_to_page(ref->page); + + spd.pages[i] = page; + spd.partial[i].len = PAGE_SIZE; + spd.partial[i].offset = 0; + spd.partial[i].private = (unsigned long)ref; + spd.nr_pages++; + } + + spd.nr_pages = i; + + /* did we read anything? */ + if (!spd.nr_pages) { + if (flags & SPLICE_F_NONBLOCK) + ret = -EAGAIN; + else + ret = 0; + /* TODO: block */ + return ret; + } + + ret = splice_to_pipe(pipe, &spd); + + return ret; +} + +static const struct file_operations tracing_buffers_fops = { + .open = tracing_buffers_open, + .read = tracing_buffers_read, + .release = tracing_buffers_release, + .splice_read = tracing_buffers_splice_read, + .llseek = no_llseek, +}; + #ifdef CONFIG_DYNAMIC_FTRACE int __weak ftrace_arch_read_dyn_info(char *buf, int size) @@ -3399,6 +3638,7 @@ static __init void create_trace_options_dir(void) static __init int tracer_init_debugfs(void) { struct dentry *d_tracer; + struct dentry *buffers; struct dentry *entry; int cpu; @@ -3471,6 +3711,26 @@ static __init int tracer_init_debugfs(void) pr_warning("Could not create debugfs " "'trace_marker' entry\n"); + buffers = debugfs_create_dir("binary_buffers", d_tracer); + + if (!buffers) + pr_warning("Could not create buffers directory\n"); + else { + int cpu; + char buf[64]; + + for_each_tracing_cpu(cpu) { + sprintf(buf, "%d", cpu); + + entry = debugfs_create_file(buf, 0444, buffers, + (void *)(long)cpu, + &tracing_buffers_fops); + if (!entry) + pr_warning("Could not create debugfs buffers " + "'%s' entry\n", buf); + } + } + #ifdef CONFIG_DYNAMIC_FTRACE entry = debugfs_create_file("dyn_ftrace_total_info", 0444, d_tracer, &ftrace_update_tot_cnt, diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index e606633fb498..561bb5c5d988 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -217,6 +217,7 @@ enum trace_flag_type { */ struct trace_array_cpu { atomic_t disabled; + void *buffer_page; /* ring buffer spare */ /* these fields get copied into max-trace: */ unsigned long trace_idx;