From 1a17662ea033674a58bad3603531b0b5d42572f6 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Fri, 20 Mar 2009 09:47:30 +0800 Subject: [PATCH 01/60] blktrace: fix possible memory leak When we failed to create "block" debugfs dir, we should do some cleanups. Signed-off-by: Li Zefan Acked-by: Frederic Weisbecker Acked-by: Arnaldo Carvalho de Melo Cc: Jens Axboe Cc: Steven Rostedt LKML-Reference: <49C2F5B2.8000800@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/blktrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index b171778e3863..fb3bc53835dd 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -432,7 +432,7 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, if (!blk_tree_root) { blk_tree_root = debugfs_create_dir("block", NULL); if (!blk_tree_root) - return -ENOMEM; + goto err; } dir = debugfs_create_dir(buts->name, blk_tree_root); From 5006ea73f38caef6065d1136808413813271633f Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Fri, 20 Mar 2009 09:48:03 +0800 Subject: [PATCH 02/60] blktrace: make blk_tracer_enabled a bool flag It doesn't have to be a counter, and it can be a bool flag instead. Signed-off-by: Li Zefan Acked-by: Arnaldo Carvalho de Melo Cc: Jens Axboe Cc: Steven Rostedt Acked-by: Frederic Weisbecker LKML-Reference: <49C2F5D3.8090104@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/blktrace.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index fb3bc53835dd..73845b7968bb 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -30,7 +30,7 @@ static unsigned int blktrace_seq __read_mostly = 1; static struct trace_array *blk_tr; -static int __read_mostly blk_tracer_enabled; +static bool blk_tracer_enabled __read_mostly; /* Select an alternative, minimalistic output than the original one */ #define TRACE_BLK_OPT_CLASSIC 0x1 @@ -1111,9 +1111,7 @@ static int blk_tracer_init(struct trace_array *tr) { blk_tr = tr; blk_tracer_start(tr); - mutex_lock(&blk_probe_mutex); - blk_tracer_enabled++; - mutex_unlock(&blk_probe_mutex); + blk_tracer_enabled = true; return 0; } @@ -1131,11 +1129,7 @@ static void blk_tracer_reset(struct trace_array *tr) if (!atomic_read(&blk_probes_ref)) return; - mutex_lock(&blk_probe_mutex); - blk_tracer_enabled--; - WARN_ON(blk_tracer_enabled < 0); - mutex_unlock(&blk_probe_mutex); - + blk_tracer_enabled = false; blk_tracer_stop(tr); } From 3c289ba7c320560ee74979a8895141c829046a2d Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Fri, 20 Mar 2009 09:48:26 +0800 Subject: [PATCH 03/60] blktrace: remove blk_probe_mutex blk_register_tracepoints() always returns 0, so make it return void, thus we don't need to use blk_probe_mutex to protect blk_probes_ref. Signed-off-by: Li Zefan Acked-by: Frederic Weisbecker Acked-by: Arnaldo Carvalho de Melo Cc: Jens Axboe Cc: Steven Rostedt LKML-Reference: <49C2F5EA.8060606@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/blktrace.c | 27 +++++---------------------- 1 file changed, 5 insertions(+), 22 deletions(-) diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 73845b7968bb..223b92e77b3f 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -47,10 +47,9 @@ static struct tracer_flags blk_tracer_flags = { }; /* Global reference count of probes */ -static DEFINE_MUTEX(blk_probe_mutex); static atomic_t blk_probes_ref = ATOMIC_INIT(0); -static int blk_register_tracepoints(void); +static void blk_register_tracepoints(void); static void blk_unregister_tracepoints(void); /* @@ -256,10 +255,8 @@ static void blk_trace_cleanup(struct blk_trace *bt) free_percpu(bt->sequence); free_percpu(bt->msg_data); kfree(bt); - mutex_lock(&blk_probe_mutex); if (atomic_dec_and_test(&blk_probes_ref)) blk_unregister_tracepoints(); - mutex_unlock(&blk_probe_mutex); } int blk_trace_remove(struct request_queue *q) @@ -471,13 +468,8 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, bt->pid = buts->pid; bt->trace_state = Blktrace_setup; - mutex_lock(&blk_probe_mutex); - if (atomic_add_return(1, &blk_probes_ref) == 1) { - ret = blk_register_tracepoints(); - if (ret) - goto probe_err; - } - mutex_unlock(&blk_probe_mutex); + if (atomic_add_return(1, &blk_probes_ref) == 1) + blk_register_tracepoints(); ret = -EBUSY; old_bt = xchg(&q->blk_trace, bt); @@ -487,9 +479,6 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, } return 0; -probe_err: - atomic_dec(&blk_probes_ref); - mutex_unlock(&blk_probe_mutex); err: if (bt) { if (bt->msg_file) @@ -863,7 +852,7 @@ void blk_add_driver_data(struct request_queue *q, } EXPORT_SYMBOL_GPL(blk_add_driver_data); -static int blk_register_tracepoints(void) +static void blk_register_tracepoints(void) { int ret; @@ -901,7 +890,6 @@ static int blk_register_tracepoints(void) WARN_ON(ret); ret = register_trace_block_remap(blk_add_trace_remap); WARN_ON(ret); - return 0; } static void blk_unregister_tracepoints(void) @@ -1099,11 +1087,8 @@ static void blk_tracer_print_header(struct seq_file *m) static void blk_tracer_start(struct trace_array *tr) { - mutex_lock(&blk_probe_mutex); if (atomic_add_return(1, &blk_probes_ref) == 1) - if (blk_register_tracepoints()) - atomic_dec(&blk_probes_ref); - mutex_unlock(&blk_probe_mutex); + blk_register_tracepoints(); trace_flags &= ~TRACE_ITER_CONTEXT_INFO; } @@ -1118,10 +1103,8 @@ static int blk_tracer_init(struct trace_array *tr) static void blk_tracer_stop(struct trace_array *tr) { trace_flags |= TRACE_ITER_CONTEXT_INFO; - mutex_lock(&blk_probe_mutex); if (atomic_dec_and_test(&blk_probes_ref)) blk_unregister_tracepoints(); - mutex_unlock(&blk_probe_mutex); } static void blk_tracer_reset(struct trace_array *tr) From cbe28296eb1ac441b35cf45804d0ae808add7dd1 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Fri, 20 Mar 2009 09:48:47 +0800 Subject: [PATCH 04/60] blktrace: don't increase blk_probes_ref if failed to setup blk trace do_blk_trace_setup() may return EBUSY, but the current code doesn't decrease blk_probes_ref in this case. Signed-off-by: Li Zefan Acked-by: Frederic Weisbecker Acked-by: Arnaldo Carvalho de Melo Cc: Jens Axboe Cc: Steven Rostedt LKML-Reference: <49C2F5FF.80002@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/blktrace.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 223b92e77b3f..11e7c8d9d222 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -468,9 +468,6 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, bt->pid = buts->pid; bt->trace_state = Blktrace_setup; - if (atomic_add_return(1, &blk_probes_ref) == 1) - blk_register_tracepoints(); - ret = -EBUSY; old_bt = xchg(&q->blk_trace, bt); if (old_bt) { @@ -478,6 +475,9 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, goto err; } + if (atomic_add_return(1, &blk_probes_ref) == 1) + blk_register_tracepoints(); + return 0; err: if (bt) { From 15152e448b693fa41de40f1e40ffbe717a3aab88 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Fri, 20 Mar 2009 09:49:08 +0800 Subject: [PATCH 05/60] blktrace: report EBUSY correctly blk_trace_remove_queue() returns EINVAL if q->blk_trace == NULL, but blk_trace_setup_queue() doesn't return EBUSY if q->blk_trace != NULL. # echo 0 > sdaX/trace/enable # echo 0 > sdaX/trace/enable bash: echo: write error: Invalid argument # echo 1 > sdaX/trace/enable # echo 1 > sdaX/trace/enable (should return EBUSY) Signed-off-by: Li Zefan Acked-by: Arnaldo Carvalho de Melo Acked-by: Frederic Weisbecker Cc: Jens Axboe Cc: Steven Rostedt LKML-Reference: <49C2F614.2010101@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/blktrace.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 11e7c8d9d222..14986afdbc1c 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -1260,12 +1260,10 @@ static int blk_trace_remove_queue(struct request_queue *q) static int blk_trace_setup_queue(struct request_queue *q, dev_t dev) { struct blk_trace *old_bt, *bt = NULL; - int ret; - ret = -ENOMEM; bt = kzalloc(sizeof(*bt), GFP_KERNEL); if (!bt) - goto err; + return -ENOMEM; bt->dev = dev; bt->act_mask = (u16)-1; @@ -1276,11 +1274,10 @@ static int blk_trace_setup_queue(struct request_queue *q, dev_t dev) if (old_bt != NULL) { (void)xchg(&q->blk_trace, old_bt); kfree(bt); - ret = -EBUSY; + return -EBUSY; } + return 0; -err: - return ret; } /* From cd649b8bb830d65c57c3c8b98d57b5402256d8bd Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Fri, 20 Mar 2009 11:33:55 +0800 Subject: [PATCH 06/60] blktrace: remove sysfs_blk_trace_enable_show/store() sysfs_blk_trace_enable_show()/store() share most of code with sysfs_blk_trace_attr_show()/store(). Signed-off-by: Li Zefan Acked-by: Frederic Weisbecker Acked-by: Arnaldo Carvalho de Melo Cc: Jens Axboe Cc: Steven Rostedt LKML-Reference: <49C30EA3.1060004@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/blktrace.c | 92 ++++++++++------------------------------- 1 file changed, 22 insertions(+), 70 deletions(-) diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 14986afdbc1c..dfee6f915179 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -1284,72 +1284,6 @@ static int blk_trace_setup_queue(struct request_queue *q, dev_t dev) * sysfs interface to enable and configure tracing */ -static ssize_t sysfs_blk_trace_enable_show(struct device *dev, - struct device_attribute *attr, - char *buf) -{ - struct hd_struct *p = dev_to_part(dev); - struct block_device *bdev; - ssize_t ret = -ENXIO; - - lock_kernel(); - bdev = bdget(part_devt(p)); - if (bdev != NULL) { - struct request_queue *q = bdev_get_queue(bdev); - - if (q != NULL) { - mutex_lock(&bdev->bd_mutex); - ret = sprintf(buf, "%u\n", !!q->blk_trace); - mutex_unlock(&bdev->bd_mutex); - } - - bdput(bdev); - } - - unlock_kernel(); - return ret; -} - -static ssize_t sysfs_blk_trace_enable_store(struct device *dev, - struct device_attribute *attr, - const char *buf, size_t count) -{ - struct block_device *bdev; - struct request_queue *q; - struct hd_struct *p; - int value; - ssize_t ret = -ENXIO; - - if (count == 0 || sscanf(buf, "%d", &value) != 1) - goto out; - - lock_kernel(); - p = dev_to_part(dev); - bdev = bdget(part_devt(p)); - if (bdev == NULL) - goto out_unlock_kernel; - - q = bdev_get_queue(bdev); - if (q == NULL) - goto out_bdput; - - mutex_lock(&bdev->bd_mutex); - if (value) - ret = blk_trace_setup_queue(q, bdev->bd_dev); - else - ret = blk_trace_remove_queue(q); - mutex_unlock(&bdev->bd_mutex); - - if (ret == 0) - ret = count; -out_bdput: - bdput(bdev); -out_unlock_kernel: - unlock_kernel(); -out: - return ret; -} - static ssize_t sysfs_blk_trace_attr_show(struct device *dev, struct device_attribute *attr, char *buf); @@ -1361,8 +1295,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev, sysfs_blk_trace_attr_show, \ sysfs_blk_trace_attr_store) -static DEVICE_ATTR(enable, S_IRUGO | S_IWUSR, - sysfs_blk_trace_enable_show, sysfs_blk_trace_enable_store); +static BLK_TRACE_DEVICE_ATTR(enable); static BLK_TRACE_DEVICE_ATTR(act_mask); static BLK_TRACE_DEVICE_ATTR(pid); static BLK_TRACE_DEVICE_ATTR(start_lba); @@ -1447,6 +1380,12 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev, if (q == NULL) goto out_bdput; mutex_lock(&bdev->bd_mutex); + + if (attr == &dev_attr_enable) { + ret = sprintf(buf, "%u\n", !!q->blk_trace); + goto out_unlock_bdev; + } + if (q->blk_trace == NULL) ret = sprintf(buf, "disabled\n"); else if (attr == &dev_attr_act_mask) @@ -1457,6 +1396,8 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev, ret = sprintf(buf, "%llu\n", q->blk_trace->start_lba); else if (attr == &dev_attr_end_lba) ret = sprintf(buf, "%llu\n", q->blk_trace->end_lba); + +out_unlock_bdev: mutex_unlock(&bdev->bd_mutex); out_bdput: bdput(bdev); @@ -1499,6 +1440,15 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev, goto out_bdput; mutex_lock(&bdev->bd_mutex); + + if (attr == &dev_attr_enable) { + if (value) + ret = blk_trace_setup_queue(q, bdev->bd_dev); + else + ret = blk_trace_remove_queue(q); + goto out_unlock_bdev; + } + ret = 0; if (q->blk_trace == NULL) ret = blk_trace_setup_queue(q, bdev->bd_dev); @@ -1512,13 +1462,15 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev, q->blk_trace->start_lba = value; else if (attr == &dev_attr_end_lba) q->blk_trace->end_lba = value; - ret = count; } + +out_unlock_bdev: mutex_unlock(&bdev->bd_mutex); out_bdput: bdput(bdev); out_unlock_kernel: unlock_kernel(); out: - return ret; + return ret ? ret : count; } + From b125130b22d67f249beba10b71a254558b5279d0 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Fri, 20 Mar 2009 10:34:00 +0800 Subject: [PATCH 07/60] blktrace: avoid accessing NULL bdev->bd_disk bdev->bd_disk can be NULL, if the block device is not opened. Try this against an unmounted partition, and you'll see NULL dereference: # echo 1 > /sys/block/sda/sda5/enable Signed-off-by: Li Zefan Acked-by: Frederic Weisbecker Acked-by: Arnaldo Carvalho de Melo Cc: Jens Axboe Cc: Steven Rostedt LKML-Reference: <49C30098.6080107@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/blktrace.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index dfee6f915179..108f4f7715a5 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -1362,6 +1362,14 @@ static int blk_str2act_mask(const char *str) return mask; } +static struct request_queue *blk_trace_get_queue(struct block_device *bdev) +{ + if (bdev->bd_disk == NULL) + return NULL; + + return bdev_get_queue(bdev); +} + static ssize_t sysfs_blk_trace_attr_show(struct device *dev, struct device_attribute *attr, char *buf) @@ -1376,9 +1384,10 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev, if (bdev == NULL) goto out_unlock_kernel; - q = bdev_get_queue(bdev); + q = blk_trace_get_queue(bdev); if (q == NULL) goto out_bdput; + mutex_lock(&bdev->bd_mutex); if (attr == &dev_attr_enable) { @@ -1435,7 +1444,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev, if (bdev == NULL) goto out_unlock_kernel; - q = bdev_get_queue(bdev); + q = blk_trace_get_queue(bdev); if (q == NULL) goto out_bdput; From cf027f645e6aee4f0ca6197a6b6a57f327fdb13f Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Sun, 22 Mar 2009 03:30:39 -0500 Subject: [PATCH 08/60] tracing: add run-time field descriptions for event filtering This patch makes the field descriptions defined for event tracing available at run-time, for the event-filtering mechanism introduced in a subsequent patch. The common event fields are prepended with 'common_' in the format display, allowing them to be distinguished from the other fields that might internally have same name and can therefore be unambiguously used in filters. Signed-off-by: Tom Zanussi Acked-by: Frederic Weisbecker LKML-Reference: <1237710639.7703.46.camel@charm-linux> Signed-off-by: Ingo Molnar --- kernel/trace/trace.h | 30 +++++++++++++------ kernel/trace/trace_events.c | 40 ++++++++++++++++++++++++- kernel/trace/trace_events_stage_2.h | 45 +++++++++++++++++++++++++++++ kernel/trace/trace_events_stage_3.h | 2 ++ 4 files changed, 107 insertions(+), 10 deletions(-) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 7cfb741be200..9288dc7ad14f 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -775,16 +775,26 @@ enum { TRACE_EVENT_TYPE_RAW = 2, }; +struct ftrace_event_field { + struct list_head link; + char *name; + char *type; + int offset; + int size; +}; + struct ftrace_event_call { - char *name; - char *system; - struct dentry *dir; - int enabled; - int (*regfunc)(void); - void (*unregfunc)(void); - int id; - int (*raw_init)(void); - int (*show_format)(struct trace_seq *s); + char *name; + char *system; + struct dentry *dir; + int enabled; + int (*regfunc)(void); + void (*unregfunc)(void); + int id; + int (*raw_init)(void); + int (*show_format)(struct trace_seq *s); + int (*define_fields)(void); + struct list_head fields; #ifdef CONFIG_EVENT_PROFILE atomic_t profile_count; @@ -793,6 +803,8 @@ struct ftrace_event_call { #endif }; +int trace_define_field(struct ftrace_event_call *call, char *type, + char *name, int offset, int size); void event_trace_printk(unsigned long ip, const char *fmt, ...); extern struct ftrace_event_call __start_ftrace_events[]; extern struct ftrace_event_call __stop_ftrace_events[]; diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 3047b56f6637..961b057da28b 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -19,6 +19,34 @@ static DEFINE_MUTEX(event_mutex); +int trace_define_field(struct ftrace_event_call *call, char *type, + char *name, int offset, int size) +{ + struct ftrace_event_field *field; + + field = kmalloc(sizeof(*field), GFP_KERNEL); + if (!field) + goto err; + field->name = kstrdup(name, GFP_KERNEL); + if (!field->name) + goto err; + field->type = kstrdup(type, GFP_KERNEL); + if (!field->type) + goto err; + field->offset = offset; + field->size = size; + list_add(&field->link, &call->fields); + + return 0; +err: + if (field) { + kfree(field->name); + kfree(field->type); + } + kfree(field); + return -ENOMEM; +} + static void ftrace_clear_events(void) { struct ftrace_event_call *call = (void *)__start_ftrace_events; @@ -343,7 +371,8 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, #undef FIELD #define FIELD(type, name) \ - #type, #name, offsetof(typeof(field), name), sizeof(field.name) + #type, "common_" #name, offsetof(typeof(field), name), \ + sizeof(field.name) static int trace_write_header(struct trace_seq *s) { @@ -581,6 +610,15 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events) call->name); } + if (call->define_fields) { + ret = call->define_fields(); + if (ret < 0) { + pr_warning("Could not initialize trace point" + " events/%s\n", call->name); + return ret; + } + } + /* A trace may not want to export its format */ if (!call->show_format) return 0; diff --git a/kernel/trace/trace_events_stage_2.h b/kernel/trace/trace_events_stage_2.h index 5117c43f5c67..30743f7d4110 100644 --- a/kernel/trace/trace_events_stage_2.h +++ b/kernel/trace/trace_events_stage_2.h @@ -129,3 +129,48 @@ ftrace_format_##call(struct trace_seq *s) \ } #include + +#undef __field +#define __field(type, item) \ + ret = trace_define_field(event_call, #type, #item, \ + offsetof(typeof(field), item), \ + sizeof(field.item)); \ + if (ret) \ + return ret; + +#undef __array +#define __array(type, item, len) \ + ret = trace_define_field(event_call, #type "[" #len "]", #item, \ + offsetof(typeof(field), item), \ + sizeof(field.item)); \ + if (ret) \ + return ret; + +#define __common_field(type, item) \ + ret = trace_define_field(event_call, #type, "common_" #item, \ + offsetof(typeof(field.ent), item), \ + sizeof(field.ent.item)); \ + if (ret) \ + return ret; + +#undef TRACE_EVENT +#define TRACE_EVENT(call, proto, args, tstruct, func, print) \ +int \ +ftrace_define_fields_##call(void) \ +{ \ + struct ftrace_raw_##call field; \ + struct ftrace_event_call *event_call = &event_##call; \ + int ret; \ + \ + __common_field(unsigned char, type); \ + __common_field(unsigned char, flags); \ + __common_field(unsigned char, preempt_count); \ + __common_field(int, pid); \ + __common_field(int, tgid); \ + \ + tstruct; \ + \ + return ret; \ +} + +#include diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h index 6b3261ca988c..468938f70141 100644 --- a/kernel/trace/trace_events_stage_3.h +++ b/kernel/trace/trace_events_stage_3.h @@ -252,6 +252,7 @@ static int ftrace_raw_init_event_##call(void) \ if (!id) \ return -ENODEV; \ event_##call.id = id; \ + INIT_LIST_HEAD(&event_##call.fields); \ return 0; \ } \ \ @@ -264,6 +265,7 @@ __attribute__((section("_ftrace_events"))) event_##call = { \ .regfunc = ftrace_raw_reg_event_##call, \ .unregfunc = ftrace_raw_unreg_event_##call, \ .show_format = ftrace_format_##call, \ + .define_fields = ftrace_define_fields_##call, \ _TRACE_PROFILE_INIT(call) \ } From f80d2d7725b04f8225b11b55e43bb2c77c819926 Mon Sep 17 00:00:00 2001 From: Dmitri Vorobiev Date: Sun, 22 Mar 2009 19:11:10 +0200 Subject: [PATCH 09/60] tracing, Text Edit Lock: Fix one sparse warning in kernel/extable.c Impact: cleanup. The global mutex text_mutex if declared in linux/memory.h, so this file needs to be included into kernel/extable.c, where the same mutex is defined. This fixes the following sparse warning: kernel/extable.c:32:1: warning: symbol 'text_mutex' was not declared. Should it be static? Signed-off-by: Dmitri Vorobiev LKML-Reference: <1237741871-5827-3-git-send-email-dmitri.vorobiev@movial.com> Signed-off-by: Ingo Molnar --- kernel/extable.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/extable.c b/kernel/extable.c index 25d39b0c3a1b..b54a6017b6b5 100644 --- a/kernel/extable.c +++ b/kernel/extable.c @@ -16,6 +16,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include +#include #include #include #include From b8b94265337f83b7db9c5f429b1769d463d7da8c Mon Sep 17 00:00:00 2001 From: Dmitri Vorobiev Date: Sun, 22 Mar 2009 19:11:11 +0200 Subject: [PATCH 10/60] tracing: fix four sparse warnings Impact: cleanup. This patch fixes the following sparse warnings: kernel/trace/trace.c:385:9: warning: symbol 'trace_seq_to_buffer' was not declared. Should it be static? kernel/trace/trace_clock.c:29:13: warning: symbol 'trace_clock_local' was not declared. Should it be static? kernel/trace/trace_clock.c:54:13: warning: symbol 'trace_clock' was not declared. Should it be static? kernel/trace/trace_clock.c:74:13: warning: symbol 'trace_clock_global' was not declared. Should it be static? Signed-off-by: Dmitri Vorobiev LKML-Reference: <1237741871-5827-4-git-send-email-dmitri.vorobiev@movial.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace.c | 2 +- kernel/trace/trace_clock.c | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index e6fac0ffe6f0..ace685c70186 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -382,7 +382,7 @@ ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt) return cnt; } -ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt) +static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt) { int len; void *ret; diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c index 05b176abfd30..b588fd81f7f9 100644 --- a/kernel/trace/trace_clock.c +++ b/kernel/trace/trace_clock.c @@ -18,6 +18,7 @@ #include #include #include +#include /* * trace_clock_local(): the simplest and least coherent tracing clock. From 2d622719f1572ef31e0616444a515eba3094d050 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Sun, 22 Mar 2009 03:30:49 -0500 Subject: [PATCH 11/60] tracing: add ring_buffer_event_discard() to ring buffer This patch overloads RINGBUF_TYPE_PADDING to provide a way to discard events from the ring buffer, for the event-filtering mechanism introduced in a subsequent patch. I did the initial version but thanks to Steven Rostedt for adding the parts that actually made it work. ;-) Signed-off-by: Tom Zanussi Acked-by: Frederic Weisbecker Signed-off-by: Ingo Molnar --- include/linux/ring_buffer.h | 11 +++- kernel/trace/ring_buffer.c | 117 ++++++++++++++++++++++++++++++------ 2 files changed, 105 insertions(+), 23 deletions(-) diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index 9e6052bd1a1c..e1b7b2173885 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -18,10 +18,13 @@ struct ring_buffer_event { /** * enum ring_buffer_type - internal ring buffer types * - * @RINGBUF_TYPE_PADDING: Left over page padding - * array is ignored - * size is variable depending on how much + * @RINGBUF_TYPE_PADDING: Left over page padding or discarded event + * If time_delta is 0: + * array is ignored + * size is variable depending on how much * padding is needed + * If time_delta is non zero: + * everything else same as RINGBUF_TYPE_DATA * * @RINGBUF_TYPE_TIME_EXTEND: Extend the time delta * array[0] = time delta (28 .. 59) @@ -65,6 +68,8 @@ ring_buffer_event_time_delta(struct ring_buffer_event *event) return event->time_delta; } +void ring_buffer_event_discard(struct ring_buffer_event *event); + /* * size is in bytes for each per CPU buffer. */ diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 384ca5d9d729..a09027ec1714 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -189,16 +189,65 @@ enum { RB_LEN_TIME_STAMP = 16, }; +static inline int rb_null_event(struct ring_buffer_event *event) +{ + return event->type == RINGBUF_TYPE_PADDING && event->time_delta == 0; +} + +static inline int rb_discarded_event(struct ring_buffer_event *event) +{ + return event->type == RINGBUF_TYPE_PADDING && event->time_delta; +} + +static void rb_event_set_padding(struct ring_buffer_event *event) +{ + event->type = RINGBUF_TYPE_PADDING; + event->time_delta = 0; +} + +/** + * ring_buffer_event_discard - discard an event in the ring buffer + * @buffer: the ring buffer + * @event: the event to discard + * + * Sometimes a event that is in the ring buffer needs to be ignored. + * This function lets the user discard an event in the ring buffer + * and then that event will not be read later. + * + * Note, it is up to the user to be careful with this, and protect + * against races. If the user discards an event that has been consumed + * it is possible that it could corrupt the ring buffer. + */ +void ring_buffer_event_discard(struct ring_buffer_event *event) +{ + event->type = RINGBUF_TYPE_PADDING; + /* time delta must be non zero */ + if (!event->time_delta) + event->time_delta = 1; +} + +static unsigned +rb_event_data_length(struct ring_buffer_event *event) +{ + unsigned length; + + if (event->len) + length = event->len * RB_ALIGNMENT; + else + length = event->array[0]; + return length + RB_EVNT_HDR_SIZE; +} + /* inline for ring buffer fast paths */ static unsigned rb_event_length(struct ring_buffer_event *event) { - unsigned length; - switch (event->type) { case RINGBUF_TYPE_PADDING: - /* undefined */ - return -1; + if (rb_null_event(event)) + /* undefined */ + return -1; + return rb_event_data_length(event); case RINGBUF_TYPE_TIME_EXTEND: return RB_LEN_TIME_EXTEND; @@ -207,11 +256,7 @@ rb_event_length(struct ring_buffer_event *event) return RB_LEN_TIME_STAMP; case RINGBUF_TYPE_DATA: - if (event->len) - length = event->len * RB_ALIGNMENT; - else - length = event->array[0]; - return length + RB_EVNT_HDR_SIZE; + return rb_event_data_length(event); default: BUG(); } @@ -845,11 +890,6 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size) } EXPORT_SYMBOL_GPL(ring_buffer_resize); -static inline int rb_null_event(struct ring_buffer_event *event) -{ - return event->type == RINGBUF_TYPE_PADDING; -} - static inline void * __rb_data_page_index(struct buffer_data_page *bpage, unsigned index) { @@ -1219,7 +1259,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, if (tail < BUF_PAGE_SIZE) { /* Mark the rest of the page with padding */ event = __rb_page_index(tail_page, tail); - event->type = RINGBUF_TYPE_PADDING; + rb_event_set_padding(event); } if (tail <= BUF_PAGE_SIZE) @@ -1969,7 +2009,7 @@ static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer) event = rb_reader_event(cpu_buffer); - if (event->type == RINGBUF_TYPE_DATA) + if (event->type == RINGBUF_TYPE_DATA || rb_discarded_event(event)) cpu_buffer->entries--; rb_update_read_stamp(cpu_buffer, event); @@ -2052,9 +2092,18 @@ rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts) switch (event->type) { case RINGBUF_TYPE_PADDING: - RB_WARN_ON(cpu_buffer, 1); + if (rb_null_event(event)) + RB_WARN_ON(cpu_buffer, 1); + /* + * Because the writer could be discarding every + * event it creates (which would probably be bad) + * if we were to go back to "again" then we may never + * catch up, and will trigger the warn on, or lock + * the box. Return the padding, and we will release + * the current locks, and try again. + */ rb_advance_reader(cpu_buffer); - return NULL; + return event; case RINGBUF_TYPE_TIME_EXTEND: /* Internal data, OK to advance */ @@ -2115,8 +2164,12 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts) switch (event->type) { case RINGBUF_TYPE_PADDING: - rb_inc_iter(iter); - goto again; + if (rb_null_event(event)) { + rb_inc_iter(iter); + goto again; + } + rb_advance_iter(iter); + return event; case RINGBUF_TYPE_TIME_EXTEND: /* Internal data, OK to advance */ @@ -2163,10 +2216,16 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts) if (!cpumask_test_cpu(cpu, buffer->cpumask)) return NULL; + again: spin_lock_irqsave(&cpu_buffer->reader_lock, flags); event = rb_buffer_peek(buffer, cpu, ts); spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + if (event && event->type == RINGBUF_TYPE_PADDING) { + cpu_relax(); + goto again; + } + return event; } @@ -2185,10 +2244,16 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts) struct ring_buffer_event *event; unsigned long flags; + again: spin_lock_irqsave(&cpu_buffer->reader_lock, flags); event = rb_iter_peek(iter, ts); spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + if (event && event->type == RINGBUF_TYPE_PADDING) { + cpu_relax(); + goto again; + } + return event; } @@ -2207,6 +2272,7 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts) struct ring_buffer_event *event = NULL; unsigned long flags; + again: /* might be called in atomic */ preempt_disable(); @@ -2228,6 +2294,11 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts) out: preempt_enable(); + if (event && event->type == RINGBUF_TYPE_PADDING) { + cpu_relax(); + goto again; + } + return event; } EXPORT_SYMBOL_GPL(ring_buffer_consume); @@ -2306,6 +2377,7 @@ ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts) struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; unsigned long flags; + again: spin_lock_irqsave(&cpu_buffer->reader_lock, flags); event = rb_iter_peek(iter, ts); if (!event) @@ -2315,6 +2387,11 @@ ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts) out: spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + if (event && event->type == RINGBUF_TYPE_PADDING) { + cpu_relax(); + goto again; + } + return event; } EXPORT_SYMBOL_GPL(ring_buffer_read); From 7ce7e4249921d5073e764f7ff7ad83cfa9894bd7 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Sun, 22 Mar 2009 03:31:04 -0500 Subject: [PATCH 12/60] tracing: add per-event filtering This patch adds per-event filtering to the event tracing subsystem. It adds a 'filter' debugfs file to each event directory. This file can be written to to set filters; reading from it will display the current set of filters set for that event. Basically, any field listed in the 'format' file for an event can be filtered on (including strings, but not yet other array types) using either matching ('==') or non-matching ('!=') 'predicates'. A 'predicate' can be either a single expression: # echo pid != 0 > filter # cat filter pid != 0 or a compound expression of up to 8 sub-expressions combined using '&&' or '||': # echo comm == Xorg > filter # echo "&& sig != 29" > filter # cat filter comm == Xorg && sig != 29 Only events having field values matching an expression will be available in the trace output; non-matching events are discarded. Note that a compound expression is built up by echoing each sub-expression separately - it's not the most efficient way to do things, but it keeps the parser simple and assumes that compound expressions will be relatively uncommon. In any case, a subsequent patch introducing a way to set filters for entire subsystems should mitigate any need to do this for lots of events. Setting a filter without an '&&' or '||' clears the previous filter completely and sets the filter to the new expression: # cat filter comm == Xorg && sig != 29 # echo comm != Xorg # cat filter comm != Xorg To clear a filter, echo 0 to the filter file: # echo 0 > filter # cat filter none The limit of 8 predicates for a compound expression is arbitrary - for efficiency, it's implemented as an array of pointers to predicates, and 8 seemed more than enough for any filter... Signed-off-by: Tom Zanussi Acked-by: Frederic Weisbecker LKML-Reference: <1237710665.7703.48.camel@charm-linux> Signed-off-by: Ingo Molnar --- kernel/trace/Makefile | 1 + kernel/trace/trace.h | 28 +++ kernel/trace/trace_events.c | 77 +++++++ kernel/trace/trace_events_filter.c | 326 ++++++++++++++++++++++++++++ kernel/trace/trace_events_stage_3.h | 4 + 5 files changed, 436 insertions(+) create mode 100644 kernel/trace/trace_events_filter.c diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 0e45c206c2f9..2630f5121ec1 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -45,5 +45,6 @@ obj-$(CONFIG_EVENT_TRACER) += events.o obj-$(CONFIG_EVENT_TRACER) += trace_export.o obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o +obj-$(CONFIG_EVENT_TRACER) += trace_events_filter.o libftrace-y := ftrace.o diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 9288dc7ad14f..d9eb39e4bb38 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -795,6 +795,7 @@ struct ftrace_event_call { int (*show_format)(struct trace_seq *s); int (*define_fields)(void); struct list_head fields; + struct filter_pred **preds; #ifdef CONFIG_EVENT_PROFILE atomic_t profile_count; @@ -803,8 +804,35 @@ struct ftrace_event_call { #endif }; +#define MAX_FILTER_PRED 8 + +struct filter_pred; + +typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event); + +struct filter_pred { + filter_pred_fn_t fn; + u64 val; + char *str_val; + int str_len; + char *field_name; + int offset; + int not; + int or; + int compound; + int clear; +}; + int trace_define_field(struct ftrace_event_call *call, char *type, char *name, int offset, int size); +extern void filter_free_pred(struct filter_pred *pred); +extern int filter_print_preds(struct filter_pred **preds, char *buf); +extern int filter_parse(char **pbuf, struct filter_pred *pred); +extern int filter_add_pred(struct ftrace_event_call *call, + struct filter_pred *pred); +extern void filter_free_preds(struct ftrace_event_call *call); +extern int filter_match_preds(struct ftrace_event_call *call, void *rec); + void event_trace_printk(unsigned long ip, const char *fmt, ...); extern struct ftrace_event_call __start_ftrace_events[]; extern struct ftrace_event_call __stop_ftrace_events[]; diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 961b057da28b..97470c48956e 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -459,6 +459,71 @@ event_id_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) return r; } +static ssize_t +event_filter_read(struct file *filp, char __user *ubuf, size_t cnt, + loff_t *ppos) +{ + struct ftrace_event_call *call = filp->private_data; + struct trace_seq *s; + int r; + + if (*ppos) + return 0; + + s = kmalloc(sizeof(*s), GFP_KERNEL); + if (!s) + return -ENOMEM; + + trace_seq_init(s); + + r = filter_print_preds(call->preds, s->buffer); + r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, r); + + kfree(s); + + return r; +} + +static ssize_t +event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, + loff_t *ppos) +{ + struct ftrace_event_call *call = filp->private_data; + char buf[64], *pbuf = buf; + struct filter_pred *pred; + int err; + + if (cnt >= sizeof(buf)) + return -EINVAL; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + pred = kzalloc(sizeof(*pred), GFP_KERNEL); + if (!pred) + return -ENOMEM; + + err = filter_parse(&pbuf, pred); + if (err < 0) { + filter_free_pred(pred); + return err; + } + + if (pred->clear) { + filter_free_preds(call); + return cnt; + } + + if (filter_add_pred(call, pred)) { + filter_free_pred(pred); + return -EINVAL; + } + + *ppos += cnt; + + return cnt; +} + static const struct seq_operations show_event_seq_ops = { .start = t_start, .next = t_next, @@ -504,6 +569,12 @@ static const struct file_operations ftrace_event_id_fops = { .read = event_id_read, }; +static const struct file_operations ftrace_event_filter_fops = { + .open = tracing_open_generic, + .read = event_filter_read, + .write = event_filter_write, +}; + static struct dentry *event_trace_events_dir(void) { static struct dentry *d_tracer; @@ -619,6 +690,12 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events) } } + entry = debugfs_create_file("filter", 0444, call->dir, call, + &ftrace_event_filter_fops); + if (!entry) + pr_warning("Could not create debugfs " + "'%s/filter' entry\n", call->name); + /* A trace may not want to export its format */ if (!call->show_format) return 0; diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c new file mode 100644 index 000000000000..8e8c5fa25be9 --- /dev/null +++ b/kernel/trace/trace_events_filter.c @@ -0,0 +1,326 @@ +/* + * trace_events_filter - generic event filtering + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) 2009 Tom Zanussi + */ + +#include +#include +#include +#include + +#include "trace.h" + +static int filter_pred_64(struct filter_pred *pred, void *event) +{ + u64 *addr = (u64 *)(event + pred->offset); + u64 val = (u64)pred->val; + int match; + + match = (val == *addr) ^ pred->not; + + return match; +} + +static int filter_pred_32(struct filter_pred *pred, void *event) +{ + u32 *addr = (u32 *)(event + pred->offset); + u32 val = (u32)pred->val; + int match; + + match = (val == *addr) ^ pred->not; + + return match; +} + +static int filter_pred_16(struct filter_pred *pred, void *event) +{ + u16 *addr = (u16 *)(event + pred->offset); + u16 val = (u16)pred->val; + int match; + + match = (val == *addr) ^ pred->not; + + return match; +} + +static int filter_pred_8(struct filter_pred *pred, void *event) +{ + u8 *addr = (u8 *)(event + pred->offset); + u8 val = (u8)pred->val; + int match; + + match = (val == *addr) ^ pred->not; + + return match; +} + +static int filter_pred_string(struct filter_pred *pred, void *event) +{ + char *addr = (char *)(event + pred->offset); + int cmp, match; + + cmp = strncmp(addr, pred->str_val, pred->str_len); + + match = (!cmp) ^ pred->not; + + return match; +} + +/* return 1 if event matches, 0 otherwise (discard) */ +int filter_match_preds(struct ftrace_event_call *call, void *rec) +{ + int i, matched, and_failed = 0; + struct filter_pred *pred; + + for (i = 0; i < MAX_FILTER_PRED; i++) { + if (call->preds[i]) { + pred = call->preds[i]; + if (and_failed && !pred->or) + continue; + matched = pred->fn(pred, rec); + if (!matched && !pred->or) { + and_failed = 1; + continue; + } else if (matched && pred->or) + return 1; + } else + break; + } + + if (and_failed) + return 0; + + return 1; +} + +int filter_print_preds(struct filter_pred **preds, char *buf) +{ + ssize_t this_len = 0; + char *field_name; + struct filter_pred *pred; + int i; + + if (!preds) { + this_len += sprintf(buf + this_len, "none\n"); + return this_len; + } + + for (i = 0; i < MAX_FILTER_PRED; i++) { + if (preds[i]) { + pred = preds[i]; + field_name = pred->field_name; + if (i) + this_len += sprintf(buf + this_len, + pred->or ? "|| " : "&& "); + this_len += sprintf(buf + this_len, + "%s ", field_name); + this_len += sprintf(buf + this_len, + pred->not ? "!= " : "== "); + if (pred->str_val) + this_len += sprintf(buf + this_len, + "%s\n", pred->str_val); + else + this_len += sprintf(buf + this_len, + "%llu\n", pred->val); + } else + break; + } + + return this_len; +} + +static struct ftrace_event_field * +find_event_field(struct ftrace_event_call *call, char *name) +{ + struct ftrace_event_field *field; + struct list_head *entry, *tmp; + + list_for_each_safe(entry, tmp, &call->fields) { + field = list_entry(entry, struct ftrace_event_field, link); + if (!strcmp(field->name, name)) + return field; + } + + return NULL; +} + +void filter_free_pred(struct filter_pred *pred) +{ + if (!pred) + return; + + kfree(pred->field_name); + kfree(pred->str_val); + kfree(pred); +} + +void filter_free_preds(struct ftrace_event_call *call) +{ + int i; + + if (call->preds) { + for (i = 0; i < MAX_FILTER_PRED; i++) + filter_free_pred(call->preds[i]); + kfree(call->preds); + call->preds = NULL; + } +} + +static int __filter_add_pred(struct ftrace_event_call *call, + struct filter_pred *pred) +{ + int i; + + if (call->preds && !pred->compound) + filter_free_preds(call); + + if (!call->preds) { + call->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), + GFP_KERNEL); + if (!call->preds) + return -ENOMEM; + } + + for (i = 0; i < MAX_FILTER_PRED; i++) { + if (!call->preds[i]) { + call->preds[i] = pred; + return 0; + } + } + + return -ENOMEM; +} + +static int is_string_field(const char *type) +{ + if (strchr(type, '[') && strstr(type, "char")) + return 1; + + return 0; +} + +int filter_add_pred(struct ftrace_event_call *call, struct filter_pred *pred) +{ + struct ftrace_event_field *field; + + field = find_event_field(call, pred->field_name); + if (!field) + return -EINVAL; + + pred->offset = field->offset; + + if (is_string_field(field->type)) { + pred->fn = filter_pred_string; + pred->str_len = field->size; + return __filter_add_pred(call, pred); + } + + switch (field->size) { + case 8: + pred->fn = filter_pred_64; + break; + case 4: + pred->fn = filter_pred_32; + break; + case 2: + pred->fn = filter_pred_16; + break; + case 1: + pred->fn = filter_pred_8; + break; + default: + return -EINVAL; + } + + return __filter_add_pred(call, pred); +} + +int filter_parse(char **pbuf, struct filter_pred *pred) +{ + char *tmp, *tok, *val_str = NULL; + int tok_n = 0; + + /* field ==/!= number, or/and field ==/!= number, number */ + while ((tok = strsep(pbuf, " \n"))) { + if (tok_n == 0) { + if (!strcmp(tok, "0")) { + pred->clear = 1; + return 0; + } else if (!strcmp(tok, "&&")) { + pred->or = 0; + pred->compound = 1; + } else if (!strcmp(tok, "||")) { + pred->or = 1; + pred->compound = 1; + } else + pred->field_name = tok; + tok_n = 1; + continue; + } + if (tok_n == 1) { + if (!pred->field_name) + pred->field_name = tok; + else if (!strcmp(tok, "!=")) + pred->not = 1; + else if (!strcmp(tok, "==")) + pred->not = 0; + else { + pred->field_name = NULL; + return -EINVAL; + } + tok_n = 2; + continue; + } + if (tok_n == 2) { + if (pred->compound) { + if (!strcmp(tok, "!=")) + pred->not = 1; + else if (!strcmp(tok, "==")) + pred->not = 0; + else { + pred->field_name = NULL; + return -EINVAL; + } + } else { + val_str = tok; + break; /* done */ + } + tok_n = 3; + continue; + } + if (tok_n == 3) { + val_str = tok; + break; /* done */ + } + } + + pred->field_name = kstrdup(pred->field_name, GFP_KERNEL); + if (!pred->field_name) + return -ENOMEM; + + pred->val = simple_strtoull(val_str, &tmp, 10); + if (tmp == val_str) { + pred->str_val = kstrdup(val_str, GFP_KERNEL); + if (!pred->str_val) + return -ENOMEM; + } + + return 0; +} + + diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h index 468938f70141..ebf215e87d5e 100644 --- a/kernel/trace/trace_events_stage_3.h +++ b/kernel/trace/trace_events_stage_3.h @@ -204,6 +204,7 @@ static struct ftrace_event_call event_##call; \ \ static void ftrace_raw_event_##call(proto) \ { \ + struct ftrace_event_call *call = &event_##call; \ struct ring_buffer_event *event; \ struct ftrace_raw_##call *entry; \ unsigned long irq_flags; \ @@ -222,6 +223,9 @@ static void ftrace_raw_event_##call(proto) \ assign; \ \ trace_current_buffer_unlock_commit(event, irq_flags, pc); \ + \ + if (call->preds && !filter_match_preds(call, entry)) \ + ring_buffer_event_discard(event); \ } \ \ static int ftrace_raw_reg_event_##call(void) \ From cfb180f3e71b2a280a254c8646a9ab1beab63f84 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Sun, 22 Mar 2009 03:31:17 -0500 Subject: [PATCH 13/60] tracing: add per-subsystem filtering This patch adds per-subsystem filtering to the event tracing subsystem. It adds a 'filter' debugfs file to each subsystem directory. This file can be written to to set filters; reading from it will display the current set of filters set for that subsystem. Basically what it does is propagate the filter down to each event contained in the subsystem. If a particular event doesn't have a field with the name specified in the filter, it simply doesn't get set for that event. You can verify whether or not the filter was set for a particular event by looking at the filter file for that event. As with per-event filters, compound expressions are supported, echoing '0' to the subsystem's filter file clears all filters in the subsystem, etc. Signed-off-by: Tom Zanussi Acked-by: Frederic Weisbecker LKML-Reference: <1237710677.7703.49.camel@charm-linux> Signed-off-by: Ingo Molnar --- kernel/trace/trace.h | 15 ++++++ kernel/trace/trace_events.c | 86 +++++++++++++++++++++++++++--- kernel/trace/trace_events_filter.c | 80 +++++++++++++++++++++++++++ 3 files changed, 175 insertions(+), 6 deletions(-) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index d9eb39e4bb38..f267723c3c52 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -804,6 +804,18 @@ struct ftrace_event_call { #endif }; +struct event_subsystem { + struct list_head list; + const char *name; + struct dentry *entry; + struct filter_pred **preds; +}; + +#define events_for_each(event) \ + for (event = __start_ftrace_events; \ + (unsigned long)event < (unsigned long)__stop_ftrace_events; \ + event++) + #define MAX_FILTER_PRED 8 struct filter_pred; @@ -832,6 +844,9 @@ extern int filter_add_pred(struct ftrace_event_call *call, struct filter_pred *pred); extern void filter_free_preds(struct ftrace_event_call *call); extern int filter_match_preds(struct ftrace_event_call *call, void *rec); +extern void filter_free_subsystem_preds(struct event_subsystem *system); +extern int filter_add_subsystem_pred(struct event_subsystem *system, + struct filter_pred *pred); void event_trace_printk(unsigned long ip, const char *fmt, ...); extern struct ftrace_event_call __start_ftrace_events[]; diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 97470c48956e..97d4daaddd9a 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -524,6 +524,71 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, return cnt; } +static ssize_t +subsystem_filter_read(struct file *filp, char __user *ubuf, size_t cnt, + loff_t *ppos) +{ + struct event_subsystem *system = filp->private_data; + struct trace_seq *s; + int r; + + if (*ppos) + return 0; + + s = kmalloc(sizeof(*s), GFP_KERNEL); + if (!s) + return -ENOMEM; + + trace_seq_init(s); + + r = filter_print_preds(system->preds, s->buffer); + r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, r); + + kfree(s); + + return r; +} + +static ssize_t +subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, + loff_t *ppos) +{ + struct event_subsystem *system = filp->private_data; + char buf[64], *pbuf = buf; + struct filter_pred *pred; + int err; + + if (cnt >= sizeof(buf)) + return -EINVAL; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + pred = kzalloc(sizeof(*pred), GFP_KERNEL); + if (!pred) + return -ENOMEM; + + err = filter_parse(&pbuf, pred); + if (err < 0) { + filter_free_pred(pred); + return err; + } + + if (pred->clear) { + filter_free_subsystem_preds(system); + return cnt; + } + + if (filter_add_subsystem_pred(system, pred)) { + filter_free_pred(pred); + return -EINVAL; + } + + *ppos += cnt; + + return cnt; +} + static const struct seq_operations show_event_seq_ops = { .start = t_start, .next = t_next, @@ -575,6 +640,12 @@ static const struct file_operations ftrace_event_filter_fops = { .write = event_filter_write, }; +static const struct file_operations ftrace_subsystem_filter_fops = { + .open = tracing_open_generic, + .read = subsystem_filter_read, + .write = subsystem_filter_write, +}; + static struct dentry *event_trace_events_dir(void) { static struct dentry *d_tracer; @@ -595,18 +666,13 @@ static struct dentry *event_trace_events_dir(void) return d_events; } -struct event_subsystem { - struct list_head list; - const char *name; - struct dentry *entry; -}; - static LIST_HEAD(event_subsystems); static struct dentry * event_subsystem_dir(const char *name, struct dentry *d_events) { struct event_subsystem *system; + struct dentry *entry; /* First see if we did not already create this dir */ list_for_each_entry(system, &event_subsystems, list) { @@ -633,6 +699,14 @@ event_subsystem_dir(const char *name, struct dentry *d_events) system->name = name; list_add(&system->list, &event_subsystems); + system->preds = NULL; + + entry = debugfs_create_file("filter", 0444, system->entry, system, + &ftrace_subsystem_filter_fops); + if (!entry) + pr_warning("Could not create debugfs " + "'%s/filter' entry\n", name); + return system->entry; } diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 8e8c5fa25be9..1ab20cee0e4c 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -181,6 +181,27 @@ void filter_free_preds(struct ftrace_event_call *call) } } +void filter_free_subsystem_preds(struct event_subsystem *system) +{ + struct ftrace_event_call *call = __start_ftrace_events; + int i; + + if (system->preds) { + for (i = 0; i < MAX_FILTER_PRED; i++) + filter_free_pred(system->preds[i]); + kfree(system->preds); + system->preds = NULL; + } + + events_for_each(call) { + if (!call->name || !call->regfunc) + continue; + + if (!strcmp(call->system, system->name)) + filter_free_preds(call); + } +} + static int __filter_add_pred(struct ftrace_event_call *call, struct filter_pred *pred) { @@ -250,6 +271,65 @@ int filter_add_pred(struct ftrace_event_call *call, struct filter_pred *pred) return __filter_add_pred(call, pred); } +static struct filter_pred *copy_pred(struct filter_pred *pred) +{ + struct filter_pred *new_pred = kmalloc(sizeof(*pred), GFP_KERNEL); + if (!new_pred) + return NULL; + + memcpy(new_pred, pred, sizeof(*pred)); + if (pred->str_val) { + new_pred->str_val = kstrdup(pred->str_val, GFP_KERNEL); + new_pred->field_name = kstrdup(pred->field_name, GFP_KERNEL); + if (!new_pred->str_val) { + kfree(new_pred); + return NULL; + } + } + + return new_pred; +} + +int filter_add_subsystem_pred(struct event_subsystem *system, + struct filter_pred *pred) +{ + struct ftrace_event_call *call = __start_ftrace_events; + struct filter_pred *event_pred; + int i; + + if (system->preds && !pred->compound) + filter_free_subsystem_preds(system); + + if (!system->preds) { + system->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), + GFP_KERNEL); + if (!system->preds) + return -ENOMEM; + } + + for (i = 0; i < MAX_FILTER_PRED; i++) { + if (!system->preds[i]) { + system->preds[i] = pred; + break; + } + if (i == MAX_FILTER_PRED - 1) + return -EINVAL; + } + + events_for_each(call) { + if (!call->name || !call->regfunc) + continue; + + if (!strcmp(call->system, system->name)) { + event_pred = copy_pred(pred); + if (event_pred) + filter_add_pred(call, event_pred); + } + } + + return 0; +} + int filter_parse(char **pbuf, struct filter_pred *pred) { char *tmp, *tok, *val_str = NULL; From fe9f57f250ab4d781b99504caeb218ca2db14c1a Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 22 Mar 2009 18:41:59 +0100 Subject: [PATCH 14/60] tracing: add run-time field descriptions for event filtering, kfree fix Impact: fix potential kfree of random data in (rare) failure path Zero-initialize the field structure. Reported-by: Frederic Weisbecker Cc: Tom Zanussi LKML-Reference: <1237710639.7703.46.camel@charm-linux> Signed-off-by: Ingo Molnar --- kernel/trace/trace_events.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 97d4daaddd9a..594d78aaa185 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -24,26 +24,31 @@ int trace_define_field(struct ftrace_event_call *call, char *type, { struct ftrace_event_field *field; - field = kmalloc(sizeof(*field), GFP_KERNEL); + field = kzalloc(sizeof(*field), GFP_KERNEL); if (!field) goto err; + field->name = kstrdup(name, GFP_KERNEL); if (!field->name) goto err; + field->type = kstrdup(type, GFP_KERNEL); if (!field->type) goto err; + field->offset = offset; field->size = size; list_add(&field->link, &call->fields); return 0; + err: if (field) { kfree(field->name); kfree(field->type); } kfree(field); + return -ENOMEM; } From 9bd7d099ab3f10dd666da399c064999bae427cd9 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sun, 22 Mar 2009 23:10:43 +0100 Subject: [PATCH 15/60] tracing/events: make the filter files writable We need the filter files to be writable, the current filter file permissions are only set readable. Signed-off-by: Frederic Weisbecker Cc: Steven Rostedt Cc: Tom Zanussi LKML-Reference: <1237759847-21025-1-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace_events.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 594d78aaa185..19f61dd23219 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -706,7 +706,7 @@ event_subsystem_dir(const char *name, struct dentry *d_events) system->preds = NULL; - entry = debugfs_create_file("filter", 0444, system->entry, system, + entry = debugfs_create_file("filter", 0644, system->entry, system, &ftrace_subsystem_filter_fops); if (!entry) pr_warning("Could not create debugfs " @@ -769,7 +769,7 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events) } } - entry = debugfs_create_file("filter", 0444, call->dir, call, + entry = debugfs_create_file("filter", 0644, call->dir, call, &ftrace_event_filter_fops); if (!entry) pr_warning("Could not create debugfs " From 07edf7121374609709ef1b0889f6e7b8d6a62ec1 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sun, 22 Mar 2009 23:10:46 +0100 Subject: [PATCH 16/60] tracing/events: don't use wake up for events Impact: fix hard-lockup with sched switch events Some ftrace events, such as sched wakeup, can be traced while the runqueue lock is hold. Since they are using trace_current_buffer_unlock_commit(), they call wake_up() which can try to grab the runqueue lock too, resulting in a deadlock. Now for all event, we call a new helper: trace_nowake_buffer_unlock_commit() which do pretty the same than trace_current_buffer_unlock_commit() except than it doesn't call trace_wake_up(). Reported-by: Ingo Molnar Signed-off-by: Frederic Weisbecker Cc: Steven Rostedt LKML-Reference: <1237759847-21025-4-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace.c | 26 +++++++++++++++++++++----- kernel/trace/trace.h | 2 ++ kernel/trace/trace_events_stage_3.h | 2 +- 3 files changed, 24 insertions(+), 6 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index e6fac0ffe6f0..6bad12819eb6 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -860,15 +860,25 @@ static void ftrace_trace_stack(struct trace_array *tr, static void ftrace_trace_userstack(struct trace_array *tr, unsigned long flags, int pc); -void trace_buffer_unlock_commit(struct trace_array *tr, - struct ring_buffer_event *event, - unsigned long flags, int pc) +static inline void __trace_buffer_unlock_commit(struct trace_array *tr, + struct ring_buffer_event *event, + unsigned long flags, int pc, + int wake) { ring_buffer_unlock_commit(tr->buffer, event); ftrace_trace_stack(tr, flags, 6, pc); ftrace_trace_userstack(tr, flags, pc); - trace_wake_up(); + + if (wake) + trace_wake_up(); +} + +void trace_buffer_unlock_commit(struct trace_array *tr, + struct ring_buffer_event *event, + unsigned long flags, int pc) +{ + __trace_buffer_unlock_commit(tr, event, flags, pc, 1); } struct ring_buffer_event * @@ -882,7 +892,13 @@ trace_current_buffer_lock_reserve(unsigned char type, unsigned long len, void trace_current_buffer_unlock_commit(struct ring_buffer_event *event, unsigned long flags, int pc) { - return trace_buffer_unlock_commit(&global_trace, event, flags, pc); + return __trace_buffer_unlock_commit(&global_trace, event, flags, pc, 1); +} + +void trace_nowake_buffer_unlock_commit(struct ring_buffer_event *event, + unsigned long flags, int pc) +{ + return __trace_buffer_unlock_commit(&global_trace, event, flags, pc, 0); } void diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index f267723c3c52..54fd9bcd0a65 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -483,6 +483,8 @@ trace_current_buffer_lock_reserve(unsigned char type, unsigned long len, unsigned long flags, int pc); void trace_current_buffer_unlock_commit(struct ring_buffer_event *event, unsigned long flags, int pc); +void trace_nowake_buffer_unlock_commit(struct ring_buffer_event *event, + unsigned long flags, int pc); struct trace_entry *tracing_get_trace_entry(struct trace_array *tr, struct trace_array_cpu *data); diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h index ebf215e87d5e..9a3bd49b52e5 100644 --- a/kernel/trace/trace_events_stage_3.h +++ b/kernel/trace/trace_events_stage_3.h @@ -222,7 +222,7 @@ static void ftrace_raw_event_##call(proto) \ \ assign; \ \ - trace_current_buffer_unlock_commit(event, irq_flags, pc); \ + trace_nowake_buffer_unlock_commit(event, irq_flags, pc); \ \ if (call->preds && !filter_match_preds(call, entry)) \ ring_buffer_event_discard(event); \ From 7e6ea92df3fd7cbe74e7985c6f3e40255c44b201 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sun, 22 Mar 2009 23:10:47 +0100 Subject: [PATCH 17/60] tracing/ftrace: make nop-tracer use polling wait for events on pipe Impact: display events when they arrive Now that the events don't use wake_up() anymore, we need the nop tracer to poll waiting for events on the pipe. Especially because nop is useful to look at orphan traces types (traces types that don't rely on specific tracers) because it doesn't produce traces itself. And unlike other tracers that trigger specific traces periodically, nop triggers no traces by itself that can wake him. Signed-off-by: Frederic Weisbecker Cc: Steven Rostedt LKML-Reference: <1237759847-21025-5-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace_nop.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/trace/trace_nop.c b/kernel/trace/trace_nop.c index 9aa84bde23cd..394f94417e2f 100644 --- a/kernel/trace/trace_nop.c +++ b/kernel/trace/trace_nop.c @@ -91,6 +91,7 @@ struct tracer nop_trace __read_mostly = .name = "nop", .init = nop_trace_init, .reset = nop_trace_reset, + .wait_pipe = poll_wait_pipe, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_nop, #endif From b118415bfad6d75792a85ac999e25149db8e6919 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 23 Mar 2009 00:18:39 +0100 Subject: [PATCH 18/60] tracing/events: don't discard an event after commit When we want to filter an event, the filter test is done after the event is commited to the ring-buffer to be discarded later if needed. But a reader could be reading this event while we are trying to discard it. Other kind of racy events can even happen because the event is commited and can be read and/or consumed. What we want is to discard the event before committing it. Reported-by: Steven Rostedt Signed-off-by: Frederic Weisbecker Cc: Tom Zanussi LKML-Reference: <1237763919-21505-1-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace_events_stage_3.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h index 9a3bd49b52e5..9d2fa78cecca 100644 --- a/kernel/trace/trace_events_stage_3.h +++ b/kernel/trace/trace_events_stage_3.h @@ -222,10 +222,11 @@ static void ftrace_raw_event_##call(proto) \ \ assign; \ \ - trace_nowake_buffer_unlock_commit(event, irq_flags, pc); \ - \ if (call->preds && !filter_match_preds(call, entry)) \ ring_buffer_event_discard(event); \ + \ + trace_nowake_buffer_unlock_commit(event, irq_flags, pc); \ + \ } \ \ static int ftrace_raw_reg_event_##call(void) \ From 75c8b417526529d0a7072e4d93ec99dbd483a6f4 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Mon, 23 Mar 2009 03:26:28 -0500 Subject: [PATCH 19/60] tracing/filters: use list_for_each_entry_safe Impact: cleanup Use list_for_each_entry_safe instead of list_for_each_entry in find_event_field(). Reported-by: Frederic Weisbecker Signed-off-by: Tom Zanussi Cc: Steven Rostedt LKML-Reference: <1237796788.7527.35.camel@charm-linux> Signed-off-by: Ingo Molnar --- kernel/trace/trace_events_filter.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 1ab20cee0e4c..c4a413b70f78 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -147,11 +147,9 @@ int filter_print_preds(struct filter_pred **preds, char *buf) static struct ftrace_event_field * find_event_field(struct ftrace_event_call *call, char *name) { - struct ftrace_event_field *field; - struct list_head *entry, *tmp; + struct ftrace_event_field *field, *next; - list_for_each_safe(entry, tmp, &call->fields) { - field = list_entry(entry, struct ftrace_event_field, link); + list_for_each_entry_safe(field, next, &call->fields, link) { if (!strcmp(field->name, name)) return field; } From ee6cdabc820a29bd607f38d9cb335c3ceddc673b Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Mon, 23 Mar 2009 03:26:42 -0500 Subject: [PATCH 20/60] tracing/filters: fix bug in copy_pred() Impact: fix potential crash on subsystem filter expression freeing When making a copy of the predicate, pred->field_name needs to be duplicated in the copy as well, otherwise bad things can happen due to later multiple frees of the same string. This affects only per-subsystem event filtering. Signed-off-by: Tom Zanussi Cc: Steven Rostedt Cc: =?ISO-8859-1?Q?Fr=E9d=E9ric?= Weisbecker LKML-Reference: <1237796802.7527.39.camel@charm-linux> Signed-off-by: Ingo Molnar --- kernel/trace/trace_events_filter.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index c4a413b70f78..fd01d8022ad1 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -276,11 +276,19 @@ static struct filter_pred *copy_pred(struct filter_pred *pred) return NULL; memcpy(new_pred, pred, sizeof(*pred)); + + if (pred->field_name) { + new_pred->field_name = kstrdup(pred->field_name, GFP_KERNEL); + if (!new_pred->field_name) { + kfree(new_pred); + return NULL; + } + } + if (pred->str_val) { new_pred->str_val = kstrdup(pred->str_val, GFP_KERNEL); - new_pred->field_name = kstrdup(pred->field_name, GFP_KERNEL); if (!new_pred->str_val) { - kfree(new_pred); + filter_free_pred(new_pred); return NULL; } } From c4cff064be678f1e8344d907499f2a81282edc19 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Mon, 23 Mar 2009 03:26:48 -0500 Subject: [PATCH 21/60] tracing/filters: clean up filter_add_subsystem_pred() Impact: cleanup, memory leak fix This patch cleans up filter_add_subsystem_pred(): - searches for the field before creating a copy of the pred - fixes memory leak in the case a predicate isn't applied - if -ENOMEM, makes sure there's no longer a reference to the pred so the caller can free the half-finished filter - changes the confusing i == MAX_FILTER_PRED - 1 comparison previously remarked upon This affects only per-subsystem event filtering. Signed-off-by: Tom Zanussi Cc: Steven Rostedt Cc: =?ISO-8859-1?Q?Fr=E9d=E9ric?= Weisbecker LKML-Reference: <1237796808.7527.40.camel@charm-linux> Signed-off-by: Ingo Molnar --- kernel/trace/trace_events.c | 1 + kernel/trace/trace_events_filter.c | 31 +++++++++++++++++++++++------- 2 files changed, 25 insertions(+), 7 deletions(-) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 19f61dd23219..fdab30d6c835 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -585,6 +585,7 @@ subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, } if (filter_add_subsystem_pred(system, pred)) { + filter_free_subsystem_preds(system); filter_free_pred(pred); return -EINVAL; } diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index fd01d8022ad1..4117c2ebc245 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -318,22 +318,39 @@ int filter_add_subsystem_pred(struct event_subsystem *system, system->preds[i] = pred; break; } - if (i == MAX_FILTER_PRED - 1) - return -EINVAL; } + if (i == MAX_FILTER_PRED) + return -EINVAL; + events_for_each(call) { + int err; + if (!call->name || !call->regfunc) continue; - if (!strcmp(call->system, system->name)) { - event_pred = copy_pred(pred); - if (event_pred) - filter_add_pred(call, event_pred); - } + if (strcmp(call->system, system->name)) + continue; + + if (!find_event_field(call, pred->field_name)) + continue; + + event_pred = copy_pred(pred); + if (!event_pred) + goto oom; + + err = filter_add_pred(call, event_pred); + if (err) + filter_free_pred(event_pred); + if (err == -ENOMEM) + goto oom; } return 0; + +oom: + system->preds[i] = NULL; + return -ENOMEM; } int filter_parse(char **pbuf, struct filter_pred *pred) From c0f92ba99bdeaf35f9c580291b4e1a657c67fbd4 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sun, 22 Mar 2009 23:10:44 +0100 Subject: [PATCH 22/60] debugfs: function to know if debugfs is initialized Impact: add new debugfs API With ftrace, some tracers are registered in early initcalls and attempt to create files on the debugfs filesystem. Depending on when they are activated, they can try to create their file at any time. Some checks can be done on the tracing area but providing a helper to know if debugfs is registered make it really more easy. Signed-off-by: Frederic Weisbecker Acked-by: Greg Kroah-Hartman Cc: Steven Rostedt LKML-Reference: <1237759847-21025-2-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- fs/debugfs/inode.c | 16 ++++++++++++++++ include/linux/debugfs.h | 8 ++++++++ 2 files changed, 24 insertions(+) diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index 81ae9ea3c6e1..0662ba6de85a 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -30,6 +30,7 @@ static struct vfsmount *debugfs_mount; static int debugfs_mount_count; +static bool debugfs_registered; static struct inode *debugfs_get_inode(struct super_block *sb, int mode, dev_t dev) { @@ -496,6 +497,16 @@ exit: } EXPORT_SYMBOL_GPL(debugfs_rename); +/** + * debugfs_initialized - Tells whether debugfs has been registered + */ +bool debugfs_initialized(void) +{ + return debugfs_registered; +} +EXPORT_SYMBOL_GPL(debugfs_initialized); + + static struct kobject *debug_kobj; static int __init debugfs_init(void) @@ -509,11 +520,16 @@ static int __init debugfs_init(void) retval = register_filesystem(&debug_fs_type); if (retval) kobject_put(debug_kobj); + else + debugfs_registered = true; + return retval; } static void __exit debugfs_exit(void) { + debugfs_registered = false; + simple_release_fs(&debugfs_mount, &debugfs_mount_count); unregister_filesystem(&debug_fs_type); kobject_put(debug_kobj); diff --git a/include/linux/debugfs.h b/include/linux/debugfs.h index af0e01d4c663..eb5c2ba2f81a 100644 --- a/include/linux/debugfs.h +++ b/include/linux/debugfs.h @@ -71,6 +71,9 @@ struct dentry *debugfs_create_bool(const char *name, mode_t mode, struct dentry *debugfs_create_blob(const char *name, mode_t mode, struct dentry *parent, struct debugfs_blob_wrapper *blob); + +bool debugfs_initialized(void); + #else #include @@ -183,6 +186,11 @@ static inline struct dentry *debugfs_create_blob(const char *name, mode_t mode, return ERR_PTR(-ENODEV); } +static inline bool debugfs_initialized(void) +{ + return false; +} + #endif #endif From 3e1f60b80cafcb5d7e8d3665b35962fbb8fb9efa Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sun, 22 Mar 2009 23:10:45 +0100 Subject: [PATCH 23/60] tracing/ftrace: check if debugfs is registered before creating files Impact: fix a crash with ftrace={nop,boot} parameter If the nop or initcall tracers are launched as boot tracers, they will attempt to create their option directory and files. But these tracers are registered very early and then assigned as "boot tracers" very early if asked to. Since they do this before debugfs has been registered (core initcall), a crash is triggered. Another early tracers could also come later. So we fix it by checking if debugfs is initialized before creating the root tracing directory. Signed-off-by: Frederic Weisbecker Cc: Greg Kroah-Hartman Cc: Steven Rostedt LKML-Reference: <1237759847-21025-3-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index ace685c70186..f0e1337b1ebd 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3513,6 +3513,9 @@ struct dentry *tracing_init_dentry(void) if (d_tracer) return d_tracer; + if (!debugfs_initialized()) + return NULL; + d_tracer = debugfs_create_dir("tracing", NULL); if (!d_tracer && !once) { From 45b9560895b07a4a09d55d49235c984db512c5aa Mon Sep 17 00:00:00 2001 From: Anton Vorontsov Date: Tue, 24 Mar 2009 01:07:24 +0300 Subject: [PATCH 24/60] tracing: Fix TRACING_SUPPORT dependency for PPC32 commit 40ada30f9621fbd831ac2437b9a2a399aa ("tracing: clean up menu"), despite the "clean up" in its purpose, introduced a behavioural change for Kconfig symbols: we no longer able to select tracing support on PPC32 (because IRQFLAGS_SUPPORT isn't yet implemented). The IRQFLAGS_SUPPORT is not mandatory for most tracers, tracing core has a special case for platforms w/o irqflags (which, by the way, has become useless as of the commit above). Though according to Ingo Molnar, there was periodic build failures on weird, unmaintained architectures that had no irqflags-tracing support and hence didn't know the raw_irqs_save/restore primitives. Thus we'd better not enable irqflags-less tracing for all architectures. This patch restores the old behaviour for PPC32, and thus brings the tracing back. Other architectures can either add themselves to the exception list or (better) implement TRACE_IRQFLAGS_SUPPORT. Signed-off-by: Anton Vorontsov Acked-b: Steven Rostedt Cc: linuxppc-dev@ozlabs.org LKML-Reference: <20090323220724.GA9851@oksana.dev.rtsoft.ru> Signed-off-by: Ingo Molnar --- kernel/trace/Kconfig | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index b0a46f889659..8a4d72931042 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -63,7 +63,11 @@ config TRACING # config TRACING_SUPPORT bool - depends on TRACE_IRQFLAGS_SUPPORT + # PPC32 has no irqflags tracing support, but it can use most of the + # tracers anyway, they were tested to build and work. Note that new + # exceptions to this list aren't welcomed, better implement the + # irqflags tracing for your architecture. + depends on TRACE_IRQFLAGS_SUPPORT || PPC32 depends on STACKTRACE_SUPPORT default y From 1618536961d31f9b3f55767b22d4a897f4204c26 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 23 Mar 2009 22:17:01 +0100 Subject: [PATCH 25/60] tracing/function-graph-tracer: fix functions call traces imbalance Impact: fix traces output Sometimes one can observe an imbalance in the traces between function calls and function return traces: func1() { } } The curly brace inside func1() is the return of another function nested inside func1. The return trace have been inserted in the buffer but not the entry. We are storing a return address on the function traces stack while we haven't inserted its entry on the buffer, hence the imbalance on the traces. This is because the tracers doesn't check all failures that can happen on buffer insertion. This patch reports the tracing recursion failures and the ring buffer failures. In such cases, we now restore the original return address for the function, giving up its return trace. Signed-off-by: Frederic Weisbecker Cc: Steven Rostedt LKML-Reference: <1237843021-11695-1-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 6bad12819eb6..89f0c2544ad0 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -924,7 +924,7 @@ trace_function(struct trace_array *tr, } #ifdef CONFIG_FUNCTION_GRAPH_TRACER -static void __trace_graph_entry(struct trace_array *tr, +static int __trace_graph_entry(struct trace_array *tr, struct ftrace_graph_ent *trace, unsigned long flags, int pc) @@ -933,15 +933,17 @@ static void __trace_graph_entry(struct trace_array *tr, struct ftrace_graph_ent_entry *entry; if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled)))) - return; + return 0; event = trace_buffer_lock_reserve(&global_trace, TRACE_GRAPH_ENT, sizeof(*entry), flags, pc); if (!event) - return; + return 0; entry = ring_buffer_event_data(event); entry->graph_ent = *trace; ring_buffer_unlock_commit(global_trace.buffer, event); + + return 1; } static void __trace_graph_return(struct trace_array *tr, @@ -1162,6 +1164,7 @@ int trace_graph_entry(struct ftrace_graph_ent *trace) struct trace_array_cpu *data; unsigned long flags; long disabled; + int ret; int cpu; int pc; @@ -1177,15 +1180,18 @@ int trace_graph_entry(struct ftrace_graph_ent *trace) disabled = atomic_inc_return(&data->disabled); if (likely(disabled == 1)) { pc = preempt_count(); - __trace_graph_entry(tr, trace, flags, pc); + ret = __trace_graph_entry(tr, trace, flags, pc); + } else { + ret = 0; } /* Only do the atomic if it is not already set */ if (!test_tsk_trace_graph(current)) set_tsk_trace_graph(current); + atomic_dec(&data->disabled); local_irq_restore(flags); - return 1; + return ret; } void trace_graph_return(struct ftrace_graph_ret *trace) From 1fc2d5c11918082536acf261ce6abb1f5511053f Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Tue, 24 Mar 2009 02:14:01 -0500 Subject: [PATCH 26/60] tracing/filters: use list_for_each_entry Impact: cleanup No need to use the safe version here, so use list_for_each_entry instead of list_for_each_entry_safe in find_event_field(). Signed-off-by: Tom Zanussi Cc: Steven Rostedt Cc: =?ISO-8859-1?Q?Fr=E9d=E9ric?= Weisbecker LKML-Reference: <1237878841.8339.57.camel@charm-linux> Signed-off-by: Ingo Molnar --- kernel/trace/trace_events_filter.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 4117c2ebc245..3f0b79f8a4bc 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -147,9 +147,9 @@ int filter_print_preds(struct filter_pred **preds, char *buf) static struct ftrace_event_field * find_event_field(struct ftrace_event_call *call, char *name) { - struct ftrace_event_field *field, *next; + struct ftrace_event_field *field; - list_for_each_entry_safe(field, next, &call->fields, link) { + list_for_each_entry(field, &call->fields, link) { if (!strcmp(field->name, name)) return field; } From 09f1f245c79585383de63e3ca54d0f91824bff3a Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Tue, 24 Mar 2009 02:14:11 -0500 Subject: [PATCH 27/60] tracing/filters: free pred when clearing filters Impact: fix (small) per trace filter modification memory leak Free the current pred when clearing the filters via the filter files. Signed-off-by: Tom Zanussi Cc: Steven Rostedt Cc: =?ISO-8859-1?Q?Fr=E9d=E9ric?= Weisbecker LKML-Reference: <1237878851.8339.58.camel@charm-linux> Signed-off-by: Ingo Molnar --- kernel/trace/trace_events.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index fdab30d6c835..a9381384aa9e 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -516,6 +516,7 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, if (pred->clear) { filter_free_preds(call); + filter_free_pred(pred); return cnt; } @@ -581,6 +582,7 @@ subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, if (pred->clear) { filter_free_subsystem_preds(system); + filter_free_pred(pred); return cnt; } From 4bda2d517bfa3ce3d7044e06988cdddae7adffe2 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Tue, 24 Mar 2009 02:14:31 -0500 Subject: [PATCH 28/60] tracing/filters: use trace_seq_printf() to print filters Impact: cleanup Instead of just using the trace_seq buffer to print the filters, use trace_seq_printf() as it was intended to be used. Reported-by: Steven Rostedt Signed-off-by: Tom Zanussi Cc: =?ISO-8859-1?Q?Fr=E9d=E9ric?= Weisbecker LKML-Reference: <1237878871.8339.59.camel@charm-linux> Signed-off-by: Ingo Molnar --- kernel/trace/trace.h | 3 ++- kernel/trace/trace_events.c | 8 ++++---- kernel/trace/trace_events_filter.c | 25 +++++++++---------------- 3 files changed, 15 insertions(+), 21 deletions(-) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 54fd9bcd0a65..90a848debcba 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -840,7 +840,8 @@ struct filter_pred { int trace_define_field(struct ftrace_event_call *call, char *type, char *name, int offset, int size); extern void filter_free_pred(struct filter_pred *pred); -extern int filter_print_preds(struct filter_pred **preds, char *buf); +extern void filter_print_preds(struct filter_pred **preds, + struct trace_seq *s); extern int filter_parse(char **pbuf, struct filter_pred *pred); extern int filter_add_pred(struct ftrace_event_call *call, struct filter_pred *pred); diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index a9381384aa9e..d132997ab756 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -481,8 +481,8 @@ event_filter_read(struct file *filp, char __user *ubuf, size_t cnt, trace_seq_init(s); - r = filter_print_preds(call->preds, s->buffer); - r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, r); + filter_print_preds(call->preds, s); + r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len); kfree(s); @@ -547,8 +547,8 @@ subsystem_filter_read(struct file *filp, char __user *ubuf, size_t cnt, trace_seq_init(s); - r = filter_print_preds(system->preds, s->buffer); - r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, r); + filter_print_preds(system->preds, s); + r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len); kfree(s); diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 3f0b79f8a4bc..9fca8bb1c06b 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -24,6 +24,7 @@ #include #include "trace.h" +#include "trace_output.h" static int filter_pred_64(struct filter_pred *pred, void *event) { @@ -108,16 +109,15 @@ int filter_match_preds(struct ftrace_event_call *call, void *rec) return 1; } -int filter_print_preds(struct filter_pred **preds, char *buf) +void filter_print_preds(struct filter_pred **preds, struct trace_seq *s) { - ssize_t this_len = 0; char *field_name; struct filter_pred *pred; int i; if (!preds) { - this_len += sprintf(buf + this_len, "none\n"); - return this_len; + trace_seq_printf(s, "none\n"); + return; } for (i = 0; i < MAX_FILTER_PRED; i++) { @@ -125,23 +125,16 @@ int filter_print_preds(struct filter_pred **preds, char *buf) pred = preds[i]; field_name = pred->field_name; if (i) - this_len += sprintf(buf + this_len, - pred->or ? "|| " : "&& "); - this_len += sprintf(buf + this_len, - "%s ", field_name); - this_len += sprintf(buf + this_len, - pred->not ? "!= " : "== "); + trace_seq_printf(s, pred->or ? "|| " : "&& "); + trace_seq_printf(s, "%s ", field_name); + trace_seq_printf(s, pred->not ? "!= " : "== "); if (pred->str_val) - this_len += sprintf(buf + this_len, - "%s\n", pred->str_val); + trace_seq_printf(s, "%s\n", pred->str_val); else - this_len += sprintf(buf + this_len, - "%llu\n", pred->val); + trace_seq_printf(s, "%llu\n", pred->val); } else break; } - - return this_len; } static struct ftrace_event_field * From 9f58a159d022c8f2533a27708aa267adf4f0e3ce Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Tue, 24 Mar 2009 02:14:42 -0500 Subject: [PATCH 29/60] tracing/filters: disallow integer values for string filters and vice versa Impact: fix filter use boundary condition / crash Make sure filters for string fields don't use integer values and vice versa. Getting it wrong can crash the system or produce bogus results. Signed-off-by: Tom Zanussi Cc: Steven Rostedt Cc: =?ISO-8859-1?Q?Fr=E9d=E9ric?= Weisbecker LKML-Reference: <1237878882.8339.61.camel@charm-linux> Signed-off-by: Ingo Molnar --- kernel/trace/trace_events_filter.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 9fca8bb1c06b..026be412f356 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -237,9 +237,14 @@ int filter_add_pred(struct ftrace_event_call *call, struct filter_pred *pred) pred->offset = field->offset; if (is_string_field(field->type)) { + if (!pred->str_val) + return -EINVAL; pred->fn = filter_pred_string; pred->str_len = field->size; return __filter_add_pred(call, pred); + } else { + if (pred->str_val) + return -EINVAL; } switch (field->size) { From e4955c9986a27bb47ddeb6cd55803053f547e2e9 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 24 Mar 2009 16:04:37 +0800 Subject: [PATCH 30/60] blktrace: mark ddir_act[] const Impact: cleanup ddir_act and what2act always stay immutable. Signed-off-by: Li Zefan Acked-by: Jens Axboe Cc: Steven Rostedt Cc: Frederic Weisbecker Cc: Arnaldo Carvalho de Melo LKML-Reference: <49C89415.5080503@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/blktrace.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 108f4f7715a5..1ffcbd4ac45b 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -147,8 +147,8 @@ static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector, /* * Data direction bit lookup */ -static u32 ddir_act[2] __read_mostly = { BLK_TC_ACT(BLK_TC_READ), - BLK_TC_ACT(BLK_TC_WRITE) }; +static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ), + BLK_TC_ACT(BLK_TC_WRITE) }; /* The ilog2() calls fall out because they're constant */ #define MASK_TC_BIT(rw, __name) ((rw & (1 << BIO_RW_ ## __name)) << \ @@ -1116,10 +1116,10 @@ static void blk_tracer_reset(struct trace_array *tr) blk_tracer_stop(tr); } -static struct { +static const struct { const char *act[2]; int (*print)(struct trace_seq *s, const struct trace_entry *ent); -} what2act[] __read_mostly = { +} what2act[] = { [__BLK_TA_QUEUE] = {{ "Q", "queue" }, blk_log_generic }, [__BLK_TA_BACKMERGE] = {{ "M", "backmerge" }, blk_log_generic }, [__BLK_TA_FRONTMERGE] = {{ "F", "frontmerge" }, blk_log_generic }, From 65796348e09880e12b97267d39b8857c758440a6 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 24 Mar 2009 16:05:06 +0800 Subject: [PATCH 31/60] blktrace: fix wrong calculation of RWBS Impact: fix the output of IO type category characters Trace categories are the upper 16 bits, not the lower 16 bits. Signed-off-by: Li Zefan Acked-by: Jens Axboe Cc: Steven Rostedt Cc: Frederic Weisbecker Cc: Arnaldo Carvalho de Melo LKML-Reference: <49C89432.8010805@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/blktrace.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 1ffcbd4ac45b..9af41430ee54 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -922,23 +922,24 @@ static void blk_unregister_tracepoints(void) static void fill_rwbs(char *rwbs, const struct blk_io_trace *t) { int i = 0; + int tc = t->action >> BLK_TC_SHIFT; - if (t->action & BLK_TC_DISCARD) + if (tc & BLK_TC_DISCARD) rwbs[i++] = 'D'; - else if (t->action & BLK_TC_WRITE) + else if (tc & BLK_TC_WRITE) rwbs[i++] = 'W'; else if (t->bytes) rwbs[i++] = 'R'; else rwbs[i++] = 'N'; - if (t->action & BLK_TC_AHEAD) + if (tc & BLK_TC_AHEAD) rwbs[i++] = 'A'; - if (t->action & BLK_TC_BARRIER) + if (tc & BLK_TC_BARRIER) rwbs[i++] = 'B'; - if (t->action & BLK_TC_SYNC) + if (tc & BLK_TC_SYNC) rwbs[i++] = 'S'; - if (t->action & BLK_TC_META) + if (tc & BLK_TC_META) rwbs[i++] = 'M'; rwbs[i] = '\0'; From e0dc81bec0927fa0c8aabc521793161909eef7a5 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 24 Mar 2009 16:05:51 +0800 Subject: [PATCH 32/60] blktrace: fix t_error() Impact: fix error flag output t_error() should return t->error but not t->sector. Signed-off-by: Li Zefan Acked-by: Jens Axboe Cc: Steven Rostedt Cc: Frederic Weisbecker Cc: Arnaldo Carvalho de Melo LKML-Reference: <49C8945F.5020802@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/blktrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 9af41430ee54..f69f8bd8bef9 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -968,7 +968,7 @@ static inline unsigned long long t_sector(const struct trace_entry *ent) static inline __u16 t_error(const struct trace_entry *ent) { - return te_blk_io_trace(ent)->sector; + return te_blk_io_trace(ent)->error; } static __u64 get_pdu_int(const struct trace_entry *ent) From 093419971e03362a00f499960557c119982ea09f Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 24 Mar 2009 17:43:30 +0800 Subject: [PATCH 33/60] blktrace: print human-readable act_mask Impact: new feature, allow symbolic values in /debug/tracing/act_mask Print stringified act_mask instead of hex value: # cat act_mask read,write,barrier,sync,queue,requeue,issue,complete,fs,pc,ahead,meta, discard,drv_data # echo "meta,write" > act_mask # cat act_mask write,meta Also: - make act_mask accept "ahead", "meta", "discard" and "drv_data" - use strsep() instead of strchr() to parse user input - return -EINVAL if a token is not found in the mask map - fix a bug that 'value' is unsigned, so it can < 0 - propagate error value of blk_trace_mask2str() to userspace, but not always return -ENXIO. Signed-off-by: Li Zefan Acked-by: Jens Axboe Cc: Steven Rostedt Cc: Frederic Weisbecker Cc: Arnaldo Carvalho de Melo LKML-Reference: <49C8AB42.1000802@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/blktrace.c | 103 +++++++++++++++++++++++++--------------- 1 file changed, 65 insertions(+), 38 deletions(-) diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index f69f8bd8bef9..6fb274f5f34e 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -1316,53 +1316,77 @@ struct attribute_group blk_trace_attr_group = { .attrs = blk_trace_attrs, }; -static int blk_str2act_mask(const char *str) +static const struct { + int mask; + const char *str; +} mask_maps[] = { + { BLK_TC_READ, "read" }, + { BLK_TC_WRITE, "write" }, + { BLK_TC_BARRIER, "barrier" }, + { BLK_TC_SYNC, "sync" }, + { BLK_TC_QUEUE, "queue" }, + { BLK_TC_REQUEUE, "requeue" }, + { BLK_TC_ISSUE, "issue" }, + { BLK_TC_COMPLETE, "complete" }, + { BLK_TC_FS, "fs" }, + { BLK_TC_PC, "pc" }, + { BLK_TC_AHEAD, "ahead" }, + { BLK_TC_META, "meta" }, + { BLK_TC_DISCARD, "discard" }, + { BLK_TC_DRV_DATA, "drv_data" }, +}; + +static int blk_trace_str2mask(const char *str) { + int i; int mask = 0; - char *copy = kstrdup(str, GFP_KERNEL), *s; + char *s, *token; - if (copy == NULL) + s = kstrdup(str, GFP_KERNEL); + if (s == NULL) return -ENOMEM; - - s = strstrip(copy); + s = strstrip(s); while (1) { - char *sep = strchr(s, ','); - - if (sep != NULL) - *sep = '\0'; - - if (strcasecmp(s, "barrier") == 0) - mask |= BLK_TC_BARRIER; - else if (strcasecmp(s, "complete") == 0) - mask |= BLK_TC_COMPLETE; - else if (strcasecmp(s, "fs") == 0) - mask |= BLK_TC_FS; - else if (strcasecmp(s, "issue") == 0) - mask |= BLK_TC_ISSUE; - else if (strcasecmp(s, "pc") == 0) - mask |= BLK_TC_PC; - else if (strcasecmp(s, "queue") == 0) - mask |= BLK_TC_QUEUE; - else if (strcasecmp(s, "read") == 0) - mask |= BLK_TC_READ; - else if (strcasecmp(s, "requeue") == 0) - mask |= BLK_TC_REQUEUE; - else if (strcasecmp(s, "sync") == 0) - mask |= BLK_TC_SYNC; - else if (strcasecmp(s, "write") == 0) - mask |= BLK_TC_WRITE; - - if (sep == NULL) + token = strsep(&s, ","); + if (token == NULL) break; - s = sep + 1; + if (*token == '\0') + continue; + + for (i = 0; i < ARRAY_SIZE(mask_maps); i++) { + if (strcasecmp(token, mask_maps[i].str) == 0) { + mask |= mask_maps[i].mask; + break; + } + } + if (i == ARRAY_SIZE(mask_maps)) { + mask = -EINVAL; + break; + } } - kfree(copy); + kfree(s); return mask; } +static ssize_t blk_trace_mask2str(char *buf, int mask) +{ + int i; + char *p = buf; + + for (i = 0; i < ARRAY_SIZE(mask_maps); i++) { + if (mask & mask_maps[i].mask) { + p += sprintf(p, "%s%s", + (p == buf) ? "" : ",", mask_maps[i].str); + } + } + *p++ = '\n'; + + return p - buf; +} + static struct request_queue *blk_trace_get_queue(struct block_device *bdev) { if (bdev->bd_disk == NULL) @@ -1399,7 +1423,7 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev, if (q->blk_trace == NULL) ret = sprintf(buf, "disabled\n"); else if (attr == &dev_attr_act_mask) - ret = sprintf(buf, "%#x\n", q->blk_trace->act_mask); + ret = blk_trace_mask2str(buf, q->blk_trace->act_mask); else if (attr == &dev_attr_pid) ret = sprintf(buf, "%u\n", q->blk_trace->pid); else if (attr == &dev_attr_start_lba) @@ -1424,7 +1448,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev, struct request_queue *q; struct hd_struct *p; u64 value; - ssize_t ret = -ENXIO; + ssize_t ret = -EINVAL; if (count == 0) goto out; @@ -1432,13 +1456,16 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev, if (attr == &dev_attr_act_mask) { if (sscanf(buf, "%llx", &value) != 1) { /* Assume it is a list of trace category names */ - value = blk_str2act_mask(buf); - if (value < 0) + ret = blk_trace_str2mask(buf); + if (ret < 0) goto out; + value = ret; } } else if (sscanf(buf, "%llu", &value) != 1) goto out; + ret = -ENXIO; + lock_kernel(); p = dev_to_part(dev); bdev = bdget(part_devt(p)); From 098335215a4921a8a54193829eaed602dca24df5 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Sat, 21 Mar 2009 02:44:50 -0400 Subject: [PATCH 34/60] tracing: fix memory leak in trace_stat If the function profiler does not have any items recorded and one were to cat the function stat file, the kernel would take a BUG with a NULL pointer dereference. Looking further into this, I found that returning NULL from stat_start did not stop the stat logic, and would later call stat_next. This breaks from the way seq_file works, so I looked into fixing the stat code. This is where I noticed that the last next_entry is never freed. It is allocated, and if the stat_next returns NULL, the code breaks out of the loop, unlocks the mutex and exits. We never link the next_entry nor do we free it. Thus it is a real memory leak. This patch rearranges the code a bit to not only fix the memory leak, but also to act more like seq_file where nothing is printed if there is nothing to print. That is, stat_start returns NULL. Signed-off-by: Steven Rostedt --- kernel/trace/trace_stat.c | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c index 39310e3434ee..f71b85b22cfe 100644 --- a/kernel/trace/trace_stat.c +++ b/kernel/trace/trace_stat.c @@ -75,7 +75,7 @@ static int stat_seq_init(struct tracer_stat_session *session) { struct trace_stat_list *iter_entry, *new_entry; struct tracer_stat *ts = session->ts; - void *prev_stat; + void *stat; int ret = 0; int i; @@ -85,6 +85,10 @@ static int stat_seq_init(struct tracer_stat_session *session) if (!ts->stat_cmp) ts->stat_cmp = dummy_cmp; + stat = ts->stat_start(); + if (!stat) + goto exit; + /* * The first entry. Actually this is the second, but the first * one (the stat_list head) is pointless. @@ -99,14 +103,19 @@ static int stat_seq_init(struct tracer_stat_session *session) list_add(&new_entry->list, &session->stat_list); - new_entry->stat = ts->stat_start(); - prev_stat = new_entry->stat; + new_entry->stat = stat; /* * Iterate over the tracer stat entries and store them in a sorted * list. */ for (i = 1; ; i++) { + stat = ts->stat_next(stat, i); + + /* End of insertion */ + if (!stat) + break; + new_entry = kmalloc(sizeof(struct trace_stat_list), GFP_KERNEL); if (!new_entry) { ret = -ENOMEM; @@ -114,11 +123,7 @@ static int stat_seq_init(struct tracer_stat_session *session) } INIT_LIST_HEAD(&new_entry->list); - new_entry->stat = ts->stat_next(prev_stat, i); - - /* End of insertion */ - if (!new_entry->stat) - break; + new_entry->stat = stat; list_for_each_entry(iter_entry, &session->stat_list, list) { @@ -137,8 +142,6 @@ static int stat_seq_init(struct tracer_stat_session *session) break; } } - - prev_stat = new_entry->stat; } exit: mutex_unlock(&session->stat_mutex); From 5d1a03dc541dc6672e60e57249ed22f40654ca47 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 23 Mar 2009 23:38:49 -0400 Subject: [PATCH 35/60] function-graph: moved the timestamp from arch to generic code This patch move the timestamp from happening in the arch specific code into the general code. This allows for better control by the tracer to time manipulation. Signed-off-by: Steven Rostedt --- arch/x86/kernel/ftrace.c | 6 +----- include/linux/ftrace.h | 3 +-- kernel/trace/trace_functions_graph.c | 8 +++++--- 3 files changed, 7 insertions(+), 10 deletions(-) diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 57b33edb7ce3..61df77532120 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -410,7 +410,6 @@ int ftrace_disable_ftrace_graph_caller(void) void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr) { unsigned long old; - unsigned long long calltime; int faulted; struct ftrace_graph_ent trace; unsigned long return_hooker = (unsigned long) @@ -453,10 +452,7 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr) return; } - calltime = trace_clock_local(); - - if (ftrace_push_return_trace(old, calltime, - self_addr, &trace.depth) == -EBUSY) { + if (ftrace_push_return_trace(old, self_addr, &trace.depth) == -EBUSY) { *parent = old; return; } diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index db3fed630db3..1141248c84ee 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -369,8 +369,7 @@ struct ftrace_ret_stack { extern void return_to_handler(void); extern int -ftrace_push_return_trace(unsigned long ret, unsigned long long time, - unsigned long func, int *depth); +ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth); extern void ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret); diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index e876816fa8e7..d28687e7b3a7 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -57,9 +57,9 @@ static struct tracer_flags tracer_flags = { /* Add a function return address to the trace stack on thread info.*/ int -ftrace_push_return_trace(unsigned long ret, unsigned long long time, - unsigned long func, int *depth) +ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth) { + unsigned long long calltime; int index; if (!current->ret_stack) @@ -71,11 +71,13 @@ ftrace_push_return_trace(unsigned long ret, unsigned long long time, return -EBUSY; } + calltime = trace_clock_local(); + index = ++current->curr_ret_stack; barrier(); current->ret_stack[index].ret = ret; current->ret_stack[index].func = func; - current->ret_stack[index].calltime = time; + current->ret_stack[index].calltime = calltime; *depth = index; return 0; From 05ce5818adee8f8efd0a5ca0d900a6789012516b Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 24 Mar 2009 00:18:31 -0400 Subject: [PATCH 36/60] function-graph: prevent more than one tracer registering Impact: prevent crash due to multiple function graph tracers The function graph tracer can currently only handle a single tracer being registered. If another tracer registers with the function graph tracer it can crash the system. Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 7847806eefef..c81a759fbf76 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2643,6 +2643,12 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc, mutex_lock(&ftrace_lock); + /* we currently allow only one tracer registered at a time */ + if (atomic_read(&ftrace_graph_active)) { + ret = -EBUSY; + goto out; + } + ftrace_suspend_notifier.notifier_call = ftrace_suspend_notifier_call; register_pm_notifier(&ftrace_suspend_notifier); From 8aef2d2856158a36c295a8d1288281e4839bff13 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 24 Mar 2009 01:10:15 -0400 Subject: [PATCH 37/60] function-graph: ignore times across schedule Impact: more accurate timings The current method of function graph tracing does not take into account the time spent when a task is not running. This shows functions that call schedule have increased costs: 3) + 18.664 us | } ------------------------------------------ 3) -0 => kblockd-123 ------------------------------------------ 3) | finish_task_switch() { 3) 1.441 us | _spin_unlock_irq(); 3) 3.966 us | } 3) ! 2959.433 us | } 3) ! 2961.465 us | } This patch uses the tracepoint in the scheduling context switch to account for time that has elapsed while a task is scheduled out. Now we see: ------------------------------------------ 3) -0 => edac-po-1067 ------------------------------------------ 3) | finish_task_switch() { 3) 0.685 us | _spin_unlock_irq(); 3) 2.331 us | } 3) + 41.439 us | } 3) + 42.663 us | } Signed-off-by: Steven Rostedt --- include/linux/sched.h | 2 ++ kernel/trace/ftrace.c | 36 ++++++++++++++++++++++++++++++++++++ 2 files changed, 38 insertions(+) diff --git a/include/linux/sched.h b/include/linux/sched.h index 89cd308cc7a5..471e36d30123 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1409,6 +1409,8 @@ struct task_struct { int curr_ret_stack; /* Stack of return addresses for return function tracing */ struct ftrace_ret_stack *ret_stack; + /* time stamp for last schedule */ + unsigned long long ftrace_timestamp; /* * Number of functions that haven't been traced * because of depth overrun. diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index c81a759fbf76..0b90364d1a2c 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -29,6 +29,8 @@ #include #include +#include + #include #include "trace.h" @@ -2590,6 +2592,31 @@ free: return ret; } +static void +ftrace_graph_probe_sched_switch(struct rq *__rq, struct task_struct *prev, + struct task_struct *next) +{ + unsigned long long timestamp; + int index; + + timestamp = trace_clock_local(); + + prev->ftrace_timestamp = timestamp; + + /* only process tasks that we timestamped */ + if (!next->ftrace_timestamp) + return; + + /* + * Update all the counters in next to make up for the + * time next was sleeping. + */ + timestamp -= next->ftrace_timestamp; + + for (index = next->curr_ret_stack; index >= 0; index--) + next->ret_stack[index].calltime += timestamp; +} + /* Allocate a return stack for each task */ static int start_graph_tracing(void) { @@ -2611,6 +2638,13 @@ static int start_graph_tracing(void) ret = alloc_retstack_tasklist(ret_stack_list); } while (ret == -EAGAIN); + if (!ret) { + ret = register_trace_sched_switch(ftrace_graph_probe_sched_switch); + if (ret) + pr_info("ftrace_graph: Couldn't activate tracepoint" + " probe to kernel_sched_switch\n"); + } + kfree(ret_stack_list); return ret; } @@ -2674,6 +2708,7 @@ void unregister_ftrace_graph(void) mutex_lock(&ftrace_lock); atomic_dec(&ftrace_graph_active); + unregister_trace_sched_switch(ftrace_graph_probe_sched_switch); ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub; ftrace_graph_entry = ftrace_graph_entry_stub; ftrace_shutdown(FTRACE_STOP_FUNC_RET); @@ -2694,6 +2729,7 @@ void ftrace_graph_init_task(struct task_struct *t) t->curr_ret_stack = -1; atomic_set(&t->tracing_graph_pause, 0); atomic_set(&t->trace_overrun, 0); + t->ftrace_timestamp = 0; } else t->ret_stack = NULL; } From be6f164a02f394675e2ac2077dd354cebef5b4c0 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 24 Mar 2009 11:06:24 -0400 Subject: [PATCH 38/60] function-graph: add option for include sleep times Impact: give user a choice to show times spent while sleeping The user may want to see the time a function spent sleeping. This patch adds the trace option "sleep-time" to allow that. The "sleep-time" option is default on. echo sleep-time > /debug/tracing/trace_options produces: ------------------------------------------ 2) avahi-d-3428 => -0 ------------------------------------------ 2) | finish_task_switch() { 2) 0.621 us | _spin_unlock_irq(); 2) 2.202 us | } 2) ! 1002.197 us | } 2) ! 1003.521 us | } where as, echo nosleep-time > /debug/tracing/trace_options produces: 0) -0 => yum-upd-3416 ------------------------------------------ 0) | finish_task_switch() { 0) 0.643 us | _spin_unlock_irq(); 0) 2.342 us | } 0) + 41.302 us | } 0) + 42.453 us | } Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 7 +++++++ kernel/trace/trace.c | 3 ++- kernel/trace/trace.h | 1 + 3 files changed, 10 insertions(+), 1 deletion(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 0b90364d1a2c..02d2de9d08ba 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2599,6 +2599,13 @@ ftrace_graph_probe_sched_switch(struct rq *__rq, struct task_struct *prev, unsigned long long timestamp; int index; + /* + * Does the user want to count the time a function was asleep. + * If so, do not update the time stamps. + */ + if (trace_flags & TRACE_ITER_SLEEP_TIME) + return; + timestamp = trace_clock_local(); prev->ftrace_timestamp = timestamp; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index f0e1337b1ebd..67c6a21dd427 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -255,7 +255,7 @@ static DECLARE_WAIT_QUEUE_HEAD(trace_wait); /* trace_flags holds trace_options default values */ unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | - TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO; + TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME; /** * trace_wake_up - wake up tasks waiting for trace input @@ -316,6 +316,7 @@ static const char *trace_options[] = { "context-info", "latency-format", "global-clock", + "sleep-time", NULL }; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 7cfb741be200..d7410bbb9a80 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -683,6 +683,7 @@ enum trace_iterator_flags { TRACE_ITER_CONTEXT_INFO = 0x20000, /* Print pid/cpu/time */ TRACE_ITER_LATENCY_FMT = 0x40000, TRACE_ITER_GLOBAL_CLK = 0x80000, + TRACE_ITER_SLEEP_TIME = 0x100000, }; /* From cc59c9e8d0165c632fd056c4a23e36f917507fb4 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 24 Mar 2009 11:03:01 +0800 Subject: [PATCH 39/60] ftrace: show virtual PID Impact: fix PID output under namespaces When current namespace is not the global namespace, pid read from set_ftrace_pid is no correct. # ~/newpid_namespace_run bash # echo $$ 1 # echo 1 > set_ftrace_pid # cat set_ftrace_pid 3756 Since we write virtual PID to set_ftrace_pid, we need get virtual PID when we read it. Signed-off-by: Lai Jiangshan Cc: Steven Rostedt Cc: Frederic Weisbecker LKML-Reference: <49C84D65.9050606@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/ftrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 02d2de9d08ba..bb377112b1bb 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2264,7 +2264,7 @@ ftrace_pid_read(struct file *file, char __user *ubuf, if (ftrace_pid_trace == ftrace_swapper_pid) r = sprintf(buf, "swapper tasks\n"); else if (ftrace_pid_trace) - r = sprintf(buf, "%u\n", pid_nr(ftrace_pid_trace)); + r = sprintf(buf, "%u\n", pid_vnr(ftrace_pid_trace)); else r = sprintf(buf, "no pid\n"); From ee000b7f9fe429d2470c674ccec8d344f6789e0d Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 24 Mar 2009 13:38:06 +0800 Subject: [PATCH 40/60] tracing: use union for multi-usages field Impact: cleanup struct dyn_ftrace::ip has different usages in his lifecycle, we use union for it. And also for struct dyn_ftrace::flags. Signed-off-by: Lai Jiangshan Cc: Steven Rostedt Cc: Frederic Weisbecker LKML-Reference: <49C871BE.3080405@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- include/linux/ftrace.h | 12 +++++++++--- kernel/trace/ftrace.c | 8 ++++---- 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 1141248c84ee..015a3d22cf74 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -145,9 +145,15 @@ enum { }; struct dyn_ftrace { - unsigned long ip; /* address of mcount call-site */ - unsigned long flags; - struct dyn_arch_ftrace arch; + union { + unsigned long ip; /* address of mcount call-site */ + struct dyn_ftrace *freelist; + }; + union { + unsigned long flags; + struct dyn_ftrace *newlist; + }; + struct dyn_arch_ftrace arch; }; int ftrace_force_update(void); diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index bb377112b1bb..7b8722baf153 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -341,7 +341,7 @@ static inline int record_frozen(struct dyn_ftrace *rec) static void ftrace_free_rec(struct dyn_ftrace *rec) { - rec->ip = (unsigned long)ftrace_free_records; + rec->freelist = ftrace_free_records; ftrace_free_records = rec; rec->flags |= FTRACE_FL_FREE; } @@ -379,7 +379,7 @@ static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip) return NULL; } - ftrace_free_records = (void *)rec->ip; + ftrace_free_records = rec->freelist; memset(rec, 0, sizeof(*rec)); return rec; } @@ -411,7 +411,7 @@ ftrace_record_ip(unsigned long ip) return NULL; rec->ip = ip; - rec->flags = (unsigned long)ftrace_new_addrs; + rec->newlist = ftrace_new_addrs; ftrace_new_addrs = rec; return rec; @@ -731,7 +731,7 @@ static int ftrace_update_code(struct module *mod) return -1; p = ftrace_new_addrs; - ftrace_new_addrs = (struct dyn_ftrace *)p->flags; + ftrace_new_addrs = p->newlist; p->flags = 0L; /* convert record (i.e, patch mcount-call with NOP) */ From e6f489013b985b58d096a3091ece0ed579367232 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Wed, 25 Mar 2009 16:27:17 +0800 Subject: [PATCH 41/60] trace_stat: don't call seq_printf() in seq_operation->start() Impact: Fix incorrect way using seq_file's API Use SEQ_START_TOKEN instead of calling ->stat_headers() int seq_operation->start(). Signed-off-by: Lai Jiangshan Acked-by: Steven Rostedt Cc: Alexey Dobriyan LKML-Reference: <49C9EAE5.5070202@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace_stat.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c index f71b85b22cfe..8c129dd480ad 100644 --- a/kernel/trace/trace_stat.c +++ b/kernel/trace/trace_stat.c @@ -163,7 +163,7 @@ static void *stat_seq_start(struct seq_file *s, loff_t *pos) /* If we are in the beginning of the file, print the headers */ if (!*pos && session->ts->stat_headers) - session->ts->stat_headers(s); + return SEQ_START_TOKEN; return seq_list_start(&session->stat_list, *pos); } @@ -172,6 +172,9 @@ static void *stat_seq_next(struct seq_file *s, void *p, loff_t *pos) { struct tracer_stat_session *session = s->private; + if (p == SEQ_START_TOKEN) + return seq_list_start(&session->stat_list, *pos); + return seq_list_next(p, &session->stat_list, pos); } @@ -186,6 +189,9 @@ static int stat_seq_show(struct seq_file *s, void *v) struct tracer_stat_session *session = s->private; struct trace_stat_list *l = list_entry(v, struct trace_stat_list, list); + if (v == SEQ_START_TOKEN) + return session->ts->stat_headers(s); + return session->ts->stat_show(s, l->stat); } From 220ba351dfa57eca4bec5ce0098a276446a47958 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Wed, 25 Mar 2009 16:58:39 +0800 Subject: [PATCH 42/60] trace_stat: keep original order Impact: make trace_stat files show items with the original order trace_stat tracer reverse the items, it makes the output looks a little ugly. Example, when we read trace_stat/workqueues, we get cpu#7's stat. at first, and then cpu#6... cpu#0. Signed-off-by: Lai Jiangshan Acked-by: Steven Rostedt Acked-by: Frederic Weisbecker LKML-Reference: <49C9F23F.5040307@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace_stat.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c index 8c129dd480ad..acdebd771a93 100644 --- a/kernel/trace/trace_stat.c +++ b/kernel/trace/trace_stat.c @@ -125,23 +125,21 @@ static int stat_seq_init(struct tracer_stat_session *session) INIT_LIST_HEAD(&new_entry->list); new_entry->stat = stat; - list_for_each_entry(iter_entry, &session->stat_list, list) { + list_for_each_entry_reverse(iter_entry, &session->stat_list, + list) { /* Insertion with a descendent sorting */ - if (ts->stat_cmp(new_entry->stat, - iter_entry->stat) > 0) { + if (ts->stat_cmp(iter_entry->stat, + new_entry->stat) >= 0) { - list_add_tail(&new_entry->list, - &iter_entry->list); - break; - - /* The current smaller value */ - } else if (list_is_last(&iter_entry->list, - &session->stat_list)) { list_add(&new_entry->list, &iter_entry->list); break; } } + + /* The current larger value */ + if (list_empty(&new_entry->list)) + list_add(&new_entry->list, &session->stat_list); } exit: mutex_unlock(&session->stat_mutex); From 2f63b840bc8a816ac879ee773b035cf3e433fae4 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Wed, 25 Mar 2009 16:59:18 +0800 Subject: [PATCH 43/60] trace_workqueues: fix empty line's output Empty lines separate cpus stat. After previous fix(trace_stat: keep original order) applied, the empty lines are displayed at incorrect position. Signed-off-by: Lai Jiangshan Acked-by: Steven Rostedt Acked-by: Frederic Weisbecker LKML-Reference: <49C9F266.2060706@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace_workqueue.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c index 9ab035b58cf1..797201e4a137 100644 --- a/kernel/trace/trace_workqueue.c +++ b/kernel/trace/trace_workqueue.c @@ -196,6 +196,11 @@ static int workqueue_stat_show(struct seq_file *s, void *p) struct pid *pid; struct task_struct *tsk; + spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); + if (&cws->list == workqueue_cpu_stat(cpu)->list.next) + seq_printf(s, "\n"); + spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); + pid = find_get_pid(cws->pid); if (pid) { tsk = get_pid_task(pid, PIDTYPE_PID); @@ -208,18 +213,13 @@ static int workqueue_stat_show(struct seq_file *s, void *p) put_pid(pid); } - spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); - if (&cws->list == workqueue_cpu_stat(cpu)->list.next) - seq_printf(s, "\n"); - spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); - return 0; } static int workqueue_stat_headers(struct seq_file *s) { seq_printf(s, "# CPU INSERTED EXECUTED NAME\n"); - seq_printf(s, "# | | | |\n\n"); + seq_printf(s, "# | | | |\n"); return 0; } From 759ee0915dd713361e72facb78b66600b5712d65 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Wed, 25 Mar 2009 17:06:30 +0800 Subject: [PATCH 44/60] init,cpuset: fix initialize order Impact: cpuset_wq should be initialized after init_workqueues() When I read /debugfs/tracing/trace_stat/workqueues, I got this: # CPU INSERTED EXECUTED NAME # | | | | 0 0 0 cpuset 0 285 285 events/0 0 2 2 work_on_cpu/0 0 1115 1115 khelper 0 325 325 kblockd/0 0 0 0 kacpid 0 0 0 kacpi_notify 0 0 0 ata/0 0 0 0 ata_aux 0 0 0 ksuspend_usbd 0 0 0 aio/0 0 0 0 nfsiod 0 0 0 kpsmoused 0 0 0 kstriped 0 0 0 kondemand/0 0 1 1 hid_compat 0 0 0 rpciod/0 1 64 64 events/1 1 2 2 work_on_cpu/1 1 5 5 kblockd/1 1 0 0 ata/1 1 0 0 aio/1 1 0 0 kondemand/1 1 0 0 rpciod/1 I found "cpuset" is at the earliest. I found a create_singlethread_workqueue() is earlier than init_workqueues(): kernel_init() ->cpuset_init_smp() ->create_singlethread_workqueue() ->do_basic_setup() ->init_workqueues() I think it's better that create_singlethread_workqueue() is called after workqueue subsystem has been initialized. Signed-off-by: Lai Jiangshan Acked-by: Steven Rostedt Acked-by: Frederic Weisbecker Cc: Paul Menage Cc: miaoxie Cc: Li Zefan Cc: Andrew Morton LKML-Reference: <49C9F416.1050707@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- init/main.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/init/main.c b/init/main.c index 20d784ab5ef8..b0097d2b63ae 100644 --- a/init/main.c +++ b/init/main.c @@ -772,6 +772,7 @@ static void __init do_basic_setup(void) { rcu_init_sched(); /* needed by module_init stage. */ init_workqueues(); + cpuset_init_smp(); usermodehelper_init(); driver_init(); init_irq_proc(); @@ -865,8 +866,6 @@ static int __init kernel_init(void * unused) smp_init(); sched_init_smp(); - cpuset_init_smp(); - do_basic_setup(); /* From fee039a1d05c6e0f71b0fe270d847742a02d56c4 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Mon, 23 Mar 2009 10:14:52 -0400 Subject: [PATCH 45/60] x86: kretprobe-booster interrupt emulation code fix Fix interrupt emulation code in kretprobe-booster according to pt_regs update (es/ds change and gs adding). This issue has been reported on systemtap-bugzilla: http://sources.redhat.com/bugzilla/show_bug.cgi?id=9965 | On a -tip kernel on x86_32, kretprobe_example (from samples) triggers the | following backtrace when its retprobing a class of functions that cause a | copy_from/to_user(). | | BUG: sleeping function called from invalid context at mm/memory.c:3196 | in_atomic(): 0, irqs_disabled(): 1, pid: 2286, name: cat Signed-off-by: Masami Hiramatsu Acked-by: Ananth N Mavinakayanahalli Tested-by: Bharata B Rao Cc: systemtap-ml LKML-Reference: <49C7995C.2010601@redhat.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/kprobes.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c index 4558dd3918cf..759095d53a06 100644 --- a/arch/x86/kernel/kprobes.c +++ b/arch/x86/kernel/kprobes.c @@ -638,13 +638,13 @@ static void __used __kprobes kretprobe_trampoline_holder(void) #else " pushf\n" /* - * Skip cs, ip, orig_ax. + * Skip cs, ip, orig_ax and gs. * trampoline_handler() will plug in these values */ - " subl $12, %esp\n" + " subl $16, %esp\n" " pushl %fs\n" - " pushl %ds\n" " pushl %es\n" + " pushl %ds\n" " pushl %eax\n" " pushl %ebp\n" " pushl %edi\n" @@ -655,10 +655,10 @@ static void __used __kprobes kretprobe_trampoline_holder(void) " movl %esp, %eax\n" " call trampoline_handler\n" /* Move flags to cs */ - " movl 52(%esp), %edx\n" - " movl %edx, 48(%esp)\n" + " movl 56(%esp), %edx\n" + " movl %edx, 52(%esp)\n" /* Replace saved flags with true return address. */ - " movl %eax, 52(%esp)\n" + " movl %eax, 56(%esp)\n" " popl %ebx\n" " popl %ecx\n" " popl %edx\n" @@ -666,8 +666,8 @@ static void __used __kprobes kretprobe_trampoline_holder(void) " popl %edi\n" " popl %ebp\n" " popl %eax\n" - /* Skip ip, orig_ax, es, ds, fs */ - " addl $20, %esp\n" + /* Skip ds, es, fs, gs, orig_ax and ip */ + " addl $24, %esp\n" " popf\n" #endif " ret\n"); @@ -691,6 +691,7 @@ static __used __kprobes void *trampoline_handler(struct pt_regs *regs) regs->cs = __KERNEL_CS; #else regs->cs = __KERNEL_CS | get_kernel_rpl(); + regs->gs = 0; #endif regs->ip = trampoline_address; regs->orig_ax = ~0UL; From 2a4efa42450762cbfa5c5712aa4cc9f06924c9fd Mon Sep 17 00:00:00 2001 From: Zhaolei Date: Wed, 25 Mar 2009 12:06:05 +0800 Subject: [PATCH 46/60] ftrace: Using FTRACE_WARN_ON() to check "freed record" in ftrace_release() "Because when we call ftrace_free_rec we change the rec->ip to point to the next record in the chain. Something is very wrong if rec->ip >= s && rec->ip < e and the record is already free." "Note, use FTRACE_WARN_ON() macro. This way it shuts down ftrace if it is hit and helps to avoid further damage later." -- Steven Rostedt Signed-off-by: Zhao Lei Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 7b8722baf153..1752a63f37c0 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -358,9 +358,14 @@ void ftrace_release(void *start, unsigned long size) mutex_lock(&ftrace_lock); do_for_each_ftrace_rec(pg, rec) { - if ((rec->ip >= s) && (rec->ip < e) && - !(rec->flags & FTRACE_FL_FREE)) + if ((rec->ip >= s) && (rec->ip < e)) { + /* + * rec->ip is changed in ftrace_free_rec() + * It should not between s and e if record was freed. + */ + FTRACE_WARN_ON(rec->flags & FTRACE_FL_FREE); ftrace_free_rec(rec); + } } while_for_each_ftrace_rec(); mutex_unlock(&ftrace_lock); } From 9a8118baaeb0eaa148913bed77bf9c6335f6ca63 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Thu, 26 Mar 2009 01:24:34 -0500 Subject: [PATCH 47/60] tracing: filter fix for TRACE_EVENT_FORMAT events Impact: fix crash (hang) when using TRACE_EVENT_FORMAT filter files filters are only hooked up to the tracepoint events defined using TRACE_EVENT but not the tracers that use TRACE_EVENT_FORMAT, such as ftrace. Do not display the filter files at all for TRACE_EVENT_FORMAT events for the time being. Cc: Steven Rostedt Cc: =?ISO-8859-1?Q?Fr=E9d=E9ric?= Weisbecker LKML-Reference: <1237878882.8339.61.camel@charm-linux> Signed-off-by: Ingo Molnar --- kernel/trace/trace_events.c | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index d132997ab756..64ec4d278ffb 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -680,7 +680,6 @@ static struct dentry * event_subsystem_dir(const char *name, struct dentry *d_events) { struct event_subsystem *system; - struct dentry *entry; /* First see if we did not already create this dir */ list_for_each_entry(system, &event_subsystems, list) { @@ -709,12 +708,6 @@ event_subsystem_dir(const char *name, struct dentry *d_events) system->preds = NULL; - entry = debugfs_create_file("filter", 0644, system->entry, system, - &ftrace_subsystem_filter_fops); - if (!entry) - pr_warning("Could not create debugfs " - "'%s/filter' entry\n", name); - return system->entry; } @@ -770,14 +763,13 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events) " events/%s\n", call->name); return ret; } + entry = debugfs_create_file("filter", 0644, call->dir, call, + &ftrace_event_filter_fops); + if (!entry) + pr_warning("Could not create debugfs " + "'%s/filter' entry\n", call->name); } - entry = debugfs_create_file("filter", 0644, call->dir, call, - &ftrace_event_filter_fops); - if (!entry) - pr_warning("Could not create debugfs " - "'%s/filter' entry\n", call->name); - /* A trace may not want to export its format */ if (!call->show_format) return 0; From 548c316137901cc81dea35e26362a9d5ba0b89b6 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 31 Mar 2009 00:25:23 +0200 Subject: [PATCH 48/60] tracing, Text Edit Lock: cleanup Remove incorrectly introduced headers. Signed-off-by: Ingo Molnar --- mm/memory.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index dfc9e4ea4e8b..baa999e87cd2 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -48,8 +48,6 @@ #include #include #include -#include -#include #include #include #include From 6c051ce0307526adec32a847f0daa1af2124f0a9 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 25 Mar 2009 17:18:56 +0800 Subject: [PATCH 49/60] blktrace: fix timestamp in binary output I found the timestamp is wrong: # echo bin > trace_option # echo blk > current_tracer # cat trace_pipe | blkparse -i - 8,0 0 0 0.000000000 504 A W ... ... 8,7 1 0 0.008534097 0 C R ... (should be 8.534097xxx) user-space blkparse expects the timestamp to be nanosecond. Signed-off-by: Li Zefan Acked-by: Jens Axboe Cc: Arnaldo Carvalho de Melo Cc: Steven Rostedt Cc: Frederic Weisbecker Signed-off-by: Ingo Molnar --- kernel/trace/blktrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 6fb274f5f34e..ee7a8bb8b1e8 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -1168,7 +1168,7 @@ static int blk_trace_synthesize_old_trace(struct trace_iterator *iter) const int offset = offsetof(struct blk_io_trace, sector); struct blk_io_trace old = { .magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION, - .time = ns2usecs(iter->ts), + .time = iter->ts, }; if (!trace_seq_putmem(s, &old, offset)) From b5230b56ee6caeb27cedb7753c0c319646383bb4 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 25 Mar 2009 17:19:33 +0800 Subject: [PATCH 50/60] blktrace: fix a race when creating blk_tree_root in debugfs t1 t2 ------ ------ do_blk_trace_setup() do_blk_trace_setup() if (!blk_tree_root) { if (!blk_tree_root) blk_tree_root = create_dir() blk_tree_root = create_dir(); (now blk_tree_root == NULL) ... dir = create_dir(name, blk_tree_root); Due to this race, t1 will create 'dir' in /debugfs but not /debugfs/block. Signed-off-by: Li Zefan Acked-by: Jens Axboe Cc: Arnaldo Carvalho de Melo Cc: Steven Rostedt Cc: Frederic Weisbecker Signed-off-by: Ingo Molnar --- kernel/trace/blktrace.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index ee7a8bb8b1e8..95f89faca73e 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -426,11 +426,15 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, ret = -ENOENT; + mutex_lock(&blk_tree_mutex); if (!blk_tree_root) { blk_tree_root = debugfs_create_dir("block", NULL); - if (!blk_tree_root) + if (!blk_tree_root) { + mutex_unlock(&blk_tree_mutex); goto err; + } } + mutex_unlock(&blk_tree_mutex); dir = debugfs_create_dir(buts->name, blk_tree_root); From 5554720482a631702146a959db22fe417740e0a6 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 25 Mar 2009 17:21:26 +0800 Subject: [PATCH 51/60] blktrace: fix the original blktrace Currently the original blktrace, which is using relay and is used via ioctl, is broken. You can use ftrace to see the output of blktrace, but user-space blktrace is unusable. It's broken by "blktrace: add ftrace plugin" (c71a896154119f4ca9e89d6078f5f63ad60ef199) - if (unlikely(bt->trace_state != Blktrace_running)) + if (unlikely(bt->trace_state != Blktrace_running || !blk_tracer_enabled)) return; With this patch, both ioctl and ftrace can be used, but of course you can't use both of them at the same time. Signed-off-by: Li Zefan Acked-by: Jens Axboe Cc: Arnaldo Carvalho de Melo Cc: Steven Rostedt Cc: Frederic Weisbecker Signed-off-by: Ingo Molnar --- kernel/trace/blktrace.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 95f89faca73e..a7f7ff5db38b 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -110,7 +110,7 @@ void __trace_note_message(struct blk_trace *bt, const char *fmt, ...) unsigned long flags; char *buf; - if (blk_tr) { + if (blk_tracer_enabled) { va_start(args, fmt); ftrace_vprintk(fmt, args); va_end(args); @@ -169,7 +169,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, pid_t pid; int cpu, pc = 0; - if (unlikely(bt->trace_state != Blktrace_running || + if (unlikely(bt->trace_state != Blktrace_running && !blk_tracer_enabled)) return; @@ -185,7 +185,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, return; cpu = raw_smp_processor_id(); - if (blk_tr) { + if (blk_tracer_enabled) { tracing_record_cmdline(current); pc = preempt_count(); @@ -235,7 +235,7 @@ record_it: if (pdu_len) memcpy((void *) t + sizeof(*t), pdu_data, pdu_len); - if (blk_tr) { + if (blk_tracer_enabled) { trace_buffer_unlock_commit(blk_tr, event, 0, pc); return; } @@ -267,8 +267,7 @@ int blk_trace_remove(struct request_queue *q) if (!bt) return -EINVAL; - if (bt->trace_state == Blktrace_setup || - bt->trace_state == Blktrace_stopped) + if (bt->trace_state != Blktrace_running) blk_trace_cleanup(bt); return 0; @@ -1273,7 +1272,6 @@ static int blk_trace_setup_queue(struct request_queue *q, dev_t dev) bt->dev = dev; bt->act_mask = (u16)-1; bt->end_lba = -1ULL; - bt->trace_state = Blktrace_running; old_bt = xchg(&q->blk_trace, bt); if (old_bt != NULL) { From eb08f8eb0673d9c1e62b69ad1b41593e73c40467 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 24 Mar 2009 16:05:27 +0800 Subject: [PATCH 52/60] blktrace: fix off-by-one bug 'what' is used as the index of array what2act, so it can't >= the array size. Signed-off-by: Li Zefan Acked-by: Jens Axboe Acked-by: Arnaldo Carvalho de Melo Acked-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/blktrace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index a7f7ff5db38b..d43cdac7c754 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -1152,7 +1152,7 @@ static enum print_line_t blk_trace_event_print(struct trace_iterator *iter, if (!trace_print_context(iter)) return TRACE_TYPE_PARTIAL_LINE; - if (unlikely(what == 0 || what > ARRAY_SIZE(what2act))) + if (unlikely(what == 0 || what >= ARRAY_SIZE(what2act))) ret = trace_seq_printf(s, "Bad pc action %x\n", what); else { const bool long_act = !!(trace_flags & TRACE_ITER_VERBOSE); @@ -1199,7 +1199,7 @@ static enum print_line_t blk_tracer_print_line(struct trace_iterator *iter) t = (const struct blk_io_trace *)iter->ent; what = t->action & ((1 << BLK_TC_SHIFT) - 1); - if (unlikely(what == 0 || what > ARRAY_SIZE(what2act))) + if (unlikely(what == 0 || what >= ARRAY_SIZE(what2act))) ret = trace_seq_printf(&iter->seq, "Bad pc action %x\n", what); else { const bool long_act = !!(trace_flags & TRACE_ITER_VERBOSE); From 35ac51bfe4c293b67ce9f85082ba0b9bc6123c40 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Fri, 27 Mar 2009 10:19:46 +0800 Subject: [PATCH 53/60] blktrace: make classic output more classic Impact: fix ftrace plugin timestamp output In the classic user-space blktrace, the output timestamp is sec.nsec not sec.usec. Signed-off-by: Li Zefan Acked-by: Jens Axboe Cc: Arnaldo Carvalho de Melo Cc: Steven Rostedt Cc: Frederic Weisbecker Signed-off-by: Ingo Molnar --- kernel/trace/blktrace.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index d43cdac7c754..5b28f0f119c5 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -994,8 +994,8 @@ static void get_pdu_remap(const struct trace_entry *ent, static int blk_log_action_iter(struct trace_iterator *iter, const char *act) { char rwbs[6]; - unsigned long long ts = ns2usecs(iter->ts); - unsigned long usec_rem = do_div(ts, USEC_PER_SEC); + unsigned long long ts = iter->ts; + unsigned long nsec_rem = do_div(ts, NSEC_PER_SEC); unsigned secs = (unsigned long)ts; const struct trace_entry *ent = iter->ent; const struct blk_io_trace *t = (const struct blk_io_trace *)ent; @@ -1003,9 +1003,9 @@ static int blk_log_action_iter(struct trace_iterator *iter, const char *act) fill_rwbs(rwbs, t); return trace_seq_printf(&iter->seq, - "%3d,%-3d %2d %5d.%06lu %5u %2s %3s ", + "%3d,%-3d %2d %5d.%09lu %5u %2s %3s ", MAJOR(t->device), MINOR(t->device), iter->cpu, - secs, usec_rem, ent->pid, act, rwbs); + secs, nsec_rem, ent->pid, act, rwbs); } static int blk_log_action_seq(struct trace_seq *s, const struct blk_io_trace *t, From 17ba97e347bec9bbc47a0877c7a098708982129d Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Fri, 27 Mar 2009 10:20:09 +0800 Subject: [PATCH 54/60] blktrace: fix blk_probes_ref chaos Impact: fix mixed ioctl and ftrace-plugin blktrace use refcount bugs ioctl-based blktrace allocates bt and registers tracepoints when ioctl(BLKTRACESETUP), and do all cleanups when ioctl(BLKTRACETEARDOWN). while ftrace-based blktrace allocates/frees bt when: # echo 1/0 > /sys/block/sda/sda1/trace/enable and registers/unregisters tracepoints when: # echo blk/nop > /debugfs/tracing/current_tracer or # echo 1/0 > /debugfs/tracing/tracing_enable The separatation of allocation and registeration causes 2 problems: 1. current user-space blktrace still calls ioctl(TEARDOWN) when ioctl(SETUP) failed: # echo 1 > /sys/block/sda/sda1/trace/enable # blktrace /dev/sda BLKTRACESETUP: Device or resource busy ^C and now blk_probes_ref == -1 2. Another way to make blk_probes_ref == -1: # plugin sdb && mount sdb1 # echo 1 > /sys/block/sdb/sdb1/trace/enable # remove sdb This patch does the allocation and registeration when writing sdaX/trace/enable. Signed-off-by: Li Zefan Acked-by: Jens Axboe Cc: Arnaldo Carvalho de Melo Cc: Steven Rostedt Cc: Frederic Weisbecker Signed-off-by: Ingo Molnar --- kernel/trace/blktrace.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 5b28f0f119c5..8d6bd12aab10 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -478,7 +478,7 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, goto err; } - if (atomic_add_return(1, &blk_probes_ref) == 1) + if (atomic_inc_return(&blk_probes_ref) == 1) blk_register_tracepoints(); return 0; @@ -1091,8 +1091,6 @@ static void blk_tracer_print_header(struct seq_file *m) static void blk_tracer_start(struct trace_array *tr) { - if (atomic_add_return(1, &blk_probes_ref) == 1) - blk_register_tracepoints(); trace_flags &= ~TRACE_ITER_CONTEXT_INFO; } @@ -1107,15 +1105,10 @@ static int blk_tracer_init(struct trace_array *tr) static void blk_tracer_stop(struct trace_array *tr) { trace_flags |= TRACE_ITER_CONTEXT_INFO; - if (atomic_dec_and_test(&blk_probes_ref)) - blk_unregister_tracepoints(); } static void blk_tracer_reset(struct trace_array *tr) { - if (!atomic_read(&blk_probes_ref)) - return; - blk_tracer_enabled = false; blk_tracer_stop(tr); } @@ -1254,6 +1247,9 @@ static int blk_trace_remove_queue(struct request_queue *q) if (bt == NULL) return -EINVAL; + if (atomic_dec_and_test(&blk_probes_ref)) + blk_unregister_tracepoints(); + kfree(bt); return 0; } @@ -1280,6 +1276,9 @@ static int blk_trace_setup_queue(struct request_queue *q, dev_t dev) return -EBUSY; } + if (atomic_inc_return(&blk_probes_ref) == 1) + blk_register_tracepoints(); + return 0; } From ad5dd5493a55e462796e42e50a49e76df76fdb05 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Fri, 27 Mar 2009 10:20:24 +0800 Subject: [PATCH 55/60] blktrace: fix memory leak when freeing struct blk_io_trace Impact: fix mixed ioctl and ftrace-plugin blktrace use memory leak When mixing the use of ioctl-based blktrace and ftrace-based blktrace, we can leak memory in this way: # btrace /dev/sda > /dev/null & # echo 0 > /sys/block/sda/sda1/trace/enable now we leak bt->dropped_file, bt->msg_file, bt->rchan... Signed-off-by: Li Zefan Acked-by: Jens Axboe Cc: Arnaldo Carvalho de Melo Cc: Steven Rostedt Cc: Frederic Weisbecker Signed-off-by: Ingo Molnar --- kernel/trace/blktrace.c | 29 ++++++++++++----------------- 1 file changed, 12 insertions(+), 17 deletions(-) diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 8d6bd12aab10..2f21d7761600 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -247,7 +247,7 @@ record_it: static struct dentry *blk_tree_root; static DEFINE_MUTEX(blk_tree_mutex); -static void blk_trace_cleanup(struct blk_trace *bt) +static void blk_trace_free(struct blk_trace *bt) { debugfs_remove(bt->msg_file); debugfs_remove(bt->dropped_file); @@ -255,6 +255,11 @@ static void blk_trace_cleanup(struct blk_trace *bt) free_percpu(bt->sequence); free_percpu(bt->msg_data); kfree(bt); +} + +static void blk_trace_cleanup(struct blk_trace *bt) +{ + blk_trace_free(bt); if (atomic_dec_and_test(&blk_probes_ref)) blk_unregister_tracepoints(); } @@ -410,11 +415,11 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, if (buts->name[i] == '/') buts->name[i] = '_'; - ret = -ENOMEM; bt = kzalloc(sizeof(*bt), GFP_KERNEL); if (!bt) - goto err; + return -ENOMEM; + ret = -ENOMEM; bt->sequence = alloc_percpu(unsigned long); if (!bt->sequence) goto err; @@ -483,17 +488,7 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, return 0; err: - if (bt) { - if (bt->msg_file) - debugfs_remove(bt->msg_file); - if (bt->dropped_file) - debugfs_remove(bt->dropped_file); - free_percpu(bt->sequence); - free_percpu(bt->msg_data); - if (bt->rchan) - relay_close(bt->rchan); - kfree(bt); - } + blk_trace_free(bt); return ret; } @@ -1091,6 +1086,7 @@ static void blk_tracer_print_header(struct seq_file *m) static void blk_tracer_start(struct trace_array *tr) { + blk_tracer_enabled = true; trace_flags &= ~TRACE_ITER_CONTEXT_INFO; } @@ -1098,18 +1094,17 @@ static int blk_tracer_init(struct trace_array *tr) { blk_tr = tr; blk_tracer_start(tr); - blk_tracer_enabled = true; return 0; } static void blk_tracer_stop(struct trace_array *tr) { + blk_tracer_enabled = false; trace_flags |= TRACE_ITER_CONTEXT_INFO; } static void blk_tracer_reset(struct trace_array *tr) { - blk_tracer_enabled = false; blk_tracer_stop(tr); } @@ -1250,7 +1245,7 @@ static int blk_trace_remove_queue(struct request_queue *q) if (atomic_dec_and_test(&blk_probes_ref)) blk_unregister_tracepoints(); - kfree(bt); + blk_trace_free(bt); return 0; } From b6a4b0c3ad4c09c1d37b1040ac8e3ebd1016e10b Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Fri, 27 Mar 2009 10:21:23 +0800 Subject: [PATCH 56/60] blktrace: extract duplidate code Impact: cleanup blk_trace_event_print() and blk_tracer_print_line() share most of the code. text data bss dec hex filename 8605 393 12 9010 2332 kernel/trace/blktrace.o.orig text data bss dec hex filename 8555 393 12 8960 2300 kernel/trace/blktrace.o This patch also prepares for the next patch, that prints out BLK_TN_MESSAGE. Signed-off-by: Li Zefan Acked-by: Jens Axboe Cc: Arnaldo Carvalho de Melo Cc: Steven Rostedt Cc: Frederic Weisbecker Signed-off-by: Ingo Molnar --- kernel/trace/blktrace.c | 62 ++++++++++++++++++++--------------------- 1 file changed, 30 insertions(+), 32 deletions(-) diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 2f21d7761600..c103b0c20022 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -986,29 +986,31 @@ static void get_pdu_remap(const struct trace_entry *ent, r->sector = be64_to_cpu(sector); } -static int blk_log_action_iter(struct trace_iterator *iter, const char *act) +typedef int (blk_log_action_t) (struct trace_iterator *iter, const char *act); + +static int blk_log_action_classic(struct trace_iterator *iter, const char *act) { char rwbs[6]; unsigned long long ts = iter->ts; unsigned long nsec_rem = do_div(ts, NSEC_PER_SEC); unsigned secs = (unsigned long)ts; - const struct trace_entry *ent = iter->ent; - const struct blk_io_trace *t = (const struct blk_io_trace *)ent; + const struct blk_io_trace *t = te_blk_io_trace(iter->ent); fill_rwbs(rwbs, t); return trace_seq_printf(&iter->seq, "%3d,%-3d %2d %5d.%09lu %5u %2s %3s ", MAJOR(t->device), MINOR(t->device), iter->cpu, - secs, nsec_rem, ent->pid, act, rwbs); + secs, nsec_rem, iter->ent->pid, act, rwbs); } -static int blk_log_action_seq(struct trace_seq *s, const struct blk_io_trace *t, - const char *act) +static int blk_log_action(struct trace_iterator *iter, const char *act) { char rwbs[6]; + const struct blk_io_trace *t = te_blk_io_trace(iter->ent); + fill_rwbs(rwbs, t); - return trace_seq_printf(s, "%3d,%-3d %2s %3s ", + return trace_seq_printf(&iter->seq, "%3d,%-3d %2s %3s ", MAJOR(t->device), MINOR(t->device), act, rwbs); } @@ -1129,22 +1131,25 @@ static const struct { [__BLK_TA_REMAP] = {{ "A", "remap" }, blk_log_remap }, }; -static enum print_line_t blk_trace_event_print(struct trace_iterator *iter, - int flags) +static enum print_line_t print_one_line(struct trace_iterator *iter, + bool classic) { struct trace_seq *s = &iter->seq; - const struct blk_io_trace *t = (struct blk_io_trace *)iter->ent; - const u16 what = t->action & ((1 << BLK_TC_SHIFT) - 1); + const struct blk_io_trace *t; + u16 what; int ret; + bool long_act; + blk_log_action_t *log_action; - if (!trace_print_context(iter)) - return TRACE_TYPE_PARTIAL_LINE; + t = te_blk_io_trace(iter->ent); + what = t->action & ((1 << BLK_TC_SHIFT) - 1); + long_act = !!(trace_flags & TRACE_ITER_VERBOSE); + log_action = classic ? &blk_log_action_classic : &blk_log_action; if (unlikely(what == 0 || what >= ARRAY_SIZE(what2act))) ret = trace_seq_printf(s, "Bad pc action %x\n", what); else { - const bool long_act = !!(trace_flags & TRACE_ITER_VERBOSE); - ret = blk_log_action_seq(s, t, what2act[what].act[long_act]); + ret = log_action(iter, what2act[what].act[long_act]); if (ret) ret = what2act[what].print(s, iter->ent); } @@ -1152,6 +1157,15 @@ static enum print_line_t blk_trace_event_print(struct trace_iterator *iter, return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; } +static enum print_line_t blk_trace_event_print(struct trace_iterator *iter, + int flags) +{ + if (!trace_print_context(iter)) + return TRACE_TYPE_PARTIAL_LINE; + + return print_one_line(iter, false); +} + static int blk_trace_synthesize_old_trace(struct trace_iterator *iter) { struct trace_seq *s = &iter->seq; @@ -1177,26 +1191,10 @@ blk_trace_event_print_binary(struct trace_iterator *iter, int flags) static enum print_line_t blk_tracer_print_line(struct trace_iterator *iter) { - const struct blk_io_trace *t; - u16 what; - int ret; - if (!(blk_tracer_flags.val & TRACE_BLK_OPT_CLASSIC)) return TRACE_TYPE_UNHANDLED; - t = (const struct blk_io_trace *)iter->ent; - what = t->action & ((1 << BLK_TC_SHIFT) - 1); - - if (unlikely(what == 0 || what >= ARRAY_SIZE(what2act))) - ret = trace_seq_printf(&iter->seq, "Bad pc action %x\n", what); - else { - const bool long_act = !!(trace_flags & TRACE_ITER_VERBOSE); - ret = blk_log_action_iter(iter, what2act[what].act[long_act]); - if (ret) - ret = what2act[what].print(&iter->seq, iter->ent); - } - - return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; + return print_one_line(iter, true); } static struct tracer blk_tracer __read_mostly = { From 18cea4591a98817697017bcb056a848bae1205df Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Fri, 27 Mar 2009 10:21:54 +0800 Subject: [PATCH 57/60] blktrace: print out BLK_TN_MESSAGE properly Impact: improve ftrace plugin output Before this patch: # cat trace make-5383 [001] 741.240059: 8,7 P N [make] __trace_note_message: cfq1074 # echo 1 > options/blk_classic # cat trace 8,7 1 0.692221252 0 C W 130411392 + 1024 [0] Bad pc action 6361 Bad pc action 283d # echo 0 > options/blk_classic # echo bin > trace_options # cat trace_pipe | blkparse -i - (can't parse messages generated by blk_add_trace_msg()) After this patch: # cat trace -0 [001] 187.600933: 8,7 C W 145220224 + 8 [0] -0 [001] 187.600946: 8,7 m N cfq1076 complete # echo 1 > options/blk_classic # cat trace 8,7 1 0.256378996 238 I W 113190728 + 8 [pdflush] 8,7 1 0.256378998 238 m N cfq1076 insert_request # echo 0 > options/blk_classic # echo bin > trace_options # cat trace_pipe | blkparse -i - 8,7 1 0 22.973250293 0 C W 102770576 + 8 [0] 8,7 1 0 22.973259213 0 m N cfq1076 complete Signed-off-by: Li Zefan Acked-by: Jens Axboe Cc: Arnaldo Carvalho de Melo Cc: Steven Rostedt Cc: Frederic Weisbecker Signed-off-by: Ingo Molnar --- kernel/trace/blktrace.c | 80 +++++++++++++++++++++++++++++++---------- 1 file changed, 61 insertions(+), 19 deletions(-) diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index c103b0c20022..947c5b3f90c4 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -59,22 +59,39 @@ static void trace_note(struct blk_trace *bt, pid_t pid, int action, const void *data, size_t len) { struct blk_io_trace *t; + struct ring_buffer_event *event = NULL; + int pc = 0; + int cpu = smp_processor_id(); + bool blk_tracer = blk_tracer_enabled; + + if (blk_tracer) { + pc = preempt_count(); + event = trace_buffer_lock_reserve(blk_tr, TRACE_BLK, + sizeof(*t) + len, + 0, pc); + if (!event) + return; + t = ring_buffer_event_data(event); + goto record_it; + } if (!bt->rchan) return; t = relay_reserve(bt->rchan, sizeof(*t) + len); if (t) { - const int cpu = smp_processor_id(); - t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION; t->time = ktime_to_ns(ktime_get()); +record_it: t->device = bt->dev; t->action = action; t->pid = pid; t->cpu = cpu; t->pdu_len = len; memcpy((void *) t + sizeof(*t), data, len); + + if (blk_tracer) + trace_buffer_unlock_commit(blk_tr, event, 0, pc); } } @@ -110,14 +127,8 @@ void __trace_note_message(struct blk_trace *bt, const char *fmt, ...) unsigned long flags; char *buf; - if (blk_tracer_enabled) { - va_start(args, fmt); - ftrace_vprintk(fmt, args); - va_end(args); - return; - } - - if (!bt->msg_data) + if (unlikely(bt->trace_state != Blktrace_running && + !blk_tracer_enabled)) return; local_irq_save(flags); @@ -168,9 +179,9 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, unsigned long *sequence; pid_t pid; int cpu, pc = 0; + bool blk_tracer = blk_tracer_enabled; - if (unlikely(bt->trace_state != Blktrace_running && - !blk_tracer_enabled)) + if (unlikely(bt->trace_state != Blktrace_running && !blk_tracer)) return; what |= ddir_act[rw & WRITE]; @@ -185,7 +196,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, return; cpu = raw_smp_processor_id(); - if (blk_tracer_enabled) { + if (blk_tracer) { tracing_record_cmdline(current); pc = preempt_count(); @@ -235,7 +246,7 @@ record_it: if (pdu_len) memcpy((void *) t + sizeof(*t), pdu_data, pdu_len); - if (blk_tracer_enabled) { + if (blk_tracer) { trace_buffer_unlock_commit(blk_tr, event, 0, pc); return; } @@ -922,6 +933,11 @@ static void fill_rwbs(char *rwbs, const struct blk_io_trace *t) int i = 0; int tc = t->action >> BLK_TC_SHIFT; + if (t->action == BLK_TN_MESSAGE) { + rwbs[i++] = 'N'; + goto out; + } + if (tc & BLK_TC_DISCARD) rwbs[i++] = 'D'; else if (tc & BLK_TC_WRITE) @@ -939,7 +955,7 @@ static void fill_rwbs(char *rwbs, const struct blk_io_trace *t) rwbs[i++] = 'S'; if (tc & BLK_TC_META) rwbs[i++] = 'M'; - +out: rwbs[i] = '\0'; } @@ -1074,6 +1090,17 @@ static int blk_log_split(struct trace_seq *s, const struct trace_entry *ent) get_pdu_int(ent), cmd); } +static int blk_log_msg(struct trace_seq *s, const struct trace_entry *ent) +{ + int ret; + const struct blk_io_trace *t = te_blk_io_trace(ent); + + ret = trace_seq_putmem(s, t + 1, t->pdu_len); + if (ret) + return trace_seq_putc(s, '\n'); + return ret; +} + /* * struct tracer operations */ @@ -1146,6 +1173,13 @@ static enum print_line_t print_one_line(struct trace_iterator *iter, long_act = !!(trace_flags & TRACE_ITER_VERBOSE); log_action = classic ? &blk_log_action_classic : &blk_log_action; + if (t->action == BLK_TN_MESSAGE) { + ret = log_action(iter, long_act ? "message" : "m"); + if (ret) + ret = blk_log_msg(s, iter->ent); + goto out; + } + if (unlikely(what == 0 || what >= ARRAY_SIZE(what2act))) ret = trace_seq_printf(s, "Bad pc action %x\n", what); else { @@ -1153,7 +1187,7 @@ static enum print_line_t print_one_line(struct trace_iterator *iter, if (ret) ret = what2act[what].print(s, iter->ent); } - +out: return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; } @@ -1253,11 +1287,16 @@ static int blk_trace_remove_queue(struct request_queue *q) static int blk_trace_setup_queue(struct request_queue *q, dev_t dev) { struct blk_trace *old_bt, *bt = NULL; + int ret = -ENOMEM; bt = kzalloc(sizeof(*bt), GFP_KERNEL); if (!bt) return -ENOMEM; + bt->msg_data = __alloc_percpu(BLK_TN_MAX_MSG, __alignof__(char)); + if (!bt->msg_data) + goto free_bt; + bt->dev = dev; bt->act_mask = (u16)-1; bt->end_lba = -1ULL; @@ -1265,14 +1304,17 @@ static int blk_trace_setup_queue(struct request_queue *q, dev_t dev) old_bt = xchg(&q->blk_trace, bt); if (old_bt != NULL) { (void)xchg(&q->blk_trace, old_bt); - kfree(bt); - return -EBUSY; + ret = -EBUSY; + goto free_bt; } if (atomic_inc_return(&blk_probes_ref) == 1) blk_register_tracepoints(); - return 0; + +free_bt: + blk_trace_free(bt); + return ret; } /* From bdd6df6af98ce7e70702edfb5fd5dbbd8d1b0453 Mon Sep 17 00:00:00 2001 From: Eduard - Gabriel Munteanu Date: Mon, 23 Mar 2009 15:12:22 +0200 Subject: [PATCH 58/60] tracing: provide trace_seq_reserve() trace_seq_reserve() allows a caller to reserve space in a trace_seq and write directly into it. This makes it easier to export binary data to userspace via the tracing interface, by simply filling in a struct. Signed-off-by: Eduard - Gabriel Munteanu Signed-off-by: Ingo Molnar --- kernel/trace/trace_output.c | 13 +++++++++++++ kernel/trace/trace_output.h | 1 + 2 files changed, 14 insertions(+) diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 19261fdd2455..6595074cbac5 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -167,6 +167,19 @@ int trace_seq_putmem_hex(struct trace_seq *s, void *mem, size_t len) return trace_seq_putmem(s, hex, j); } +void *trace_seq_reserve(struct trace_seq *s, size_t len) +{ + void *ret; + + if (len > ((PAGE_SIZE - 1) - s->len)) + return NULL; + + ret = s->buffer + s->len; + s->len += len; + + return ret; +} + int trace_seq_path(struct trace_seq *s, struct path *path) { unsigned char *p; diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h index 35c422fb51a9..0ae20b83eec8 100644 --- a/kernel/trace/trace_output.h +++ b/kernel/trace/trace_output.h @@ -33,6 +33,7 @@ int trace_seq_puts(struct trace_seq *s, const char *str); int trace_seq_putc(struct trace_seq *s, unsigned char c); int trace_seq_putmem(struct trace_seq *s, void *mem, size_t len); int trace_seq_putmem_hex(struct trace_seq *s, void *mem, size_t len); +void *trace_seq_reserve(struct trace_seq *s, size_t len); int trace_seq_path(struct trace_seq *s, struct path *path); int seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s, unsigned long sym_flags); From f285901bb21355bb47106658ef14eeb6b8ed538f Mon Sep 17 00:00:00 2001 From: Eduard - Gabriel Munteanu Date: Mon, 23 Mar 2009 15:12:23 +0200 Subject: [PATCH 59/60] tracing: add missing 'extern' keywords to trace_output.h Impact: cleanup Many declarations within trace_output.h are missing the 'extern' keyword in an inconsistent manner. This adds 'extern' where it should be. Signed-off-by: Eduard - Gabriel Munteanu Signed-off-by: Ingo Molnar --- kernel/trace/trace_output.h | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h index 0ae20b83eec8..46fb9612d884 100644 --- a/kernel/trace/trace_output.h +++ b/kernel/trace/trace_output.h @@ -29,25 +29,26 @@ seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags); extern ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt); -int trace_seq_puts(struct trace_seq *s, const char *str); -int trace_seq_putc(struct trace_seq *s, unsigned char c); -int trace_seq_putmem(struct trace_seq *s, void *mem, size_t len); -int trace_seq_putmem_hex(struct trace_seq *s, void *mem, size_t len); -void *trace_seq_reserve(struct trace_seq *s, size_t len); -int trace_seq_path(struct trace_seq *s, struct path *path); -int seq_print_userip_objs(const struct userstack_entry *entry, - struct trace_seq *s, unsigned long sym_flags); -int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm, - unsigned long ip, unsigned long sym_flags); +extern int trace_seq_puts(struct trace_seq *s, const char *str); +extern int trace_seq_putc(struct trace_seq *s, unsigned char c); +extern int trace_seq_putmem(struct trace_seq *s, void *mem, size_t len); +extern int trace_seq_putmem_hex(struct trace_seq *s, void *mem, size_t len); +extern void *trace_seq_reserve(struct trace_seq *s, size_t len); +extern int trace_seq_path(struct trace_seq *s, struct path *path); +extern int seq_print_userip_objs(const struct userstack_entry *entry, + struct trace_seq *s, unsigned long sym_flags); +extern int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm, + unsigned long ip, unsigned long sym_flags); -int trace_print_context(struct trace_iterator *iter); -int trace_print_lat_context(struct trace_iterator *iter); +extern int trace_print_context(struct trace_iterator *iter); +extern int trace_print_lat_context(struct trace_iterator *iter); -struct trace_event *ftrace_find_event(int type); -int register_ftrace_event(struct trace_event *event); -int unregister_ftrace_event(struct trace_event *event); +extern struct trace_event *ftrace_find_event(int type); +extern int register_ftrace_event(struct trace_event *event); +extern int unregister_ftrace_event(struct trace_event *event); -enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags); +extern enum print_line_t trace_nop_print(struct trace_iterator *iter, + int flags); #define MAX_MEMHEX_BYTES 8 #define HEX_CHARS (MAX_MEMHEX_BYTES*2 + 1) From b14b70a6a4e394c9630bcde17e07d3bcdcbca27e Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Fri, 27 Mar 2009 10:21:00 +0800 Subject: [PATCH 60/60] trace: make argument 'mem' of trace_seq_putmem() const Impact: fix build warning I passed a const value to trace_seq_putmem(), and I got compile warning. Signed-off-by: Li Zefan Acked-by: Jens Axboe Cc: Arnaldo Carvalho de Melo Cc: Steven Rostedt Cc: Frederic Weisbecker Signed-off-by: Ingo Molnar --- kernel/trace/trace_output.c | 6 +++--- kernel/trace/trace_output.h | 5 +++-- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 6595074cbac5..d72b9a63b247 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -137,7 +137,7 @@ int trace_seq_putc(struct trace_seq *s, unsigned char c) return 1; } -int trace_seq_putmem(struct trace_seq *s, void *mem, size_t len) +int trace_seq_putmem(struct trace_seq *s, const void *mem, size_t len) { if (len > ((PAGE_SIZE - 1) - s->len)) return 0; @@ -148,10 +148,10 @@ int trace_seq_putmem(struct trace_seq *s, void *mem, size_t len) return len; } -int trace_seq_putmem_hex(struct trace_seq *s, void *mem, size_t len) +int trace_seq_putmem_hex(struct trace_seq *s, const void *mem, size_t len) { unsigned char hex[HEX_CHARS]; - unsigned char *data = mem; + const unsigned char *data = mem; int i, j; #ifdef __BIG_ENDIAN diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h index 46fb9612d884..e0bde39c2dd9 100644 --- a/kernel/trace/trace_output.h +++ b/kernel/trace/trace_output.h @@ -31,8 +31,9 @@ extern ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt); extern int trace_seq_puts(struct trace_seq *s, const char *str); extern int trace_seq_putc(struct trace_seq *s, unsigned char c); -extern int trace_seq_putmem(struct trace_seq *s, void *mem, size_t len); -extern int trace_seq_putmem_hex(struct trace_seq *s, void *mem, size_t len); +extern int trace_seq_putmem(struct trace_seq *s, const void *mem, size_t len); +extern int trace_seq_putmem_hex(struct trace_seq *s, const void *mem, + size_t len); extern void *trace_seq_reserve(struct trace_seq *s, size_t len); extern int trace_seq_path(struct trace_seq *s, struct path *path); extern int seq_print_userip_objs(const struct userstack_entry *entry,