diff options
Diffstat (limited to 'kernel/trace')
-rw-r--r-- | kernel/trace/Kconfig | 15 | ||||
-rw-r--r-- | kernel/trace/Makefile | 2 | ||||
-rw-r--r-- | kernel/trace/blktrace.c | 6 | ||||
-rw-r--r-- | kernel/trace/ftrace.c | 82 | ||||
-rw-r--r-- | kernel/trace/power-traces.c | 1 | ||||
-rw-r--r-- | kernel/trace/ring_buffer.c | 64 | ||||
-rw-r--r-- | kernel/trace/ring_buffer_benchmark.c | 1 | ||||
-rw-r--r-- | kernel/trace/trace.c | 206 | ||||
-rw-r--r-- | kernel/trace/trace.h | 11 | ||||
-rw-r--r-- | kernel/trace/trace_branch.c | 19 | ||||
-rw-r--r-- | kernel/trace/trace_clock.c | 5 | ||||
-rw-r--r-- | kernel/trace/trace_event_perf.c (renamed from kernel/trace/trace_event_profile.c) | 63 | ||||
-rw-r--r-- | kernel/trace/trace_events.c | 84 | ||||
-rw-r--r-- | kernel/trace/trace_events_filter.c | 1 | ||||
-rw-r--r-- | kernel/trace/trace_export.c | 87 | ||||
-rw-r--r-- | kernel/trace/trace_functions_graph.c | 108 | ||||
-rw-r--r-- | kernel/trace/trace_kprobe.c | 139 | ||||
-rw-r--r-- | kernel/trace/trace_ksym.c | 1 | ||||
-rw-r--r-- | kernel/trace/trace_mmiotrace.c | 1 | ||||
-rw-r--r-- | kernel/trace/trace_selftest.c | 1 | ||||
-rw-r--r-- | kernel/trace/trace_stack.c | 24 | ||||
-rw-r--r-- | kernel/trace/trace_stat.c | 1 | ||||
-rw-r--r-- | kernel/trace/trace_syscalls.c | 186 | ||||
-rw-r--r-- | kernel/trace/trace_workqueue.c | 1 |
24 files changed, 645 insertions, 464 deletions
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 6c22d8a2f289..13e13d428cd3 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -27,9 +27,7 @@ config HAVE_FUNCTION_GRAPH_TRACER config HAVE_FUNCTION_GRAPH_FP_TEST bool help - An arch may pass in a unique value (frame pointer) to both the - entering and exiting of a function. On exit, the value is compared - and if it does not match, then it will panic the kernel. + See Documentation/trace/ftrace-design.txt config HAVE_FUNCTION_TRACE_MCOUNT_TEST bool @@ -330,15 +328,6 @@ config BRANCH_TRACER Say N if unsure. -config POWER_TRACER - bool "Trace power consumption behavior" - depends on X86 - select GENERIC_TRACER - help - This tracer helps developers to analyze and optimize the kernel's - power management decisions, specifically the C-state and P-state - behavior. - config KSYM_TRACER bool "Trace read and write access on kernel memory locations" depends on HAVE_HW_BREAKPOINT @@ -451,7 +440,7 @@ config BLK_DEV_IO_TRACE config KPROBE_EVENT depends on KPROBES - depends on X86 + depends on HAVE_REGS_AND_STACK_ACCESS_API bool "Enable kprobes-based dynamic events" select TRACING default y diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index d00c6fe23f54..78edc6490038 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -52,7 +52,7 @@ obj-$(CONFIG_EVENT_TRACING) += trace_events.o obj-$(CONFIG_EVENT_TRACING) += trace_export.o obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o ifeq ($(CONFIG_PERF_EVENTS),y) -obj-$(CONFIG_EVENT_TRACING) += trace_event_profile.o +obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o endif obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index d9d6206e0b14..b3bc91a3f510 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -21,6 +21,7 @@ #include <linux/percpu.h> #include <linux/init.h> #include <linux/mutex.h> +#include <linux/slab.h> #include <linux/debugfs.h> #include <linux/smp_lock.h> #include <linux/time.h> @@ -540,9 +541,10 @@ int blk_trace_setup(struct request_queue *q, char *name, dev_t dev, if (ret) return ret; - if (copy_to_user(arg, &buts, sizeof(buts))) + if (copy_to_user(arg, &buts, sizeof(buts))) { + blk_trace_remove(q); return -EFAULT; - + } return 0; } EXPORT_SYMBOL_GPL(blk_trace_setup); diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 1904797f4a8a..2404b59b3097 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -24,9 +24,11 @@ #include <linux/uaccess.h> #include <linux/ftrace.h> #include <linux/sysctl.h> +#include <linux/slab.h> #include <linux/ctype.h> #include <linux/list.h> #include <linux/hash.h> +#include <linux/rcupdate.h> #include <trace/events/sched.h> @@ -84,22 +86,22 @@ ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub; ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub; -#ifdef CONFIG_FUNCTION_GRAPH_TRACER -static int ftrace_set_func(unsigned long *array, int *idx, char *buffer); -#endif - +/* + * Traverse the ftrace_list, invoking all entries. The reason that we + * can use rcu_dereference_raw() is that elements removed from this list + * are simply leaked, so there is no need to interact with a grace-period + * mechanism. The rcu_dereference_raw() calls are needed to handle + * concurrent insertions into the ftrace_list. + * + * Silly Alpha and silly pointer-speculation compiler optimizations! + */ static void ftrace_list_func(unsigned long ip, unsigned long parent_ip) { - struct ftrace_ops *op = ftrace_list; - - /* in case someone actually ports this to alpha! */ - read_barrier_depends(); + struct ftrace_ops *op = rcu_dereference_raw(ftrace_list); /*see above*/ while (op != &ftrace_list_end) { - /* silly alpha */ - read_barrier_depends(); op->func(ip, parent_ip); - op = op->next; + op = rcu_dereference_raw(op->next); /*see above*/ }; } @@ -154,8 +156,7 @@ static int __register_ftrace_function(struct ftrace_ops *ops) * the ops->next pointer is valid before another CPU sees * the ops pointer included into the ftrace_list. */ - smp_wmb(); - ftrace_list = ops; + rcu_assign_pointer(ftrace_list, ops); if (ftrace_enabled) { ftrace_func_t func; @@ -2276,6 +2277,8 @@ __setup("ftrace_filter=", set_ftrace_filter); #ifdef CONFIG_FUNCTION_GRAPH_TRACER static char ftrace_graph_buf[FTRACE_FILTER_SIZE] __initdata; +static int ftrace_set_func(unsigned long *array, int *idx, char *buffer); + static int __init set_graph_function(char *str) { strlcpy(ftrace_graph_buf, str, FTRACE_FILTER_SIZE); @@ -2402,6 +2405,7 @@ static const struct file_operations ftrace_notrace_fops = { static DEFINE_MUTEX(graph_lock); int ftrace_graph_count; +int ftrace_graph_filter_enabled; unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly; static void * @@ -2424,7 +2428,7 @@ static void *g_start(struct seq_file *m, loff_t *pos) mutex_lock(&graph_lock); /* Nothing, tell g_show to print all functions are enabled */ - if (!ftrace_graph_count && !*pos) + if (!ftrace_graph_filter_enabled && !*pos) return (void *)1; return __g_next(m, pos); @@ -2470,6 +2474,7 @@ ftrace_graph_open(struct inode *inode, struct file *file) mutex_lock(&graph_lock); if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) { + ftrace_graph_filter_enabled = 0; ftrace_graph_count = 0; memset(ftrace_graph_funcs, 0, sizeof(ftrace_graph_funcs)); } @@ -2495,7 +2500,7 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer) struct dyn_ftrace *rec; struct ftrace_page *pg; int search_len; - int found = 0; + int fail = 1; int type, not; char *search; bool exists; @@ -2506,37 +2511,51 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer) /* decode regex */ type = filter_parse_regex(buffer, strlen(buffer), &search, ¬); - if (not) - return -EINVAL; + if (!not && *idx >= FTRACE_GRAPH_MAX_FUNCS) + return -EBUSY; search_len = strlen(search); mutex_lock(&ftrace_lock); do_for_each_ftrace_rec(pg, rec) { - if (*idx >= FTRACE_GRAPH_MAX_FUNCS) - break; - if (rec->flags & (FTRACE_FL_FAILED | FTRACE_FL_FREE)) continue; if (ftrace_match_record(rec, search, search_len, type)) { - /* ensure it is not already in the array */ + /* if it is in the array */ exists = false; - for (i = 0; i < *idx; i++) + for (i = 0; i < *idx; i++) { if (array[i] == rec->ip) { exists = true; break; } - if (!exists) - array[(*idx)++] = rec->ip; - found = 1; + } + + if (!not) { + fail = 0; + if (!exists) { + array[(*idx)++] = rec->ip; + if (*idx >= FTRACE_GRAPH_MAX_FUNCS) + goto out; + } + } else { + if (exists) { + array[i] = array[--(*idx)]; + array[*idx] = 0; + fail = 0; + } + } } } while_for_each_ftrace_rec(); - +out: mutex_unlock(&ftrace_lock); - return found ? 0 : -EINVAL; + if (fail) + return -EINVAL; + + ftrace_graph_filter_enabled = 1; + return 0; } static ssize_t @@ -2546,16 +2565,11 @@ ftrace_graph_write(struct file *file, const char __user *ubuf, struct trace_parser parser; ssize_t read, ret; - if (!cnt || cnt < 0) + if (!cnt) return 0; mutex_lock(&graph_lock); - if (ftrace_graph_count >= FTRACE_GRAPH_MAX_FUNCS) { - ret = -EBUSY; - goto out_unlock; - } - if (trace_parser_get_init(&parser, FTRACE_BUFF_MAX)) { ret = -ENOMEM; goto out_unlock; @@ -3340,6 +3354,7 @@ void ftrace_graph_init_task(struct task_struct *t) { /* Make sure we do not use the parent ret_stack */ t->ret_stack = NULL; + t->curr_ret_stack = -1; if (ftrace_graph_active) { struct ftrace_ret_stack *ret_stack; @@ -3349,7 +3364,6 @@ void ftrace_graph_init_task(struct task_struct *t) GFP_KERNEL); if (!ret_stack) return; - t->curr_ret_stack = -1; atomic_set(&t->tracing_graph_pause, 0); atomic_set(&t->trace_overrun, 0); t->ftrace_timestamp = 0; diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c index 9f4f565b01e6..a22582a06161 100644 --- a/kernel/trace/power-traces.c +++ b/kernel/trace/power-traces.c @@ -9,7 +9,6 @@ #include <linux/workqueue.h> #include <linux/sched.h> #include <linux/module.h> -#include <linux/slab.h> #define CREATE_TRACE_POINTS #include <trace/events/power.h> diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index edefe3b2801b..41ca394feb22 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -14,12 +14,14 @@ #include <linux/module.h> #include <linux/percpu.h> #include <linux/mutex.h> +#include <linux/slab.h> #include <linux/init.h> #include <linux/hash.h> #include <linux/list.h> #include <linux/cpu.h> #include <linux/fs.h> +#include <asm/local.h> #include "trace.h" /* @@ -206,6 +208,14 @@ EXPORT_SYMBOL_GPL(tracing_is_on); #define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX) #define RB_EVNT_MIN_SIZE 8U /* two 32bit words */ +#if !defined(CONFIG_64BIT) || defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) +# define RB_FORCE_8BYTE_ALIGNMENT 0 +# define RB_ARCH_ALIGNMENT RB_ALIGNMENT +#else +# define RB_FORCE_8BYTE_ALIGNMENT 1 +# define RB_ARCH_ALIGNMENT 8U +#endif + /* define RINGBUF_TYPE_DATA for 'case RINGBUF_TYPE_DATA:' */ #define RINGBUF_TYPE_DATA 0 ... RINGBUF_TYPE_DATA_TYPE_LEN_MAX @@ -464,6 +474,8 @@ struct ring_buffer_iter { struct ring_buffer_per_cpu *cpu_buffer; unsigned long head; struct buffer_page *head_page; + struct buffer_page *cache_reader_page; + unsigned long cache_read; u64 read_stamp; }; @@ -1198,18 +1210,19 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages) for (i = 0; i < nr_pages; i++) { if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages))) - return; + goto out; p = cpu_buffer->pages->next; bpage = list_entry(p, struct buffer_page, list); list_del_init(&bpage->list); free_buffer_page(bpage); } if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages))) - return; + goto out; rb_reset_cpu(cpu_buffer); rb_check_pages(cpu_buffer); +out: spin_unlock_irq(&cpu_buffer->reader_lock); } @@ -1226,7 +1239,7 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer, for (i = 0; i < nr_pages; i++) { if (RB_WARN_ON(cpu_buffer, list_empty(pages))) - return; + goto out; p = pages->next; bpage = list_entry(p, struct buffer_page, list); list_del_init(&bpage->list); @@ -1235,6 +1248,7 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer, rb_reset_cpu(cpu_buffer); rb_check_pages(cpu_buffer); +out: spin_unlock_irq(&cpu_buffer->reader_lock); } @@ -1544,7 +1558,7 @@ rb_update_event(struct ring_buffer_event *event, case 0: length -= RB_EVNT_HDR_SIZE; - if (length > RB_MAX_SMALL_DATA) + if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) event->array[0] = length; else event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT); @@ -1719,11 +1733,11 @@ static unsigned rb_calculate_event_length(unsigned length) if (!length) length = 1; - if (length > RB_MAX_SMALL_DATA) + if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) length += sizeof(event.array[0]); length += RB_EVNT_HDR_SIZE; - length = ALIGN(length, RB_ALIGNMENT); + length = ALIGN(length, RB_ARCH_ALIGNMENT); return length; } @@ -2230,12 +2244,12 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length) if (ring_buffer_flags != RB_BUFFERS_ON) return NULL; - if (atomic_read(&buffer->record_disabled)) - return NULL; - /* If we are tracing schedule, we don't want to recurse */ resched = ftrace_preempt_disable(); + if (atomic_read(&buffer->record_disabled)) + goto out_nocheck; + if (trace_recursive_lock()) goto out_nocheck; @@ -2467,11 +2481,11 @@ int ring_buffer_write(struct ring_buffer *buffer, if (ring_buffer_flags != RB_BUFFERS_ON) return -EBUSY; - if (atomic_read(&buffer->record_disabled)) - return -EBUSY; - resched = ftrace_preempt_disable(); + if (atomic_read(&buffer->record_disabled)) + goto out; + cpu = raw_smp_processor_id(); if (!cpumask_test_cpu(cpu, buffer->cpumask)) @@ -2539,7 +2553,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_record_disable); * @buffer: The ring buffer to enable writes * * Note, multiple disables will need the same number of enables - * to truely enable the writing (much like preempt_disable). + * to truly enable the writing (much like preempt_disable). */ void ring_buffer_record_enable(struct ring_buffer *buffer) { @@ -2575,7 +2589,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_record_disable_cpu); * @cpu: The CPU to enable. * * Note, multiple disables will need the same number of enables - * to truely enable the writing (much like preempt_disable). + * to truly enable the writing (much like preempt_disable). */ void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu) { @@ -2716,6 +2730,8 @@ static void rb_iter_reset(struct ring_buffer_iter *iter) iter->read_stamp = cpu_buffer->read_stamp; else iter->read_stamp = iter->head_page->page->time_stamp; + iter->cache_reader_page = cpu_buffer->reader_page; + iter->cache_read = cpu_buffer->read; } /** @@ -3060,13 +3076,22 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts) struct ring_buffer_event *event; int nr_loops = 0; - if (ring_buffer_iter_empty(iter)) - return NULL; - cpu_buffer = iter->cpu_buffer; buffer = cpu_buffer->buffer; + /* + * Check if someone performed a consuming read to + * the buffer. A consuming read invalidates the iterator + * and we need to reset the iterator in this case. + */ + if (unlikely(iter->cache_read != cpu_buffer->read || + iter->cache_reader_page != cpu_buffer->reader_page)) + rb_iter_reset(iter); + again: + if (ring_buffer_iter_empty(iter)) + return NULL; + /* * We repeat when a timestamp is encountered. * We can get multiple timestamps by nested interrupts or also @@ -3081,6 +3106,11 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts) if (rb_per_cpu_empty(cpu_buffer)) return NULL; + if (iter->head >= local_read(&iter->head_page->page->commit)) { + rb_inc_iter(iter); + goto again; + } + event = rb_iter_head_event(iter); switch (event->type_len) { diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c index b2477caf09c2..df74c7982255 100644 --- a/kernel/trace/ring_buffer_benchmark.c +++ b/kernel/trace/ring_buffer_benchmark.c @@ -8,6 +8,7 @@ #include <linux/kthread.h> #include <linux/module.h> #include <linux/time.h> +#include <asm/local.h> struct rb_page { u64 ts; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 0df1b0f2cb9e..44f916a04065 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -32,10 +32,11 @@ #include <linux/splice.h> #include <linux/kdebug.h> #include <linux/string.h> +#include <linux/rwsem.h> +#include <linux/slab.h> #include <linux/ctype.h> #include <linux/init.h> #include <linux/poll.h> -#include <linux/gfp.h> #include <linux/fs.h> #include "trace.h" @@ -91,20 +92,17 @@ DEFINE_PER_CPU(int, ftrace_cpu_disabled); static inline void ftrace_disable_cpu(void) { preempt_disable(); - __this_cpu_inc(per_cpu_var(ftrace_cpu_disabled)); + __this_cpu_inc(ftrace_cpu_disabled); } static inline void ftrace_enable_cpu(void) { - __this_cpu_dec(per_cpu_var(ftrace_cpu_disabled)); + __this_cpu_dec(ftrace_cpu_disabled); preempt_enable(); } static cpumask_var_t __read_mostly tracing_buffer_mask; -/* Define which cpu buffers are currently read in trace_pipe */ -static cpumask_var_t tracing_reader_cpumask; - #define for_each_tracing_cpu(cpu) \ for_each_cpu(cpu, tracing_buffer_mask) @@ -243,12 +241,91 @@ static struct tracer *current_trace __read_mostly; /* * trace_types_lock is used to protect the trace_types list. - * This lock is also used to keep user access serialized. - * Accesses from userspace will grab this lock while userspace - * activities happen inside the kernel. */ static DEFINE_MUTEX(trace_types_lock); +/* + * serialize the access of the ring buffer + * + * ring buffer serializes readers, but it is low level protection. + * The validity of the events (which returns by ring_buffer_peek() ..etc) + * are not protected by ring buffer. + * + * The content of events may become garbage if we allow other process consumes + * these events concurrently: + * A) the page of the consumed events may become a normal page + * (not reader page) in ring buffer, and this page will be rewrited + * by events producer. + * B) The page of the consumed events may become a page for splice_read, + * and this page will be returned to system. + * + * These primitives allow multi process access to different cpu ring buffer + * concurrently. + * + * These primitives don't distinguish read-only and read-consume access. + * Multi read-only access are also serialized. + */ + +#ifdef CONFIG_SMP +static DECLARE_RWSEM(all_cpu_access_lock); +static DEFINE_PER_CPU(struct mutex, cpu_access_lock); + +static inline void trace_access_lock(int cpu) +{ + if (cpu == TRACE_PIPE_ALL_CPU) { + /* gain it for accessing the whole ring buffer. */ + down_write(&all_cpu_access_lock); + } else { + /* gain it for accessing a cpu ring buffer. */ + + /* Firstly block other trace_access_lock(TRACE_PIPE_ALL_CPU). */ + down_read(&all_cpu_access_lock); + + /* Secondly block other access to this @cpu ring buffer. */ + mutex_lock(&per_cpu(cpu_access_lock, cpu)); + } +} + +static inline void trace_access_unlock(int cpu) +{ + if (cpu == TRACE_PIPE_ALL_CPU) { + up_write(&all_cpu_access_lock); + } else { + mutex_unlock(&per_cpu(cpu_access_lock, cpu)); + up_read(&all_cpu_access_lock); + } +} + +static inline void trace_access_lock_init(void) +{ + int cpu; + + for_each_possible_cpu(cpu) + mutex_init(&per_cpu(cpu_access_lock, cpu)); +} + +#else + +static DEFINE_MUTEX(access_lock); + +static inline void trace_access_lock(int cpu) +{ + (void)cpu; + mutex_lock(&access_lock); +} + +static inline void trace_access_unlock(int cpu) +{ + (void)cpu; + mutex_unlock(&access_lock); +} + +static inline void trace_access_lock_init(void) +{ +} + +#endif + /* trace_wait is a waitqueue for tasks blocked on trace_poll */ static DECLARE_WAIT_QUEUE_HEAD(trace_wait); @@ -297,6 +374,21 @@ static int __init set_buf_size(char *str) } __setup("trace_buf_size=", set_buf_size); +static int __init set_tracing_thresh(char *str) +{ + unsigned long threshhold; + int ret; + + if (!str) + return 0; + ret = strict_strtoul(str, 0, &threshhold); + if (ret < 0) + return 0; + tracing_thresh = threshhold * 1000; + return 1; +} +__setup("tracing_thresh=", set_tracing_thresh); + unsigned long nsecs_to_usecs(unsigned long nsecs) { return nsecs / 1000; @@ -502,9 +594,10 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt) static arch_spinlock_t ftrace_max_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; +unsigned long __read_mostly tracing_thresh; + #ifdef CONFIG_TRACER_MAX_TRACE unsigned long __read_mostly tracing_max_latency; -unsigned long __read_mostly tracing_thresh; /* * Copy the new maximum trace into the separate maximum-trace @@ -515,7 +608,7 @@ static void __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) { struct trace_array_cpu *data = tr->data[cpu]; - struct trace_array_cpu *max_data = tr->data[cpu]; + struct trace_array_cpu *max_data; max_tr.cpu = cpu; max_tr.time_start = data->preempt_timestamp; @@ -525,7 +618,7 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) max_data->critical_start = data->critical_start; max_data->critical_end = data->critical_end; - memcpy(data->comm, tsk->comm, TASK_COMM_LEN); + memcpy(max_data->comm, tsk->comm, TASK_COMM_LEN); max_data->pid = tsk->pid; max_data->uid = task_uid(tsk); max_data->nice = tsk->static_prio - 20 - MAX_RT_PRIO; @@ -747,10 +840,10 @@ out: mutex_unlock(&trace_types_lock); } -static void __tracing_reset(struct trace_array *tr, int cpu) +static void __tracing_reset(struct ring_buffer *buffer, int cpu) { ftrace_disable_cpu(); - ring_buffer_reset_cpu(tr->buffer, cpu); + ring_buffer_reset_cpu(buffer, cpu); ftrace_enable_cpu(); } @@ -762,7 +855,7 @@ void tracing_reset(struct trace_array *tr, int cpu) /* Make sure all commits have finished */ synchronize_sched(); - __tracing_reset(tr, cpu); + __tracing_reset(buffer, cpu); ring_buffer_record_enable(buffer); } @@ -780,7 +873,7 @@ void tracing_reset_online_cpus(struct trace_array *tr) tr->time_start = ftrace_now(tr->cpu); for_each_online_cpu(cpu) - __tracing_reset(tr, cpu); + __tracing_reset(buffer, cpu); ring_buffer_record_enable(buffer); } @@ -857,6 +950,8 @@ void tracing_start(void) goto out; } + /* Prevent the buffers from switching */ + arch_spin_lock(&ftrace_max_lock); buffer = global_trace.buffer; if (buffer) @@ -866,6 +961,8 @@ void tracing_start(void) if (buffer) ring_buffer_record_enable(buffer); + arch_spin_unlock(&ftrace_max_lock); + ftrace_start(); out: spin_unlock_irqrestore(&tracing_start_lock, flags); @@ -887,6 +984,9 @@ void tracing_stop(void) if (trace_stop_count++) goto out; + /* Prevent the buffers from switching */ + arch_spin_lock(&ftrace_max_lock); + buffer = global_trace.buffer; if (buffer) ring_buffer_record_disable(buffer); @@ -895,6 +995,8 @@ void tracing_stop(void) if (buffer) ring_buffer_record_disable(buffer); + arch_spin_unlock(&ftrace_max_lock); + out: spin_unlock_irqrestore(&tracing_start_lock, flags); } @@ -951,6 +1053,11 @@ void trace_find_cmdline(int pid, char comm[]) return; } + if (WARN_ON_ONCE(pid < 0)) { + strcpy(comm, "<XXX>"); + return; + } + if (pid > PID_MAX_DEFAULT) { strcpy(comm, "<...>"); return; @@ -1084,7 +1191,7 @@ trace_function(struct trace_array *tr, struct ftrace_entry *entry; /* If we are reading the ring buffer, don't trace */ - if (unlikely(__this_cpu_read(per_cpu_var(ftrace_cpu_disabled)))) + if (unlikely(__this_cpu_read(ftrace_cpu_disabled))) return; event = trace_buffer_lock_reserve(buffer, TRACE_FN, sizeof(*entry), @@ -1177,6 +1284,13 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc) if (!(trace_flags & TRACE_ITER_USERSTACKTRACE)) return; + /* + * NMIs can not handle page faults, even with fix ups. + * The save user stack can (and often does) fault. + */ + if (unlikely(in_nmi())) + return; + event = trace_buffer_lock_reserve(buffer, TRACE_USER_STACK, sizeof(*entry), flags, pc); if (!event) @@ -1315,8 +1429,10 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) entry->fmt = fmt; memcpy(entry->buf, trace_buf, sizeof(u32) * len); - if (!filter_check_discard(call, entry, buffer, event)) + if (!filter_check_discard(call, entry, buffer, event)) { ring_buffer_unlock_commit(buffer, event); + ftrace_trace_stack(buffer, flags, 6, pc); + } out_unlock: arch_spin_unlock(&trace_buf_lock); @@ -1389,8 +1505,10 @@ int trace_array_vprintk(struct trace_array *tr, memcpy(&entry->buf, trace_buf, len); entry->buf[len] = '\0'; - if (!filter_check_discard(call, entry, buffer, event)) + if (!filter_check_discard(call, entry, buffer, event)) { ring_buffer_unlock_commit(buffer, event); + ftrace_trace_stack(buffer, irq_flags, 6, pc); + } out_unlock: arch_spin_unlock(&trace_buf_lock); @@ -1580,12 +1698,6 @@ static void tracing_iter_reset(struct trace_iterator *iter, int cpu) } /* - * No necessary locking here. The worst thing which can - * happen is loosing events consumed at the same time - * by a trace_pipe reader. - * Other than that, we don't risk to crash the ring buffer - * because it serializes the readers. - * * The current tracer is copied to avoid a global locking * all around. */ @@ -1623,6 +1735,7 @@ static void *s_start(struct seq_file *m, loff_t *pos) ftrace_enable_cpu(); + iter->leftover = 0; for (p = iter; p && l < *pos; p = s_next(m, p, &l)) ; @@ -1640,12 +1753,16 @@ static void *s_start(struct seq_file *m, loff_t *pos) } trace_event_read_lock(); + trace_access_lock(cpu_file); return p; } static void s_stop(struct seq_file *m, void *p) { + struct trace_iterator *iter = m->private; + atomic_dec(&trace_record_cmdline_disabled); + trace_access_unlock(iter->cpu_file); trace_event_read_unlock(); } @@ -2836,22 +2953,6 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp) mutex_lock(&trace_types_lock); - /* We only allow one reader per cpu */ - if (cpu_file == TRACE_PIPE_ALL_CPU) { - if (!cpumask_empty(tracing_reader_cpumask)) { - ret = -EBUSY; - goto out; - } - cpumask_setall(tracing_reader_cpumask); - } else { - if (!cpumask_test_cpu(cpu_file, tracing_reader_cpumask)) - cpumask_set_cpu(cpu_file, tracing_reader_cpumask); - else { - ret = -EBUSY; - goto out; - } - } - /* create a buffer to store the information to pass to userspace */ iter = kzalloc(sizeof(*iter), GFP_KERNEL); if (!iter) { @@ -2907,12 +3008,6 @@ static int tracing_release_pipe(struct inode *inode, struct file *file) mutex_lock(&trace_types_lock); - if (iter->cpu_file == TRACE_PIPE_ALL_CPU) - cpumask_clear(tracing_reader_cpumask); - else - cpumask_clear_cpu(iter->cpu_file, tracing_reader_cpumask); - - if (iter->trace->pipe_close) iter->trace->pipe_close(iter); @@ -3074,6 +3169,7 @@ waitagain: iter->pos = -1; trace_event_read_lock(); + trace_access_lock(iter->cpu_file); while (find_next_entry_inc(iter) != NULL) { enum print_line_t ret; int len = iter->seq.len; @@ -3090,6 +3186,7 @@ waitagain: if (iter->seq.len >= cnt) break; } + trace_access_unlock(iter->cpu_file); trace_event_read_unlock(); /* Now copy what we have to the user */ @@ -3215,6 +3312,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, } trace_event_read_lock(); + trace_access_lock(iter->cpu_file); /* Fill as many pages as possible. */ for (i = 0, rem = len; i < PIPE_BUFFERS && rem; i++) { @@ -3238,6 +3336,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, trace_seq_init(&iter->seq); } + trace_access_unlock(iter->cpu_file); trace_event_read_unlock(); mutex_unlock(&iter->mutex); @@ -3539,10 +3638,12 @@ tracing_buffers_read(struct file *filp, char __user *ubuf, info->read = 0; + trace_access_lock(info->cpu); ret = ring_buffer_read_page(info->tr->buffer, &info->spare, count, info->cpu, 0); + trace_access_unlock(info->cpu); if (ret < 0) return 0; @@ -3670,6 +3771,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, len &= PAGE_MASK; } + trace_access_lock(info->cpu); entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu); for (i = 0; i < PIPE_BUFFERS && len && entries; i++, len -= PAGE_SIZE) { @@ -3717,6 +3819,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu); } + trace_access_unlock(info->cpu); spd.nr_pages = i; /* did we read anything? */ @@ -4153,6 +4256,8 @@ static __init int tracer_init_debugfs(void) struct dentry *d_tracer; int cpu; + trace_access_lock_init(); + d_tracer = tracing_init_dentry(); trace_create_file("tracing_enabled", 0644, d_tracer, @@ -4176,10 +4281,10 @@ static __init int tracer_init_debugfs(void) #ifdef CONFIG_TRACER_MAX_TRACE trace_create_file("tracing_max_latency", 0644, d_tracer, &tracing_max_latency, &tracing_max_lat_fops); +#endif trace_create_file("tracing_thresh", 0644, d_tracer, &tracing_thresh, &tracing_max_lat_fops); -#endif trace_create_file("README", 0444, d_tracer, NULL, &tracing_readme_fops); @@ -4387,9 +4492,6 @@ __init static int tracer_alloc_buffers(void) if (!alloc_cpumask_var(&tracing_cpumask, GFP_KERNEL)) goto out_free_buffer_mask; - if (!zalloc_cpumask_var(&tracing_reader_cpumask, GFP_KERNEL)) - goto out_free_tracing_cpumask; - /* To save memory, keep the ring buffer size to its minimum */ if (ring_buffer_expanded) ring_buf_size = trace_buf_size; @@ -4447,8 +4549,6 @@ __init static int tracer_alloc_buffers(void) return 0; out_free_cpumask: - free_cpumask_var(tracing_reader_cpumask); -out_free_tracing_cpumask: free_cpumask_var(tracing_cpumask); out_free_buffer_mask: free_cpumask_var(tracing_buffer_mask); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 4df6a77eb196..2825ef2c0b15 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -396,9 +396,10 @@ extern int process_new_ksym_entry(char *ksymname, int op, unsigned long addr); extern unsigned long nsecs_to_usecs(unsigned long nsecs); +extern unsigned long tracing_thresh; + #ifdef CONFIG_TRACER_MAX_TRACE extern unsigned long tracing_max_latency; -extern unsigned long tracing_thresh; void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu); void update_max_tr_single(struct trace_array *tr, @@ -497,6 +498,7 @@ trace_print_graph_duration(unsigned long long duration, struct trace_seq *s); #ifdef CONFIG_DYNAMIC_FTRACE /* TODO: make this variable */ #define FTRACE_GRAPH_MAX_FUNCS 32 +extern int ftrace_graph_filter_enabled; extern int ftrace_graph_count; extern unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS]; @@ -504,7 +506,7 @@ static inline int ftrace_graph_addr(unsigned long addr) { int i; - if (!ftrace_graph_count || test_tsk_trace_graph(current)) + if (!ftrace_graph_filter_enabled) return 1; for (i = 0; i < ftrace_graph_count; i++) { @@ -549,7 +551,7 @@ static inline int ftrace_trace_task(struct task_struct *task) * struct trace_parser - servers for reading the user input separated by spaces * @cont: set if the input is not complete - no final space char was found * @buffer: holds the parsed user input - * @idx: user input lenght + * @idx: user input length * @size: buffer size */ struct trace_parser { @@ -791,7 +793,8 @@ extern const char *__stop___trace_bprintk_fmt[]; #undef FTRACE_ENTRY #define FTRACE_ENTRY(call, struct_name, id, tstruct, print) \ - extern struct ftrace_event_call event_##call; + extern struct ftrace_event_call \ + __attribute__((__aligned__(4))) event_##call; #undef FTRACE_ENTRY_DUP #define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print) \ FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print)) diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index 4a194f08f88c..b9bc4d470177 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c @@ -307,8 +307,23 @@ static int annotated_branch_stat_cmp(void *p1, void *p2) return -1; if (percent_a > percent_b) return 1; - else - return 0; + + if (a->incorrect < b->incorrect) + return -1; + if (a->incorrect > b->incorrect) + return 1; + + /* + * Since the above shows worse (incorrect) cases + * first, we continue that by showing best (correct) + * cases last. + */ + if (a->correct > b->correct) + return -1; + if (a->correct < b->correct) + return 1; + + return 0; } static struct tracer_stat annotated_branch_stats = { diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c index 84a3a7ba072a..9d589d8dcd1a 100644 --- a/kernel/trace/trace_clock.c +++ b/kernel/trace/trace_clock.c @@ -13,6 +13,7 @@ * Tracer plugins will chose a default from these clocks. */ #include <linux/spinlock.h> +#include <linux/irqflags.h> #include <linux/hardirq.h> #include <linux/module.h> #include <linux/percpu.h> @@ -83,7 +84,7 @@ u64 notrace trace_clock_global(void) int this_cpu; u64 now; - raw_local_irq_save(flags); + local_irq_save(flags); this_cpu = raw_smp_processor_id(); now = cpu_clock(this_cpu); @@ -109,7 +110,7 @@ u64 notrace trace_clock_global(void) arch_spin_unlock(&trace_clock_struct.lock); out: - raw_local_irq_restore(flags); + local_irq_restore(flags); return now; } diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_perf.c index f0d693005075..0565bb42566f 100644 --- a/kernel/trace/trace_event_profile.c +++ b/kernel/trace/trace_event_perf.c @@ -1,32 +1,41 @@ /* - * trace event based perf counter profiling + * trace event based perf event profiling/tracing * * Copyright (C) 2009 Red Hat Inc, Peter Zijlstra <pzijlstr@redhat.com> - * + * Copyright (C) 2009-2010 Frederic Weisbecker <fweisbec@gmail.com> */ #include <linux/module.h> #include <linux/kprobes.h> #include "trace.h" +DEFINE_PER_CPU(struct pt_regs, perf_trace_regs); +EXPORT_PER_CPU_SYMBOL_GPL(perf_trace_regs); + +EXPORT_SYMBOL_GPL(perf_arch_fetch_caller_regs); static char *perf_trace_buf; static char *perf_trace_buf_nmi; -typedef typeof(char [FTRACE_MAX_PROFILE_SIZE]) perf_trace_t ; +/* + * Force it to be aligned to unsigned long to avoid misaligned accesses + * suprises + */ +typedef typeof(unsigned long [PERF_MAX_TRACE_SIZE / sizeof(unsigned long)]) + perf_trace_t; /* Count the events in use (per event id, not per instance) */ -static int total_profile_count; +static int total_ref_count; -static int ftrace_profile_enable_event(struct ftrace_event_call *event) +static int perf_trace_event_enable(struct ftrace_event_call *event) { char *buf; int ret = -ENOMEM; - if (event->profile_count++ > 0) + if (event->perf_refcount++ > 0) return 0; - if (!total_profile_count) { + if (!total_ref_count) { buf = (char *)alloc_percpu(perf_trace_t); if (!buf) goto fail_buf; @@ -40,35 +49,35 @@ static int ftrace_profile_enable_event(struct ftrace_event_call *event) rcu_assign_pointer(perf_trace_buf_nmi, buf); } - ret = event->profile_enable(event); + ret = event->perf_event_enable(event); if (!ret) { - total_profile_count++; + total_ref_count++; return 0; } fail_buf_nmi: - if (!total_profile_count) { + if (!total_ref_count) { free_percpu(perf_trace_buf_nmi); free_percpu(perf_trace_buf); perf_trace_buf_nmi = NULL; perf_trace_buf = NULL; } fail_buf: - event->profile_count--; + event->perf_refcount--; return ret; } -int ftrace_profile_enable(int event_id) +int perf_trace_enable(int event_id) { struct ftrace_event_call *event; int ret = -EINVAL; mutex_lock(&event_mutex); list_for_each_entry(event, &ftrace_events, list) { - if (event->id == event_id && event->profile_enable && + if (event->id == event_id && event->perf_event_enable && try_module_get(event->mod)) { - ret = ftrace_profile_enable_event(event); + ret = perf_trace_event_enable(event); break; } } @@ -77,16 +86,16 @@ int ftrace_profile_enable(int event_id) return ret; } -static void ftrace_profile_disable_event(struct ftrace_event_call *event) +static void perf_trace_event_disable(struct ftrace_event_call *event) { char *buf, *nmi_buf; - if (--event->profile_count > 0) + if (--event->perf_refcount > 0) return; - event->profile_disable(event); + event->perf_event_disable(event); - if (!--total_profile_count) { + if (!--total_ref_count) { buf = perf_trace_buf; rcu_assign_pointer(perf_trace_buf, NULL); @@ -104,14 +113,14 @@ static void ftrace_profile_disable_event(struct ftrace_event_call *event) } } -void ftrace_profile_disable(int event_id) +void perf_trace_disable(int event_id) { struct ftrace_event_call *event; mutex_lock(&event_mutex); list_for_each_entry(event, &ftrace_events, list) { if (event->id == event_id) { - ftrace_profile_disable_event(event); + perf_trace_event_disable(event); module_put(event->mod); break; } @@ -119,13 +128,15 @@ void ftrace_profile_disable(int event_id) mutex_unlock(&event_mutex); } -__kprobes void *ftrace_perf_buf_prepare(int size, unsigned short type, - int *rctxp, unsigned long *irq_flags) +__kprobes void *perf_trace_buf_prepare(int size, unsigned short type, + int *rctxp, unsigned long *irq_flags) { struct trace_entry *entry; char *trace_buf, *raw_data; int pc, cpu; + BUILD_BUG_ON(PERF_MAX_TRACE_SIZE % sizeof(unsigned long)); + pc = preempt_count(); /* Protect the per cpu buffer, begin the rcu read side */ @@ -138,9 +149,9 @@ __kprobes void *ftrace_perf_buf_prepare(int size, unsigned short type, cpu = smp_processor_id(); if (in_nmi()) - trace_buf = rcu_dereference(perf_trace_buf_nmi); + trace_buf = rcu_dereference_sched(perf_trace_buf_nmi); else - trace_buf = rcu_dereference(perf_trace_buf); + trace_buf = rcu_dereference_sched(perf_trace_buf); if (!trace_buf) goto err; @@ -148,7 +159,7 @@ __kprobes void *ftrace_perf_buf_prepare(int size, unsigned short type, raw_data = per_cpu_ptr(trace_buf, cpu); /* zero the dead bytes from align to not leak stack to user */ - *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL; + memset(&raw_data[size - sizeof(u64)], 0, sizeof(u64)); entry = (struct trace_entry *)raw_data; tracing_generic_entry_update(entry, *irq_flags, pc); @@ -161,4 +172,4 @@ err_recursion: local_irq_restore(*irq_flags); return NULL; } -EXPORT_SYMBOL_GPL(ftrace_perf_buf_prepare); +EXPORT_SYMBOL_GPL(perf_trace_buf_prepare); diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 189b09baf4fb..c697c7043349 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -15,6 +15,7 @@ #include <linux/uaccess.h> #include <linux/module.h> #include <linux/ctype.h> +#include <linux/slab.h> #include <linux/delay.h> #include <asm/setup.h> @@ -60,10 +61,8 @@ int trace_define_field(struct ftrace_event_call *call, const char *type, return 0; err: - if (field) { + if (field) kfree(field->name); - kfree(field->type); - } kfree(field); return -ENOMEM; @@ -520,41 +519,16 @@ out: return ret; } -extern char *__bad_type_size(void); - -#undef FIELD -#define FIELD(type, name) \ - sizeof(type) != sizeof(field.name) ? __bad_type_size() : \ - #type, "common_" #name, offsetof(typeof(field), name), \ - sizeof(field.name), is_signed_type(type) - -static int trace_write_header(struct trace_seq *s) -{ - struct trace_entry field; - - /* struct trace_entry */ - return trace_seq_printf(s, - "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n" - "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n" - "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n" - "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n" - "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n" - "\n", - FIELD(unsigned short, type), - FIELD(unsigned char, flags), - FIELD(unsigned char, preempt_count), - FIELD(int, pid), - FIELD(int, lock_depth)); -} - static ssize_t event_format_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { struct ftrace_event_call *call = filp->private_data; + struct ftrace_event_field *field; struct trace_seq *s; + int common_field_count = 5; char *buf; - int r; + int r = 0; if (*ppos) return 0; @@ -565,14 +539,48 @@ event_format_read(struct file *filp, char __user *ubuf, size_t cnt, trace_seq_init(s); - /* If any of the first writes fail, so will the show_format. */ - trace_seq_printf(s, "name: %s\n", call->name); trace_seq_printf(s, "ID: %d\n", call->id); trace_seq_printf(s, "format:\n"); - trace_write_header(s); - r = call->show_format(call, s); + list_for_each_entry_reverse(field, &call->fields, link) { + /* + * Smartly shows the array type(except dynamic array). + * Normal: + * field:TYPE VAR + * If TYPE := TYPE[LEN], it is shown: + * field:TYPE VAR[LEN] + */ + const char *array_descriptor = strchr(field->type, '['); + + if (!strncmp(field->type, "__data_loc", 10)) + array_descriptor = NULL; + + if (!array_descriptor) { + r = trace_seq_printf(s, "\tfield:%s %s;\toffset:%u;" + "\tsize:%u;\tsigned:%d;\n", + field->type, field->name, field->offset, + field->size, !!field->is_signed); + } else { + r = trace_seq_printf(s, "\tfield:%.*s %s%s;\toffset:%u;" + "\tsize:%u;\tsigned:%d;\n", + (int)(array_descriptor - field->type), + field->type, field->name, + array_descriptor, field->offset, + field->size, !!field->is_signed); + } + + if (--common_field_count == 0) + r = trace_seq_printf(s, "\n"); + + if (!r) + break; + } + + if (r) + r = trace_seq_printf(s, "\nprint fmt: %s\n", + call->print_fmt); + if (!r) { /* * ug! The format output is bigger than a PAGE!! @@ -931,7 +939,7 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events, trace_create_file("enable", 0644, call->dir, call, enable); - if (call->id && call->profile_enable) + if (call->id && call->perf_event_enable) trace_create_file("id", 0444, call->dir, call, id); @@ -948,10 +956,6 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events, filter); } - /* A trace may not want to export its format */ - if (!call->show_format) - return 0; - trace_create_file("format", 0444, call->dir, call, format); diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 4615f62a04f1..88c0b6dbd7fe 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -22,6 +22,7 @@ #include <linux/ctype.h> #include <linux/mutex.h> #include <linux/perf_event.h> +#include <linux/slab.h> #include "trace.h" #include "trace_output.h" diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index d4fa5dc1ee4e..e091f64ba6ce 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -62,78 +62,6 @@ static void __always_unused ____ftrace_check_##name(void) \ #include "trace_entries.h" - -#undef __field -#define __field(type, item) \ - ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \ - "offset:%zu;\tsize:%zu;\tsigned:%u;\n", \ - offsetof(typeof(field), item), \ - sizeof(field.item), is_signed_type(type)); \ - if (!ret) \ - return 0; - -#undef __field_desc -#define __field_desc(type, container, item) \ - ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \ - "offset:%zu;\tsize:%zu;\tsigned:%u;\n", \ - offsetof(typeof(field), container.item), \ - sizeof(field.container.item), \ - is_signed_type(type)); \ - if (!ret) \ - return 0; - -#undef __array -#define __array(type, item, len) \ - ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \ - "offset:%zu;\tsize:%zu;\tsigned:%u;\n", \ - offsetof(typeof(field), item), \ - sizeof(field.item), is_signed_type(type)); \ - if (!ret) \ - return 0; - -#undef __array_desc -#define __array_desc(type, container, item, len) \ - ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \ - "offset:%zu;\tsize:%zu;\tsigned:%u;\n", \ - offsetof(typeof(field), container.item), \ - sizeof(field.container.item), \ - is_signed_type(type)); \ - if (!ret) \ - return 0; - -#undef __dynamic_array -#define __dynamic_array(type, item) \ - ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \ - "offset:%zu;\tsize:0;\tsigned:%u;\n", \ - offsetof(typeof(field), item), \ - is_signed_type(type)); \ - if (!ret) \ - return 0; - -#undef F_printk -#define F_printk(fmt, args...) "%s, %s\n", #fmt, __stringify(args) - -#undef __entry -#define __entry REC - -#undef FTRACE_ENTRY -#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \ -static int \ -ftrace_format_##name(struct ftrace_event_call *unused, \ - struct trace_seq *s) \ -{ \ - struct struct_name field __attribute__((unused)); \ - int ret = 0; \ - \ - tstruct; \ - \ - trace_seq_printf(s, "\nprint fmt: " print); \ - \ - return ret; \ -} - -#include "trace_entries.h" - #undef __field #define __field(type, item) \ ret = trace_define_field(event_call, #type, #item, \ @@ -175,7 +103,12 @@ ftrace_format_##name(struct ftrace_event_call *unused, \ return ret; #undef __dynamic_array -#define __dynamic_array(type, item) +#define __dynamic_array(type, item) \ + ret = trace_define_field(event_call, #type, #item, \ + offsetof(typeof(field), item), \ + 0, is_signed_type(type), FILTER_OTHER);\ + if (ret) \ + return ret; #undef FTRACE_ENTRY #define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \ @@ -198,6 +131,9 @@ static int ftrace_raw_init_event(struct ftrace_event_call *call) return 0; } +#undef __entry +#define __entry REC + #undef __field #define __field(type, item) @@ -213,6 +149,9 @@ static int ftrace_raw_init_event(struct ftrace_event_call *call) #undef __dynamic_array #define __dynamic_array(type, item) +#undef F_printk +#define F_printk(fmt, args...) #fmt ", " __stringify(args) + #undef FTRACE_ENTRY #define FTRACE_ENTRY(call, struct_name, type, tstruct, print) \ \ @@ -223,7 +162,7 @@ __attribute__((section("_ftrace_events"))) event_##call = { \ .id = type, \ .system = __stringify(TRACE_SYSTEM), \ .raw_init = ftrace_raw_init_event, \ - .show_format = ftrace_format_##call, \ + .print_fmt = print, \ .define_fields = ftrace_define_fields_##call, \ }; \ diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index b1342c5d37cf..9aed1a5cf553 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -9,6 +9,7 @@ #include <linux/debugfs.h> #include <linux/uaccess.h> #include <linux/ftrace.h> +#include <linux/slab.h> #include <linux/fs.h> #include "trace.h" @@ -18,6 +19,7 @@ struct fgraph_cpu_data { pid_t last_pid; int depth; int ignore; + unsigned long enter_funcs[FTRACE_RETFUNC_DEPTH]; }; struct fgraph_data { @@ -187,7 +189,7 @@ static int __trace_graph_entry(struct trace_array *tr, struct ring_buffer *buffer = tr->buffer; struct ftrace_graph_ent_entry *entry; - if (unlikely(__this_cpu_read(per_cpu_var(ftrace_cpu_disabled)))) + if (unlikely(__this_cpu_read(ftrace_cpu_disabled))) return 0; event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_ENT, @@ -212,13 +214,11 @@ int trace_graph_entry(struct ftrace_graph_ent *trace) int cpu; int pc; - if (unlikely(!tr)) - return 0; - if (!ftrace_trace_task(current)) return 0; - if (!ftrace_graph_addr(trace->func)) + /* trace it when it is-nested-in or is a function enabled. */ + if (!(trace->depth || ftrace_graph_addr(trace->func))) return 0; local_irq_save(flags); @@ -231,9 +231,6 @@ int trace_graph_entry(struct ftrace_graph_ent *trace) } else { ret = 0; } - /* Only do the atomic if it is not already set */ - if (!test_tsk_trace_graph(current)) - set_tsk_trace_graph(current); atomic_dec(&data->disabled); local_irq_restore(flags); @@ -241,6 +238,14 @@ int trace_graph_entry(struct ftrace_graph_ent *trace) return ret; } +int trace_graph_thresh_entry(struct ftrace_graph_ent *trace) +{ + if (tracing_thresh) + return 1; + else + return trace_graph_entry(trace); +} + static void __trace_graph_return(struct trace_array *tr, struct ftrace_graph_ret *trace, unsigned long flags, @@ -251,7 +256,7 @@ static void __trace_graph_return(struct trace_array *tr, struct ring_buffer *buffer = tr->buffer; struct ftrace_graph_ret_entry *entry; - if (unlikely(__this_cpu_read(per_cpu_var(ftrace_cpu_disabled)))) + if (unlikely(__this_cpu_read(ftrace_cpu_disabled))) return; event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_RET, @@ -281,19 +286,39 @@ void trace_graph_return(struct ftrace_graph_ret *trace) pc = preempt_count(); __trace_graph_return(tr, trace, flags, pc); } - if (!trace->depth) - clear_tsk_trace_graph(current); atomic_dec(&data->disabled); local_irq_restore(flags); } +void set_graph_array(struct trace_array *tr) +{ + graph_array = tr; + + /* Make graph_array visible before we start tracing */ + + smp_mb(); +} + +void trace_graph_thresh_return(struct ftrace_graph_ret *trace) +{ + if (tracing_thresh && + (trace->rettime - trace->calltime < tracing_thresh)) + return; + else + trace_graph_return(trace); +} + static int graph_trace_init(struct trace_array *tr) { int ret; - graph_array = tr; - ret = register_ftrace_graph(&trace_graph_return, - &trace_graph_entry); + set_graph_array(tr); + if (tracing_thresh) + ret = register_ftrace_graph(&trace_graph_thresh_return, + &trace_graph_thresh_entry); + else + ret = register_ftrace_graph(&trace_graph_return, + &trace_graph_entry); if (ret) return ret; tracing_start_cmdline_record(); @@ -301,11 +326,6 @@ static int graph_trace_init(struct trace_array *tr) return 0; } -void set_graph_array(struct trace_array *tr) -{ - graph_array = tr; -} - static void graph_trace_reset(struct trace_array *tr) { tracing_stop_cmdline_record(); @@ -673,15 +693,21 @@ print_graph_entry_leaf(struct trace_iterator *iter, duration = graph_ret->rettime - graph_ret->calltime; if (data) { + struct fgraph_cpu_data *cpu_data; int cpu = iter->cpu; - int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth); + + cpu_data = per_cpu_ptr(data->cpu_data, cpu); /* * Comments display at + 1 to depth. Since * this is a leaf function, keep the comments * equal to this depth. */ - *depth = call->depth - 1; + cpu_data->depth = call->depth - 1; + + /* No need to keep this function around for this depth */ + if (call->depth < FTRACE_RETFUNC_DEPTH) + cpu_data->enter_funcs[call->depth] = 0; } /* Overhead */ @@ -721,10 +747,15 @@ print_graph_entry_nested(struct trace_iterator *iter, int i; if (data) { + struct fgraph_cpu_data *cpu_data; int cpu = iter->cpu; - int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth); - *depth = call->depth; + cpu_data = per_cpu_ptr(data->cpu_data, cpu); + cpu_data->depth = call->depth; + + /* Save this function pointer to see if the exit matches */ + if (call->depth < FTRACE_RETFUNC_DEPTH) + cpu_data->enter_funcs[call->depth] = call->func; } /* No overhead */ @@ -854,19 +885,28 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, struct fgraph_data *data = iter->private; pid_t pid = ent->pid; int cpu = iter->cpu; + int func_match = 1; int ret; int i; if (data) { + struct fgraph_cpu_data *cpu_data; int cpu = iter->cpu; - int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth); + + cpu_data = per_cpu_ptr(data->cpu_data, cpu); /* * Comments display at + 1 to depth. This is the * return from a function, we now want the comments * to display at the same level of the bracket. */ - *depth = trace->depth - 1; + cpu_data->depth = trace->depth - 1; + + if (trace->depth < FTRACE_RETFUNC_DEPTH) { + if (cpu_data->enter_funcs[trace->depth] != trace->func) + func_match = 0; + cpu_data->enter_funcs[trace->depth] = 0; + } } if (print_graph_prologue(iter, s, 0, 0)) @@ -891,9 +931,21 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, return TRACE_TYPE_PARTIAL_LINE; } - ret = trace_seq_printf(s, "}\n"); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; + /* + * If the return function does not have a matching entry, + * then the entry was lost. Instead of just printing + * the '}' and letting the user guess what function this + * belongs to, write out the function name. + */ + if (func_match) { + ret = trace_seq_printf(s, "}\n"); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + } else { + ret = trace_seq_printf(s, "} /* %ps */\n", (void *)trace->func); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + } /* Overrun */ if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERRUN) { diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 6178abf3637e..1251e367bae9 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -635,12 +635,12 @@ static int create_trace_probe(int argc, char **argv) event = strchr(group, '/') + 1; event[-1] = '\0'; if (strlen(group) == 0) { - pr_info("Group name is not specifiled\n"); + pr_info("Group name is not specified\n"); return -EINVAL; } } if (strlen(event) == 0) { - pr_info("Event name is not specifiled\n"); + pr_info("Event name is not specified\n"); return -EINVAL; } } @@ -673,7 +673,7 @@ static int create_trace_probe(int argc, char **argv) return -EINVAL; } /* an address specified */ - ret = strict_strtoul(&argv[0][2], 0, (unsigned long *)&addr); + ret = strict_strtoul(&argv[1][0], 0, (unsigned long *)&addr); if (ret) { pr_info("Failed to parse address.\n"); return ret; @@ -1155,86 +1155,66 @@ static int kretprobe_event_define_fields(struct ftrace_event_call *event_call) return 0; } -static int __probe_event_show_format(struct trace_seq *s, - struct trace_probe *tp, const char *fmt, - const char *arg) +static int __set_print_fmt(struct trace_probe *tp, char *buf, int len) { int i; + int pos = 0; - /* Show format */ - if (!trace_seq_printf(s, "\nprint fmt: \"%s", fmt)) - return 0; + const char *fmt, *arg; - for (i = 0; i < tp->nr_args; i++) - if (!trace_seq_printf(s, " %s=%%lx", tp->args[i].name)) - return 0; + if (!probe_is_return(tp)) { + fmt = "(%lx)"; + arg = "REC->" FIELD_STRING_IP; + } else { + fmt = "(%lx <- %lx)"; + arg = "REC->" FIELD_STRING_FUNC ", REC->" FIELD_STRING_RETIP; + } - if (!trace_seq_printf(s, "\", %s", arg)) - return 0; + /* When len=0, we just calculate the needed length */ +#define LEN_OR_ZERO (len ? len - pos : 0) - for (i = 0; i < tp->nr_args; i++) - if (!trace_seq_printf(s, ", REC->%s", tp->args[i].name)) - return 0; + pos += snprintf(buf + pos, LEN_OR_ZERO, "\"%s", fmt); - return trace_seq_puts(s, "\n"); -} + for (i = 0; i < tp->nr_args; i++) { + pos += snprintf(buf + pos, LEN_OR_ZERO, " %s=%%lx", + tp->args[i].name); + } -#undef SHOW_FIELD -#define SHOW_FIELD(type, item, name) \ - do { \ - ret = trace_seq_printf(s, "\tfield:" #type " %s;\t" \ - "offset:%u;\tsize:%u;\tsigned:%d;\n", name,\ - (unsigned int)offsetof(typeof(field), item),\ - (unsigned int)sizeof(type), \ - is_signed_type(type)); \ - if (!ret) \ - return 0; \ - } while (0) + pos += snprintf(buf + pos, LEN_OR_ZERO, "\", %s", arg); -static int kprobe_event_show_format(struct ftrace_event_call *call, - struct trace_seq *s) -{ - struct kprobe_trace_entry field __attribute__((unused)); - int ret, i; - struct trace_probe *tp = (struct trace_probe *)call->data; - - SHOW_FIELD(unsigned long, ip, FIELD_STRING_IP); - SHOW_FIELD(int, nargs, FIELD_STRING_NARGS); + for (i = 0; i < tp->nr_args; i++) { + pos += snprintf(buf + pos, LEN_OR_ZERO, ", REC->%s", + tp->args[i].name); + } - /* Show fields */ - for (i = 0; i < tp->nr_args; i++) - SHOW_FIELD(unsigned long, args[i], tp->args[i].name); - trace_seq_puts(s, "\n"); +#undef LEN_OR_ZERO - return __probe_event_show_format(s, tp, "(%lx)", - "REC->" FIELD_STRING_IP); + /* return the length of print_fmt */ + return pos; } -static int kretprobe_event_show_format(struct ftrace_event_call *call, - struct trace_seq *s) +static int set_print_fmt(struct trace_probe *tp) { - struct kretprobe_trace_entry field __attribute__((unused)); - int ret, i; - struct trace_probe *tp = (struct trace_probe *)call->data; + int len; + char *print_fmt; - SHOW_FIELD(unsigned long, func, FIELD_STRING_FUNC); - SHOW_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP); - SHOW_FIELD(int, nargs, FIELD_STRING_NARGS); + /* First: called with 0 length to calculate the needed length */ + len = __set_print_fmt(tp, NULL, 0); + print_fmt = kmalloc(len + 1, GFP_KERNEL); + if (!print_fmt) + return -ENOMEM; - /* Show fields */ - for (i = 0; i < tp->nr_args; i++) - SHOW_FIELD(unsigned long, args[i], tp->args[i].name); - trace_seq_puts(s, "\n"); + /* Second: actually write the @print_fmt */ + __set_print_fmt(tp, print_fmt, len + 1); + tp->call.print_fmt = print_fmt; - return __probe_event_show_format(s, tp, "(%lx <- %lx)", - "REC->" FIELD_STRING_FUNC - ", REC->" FIELD_STRING_RETIP); + return 0; } #ifdef CONFIG_PERF_EVENTS /* Kprobe profile handler */ -static __kprobes void kprobe_profile_func(struct kprobe *kp, +static __kprobes void kprobe_perf_func(struct kprobe *kp, struct pt_regs *regs) { struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp); @@ -1247,11 +1227,11 @@ static __kprobes void kprobe_profile_func(struct kprobe *kp, __size = SIZEOF_KPROBE_TRACE_ENTRY(tp->nr_args); size = ALIGN(__size + sizeof(u32), sizeof(u64)); size -= sizeof(u32); - if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE, + if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, "profile buffer not large enough")) return; - entry = ftrace_perf_buf_prepare(size, call->id, &rctx, &irq_flags); + entry = perf_trace_buf_prepare(size, call->id, &rctx, &irq_flags); if (!entry) return; @@ -1260,11 +1240,11 @@ static __kprobes void kprobe_profile_func(struct kprobe *kp, for (i = 0; i < tp->nr_args; i++) entry->args[i] = call_fetch(&tp->args[i].fetch, regs); - ftrace_perf_buf_submit(entry, size, rctx, entry->ip, 1, irq_flags); + perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, irq_flags, regs); } /* Kretprobe profile handler */ -static __kprobes void kretprobe_profile_func(struct kretprobe_instance *ri, +static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri, struct pt_regs *regs) { struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp); @@ -1277,11 +1257,11 @@ static __kprobes void kretprobe_profile_func(struct kretprobe_instance *ri, __size = SIZEOF_KRETPROBE_TRACE_ENTRY(tp->nr_args); size = ALIGN(__size + sizeof(u32), sizeof(u64)); size -= sizeof(u32); - if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE, + if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, "profile buffer not large enough")) return; - entry = ftrace_perf_buf_prepare(size, call->id, &rctx, &irq_flags); + entry = perf_trace_buf_prepare(size, call->id, &rctx, &irq_flags); if (!entry) return; @@ -1291,10 +1271,11 @@ static __kprobes void kretprobe_profile_func(struct kretprobe_instance *ri, for (i = 0; i < tp->nr_args; i++) entry->args[i] = call_fetch(&tp->args[i].fetch, regs); - ftrace_perf_buf_submit(entry, size, rctx, entry->ret_ip, 1, irq_flags); + perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1, + irq_flags, regs); } -static int probe_profile_enable(struct ftrace_event_call *call) +static int probe_perf_enable(struct ftrace_event_call *call) { struct trace_probe *tp = (struct trace_probe *)call->data; @@ -1306,7 +1287,7 @@ static int probe_profile_enable(struct ftrace_event_call *call) return enable_kprobe(&tp->rp.kp); } -static void probe_profile_disable(struct ftrace_event_call *call) +static void probe_perf_disable(struct ftrace_event_call *call) { struct trace_probe *tp = (struct trace_probe *)call->data; @@ -1331,7 +1312,7 @@ int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs) kprobe_trace_func(kp, regs); #ifdef CONFIG_PERF_EVENTS if (tp->flags & TP_FLAG_PROFILE) - kprobe_profile_func(kp, regs); + kprobe_perf_func(kp, regs); #endif return 0; /* We don't tweek kernel, so just return 0 */ } @@ -1345,7 +1326,7 @@ int kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs) kretprobe_trace_func(ri, regs); #ifdef CONFIG_PERF_EVENTS if (tp->flags & TP_FLAG_PROFILE) - kretprobe_profile_func(ri, regs); + kretprobe_perf_func(ri, regs); #endif return 0; /* We don't tweek kernel, so just return 0 */ } @@ -1359,30 +1340,33 @@ static int register_probe_event(struct trace_probe *tp) if (probe_is_return(tp)) { tp->event.trace = print_kretprobe_event; call->raw_init = probe_event_raw_init; - call->show_format = kretprobe_event_show_format; call->define_fields = kretprobe_event_define_fields; } else { tp->event.trace = print_kprobe_event; call->raw_init = probe_event_raw_init; - call->show_format = kprobe_event_show_format; call->define_fields = kprobe_event_define_fields; } + if (set_print_fmt(tp) < 0) + return -ENOMEM; call->event = &tp->event; call->id = register_ftrace_event(&tp->event); - if (!call->id) + if (!call->id) { + kfree(call->print_fmt); return -ENODEV; + } call->enabled = 0; call->regfunc = probe_event_enable; call->unregfunc = probe_event_disable; #ifdef CONFIG_PERF_EVENTS - call->profile_enable = probe_profile_enable; - call->profile_disable = probe_profile_disable; + call->perf_event_enable = probe_perf_enable; + call->perf_event_disable = probe_perf_disable; #endif call->data = tp; ret = trace_add_event_call(call); if (ret) { pr_info("Failed to register kprobe event: %s\n", call->name); + kfree(call->print_fmt); unregister_ftrace_event(&tp->event); } return ret; @@ -1392,6 +1376,7 @@ static void unregister_probe_event(struct trace_probe *tp) { /* tp->event is unregistered in trace_remove_event_call() */ trace_remove_event_call(&tp->call); + kfree(tp->call.print_fmt); } /* Make a debugfs interface for controling probe points */ diff --git a/kernel/trace/trace_ksym.c b/kernel/trace/trace_ksym.c index 94103cdcf9d8..d59cd6879477 100644 --- a/kernel/trace/trace_ksym.c +++ b/kernel/trace/trace_ksym.c @@ -23,6 +23,7 @@ #include <linux/debugfs.h> #include <linux/ftrace.h> #include <linux/module.h> +#include <linux/slab.h> #include <linux/fs.h> #include "trace_output.h" diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index 0acd834659ed..017fa376505d 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -9,6 +9,7 @@ #include <linux/kernel.h> #include <linux/mmiotrace.h> #include <linux/pci.h> +#include <linux/slab.h> #include <linux/time.h> #include <asm/atomic.h> diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 280fea470d67..81003b4d617f 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -3,6 +3,7 @@ #include <linux/stringify.h> #include <linux/kthread.h> #include <linux/delay.h> +#include <linux/slab.h> static inline int trace_valid_entry(struct trace_entry *entry) { diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 678a5120ee30..f4bc9b27de5f 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -157,6 +157,7 @@ stack_max_size_write(struct file *filp, const char __user *ubuf, unsigned long val, flags; char buf[64]; int ret; + int cpu; if (count >= sizeof(buf)) return -EINVAL; @@ -171,9 +172,20 @@ stack_max_size_write(struct file *filp, const char __user *ubuf, return ret; local_irq_save(flags); + + /* + * In case we trace inside arch_spin_lock() or after (NMI), + * we will cause circular lock, so we also need to increase + * the percpu trace_active here. + */ + cpu = smp_processor_id(); + per_cpu(trace_active, cpu)++; + arch_spin_lock(&max_stack_lock); *ptr = val; arch_spin_unlock(&max_stack_lock); + + per_cpu(trace_active, cpu)--; local_irq_restore(flags); return count; @@ -206,7 +218,13 @@ t_next(struct seq_file *m, void *v, loff_t *pos) static void *t_start(struct seq_file *m, loff_t *pos) { + int cpu; + local_irq_disable(); + + cpu = smp_processor_id(); + per_cpu(trace_active, cpu)++; + arch_spin_lock(&max_stack_lock); if (*pos == 0) @@ -217,7 +235,13 @@ static void *t_start(struct seq_file *m, loff_t *pos) static void t_stop(struct seq_file *m, void *p) { + int cpu; + arch_spin_unlock(&max_stack_lock); + + cpu = smp_processor_id(); + per_cpu(trace_active, cpu)--; + local_irq_enable(); } diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c index a4bb239eb987..96cffb269e73 100644 --- a/kernel/trace/trace_stat.c +++ b/kernel/trace/trace_stat.c @@ -10,6 +10,7 @@ #include <linux/list.h> +#include <linux/slab.h> #include <linux/rbtree.h> #include <linux/debugfs.h> #include "trace_stat.h" diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 4e332b9e449c..4d6d711717f2 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -1,5 +1,6 @@ #include <trace/syscall.h> #include <trace/events/syscalls.h> +#include <linux/slab.h> #include <linux/kernel.h> #include <linux/ftrace.h> #include <linux/perf_event.h> @@ -143,70 +144,65 @@ extern char *__bad_type_size(void); #type, #name, offsetof(typeof(trace), name), \ sizeof(trace.name), is_signed_type(type) -int syscall_enter_format(struct ftrace_event_call *call, struct trace_seq *s) +static +int __set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len) { int i; - int ret; - struct syscall_metadata *entry = call->data; - struct syscall_trace_enter trace; - int offset = offsetof(struct syscall_trace_enter, args); + int pos = 0; - ret = trace_seq_printf(s, "\tfield:%s %s;\toffset:%zu;\tsize:%zu;" - "\tsigned:%u;\n", - SYSCALL_FIELD(int, nr)); - if (!ret) - return 0; + /* When len=0, we just calculate the needed length */ +#define LEN_OR_ZERO (len ? len - pos : 0) + pos += snprintf(buf + pos, LEN_OR_ZERO, "\""); for (i = 0; i < entry->nb_args; i++) { - ret = trace_seq_printf(s, "\tfield:%s %s;", entry->types[i], - entry->args[i]); - if (!ret) - return 0; - ret = trace_seq_printf(s, "\toffset:%d;\tsize:%zu;" - "\tsigned:%u;\n", offset, - sizeof(unsigned long), - is_signed_type(unsigned long)); - if (!ret) - return 0; - offset += sizeof(unsigned long); + pos += snprintf(buf + pos, LEN_OR_ZERO, "%s: 0x%%0%zulx%s", + entry->args[i], sizeof(unsigned long), + i == entry->nb_args - 1 ? "" : ", "); } + pos += snprintf(buf + pos, LEN_OR_ZERO, "\""); - trace_seq_puts(s, "\nprint fmt: \""); for (i = 0; i < entry->nb_args; i++) { - ret = trace_seq_printf(s, "%s: 0x%%0%zulx%s", entry->args[i], - sizeof(unsigned long), - i == entry->nb_args - 1 ? "" : ", "); - if (!ret) - return 0; + pos += snprintf(buf + pos, LEN_OR_ZERO, + ", ((unsigned long)(REC->%s))", entry->args[i]); } - trace_seq_putc(s, '"'); - for (i = 0; i < entry->nb_args; i++) { - ret = trace_seq_printf(s, ", ((unsigned long)(REC->%s))", - entry->args[i]); - if (!ret) - return 0; - } +#undef LEN_OR_ZERO - return trace_seq_putc(s, '\n'); + /* return the length of print_fmt */ + return pos; } -int syscall_exit_format(struct ftrace_event_call *call, struct trace_seq *s) +static int set_syscall_print_fmt(struct ftrace_event_call *call) { - int ret; - struct syscall_trace_exit trace; + char *print_fmt; + int len; + struct syscall_metadata *entry = call->data; - ret = trace_seq_printf(s, - "\tfield:%s %s;\toffset:%zu;\tsize:%zu;" - "\tsigned:%u;\n" - "\tfield:%s %s;\toffset:%zu;\tsize:%zu;" - "\tsigned:%u;\n", - SYSCALL_FIELD(int, nr), - SYSCALL_FIELD(long, ret)); - if (!ret) + if (entry->enter_event != call) { + call->print_fmt = "\"0x%lx\", REC->ret"; return 0; + } - return trace_seq_printf(s, "\nprint fmt: \"0x%%lx\", REC->ret\n"); + /* First: called with 0 length to calculate the needed length */ + len = __set_enter_print_fmt(entry, NULL, 0); + + print_fmt = kmalloc(len + 1, GFP_KERNEL); + if (!print_fmt) + return -ENOMEM; + + /* Second: actually write the @print_fmt */ + __set_enter_print_fmt(entry, print_fmt, len + 1); + call->print_fmt = print_fmt; + + return 0; +} + +static void free_syscall_print_fmt(struct ftrace_event_call *call) +{ + struct syscall_metadata *entry = call->data; + + if (entry->enter_event == call) + kfree(call->print_fmt); } int syscall_enter_define_fields(struct ftrace_event_call *call) @@ -386,12 +382,22 @@ int init_syscall_trace(struct ftrace_event_call *call) { int id; - id = register_ftrace_event(call->event); - if (!id) - return -ENODEV; - call->id = id; - INIT_LIST_HEAD(&call->fields); - return 0; + if (set_syscall_print_fmt(call) < 0) + return -ENOMEM; + + id = trace_event_raw_init(call); + + if (id < 0) { + free_syscall_print_fmt(call); + return id; + } + + return id; +} + +unsigned long __init arch_syscall_addr(int nr) +{ + return (unsigned long)sys_call_table[nr]; } int __init init_ftrace_syscalls(void) @@ -423,12 +429,12 @@ core_initcall(init_ftrace_syscalls); #ifdef CONFIG_PERF_EVENTS -static DECLARE_BITMAP(enabled_prof_enter_syscalls, NR_syscalls); -static DECLARE_BITMAP(enabled_prof_exit_syscalls, NR_syscalls); -static int sys_prof_refcount_enter; -static int sys_prof_refcount_exit; +static DECLARE_BITMAP(enabled_perf_enter_syscalls, NR_syscalls); +static DECLARE_BITMAP(enabled_perf_exit_syscalls, NR_syscalls); +static int sys_perf_refcount_enter; +static int sys_perf_refcount_exit; -static void prof_syscall_enter(struct pt_regs *regs, long id) +static void perf_syscall_enter(struct pt_regs *regs, long id) { struct syscall_metadata *sys_data; struct syscall_trace_enter *rec; @@ -438,7 +444,7 @@ static void prof_syscall_enter(struct pt_regs *regs, long id) int size; syscall_nr = syscall_get_nr(current, regs); - if (!test_bit(syscall_nr, enabled_prof_enter_syscalls)) + if (!test_bit(syscall_nr, enabled_perf_enter_syscalls)) return; sys_data = syscall_nr_to_meta(syscall_nr); @@ -450,11 +456,11 @@ static void prof_syscall_enter(struct pt_regs *regs, long id) size = ALIGN(size + sizeof(u32), sizeof(u64)); size -= sizeof(u32); - if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE, - "profile buffer not large enough")) + if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, + "perf buffer not large enough")) return; - rec = (struct syscall_trace_enter *)ftrace_perf_buf_prepare(size, + rec = (struct syscall_trace_enter *)perf_trace_buf_prepare(size, sys_data->enter_event->id, &rctx, &flags); if (!rec) return; @@ -462,10 +468,10 @@ static void prof_syscall_enter(struct pt_regs *regs, long id) rec->nr = syscall_nr; syscall_get_arguments(current, regs, 0, sys_data->nb_args, (unsigned long *)&rec->args); - ftrace_perf_buf_submit(rec, size, rctx, 0, 1, flags); + perf_trace_buf_submit(rec, size, rctx, 0, 1, flags, regs); } -int prof_sysenter_enable(struct ftrace_event_call *call) +int perf_sysenter_enable(struct ftrace_event_call *call) { int ret = 0; int num; @@ -473,34 +479,34 @@ int prof_sysenter_enable(struct ftrace_event_call *call) num = ((struct syscall_metadata *)call->data)->syscall_nr; mutex_lock(&syscall_trace_lock); - if (!sys_prof_refcount_enter) - ret = register_trace_sys_enter(prof_syscall_enter); + if (!sys_perf_refcount_enter) + ret = register_trace_sys_enter(perf_syscall_enter); if (ret) { pr_info("event trace: Could not activate" "syscall entry trace point"); } else { - set_bit(num, enabled_prof_enter_syscalls); - sys_prof_refcount_enter++; + set_bit(num, enabled_perf_enter_syscalls); + sys_perf_refcount_enter++; } mutex_unlock(&syscall_trace_lock); return ret; } -void prof_sysenter_disable(struct ftrace_event_call *call) +void perf_sysenter_disable(struct ftrace_event_call *call) { int num; num = ((struct syscall_metadata *)call->data)->syscall_nr; mutex_lock(&syscall_trace_lock); - sys_prof_refcount_enter--; - clear_bit(num, enabled_prof_enter_syscalls); - if (!sys_prof_refcount_enter) - unregister_trace_sys_enter(prof_syscall_enter); + sys_perf_refcount_enter--; + clear_bit(num, enabled_perf_enter_syscalls); + if (!sys_perf_refcount_enter) + unregister_trace_sys_enter(perf_syscall_enter); mutex_unlock(&syscall_trace_lock); } -static void prof_syscall_exit(struct pt_regs *regs, long ret) +static void perf_syscall_exit(struct pt_regs *regs, long ret) { struct syscall_metadata *sys_data; struct syscall_trace_exit *rec; @@ -510,7 +516,7 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret) int size; syscall_nr = syscall_get_nr(current, regs); - if (!test_bit(syscall_nr, enabled_prof_exit_syscalls)) + if (!test_bit(syscall_nr, enabled_perf_exit_syscalls)) return; sys_data = syscall_nr_to_meta(syscall_nr); @@ -525,11 +531,11 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret) * Impossible, but be paranoid with the future * How to put this check outside runtime? */ - if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE, - "exit event has grown above profile buffer size")) + if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, + "exit event has grown above perf buffer size")) return; - rec = (struct syscall_trace_exit *)ftrace_perf_buf_prepare(size, + rec = (struct syscall_trace_exit *)perf_trace_buf_prepare(size, sys_data->exit_event->id, &rctx, &flags); if (!rec) return; @@ -537,10 +543,10 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret) rec->nr = syscall_nr; rec->ret = syscall_get_return_value(current, regs); - ftrace_perf_buf_submit(rec, size, rctx, 0, 1, flags); + perf_trace_buf_submit(rec, size, rctx, 0, 1, flags, regs); } -int prof_sysexit_enable(struct ftrace_event_call *call) +int perf_sysexit_enable(struct ftrace_event_call *call) { int ret = 0; int num; @@ -548,30 +554,30 @@ int prof_sysexit_enable(struct ftrace_event_call *call) num = ((struct syscall_metadata *)call->data)->syscall_nr; mutex_lock(&syscall_trace_lock); - if (!sys_prof_refcount_exit) - ret = register_trace_sys_exit(prof_syscall_exit); + if (!sys_perf_refcount_exit) + ret = register_trace_sys_exit(perf_syscall_exit); if (ret) { pr_info("event trace: Could not activate" - "syscall entry trace point"); + "syscall exit trace point"); } else { - set_bit(num, enabled_prof_exit_syscalls); - sys_prof_refcount_exit++; + set_bit(num, enabled_perf_exit_syscalls); + sys_perf_refcount_exit++; } mutex_unlock(&syscall_trace_lock); return ret; } -void prof_sysexit_disable(struct ftrace_event_call *call) +void perf_sysexit_disable(struct ftrace_event_call *call) { int num; num = ((struct syscall_metadata *)call->data)->syscall_nr; mutex_lock(&syscall_trace_lock); - sys_prof_refcount_exit--; - clear_bit(num, enabled_prof_exit_syscalls); - if (!sys_prof_refcount_exit) - unregister_trace_sys_exit(prof_syscall_exit); + sys_perf_refcount_exit--; + clear_bit(num, enabled_perf_exit_syscalls); + if (!sys_perf_refcount_exit) + unregister_trace_sys_exit(perf_syscall_exit); mutex_unlock(&syscall_trace_lock); } diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c index 40cafb07dffd..cc2d2faa7d9e 100644 --- a/kernel/trace/trace_workqueue.c +++ b/kernel/trace/trace_workqueue.c @@ -9,6 +9,7 @@ #include <trace/events/workqueue.h> #include <linux/list.h> #include <linux/percpu.h> +#include <linux/slab.h> #include <linux/kref.h> #include "trace_stat.h" #include "trace.h" |