diff options
Diffstat (limited to 'kernel')
55 files changed, 7001 insertions, 2357 deletions
diff --git a/kernel/audit.c b/kernel/audit.c index 227db99b0f19..670665c6e2a6 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -38,7 +38,8 @@ * 6) Support low-overhead kernel-based filtering to minimize the * information that must be passed to user-space. * - * Example user-space utilities: http://people.redhat.com/sgrubb/audit/ + * Audit userspace, documentation, tests, and bug/issue trackers: + * https://github.com/linux-audit */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt @@ -180,9 +181,21 @@ static char *audit_feature_names[2] = { "loginuid_immutable", }; - -/* Serialize requests from userspace. */ -DEFINE_MUTEX(audit_cmd_mutex); +/** + * struct audit_ctl_mutex - serialize requests from userspace + * @lock: the mutex used for locking + * @owner: the task which owns the lock + * + * Description: + * This is the lock struct used to ensure we only process userspace requests + * in an orderly fashion. We can't simply use a mutex/lock here because we + * need to track lock ownership so we don't end up blocking the lock owner in + * audit_log_start() or similar. + */ +static struct audit_ctl_mutex { + struct mutex lock; + void *owner; +} audit_cmd_mutex; /* AUDIT_BUFSIZ is the size of the temporary buffer used for formatting * audit records. Since printk uses a 1024 byte buffer, this buffer @@ -227,6 +240,36 @@ int auditd_test_task(struct task_struct *task) } /** + * audit_ctl_lock - Take the audit control lock + */ +void audit_ctl_lock(void) +{ + mutex_lock(&audit_cmd_mutex.lock); + audit_cmd_mutex.owner = current; +} + +/** + * audit_ctl_unlock - Drop the audit control lock + */ +void audit_ctl_unlock(void) +{ + audit_cmd_mutex.owner = NULL; + mutex_unlock(&audit_cmd_mutex.lock); +} + +/** + * audit_ctl_owner_current - Test to see if the current task owns the lock + * + * Description: + * Return true if the current task owns the audit control lock, false if it + * doesn't own the lock. + */ +static bool audit_ctl_owner_current(void) +{ + return (current == audit_cmd_mutex.owner); +} + +/** * auditd_pid_vnr - Return the auditd PID relative to the namespace * * Description: @@ -443,15 +486,15 @@ static int audit_set_failure(u32 state) * Drop any references inside the auditd connection tracking struct and free * the memory. */ - static void auditd_conn_free(struct rcu_head *rcu) - { +static void auditd_conn_free(struct rcu_head *rcu) +{ struct auditd_connection *ac; ac = container_of(rcu, struct auditd_connection, rcu); put_pid(ac->pid); put_net(ac->net); kfree(ac); - } +} /** * auditd_set - Set/Reset the auditd connection state @@ -860,8 +903,8 @@ int audit_send_list(void *_dest) struct sock *sk = audit_get_sk(dest->net); /* wait for parent to finish and send an ACK */ - mutex_lock(&audit_cmd_mutex); - mutex_unlock(&audit_cmd_mutex); + audit_ctl_lock(); + audit_ctl_unlock(); while ((skb = __skb_dequeue(&dest->q)) != NULL) netlink_unicast(sk, skb, dest->portid, 0); @@ -902,8 +945,8 @@ static int audit_send_reply_thread(void *arg) struct audit_reply *reply = (struct audit_reply *)arg; struct sock *sk = audit_get_sk(reply->net); - mutex_lock(&audit_cmd_mutex); - mutex_unlock(&audit_cmd_mutex); + audit_ctl_lock(); + audit_ctl_unlock(); /* Ignore failure. It'll only happen if the sender goes away, because our timeout is set to infinite. */ @@ -1058,6 +1101,8 @@ static void audit_log_feature_change(int which, u32 old_feature, u32 new_feature return; ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_FEATURE_CHANGE); + if (!ab) + return; audit_log_task_info(ab, current); audit_log_format(ab, " feature=%s old=%u new=%u old_lock=%u new_lock=%u res=%d", audit_feature_names[which], !!old_feature, !!new_feature, @@ -1466,7 +1511,7 @@ static void audit_receive(struct sk_buff *skb) nlh = nlmsg_hdr(skb); len = skb->len; - mutex_lock(&audit_cmd_mutex); + audit_ctl_lock(); while (nlmsg_ok(nlh, len)) { err = audit_receive_msg(skb, nlh); /* if err or if this message says it wants a response */ @@ -1475,7 +1520,7 @@ static void audit_receive(struct sk_buff *skb) nlh = nlmsg_next(nlh, &len); } - mutex_unlock(&audit_cmd_mutex); + audit_ctl_unlock(); } /* Run custom bind function on netlink socket group connect or bind requests. */ @@ -1547,6 +1592,9 @@ static int __init audit_init(void) for (i = 0; i < AUDIT_INODE_BUCKETS; i++) INIT_LIST_HEAD(&audit_inode_hash[i]); + mutex_init(&audit_cmd_mutex.lock); + audit_cmd_mutex.owner = NULL; + pr_info("initializing netlink subsys (%s)\n", audit_default ? "enabled" : "disabled"); register_pernet_subsys(&audit_net_ops); @@ -1567,19 +1615,26 @@ static int __init audit_init(void) } postcore_initcall(audit_init); -/* Process kernel command-line parameter at boot time. audit=0 or audit=1. */ +/* + * Process kernel command-line parameter at boot time. + * audit={0|off} or audit={1|on}. + */ static int __init audit_enable(char *str) { - long val; - - if (kstrtol(str, 0, &val)) - panic("audit: invalid 'audit' parameter value (%s)\n", str); - audit_default = (val ? AUDIT_ON : AUDIT_OFF); + if (!strcasecmp(str, "off") || !strcmp(str, "0")) + audit_default = AUDIT_OFF; + else if (!strcasecmp(str, "on") || !strcmp(str, "1")) + audit_default = AUDIT_ON; + else { + pr_err("audit: invalid 'audit' parameter value (%s)\n", str); + audit_default = AUDIT_ON; + } if (audit_default == AUDIT_OFF) audit_initialized = AUDIT_DISABLED; if (audit_set_enabled(audit_default)) - panic("audit: error setting audit state (%d)\n", audit_default); + pr_err("audit: error setting audit state (%d)\n", + audit_default); pr_info("%s\n", audit_default ? "enabled (after initialization)" : "disabled (until reboot)"); @@ -1710,8 +1765,7 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, * using a PID anchored in the caller's namespace * 2. generator holding the audit_cmd_mutex - we don't want to block * while holding the mutex */ - if (!(auditd_test_task(current) || - (current == __mutex_owner(&audit_cmd_mutex)))) { + if (!(auditd_test_task(current) || audit_ctl_owner_current())) { long stime = audit_backlog_wait_time; while (audit_backlog_limit && @@ -2254,33 +2308,23 @@ EXPORT_SYMBOL(audit_log_task_info); /** * audit_log_link_denied - report a link restriction denial * @operation: specific link operation - * @link: the path that triggered the restriction */ -void audit_log_link_denied(const char *operation, const struct path *link) +void audit_log_link_denied(const char *operation) { struct audit_buffer *ab; - struct audit_names *name; - name = kzalloc(sizeof(*name), GFP_NOFS); - if (!name) + if (!audit_enabled || audit_dummy_context()) return; /* Generate AUDIT_ANOM_LINK with subject, operation, outcome. */ ab = audit_log_start(current->audit_context, GFP_KERNEL, AUDIT_ANOM_LINK); if (!ab) - goto out; + return; audit_log_format(ab, "op=%s", operation); audit_log_task_info(ab, current); audit_log_format(ab, " res=0"); audit_log_end(ab); - - /* Generate AUDIT_PATH record with object. */ - name->type = AUDIT_TYPE_NORMAL; - audit_copy_inode(name, link->dentry, d_backing_inode(link->dentry)); - audit_log_name(current->audit_context, name, link, 0, NULL); -out: - kfree(name); } /** diff --git a/kernel/audit.h b/kernel/audit.h index af5bc59487ed..214e14948370 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -341,4 +341,5 @@ extern struct list_head *audit_killed_trees(void); #define audit_filter_inodes(t,c) AUDIT_DISABLED #endif -extern struct mutex audit_cmd_mutex; +extern void audit_ctl_lock(void); +extern void audit_ctl_unlock(void); diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index fd353120e0d9..67e6956c0b61 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -709,7 +709,7 @@ static int prune_tree_thread(void *unused) schedule(); } - mutex_lock(&audit_cmd_mutex); + audit_ctl_lock(); mutex_lock(&audit_filter_mutex); while (!list_empty(&prune_list)) { @@ -727,7 +727,7 @@ static int prune_tree_thread(void *unused) } mutex_unlock(&audit_filter_mutex); - mutex_unlock(&audit_cmd_mutex); + audit_ctl_unlock(); } return 0; } @@ -924,7 +924,7 @@ static void audit_schedule_prune(void) */ void audit_kill_trees(struct list_head *list) { - mutex_lock(&audit_cmd_mutex); + audit_ctl_lock(); mutex_lock(&audit_filter_mutex); while (!list_empty(list)) { @@ -942,7 +942,7 @@ void audit_kill_trees(struct list_head *list) } mutex_unlock(&audit_filter_mutex); - mutex_unlock(&audit_cmd_mutex); + audit_ctl_unlock(); } /* diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 4a1758adb222..d7a807e81451 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -258,8 +258,8 @@ static inline struct audit_entry *audit_to_entry_common(struct audit_rule_data * goto exit_err; #ifdef CONFIG_AUDITSYSCALL case AUDIT_FILTER_ENTRY: - if (rule->action == AUDIT_ALWAYS) - goto exit_err; + pr_err("AUDIT_FILTER_ENTRY is deprecated\n"); + goto exit_err; case AUDIT_FILTER_EXIT: case AUDIT_FILTER_TASK: #endif @@ -496,7 +496,6 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, if (!gid_valid(f->gid)) goto exit_free; break; - case AUDIT_SESSIONID: case AUDIT_ARCH: entry->rule.arch_f = f; break; diff --git a/kernel/auditsc.c b/kernel/auditsc.c index e80459f7e132..4e0a4ac803db 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1511,30 +1511,28 @@ void __audit_syscall_entry(int major, unsigned long a1, unsigned long a2, struct audit_context *context = tsk->audit_context; enum audit_state state; - if (!context) + if (!audit_enabled || !context) return; BUG_ON(context->in_syscall || context->name_count); - if (!audit_enabled) + state = context->state; + if (state == AUDIT_DISABLED) return; + context->dummy = !audit_n_rules; + if (!context->dummy && state == AUDIT_BUILD_CONTEXT) { + context->prio = 0; + if (auditd_test_task(tsk)) + return; + } + context->arch = syscall_get_arch(); context->major = major; context->argv[0] = a1; context->argv[1] = a2; context->argv[2] = a3; context->argv[3] = a4; - - state = context->state; - context->dummy = !audit_n_rules; - if (!context->dummy && state == AUDIT_BUILD_CONTEXT) { - context->prio = 0; - state = audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_ENTRY]); - } - if (state == AUDIT_DISABLED) - return; - context->serial = 0; context->ctime = current_kernel_time64(); context->in_syscall = 1; diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index d2bda5aa25d7..8dd9210d7db7 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -182,8 +182,10 @@ static void bpf_tcp_release(struct sock *sk) psock->cork = NULL; } - sk->sk_prot = psock->sk_proto; - psock->sk_proto = NULL; + if (psock->sk_proto) { + sk->sk_prot = psock->sk_proto; + psock->sk_proto = NULL; + } out: rcu_read_unlock(); } @@ -211,6 +213,12 @@ static void bpf_tcp_close(struct sock *sk, long timeout) close_fun = psock->save_close; write_lock_bh(&sk->sk_callback_lock); + if (psock->cork) { + free_start_sg(psock->sock, psock->cork); + kfree(psock->cork); + psock->cork = NULL; + } + list_for_each_entry_safe(md, mtmp, &psock->ingress, list) { list_del(&md->list); free_start_sg(psock->sock, md); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 0244973ee544..4ca46df19c9a 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1226,18 +1226,6 @@ bpf_prog_load_check_attach_type(enum bpf_prog_type prog_type, } } -static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, - enum bpf_attach_type attach_type) -{ - switch (prog->type) { - case BPF_PROG_TYPE_CGROUP_SOCK: - case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: - return attach_type == prog->expected_attach_type ? 0 : -EINVAL; - default: - return 0; - } -} - /* last field in 'union bpf_attr' used by this command */ #define BPF_PROG_LOAD_LAST_FIELD expected_attach_type @@ -1465,6 +1453,18 @@ out_free_tp: #ifdef CONFIG_CGROUP_BPF +static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, + enum bpf_attach_type attach_type) +{ + switch (prog->type) { + case BPF_PROG_TYPE_CGROUP_SOCK: + case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: + return attach_type == prog->expected_attach_type ? 0 : -EINVAL; + default: + return 0; + } +} + #define BPF_PROG_ATTACH_LAST_FIELD attach_flags static int sockmap_get_from_fd(const union bpf_attr *attr, diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 4f63597c824d..f7674d676889 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -376,6 +376,7 @@ phys_addr_t __weak paddr_vmcoreinfo_note(void) { return __pa(vmcoreinfo_note); } +EXPORT_SYMBOL(paddr_vmcoreinfo_note); static int __init crash_save_vmcoreinfo_init(void) { @@ -453,6 +454,7 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_NUMBER(PG_lru); VMCOREINFO_NUMBER(PG_private); VMCOREINFO_NUMBER(PG_swapcache); + VMCOREINFO_NUMBER(PG_swapbacked); VMCOREINFO_NUMBER(PG_slab); #ifdef CONFIG_MEMORY_FAILURE VMCOREINFO_NUMBER(PG_hwpoison); diff --git a/kernel/debug/kdb/kdb_bp.c b/kernel/debug/kdb/kdb_bp.c index 90ff129c88a2..62c301ad0773 100644 --- a/kernel/debug/kdb/kdb_bp.c +++ b/kernel/debug/kdb/kdb_bp.c @@ -242,11 +242,11 @@ static void kdb_printbp(kdb_bp_t *bp, int i) kdb_symbol_print(bp->bp_addr, NULL, KDB_SP_DEFAULT); if (bp->bp_enabled) - kdb_printf("\n is enabled"); + kdb_printf("\n is enabled "); else kdb_printf("\n is disabled"); - kdb_printf("\taddr at %016lx, hardtype=%d installed=%d\n", + kdb_printf(" addr at %016lx, hardtype=%d installed=%d\n", bp->bp_addr, bp->bp_type, bp->bp_installed); kdb_printf("\n"); diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index dbb0781a0533..e405677ee08d 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -1150,6 +1150,16 @@ void kdb_set_current_task(struct task_struct *p) kdb_current_regs = NULL; } +static void drop_newline(char *buf) +{ + size_t len = strlen(buf); + + if (len == 0) + return; + if (*(buf + len - 1) == '\n') + *(buf + len - 1) = '\0'; +} + /* * kdb_local - The main code for kdb. This routine is invoked on a * specific processor, it is not global. The main kdb() routine @@ -1327,6 +1337,7 @@ do_full_getstr: cmdptr = cmd_head; diag = kdb_parse(cmdbuf); if (diag == KDB_NOTFOUND) { + drop_newline(cmdbuf); kdb_printf("Unknown kdb command: '%s'\n", cmdbuf); diag = 0; } @@ -1566,6 +1577,7 @@ static int kdb_md(int argc, const char **argv) int symbolic = 0; int valid = 0; int phys = 0; + int raw = 0; kdbgetintenv("MDCOUNT", &mdcount); kdbgetintenv("RADIX", &radix); @@ -1575,9 +1587,10 @@ static int kdb_md(int argc, const char **argv) repeat = mdcount * 16 / bytesperword; if (strcmp(argv[0], "mdr") == 0) { - if (argc != 2) + if (argc == 2 || (argc == 0 && last_addr != 0)) + valid = raw = 1; + else return KDB_ARGCOUNT; - valid = 1; } else if (isdigit(argv[0][2])) { bytesperword = (int)(argv[0][2] - '0'); if (bytesperword == 0) { @@ -1613,7 +1626,10 @@ static int kdb_md(int argc, const char **argv) radix = last_radix; bytesperword = last_bytesperword; repeat = last_repeat; - mdcount = ((repeat * bytesperword) + 15) / 16; + if (raw) + mdcount = repeat; + else + mdcount = ((repeat * bytesperword) + 15) / 16; } if (argc) { @@ -1630,7 +1646,10 @@ static int kdb_md(int argc, const char **argv) diag = kdbgetularg(argv[nextarg], &val); if (!diag) { mdcount = (int) val; - repeat = mdcount * 16 / bytesperword; + if (raw) + repeat = mdcount; + else + repeat = mdcount * 16 / bytesperword; } } if (argc >= nextarg+1) { @@ -1640,8 +1659,15 @@ static int kdb_md(int argc, const char **argv) } } - if (strcmp(argv[0], "mdr") == 0) - return kdb_mdr(addr, mdcount); + if (strcmp(argv[0], "mdr") == 0) { + int ret; + last_addr = addr; + ret = kdb_mdr(addr, mdcount); + last_addr += mdcount; + last_repeat = mdcount; + last_bytesperword = bytesperword; // to make REPEAT happy + return ret; + } switch (radix) { case 10: @@ -2473,41 +2499,6 @@ static int kdb_kill(int argc, const char **argv) return 0; } -struct kdb_tm { - int tm_sec; /* seconds */ - int tm_min; /* minutes */ - int tm_hour; /* hours */ - int tm_mday; /* day of the month */ - int tm_mon; /* month */ - int tm_year; /* year */ -}; - -static void kdb_gmtime(struct timespec *tv, struct kdb_tm *tm) -{ - /* This will work from 1970-2099, 2100 is not a leap year */ - static int mon_day[] = { 31, 29, 31, 30, 31, 30, 31, - 31, 30, 31, 30, 31 }; - memset(tm, 0, sizeof(*tm)); - tm->tm_sec = tv->tv_sec % (24 * 60 * 60); - tm->tm_mday = tv->tv_sec / (24 * 60 * 60) + - (2 * 365 + 1); /* shift base from 1970 to 1968 */ - tm->tm_min = tm->tm_sec / 60 % 60; - tm->tm_hour = tm->tm_sec / 60 / 60; - tm->tm_sec = tm->tm_sec % 60; - tm->tm_year = 68 + 4*(tm->tm_mday / (4*365+1)); - tm->tm_mday %= (4*365+1); - mon_day[1] = 29; - while (tm->tm_mday >= mon_day[tm->tm_mon]) { - tm->tm_mday -= mon_day[tm->tm_mon]; - if (++tm->tm_mon == 12) { - tm->tm_mon = 0; - ++tm->tm_year; - mon_day[1] = 28; - } - } - ++tm->tm_mday; -} - /* * Most of this code has been lifted from kernel/timer.c::sys_sysinfo(). * I cannot call that code directly from kdb, it has an unconditional @@ -2515,10 +2506,10 @@ static void kdb_gmtime(struct timespec *tv, struct kdb_tm *tm) */ static void kdb_sysinfo(struct sysinfo *val) { - struct timespec uptime; - ktime_get_ts(&uptime); + u64 uptime = ktime_get_mono_fast_ns(); + memset(val, 0, sizeof(*val)); - val->uptime = uptime.tv_sec; + val->uptime = div_u64(uptime, NSEC_PER_SEC); val->loads[0] = avenrun[0]; val->loads[1] = avenrun[1]; val->loads[2] = avenrun[2]; @@ -2533,8 +2524,8 @@ static void kdb_sysinfo(struct sysinfo *val) */ static int kdb_summary(int argc, const char **argv) { - struct timespec now; - struct kdb_tm tm; + time64_t now; + struct tm tm; struct sysinfo val; if (argc) @@ -2548,9 +2539,9 @@ static int kdb_summary(int argc, const char **argv) kdb_printf("domainname %s\n", init_uts_ns.name.domainname); kdb_printf("ccversion %s\n", __stringify(CCVERSION)); - now = __current_kernel_time(); - kdb_gmtime(&now, &tm); - kdb_printf("date %04d-%02d-%02d %02d:%02d:%02d " + now = __ktime_get_real_seconds(); + time64_to_tm(now, 0, &tm); + kdb_printf("date %04ld-%02d-%02d %02d:%02d:%02d " "tz_minuteswest %d\n", 1900+tm.tm_year, tm.tm_mon+1, tm.tm_mday, tm.tm_hour, tm.tm_min, tm.tm_sec, diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c index d35cc2d3a4cc..990b3cc526c8 100644 --- a/kernel/debug/kdb/kdb_support.c +++ b/kernel/debug/kdb/kdb_support.c @@ -129,13 +129,13 @@ int kdbnearsym(unsigned long addr, kdb_symtab_t *symtab) } if (i >= ARRAY_SIZE(kdb_name_table)) { debug_kfree(kdb_name_table[0]); - memcpy(kdb_name_table, kdb_name_table+1, + memmove(kdb_name_table, kdb_name_table+1, sizeof(kdb_name_table[0]) * (ARRAY_SIZE(kdb_name_table)-1)); } else { debug_kfree(knt1); knt1 = kdb_name_table[i]; - memcpy(kdb_name_table+i, kdb_name_table+i+1, + memmove(kdb_name_table+i, kdb_name_table+i+1, sizeof(kdb_name_table[0]) * (ARRAY_SIZE(kdb_name_table)-i-1)); } diff --git a/kernel/events/core.c b/kernel/events/core.c index fc1c330c6bd6..2d5fe26551f8 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4447,6 +4447,9 @@ static void _free_event(struct perf_event *event) if (event->ctx) put_ctx(event->ctx); + if (event->hw.target) + put_task_struct(event->hw.target); + exclusive_event_destroy(event); module_put(event->pmu->module); @@ -8397,6 +8400,10 @@ static int perf_kprobe_event_init(struct perf_event *event) if (event->attr.type != perf_kprobe.type) return -ENOENT; + + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + /* * no branch sampling for probe events */ @@ -8434,6 +8441,10 @@ static int perf_uprobe_event_init(struct perf_event *event) if (event->attr.type != perf_uprobe.type) return -ENOENT; + + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + /* * no branch sampling for probe events */ @@ -9955,6 +9966,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, * and we cannot use the ctx information because we need the * pmu before we get a ctx. */ + get_task_struct(task); event->hw.target = task; } @@ -10070,6 +10082,8 @@ err_ns: perf_detach_cgroup(event); if (event->ns) put_pid_ns(event->ns); + if (event->hw.target) + put_task_struct(event->hw.target); kfree(event); return ERR_PTR(err); diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c index 0975b0268545..a5697119290e 100644 --- a/kernel/exec_domain.c +++ b/kernel/exec_domain.c @@ -19,7 +19,6 @@ #include <linux/syscalls.h> #include <linux/sysctl.h> #include <linux/types.h> -#include <linux/fs_struct.h> #ifdef CONFIG_PROC_FS static int execdomains_proc_show(struct seq_file *m, void *v) diff --git a/kernel/fork.c b/kernel/fork.c index f71b67dc156d..242c8c93d285 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -595,6 +595,8 @@ static void check_mm(struct mm_struct *mm) void __mmdrop(struct mm_struct *mm) { BUG_ON(mm == &init_mm); + WARN_ON_ONCE(mm == current->mm); + WARN_ON_ONCE(mm == current->active_mm); mm_free_pgd(mm); destroy_context(mm); hmm_mm_destroy(mm); diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c index a37a3b4b6342..f4f29b9d90ee 100644 --- a/kernel/irq/affinity.c +++ b/kernel/irq/affinity.c @@ -39,7 +39,7 @@ static void irq_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk, } } -static cpumask_var_t *alloc_node_to_possible_cpumask(void) +static cpumask_var_t *alloc_node_to_cpumask(void) { cpumask_var_t *masks; int node; @@ -62,7 +62,7 @@ out_unwind: return NULL; } -static void free_node_to_possible_cpumask(cpumask_var_t *masks) +static void free_node_to_cpumask(cpumask_var_t *masks) { int node; @@ -71,7 +71,7 @@ static void free_node_to_possible_cpumask(cpumask_var_t *masks) kfree(masks); } -static void build_node_to_possible_cpumask(cpumask_var_t *masks) +static void build_node_to_cpumask(cpumask_var_t *masks) { int cpu; @@ -79,14 +79,14 @@ static void build_node_to_possible_cpumask(cpumask_var_t *masks) cpumask_set_cpu(cpu, masks[cpu_to_node(cpu)]); } -static int get_nodes_in_cpumask(cpumask_var_t *node_to_possible_cpumask, +static int get_nodes_in_cpumask(cpumask_var_t *node_to_cpumask, const struct cpumask *mask, nodemask_t *nodemsk) { int n, nodes = 0; /* Calculate the number of nodes in the supplied affinity mask */ for_each_node(n) { - if (cpumask_intersects(mask, node_to_possible_cpumask[n])) { + if (cpumask_intersects(mask, node_to_cpumask[n])) { node_set(n, *nodemsk); nodes++; } @@ -94,73 +94,46 @@ static int get_nodes_in_cpumask(cpumask_var_t *node_to_possible_cpumask, return nodes; } -/** - * irq_create_affinity_masks - Create affinity masks for multiqueue spreading - * @nvecs: The total number of vectors - * @affd: Description of the affinity requirements - * - * Returns the masks pointer or NULL if allocation failed. - */ -struct cpumask * -irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) +static int irq_build_affinity_masks(const struct irq_affinity *affd, + int startvec, int numvecs, + cpumask_var_t *node_to_cpumask, + const struct cpumask *cpu_mask, + struct cpumask *nmsk, + struct cpumask *masks) { - int n, nodes, cpus_per_vec, extra_vecs, curvec; - int affv = nvecs - affd->pre_vectors - affd->post_vectors; - int last_affv = affv + affd->pre_vectors; + int n, nodes, cpus_per_vec, extra_vecs, done = 0; + int last_affv = affd->pre_vectors + numvecs; + int curvec = startvec; nodemask_t nodemsk = NODE_MASK_NONE; - struct cpumask *masks; - cpumask_var_t nmsk, *node_to_possible_cpumask; - - /* - * If there aren't any vectors left after applying the pre/post - * vectors don't bother with assigning affinity. - */ - if (!affv) - return NULL; - - if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL)) - return NULL; - - masks = kcalloc(nvecs, sizeof(*masks), GFP_KERNEL); - if (!masks) - goto out; - node_to_possible_cpumask = alloc_node_to_possible_cpumask(); - if (!node_to_possible_cpumask) - goto out; + if (!cpumask_weight(cpu_mask)) + return 0; - /* Fill out vectors at the beginning that don't need affinity */ - for (curvec = 0; curvec < affd->pre_vectors; curvec++) - cpumask_copy(masks + curvec, irq_default_affinity); - - /* Stabilize the cpumasks */ - get_online_cpus(); - build_node_to_possible_cpumask(node_to_possible_cpumask); - nodes = get_nodes_in_cpumask(node_to_possible_cpumask, cpu_possible_mask, - &nodemsk); + nodes = get_nodes_in_cpumask(node_to_cpumask, cpu_mask, &nodemsk); /* * If the number of nodes in the mask is greater than or equal the * number of vectors we just spread the vectors across the nodes. */ - if (affv <= nodes) { + if (numvecs <= nodes) { for_each_node_mask(n, nodemsk) { - cpumask_copy(masks + curvec, - node_to_possible_cpumask[n]); - if (++curvec == last_affv) + cpumask_copy(masks + curvec, node_to_cpumask[n]); + if (++done == numvecs) break; + if (++curvec == last_affv) + curvec = affd->pre_vectors; } - goto done; + goto out; } for_each_node_mask(n, nodemsk) { int ncpus, v, vecs_to_assign, vecs_per_node; /* Spread the vectors per node */ - vecs_per_node = (affv - (curvec - affd->pre_vectors)) / nodes; + vecs_per_node = (numvecs - (curvec - affd->pre_vectors)) / nodes; /* Get the cpus on this node which are in the mask */ - cpumask_and(nmsk, cpu_possible_mask, node_to_possible_cpumask[n]); + cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]); /* Calculate the number of cpus per vector */ ncpus = cpumask_weight(nmsk); @@ -181,19 +154,96 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) irq_spread_init_one(masks + curvec, nmsk, cpus_per_vec); } - if (curvec >= last_affv) + done += v; + if (done >= numvecs) break; + if (curvec >= last_affv) + curvec = affd->pre_vectors; --nodes; } -done: +out: + return done; +} + +/** + * irq_create_affinity_masks - Create affinity masks for multiqueue spreading + * @nvecs: The total number of vectors + * @affd: Description of the affinity requirements + * + * Returns the masks pointer or NULL if allocation failed. + */ +struct cpumask * +irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) +{ + int affvecs = nvecs - affd->pre_vectors - affd->post_vectors; + int curvec, usedvecs; + cpumask_var_t nmsk, npresmsk, *node_to_cpumask; + struct cpumask *masks = NULL; + + /* + * If there aren't any vectors left after applying the pre/post + * vectors don't bother with assigning affinity. + */ + if (nvecs == affd->pre_vectors + affd->post_vectors) + return NULL; + + if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL)) + return NULL; + + if (!zalloc_cpumask_var(&npresmsk, GFP_KERNEL)) + goto outcpumsk; + + node_to_cpumask = alloc_node_to_cpumask(); + if (!node_to_cpumask) + goto outnpresmsk; + + masks = kcalloc(nvecs, sizeof(*masks), GFP_KERNEL); + if (!masks) + goto outnodemsk; + + /* Fill out vectors at the beginning that don't need affinity */ + for (curvec = 0; curvec < affd->pre_vectors; curvec++) + cpumask_copy(masks + curvec, irq_default_affinity); + + /* Stabilize the cpumasks */ + get_online_cpus(); + build_node_to_cpumask(node_to_cpumask); + + /* Spread on present CPUs starting from affd->pre_vectors */ + usedvecs = irq_build_affinity_masks(affd, curvec, affvecs, + node_to_cpumask, cpu_present_mask, + nmsk, masks); + + /* + * Spread on non present CPUs starting from the next vector to be + * handled. If the spreading of present CPUs already exhausted the + * vector space, assign the non present CPUs to the already spread + * out vectors. + */ + if (usedvecs >= affvecs) + curvec = affd->pre_vectors; + else + curvec = affd->pre_vectors + usedvecs; + cpumask_andnot(npresmsk, cpu_possible_mask, cpu_present_mask); + usedvecs += irq_build_affinity_masks(affd, curvec, affvecs, + node_to_cpumask, npresmsk, + nmsk, masks); put_online_cpus(); /* Fill out vectors at the end that don't need affinity */ + if (usedvecs >= affvecs) + curvec = affd->pre_vectors + affvecs; + else + curvec = affd->pre_vectors + usedvecs; for (; curvec < nvecs; curvec++) cpumask_copy(masks + curvec, irq_default_affinity); - free_node_to_possible_cpumask(node_to_possible_cpumask); -out: + +outnodemsk: + free_node_to_cpumask(node_to_cpumask); +outnpresmsk: + free_cpumask_var(npresmsk); +outcpumsk: free_cpumask_var(nmsk); return masks; } diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index e5bcd94c1efb..75d8e7cf040e 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -22,50 +22,123 @@ #include <linux/ima.h> #include <crypto/hash.h> #include <crypto/sha.h> +#include <linux/elf.h> +#include <linux/elfcore.h> +#include <linux/kernel.h> +#include <linux/kexec.h> +#include <linux/slab.h> #include <linux/syscalls.h> #include <linux/vmalloc.h> #include "kexec_internal.h" static int kexec_calculate_store_digests(struct kimage *image); +/* + * Currently this is the only default function that is exported as some + * architectures need it to do additional handlings. + * In the future, other default functions may be exported too if required. + */ +int kexec_image_probe_default(struct kimage *image, void *buf, + unsigned long buf_len) +{ + const struct kexec_file_ops * const *fops; + int ret = -ENOEXEC; + + for (fops = &kexec_file_loaders[0]; *fops && (*fops)->probe; ++fops) { + ret = (*fops)->probe(buf, buf_len); + if (!ret) { + image->fops = *fops; + return ret; + } + } + + return ret; +} + /* Architectures can provide this probe function */ int __weak arch_kexec_kernel_image_probe(struct kimage *image, void *buf, unsigned long buf_len) { - return -ENOEXEC; + return kexec_image_probe_default(image, buf, buf_len); +} + +static void *kexec_image_load_default(struct kimage *image) +{ + if (!image->fops || !image->fops->load) + return ERR_PTR(-ENOEXEC); + + return image->fops->load(image, image->kernel_buf, + image->kernel_buf_len, image->initrd_buf, + image->initrd_buf_len, image->cmdline_buf, + image->cmdline_buf_len); } void * __weak arch_kexec_kernel_image_load(struct kimage *image) { - return ERR_PTR(-ENOEXEC); + return kexec_image_load_default(image); +} + +static int kexec_image_post_load_cleanup_default(struct kimage *image) +{ + if (!image->fops || !image->fops->cleanup) + return 0; + + return image->fops->cleanup(image->image_loader_data); } int __weak arch_kimage_file_post_load_cleanup(struct kimage *image) { - return -EINVAL; + return kexec_image_post_load_cleanup_default(image); } #ifdef CONFIG_KEXEC_VERIFY_SIG +static int kexec_image_verify_sig_default(struct kimage *image, void *buf, + unsigned long buf_len) +{ + if (!image->fops || !image->fops->verify_sig) { + pr_debug("kernel loader does not support signature verification.\n"); + return -EKEYREJECTED; + } + + return image->fops->verify_sig(buf, buf_len); +} + int __weak arch_kexec_kernel_verify_sig(struct kimage *image, void *buf, unsigned long buf_len) { - return -EKEYREJECTED; + return kexec_image_verify_sig_default(image, buf, buf_len); } #endif -/* Apply relocations of type RELA */ +/* + * arch_kexec_apply_relocations_add - apply relocations of type RELA + * @pi: Purgatory to be relocated. + * @section: Section relocations applying to. + * @relsec: Section containing RELAs. + * @symtab: Corresponding symtab. + * + * Return: 0 on success, negative errno on error. + */ int __weak -arch_kexec_apply_relocations_add(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, - unsigned int relsec) +arch_kexec_apply_relocations_add(struct purgatory_info *pi, Elf_Shdr *section, + const Elf_Shdr *relsec, const Elf_Shdr *symtab) { pr_err("RELA relocation unsupported.\n"); return -ENOEXEC; } -/* Apply relocations of type REL */ +/* + * arch_kexec_apply_relocations - apply relocations of type REL + * @pi: Purgatory to be relocated. + * @section: Section relocations applying to. + * @relsec: Section containing RELs. + * @symtab: Corresponding symtab. + * + * Return: 0 on success, negative errno on error. + */ int __weak -arch_kexec_apply_relocations(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, - unsigned int relsec) +arch_kexec_apply_relocations(struct purgatory_info *pi, Elf_Shdr *section, + const Elf_Shdr *relsec, const Elf_Shdr *symtab) { pr_err("REL relocation unsupported.\n"); return -ENOEXEC; @@ -532,6 +605,9 @@ static int kexec_calculate_store_digests(struct kimage *image) struct kexec_sha_region *sha_regions; struct purgatory_info *pi = &image->purgatory_info; + if (!IS_ENABLED(CONFIG_ARCH_HAS_KEXEC_PURGATORY)) + return 0; + zero_buf = __va(page_to_pfn(ZERO_PAGE(0)) << PAGE_SHIFT); zero_buf_sz = PAGE_SIZE; @@ -633,87 +709,29 @@ out: return ret; } -/* Actually load purgatory. Lot of code taken from kexec-tools */ -static int __kexec_load_purgatory(struct kimage *image, unsigned long min, - unsigned long max, int top_down) +#ifdef CONFIG_ARCH_HAS_KEXEC_PURGATORY +/* + * kexec_purgatory_setup_kbuf - prepare buffer to load purgatory. + * @pi: Purgatory to be loaded. + * @kbuf: Buffer to setup. + * + * Allocates the memory needed for the buffer. Caller is responsible to free + * the memory after use. + * + * Return: 0 on success, negative errno on error. + */ +static int kexec_purgatory_setup_kbuf(struct purgatory_info *pi, + struct kexec_buf *kbuf) { - struct purgatory_info *pi = &image->purgatory_info; - unsigned long align, bss_align, bss_sz, bss_pad; - unsigned long entry, load_addr, curr_load_addr, bss_addr, offset; - unsigned char *buf_addr, *src; - int i, ret = 0, entry_sidx = -1; - const Elf_Shdr *sechdrs_c; - Elf_Shdr *sechdrs = NULL; - struct kexec_buf kbuf = { .image = image, .bufsz = 0, .buf_align = 1, - .buf_min = min, .buf_max = max, - .top_down = top_down }; - - /* - * sechdrs_c points to section headers in purgatory and are read - * only. No modifications allowed. - */ - sechdrs_c = (void *)pi->ehdr + pi->ehdr->e_shoff; - - /* - * We can not modify sechdrs_c[] and its fields. It is read only. - * Copy it over to a local copy where one can store some temporary - * data and free it at the end. We need to modify ->sh_addr and - * ->sh_offset fields to keep track of permanent and temporary - * locations of sections. - */ - sechdrs = vzalloc(pi->ehdr->e_shnum * sizeof(Elf_Shdr)); - if (!sechdrs) - return -ENOMEM; - - memcpy(sechdrs, sechdrs_c, pi->ehdr->e_shnum * sizeof(Elf_Shdr)); - - /* - * We seem to have multiple copies of sections. First copy is which - * is embedded in kernel in read only section. Some of these sections - * will be copied to a temporary buffer and relocated. And these - * sections will finally be copied to their final destination at - * segment load time. - * - * Use ->sh_offset to reflect section address in memory. It will - * point to original read only copy if section is not allocatable. - * Otherwise it will point to temporary copy which will be relocated. - * - * Use ->sh_addr to contain final address of the section where it - * will go during execution time. - */ - for (i = 0; i < pi->ehdr->e_shnum; i++) { - if (sechdrs[i].sh_type == SHT_NOBITS) - continue; - - sechdrs[i].sh_offset = (unsigned long)pi->ehdr + - sechdrs[i].sh_offset; - } - - /* - * Identify entry point section and make entry relative to section - * start. - */ - entry = pi->ehdr->e_entry; - for (i = 0; i < pi->ehdr->e_shnum; i++) { - if (!(sechdrs[i].sh_flags & SHF_ALLOC)) - continue; - - if (!(sechdrs[i].sh_flags & SHF_EXECINSTR)) - continue; - - /* Make entry section relative */ - if (sechdrs[i].sh_addr <= pi->ehdr->e_entry && - ((sechdrs[i].sh_addr + sechdrs[i].sh_size) > - pi->ehdr->e_entry)) { - entry_sidx = i; - entry -= sechdrs[i].sh_addr; - break; - } - } + const Elf_Shdr *sechdrs; + unsigned long bss_align; + unsigned long bss_sz; + unsigned long align; + int i, ret; - /* Determine how much memory is needed to load relocatable object. */ - bss_align = 1; - bss_sz = 0; + sechdrs = (void *)pi->ehdr + pi->ehdr->e_shoff; + kbuf->buf_align = bss_align = 1; + kbuf->bufsz = bss_sz = 0; for (i = 0; i < pi->ehdr->e_shnum; i++) { if (!(sechdrs[i].sh_flags & SHF_ALLOC)) @@ -721,111 +739,124 @@ static int __kexec_load_purgatory(struct kimage *image, unsigned long min, align = sechdrs[i].sh_addralign; if (sechdrs[i].sh_type != SHT_NOBITS) { - if (kbuf.buf_align < align) - kbuf.buf_align = align; - kbuf.bufsz = ALIGN(kbuf.bufsz, align); - kbuf.bufsz += sechdrs[i].sh_size; + if (kbuf->buf_align < align) + kbuf->buf_align = align; + kbuf->bufsz = ALIGN(kbuf->bufsz, align); + kbuf->bufsz += sechdrs[i].sh_size; } else { - /* bss section */ if (bss_align < align) bss_align = align; bss_sz = ALIGN(bss_sz, align); bss_sz += sechdrs[i].sh_size; } } + kbuf->bufsz = ALIGN(kbuf->bufsz, bss_align); + kbuf->memsz = kbuf->bufsz + bss_sz; + if (kbuf->buf_align < bss_align) + kbuf->buf_align = bss_align; - /* Determine the bss padding required to align bss properly */ - bss_pad = 0; - if (kbuf.bufsz & (bss_align - 1)) - bss_pad = bss_align - (kbuf.bufsz & (bss_align - 1)); - - kbuf.memsz = kbuf.bufsz + bss_pad + bss_sz; + kbuf->buffer = vzalloc(kbuf->bufsz); + if (!kbuf->buffer) + return -ENOMEM; + pi->purgatory_buf = kbuf->buffer; - /* Allocate buffer for purgatory */ - kbuf.buffer = vzalloc(kbuf.bufsz); - if (!kbuf.buffer) { - ret = -ENOMEM; + ret = kexec_add_buffer(kbuf); + if (ret) goto out; - } - if (kbuf.buf_align < bss_align) - kbuf.buf_align = bss_align; + return 0; +out: + vfree(pi->purgatory_buf); + pi->purgatory_buf = NULL; + return ret; +} - /* Add buffer to segment list */ - ret = kexec_add_buffer(&kbuf); - if (ret) - goto out; - pi->purgatory_load_addr = kbuf.mem; +/* + * kexec_purgatory_setup_sechdrs - prepares the pi->sechdrs buffer. + * @pi: Purgatory to be loaded. + * @kbuf: Buffer prepared to store purgatory. + * + * Allocates the memory needed for the buffer. Caller is responsible to free + * the memory after use. + * + * Return: 0 on success, negative errno on error. + */ +static int kexec_purgatory_setup_sechdrs(struct purgatory_info *pi, + struct kexec_buf *kbuf) +{ + unsigned long bss_addr; + unsigned long offset; + Elf_Shdr *sechdrs; + int i; + + /* + * The section headers in kexec_purgatory are read-only. In order to + * have them modifiable make a temporary copy. + */ + sechdrs = vzalloc(pi->ehdr->e_shnum * sizeof(Elf_Shdr)); + if (!sechdrs) + return -ENOMEM; + memcpy(sechdrs, (void *)pi->ehdr + pi->ehdr->e_shoff, + pi->ehdr->e_shnum * sizeof(Elf_Shdr)); + pi->sechdrs = sechdrs; - /* Load SHF_ALLOC sections */ - buf_addr = kbuf.buffer; - load_addr = curr_load_addr = pi->purgatory_load_addr; - bss_addr = load_addr + kbuf.bufsz + bss_pad; + offset = 0; + bss_addr = kbuf->mem + kbuf->bufsz; + kbuf->image->start = pi->ehdr->e_entry; for (i = 0; i < pi->ehdr->e_shnum; i++) { + unsigned long align; + void *src, *dst; + if (!(sechdrs[i].sh_flags & SHF_ALLOC)) continue; align = sechdrs[i].sh_addralign; - if (sechdrs[i].sh_type != SHT_NOBITS) { - curr_load_addr = ALIGN(curr_load_addr, align); - offset = curr_load_addr - load_addr; - /* We already modifed ->sh_offset to keep src addr */ - src = (char *) sechdrs[i].sh_offset; - memcpy(buf_addr + offset, src, sechdrs[i].sh_size); - - /* Store load address and source address of section */ - sechdrs[i].sh_addr = curr_load_addr; - - /* - * This section got copied to temporary buffer. Update - * ->sh_offset accordingly. - */ - sechdrs[i].sh_offset = (unsigned long)(buf_addr + offset); - - /* Advance to the next address */ - curr_load_addr += sechdrs[i].sh_size; - } else { + if (sechdrs[i].sh_type == SHT_NOBITS) { bss_addr = ALIGN(bss_addr, align); sechdrs[i].sh_addr = bss_addr; bss_addr += sechdrs[i].sh_size; + continue; } - } - /* Update entry point based on load address of text section */ - if (entry_sidx >= 0) - entry += sechdrs[entry_sidx].sh_addr; + offset = ALIGN(offset, align); + if (sechdrs[i].sh_flags & SHF_EXECINSTR && + pi->ehdr->e_entry >= sechdrs[i].sh_addr && + pi->ehdr->e_entry < (sechdrs[i].sh_addr + + sechdrs[i].sh_size)) { + kbuf->image->start -= sechdrs[i].sh_addr; + kbuf->image->start += kbuf->mem + offset; + } - /* Make kernel jump to purgatory after shutdown */ - image->start = entry; + src = (void *)pi->ehdr + sechdrs[i].sh_offset; + dst = pi->purgatory_buf + offset; + memcpy(dst, src, sechdrs[i].sh_size); - /* Used later to get/set symbol values */ - pi->sechdrs = sechdrs; + sechdrs[i].sh_addr = kbuf->mem + offset; + sechdrs[i].sh_offset = offset; + offset += sechdrs[i].sh_size; + } - /* - * Used later to identify which section is purgatory and skip it - * from checksumming. - */ - pi->purgatory_buf = kbuf.buffer; - return ret; -out: - vfree(sechdrs); - vfree(kbuf.buffer); - return ret; + return 0; } static int kexec_apply_relocations(struct kimage *image) { int i, ret; struct purgatory_info *pi = &image->purgatory_info; - Elf_Shdr *sechdrs = pi->sechdrs; + const Elf_Shdr *sechdrs; + + sechdrs = (void *)pi->ehdr + pi->ehdr->e_shoff; - /* Apply relocations */ for (i = 0; i < pi->ehdr->e_shnum; i++) { - Elf_Shdr *section, *symtab; + const Elf_Shdr *relsec; + const Elf_Shdr *symtab; + Elf_Shdr *section; + + relsec = sechdrs + i; - if (sechdrs[i].sh_type != SHT_RELA && - sechdrs[i].sh_type != SHT_REL) + if (relsec->sh_type != SHT_RELA && + relsec->sh_type != SHT_REL) continue; /* @@ -834,12 +865,12 @@ static int kexec_apply_relocations(struct kimage *image) * symbol table. And ->sh_info contains section header * index of section to which relocations apply. */ - if (sechdrs[i].sh_info >= pi->ehdr->e_shnum || - sechdrs[i].sh_link >= pi->ehdr->e_shnum) + if (relsec->sh_info >= pi->ehdr->e_shnum || + relsec->sh_link >= pi->ehdr->e_shnum) return -ENOEXEC; - section = &sechdrs[sechdrs[i].sh_info]; - symtab = &sechdrs[sechdrs[i].sh_link]; + section = pi->sechdrs + relsec->sh_info; + symtab = sechdrs + relsec->sh_link; if (!(section->sh_flags & SHF_ALLOC)) continue; @@ -856,12 +887,12 @@ static int kexec_apply_relocations(struct kimage *image) * Respective architecture needs to provide support for applying * relocations of type SHT_RELA/SHT_REL. */ - if (sechdrs[i].sh_type == SHT_RELA) - ret = arch_kexec_apply_relocations_add(pi->ehdr, - sechdrs, i); - else if (sechdrs[i].sh_type == SHT_REL) - ret = arch_kexec_apply_relocations(pi->ehdr, - sechdrs, i); + if (relsec->sh_type == SHT_RELA) + ret = arch_kexec_apply_relocations_add(pi, section, + relsec, symtab); + else if (relsec->sh_type == SHT_REL) + ret = arch_kexec_apply_relocations(pi, section, + relsec, symtab); if (ret) return ret; } @@ -869,10 +900,18 @@ static int kexec_apply_relocations(struct kimage *image) return 0; } -/* Load relocatable purgatory object and relocate it appropriately */ -int kexec_load_purgatory(struct kimage *image, unsigned long min, - unsigned long max, int top_down, - unsigned long *load_addr) +/* + * kexec_load_purgatory - Load and relocate the purgatory object. + * @image: Image to add the purgatory to. + * @kbuf: Memory parameters to use. + * + * Allocates the memory needed for image->purgatory_info.sechdrs and + * image->purgatory_info.purgatory_buf/kbuf->buffer. Caller is responsible + * to free the memory after use. + * + * Return: 0 on success, negative errno on error. + */ +int kexec_load_purgatory(struct kimage *image, struct kexec_buf *kbuf) { struct purgatory_info *pi = &image->purgatory_info; int ret; @@ -880,55 +919,51 @@ int kexec_load_purgatory(struct kimage *image, unsigned long min, if (kexec_purgatory_size <= 0) return -EINVAL; - if (kexec_purgatory_size < sizeof(Elf_Ehdr)) - return -ENOEXEC; - - pi->ehdr = (Elf_Ehdr *)kexec_purgatory; - - if (memcmp(pi->ehdr->e_ident, ELFMAG, SELFMAG) != 0 - || pi->ehdr->e_type != ET_REL - || !elf_check_arch(pi->ehdr) - || pi->ehdr->e_shentsize != sizeof(Elf_Shdr)) - return -ENOEXEC; - - if (pi->ehdr->e_shoff >= kexec_purgatory_size - || (pi->ehdr->e_shnum * sizeof(Elf_Shdr) > - kexec_purgatory_size - pi->ehdr->e_shoff)) - return -ENOEXEC; + pi->ehdr = (const Elf_Ehdr *)kexec_purgatory; - ret = __kexec_load_purgatory(image, min, max, top_down); + ret = kexec_purgatory_setup_kbuf(pi, kbuf); if (ret) return ret; + ret = kexec_purgatory_setup_sechdrs(pi, kbuf); + if (ret) + goto out_free_kbuf; + ret = kexec_apply_relocations(image); if (ret) goto out; - *load_addr = pi->purgatory_load_addr; return 0; out: vfree(pi->sechdrs); pi->sechdrs = NULL; - +out_free_kbuf: vfree(pi->purgatory_buf); pi->purgatory_buf = NULL; return ret; } -static Elf_Sym *kexec_purgatory_find_symbol(struct purgatory_info *pi, - const char *name) +/* + * kexec_purgatory_find_symbol - find a symbol in the purgatory + * @pi: Purgatory to search in. + * @name: Name of the symbol. + * + * Return: pointer to symbol in read-only symtab on success, NULL on error. + */ +static const Elf_Sym *kexec_purgatory_find_symbol(struct purgatory_info *pi, + const char *name) { - Elf_Sym *syms; - Elf_Shdr *sechdrs; - Elf_Ehdr *ehdr; - int i, k; + const Elf_Shdr *sechdrs; + const Elf_Ehdr *ehdr; + const Elf_Sym *syms; const char *strtab; + int i, k; - if (!pi->sechdrs || !pi->ehdr) + if (!pi->ehdr) return NULL; - sechdrs = pi->sechdrs; ehdr = pi->ehdr; + sechdrs = (void *)ehdr + ehdr->e_shoff; for (i = 0; i < ehdr->e_shnum; i++) { if (sechdrs[i].sh_type != SHT_SYMTAB) @@ -937,8 +972,8 @@ static Elf_Sym *kexec_purgatory_find_symbol(struct purgatory_info *pi, if (sechdrs[i].sh_link >= ehdr->e_shnum) /* Invalid strtab section number */ continue; - strtab = (char *)sechdrs[sechdrs[i].sh_link].sh_offset; - syms = (Elf_Sym *)sechdrs[i].sh_offset; + strtab = (void *)ehdr + sechdrs[sechdrs[i].sh_link].sh_offset; + syms = (void *)ehdr + sechdrs[i].sh_offset; /* Go through symbols for a match */ for (k = 0; k < sechdrs[i].sh_size/sizeof(Elf_Sym); k++) { @@ -966,7 +1001,7 @@ static Elf_Sym *kexec_purgatory_find_symbol(struct purgatory_info *pi, void *kexec_purgatory_get_symbol_addr(struct kimage *image, const char *name) { struct purgatory_info *pi = &image->purgatory_info; - Elf_Sym *sym; + const Elf_Sym *sym; Elf_Shdr *sechdr; sym = kexec_purgatory_find_symbol(pi, name); @@ -989,9 +1024,9 @@ void *kexec_purgatory_get_symbol_addr(struct kimage *image, const char *name) int kexec_purgatory_get_set_symbol(struct kimage *image, const char *name, void *buf, unsigned int size, bool get_value) { - Elf_Sym *sym; - Elf_Shdr *sechdrs; struct purgatory_info *pi = &image->purgatory_info; + const Elf_Sym *sym; + Elf_Shdr *sec; char *sym_buf; sym = kexec_purgatory_find_symbol(pi, name); @@ -1004,16 +1039,15 @@ int kexec_purgatory_get_set_symbol(struct kimage *image, const char *name, return -EINVAL; } - sechdrs = pi->sechdrs; + sec = pi->sechdrs + sym->st_shndx; - if (sechdrs[sym->st_shndx].sh_type == SHT_NOBITS) { + if (sec->sh_type == SHT_NOBITS) { pr_err("symbol %s is in a bss section. Cannot %s\n", name, get_value ? "get" : "set"); return -EINVAL; } - sym_buf = (unsigned char *)sechdrs[sym->st_shndx].sh_offset + - sym->st_value; + sym_buf = (char *)pi->purgatory_buf + sec->sh_offset + sym->st_value; if (get_value) memcpy((void *)buf, sym_buf, size); @@ -1022,3 +1056,174 @@ int kexec_purgatory_get_set_symbol(struct kimage *image, const char *name, return 0; } +#endif /* CONFIG_ARCH_HAS_KEXEC_PURGATORY */ + +int crash_exclude_mem_range(struct crash_mem *mem, + unsigned long long mstart, unsigned long long mend) +{ + int i, j; + unsigned long long start, end; + struct crash_mem_range temp_range = {0, 0}; + + for (i = 0; i < mem->nr_ranges; i++) { + start = mem->ranges[i].start; + end = mem->ranges[i].end; + + if (mstart > end || mend < start) + continue; + + /* Truncate any area outside of range */ + if (mstart < start) + mstart = start; + if (mend > end) + mend = end; + + /* Found completely overlapping range */ + if (mstart == start && mend == end) { + mem->ranges[i].start = 0; + mem->ranges[i].end = 0; + if (i < mem->nr_ranges - 1) { + /* Shift rest of the ranges to left */ + for (j = i; j < mem->nr_ranges - 1; j++) { + mem->ranges[j].start = + mem->ranges[j+1].start; + mem->ranges[j].end = + mem->ranges[j+1].end; + } + } + mem->nr_ranges--; + return 0; + } + + if (mstart > start && mend < end) { + /* Split original range */ + mem->ranges[i].end = mstart - 1; + temp_range.start = mend + 1; + temp_range.end = end; + } else if (mstart != start) + mem->ranges[i].end = mstart - 1; + else + mem->ranges[i].start = mend + 1; + break; + } + + /* If a split happened, add the split to array */ + if (!temp_range.end) + return 0; + + /* Split happened */ + if (i == mem->max_nr_ranges - 1) + return -ENOMEM; + + /* Location where new range should go */ + j = i + 1; + if (j < mem->nr_ranges) { + /* Move over all ranges one slot towards the end */ + for (i = mem->nr_ranges - 1; i >= j; i--) + mem->ranges[i + 1] = mem->ranges[i]; + } + + mem->ranges[j].start = temp_range.start; + mem->ranges[j].end = temp_range.end; + mem->nr_ranges++; + return 0; +} + +int crash_prepare_elf64_headers(struct crash_mem *mem, int kernel_map, + void **addr, unsigned long *sz) +{ + Elf64_Ehdr *ehdr; + Elf64_Phdr *phdr; + unsigned long nr_cpus = num_possible_cpus(), nr_phdr, elf_sz; + unsigned char *buf; + unsigned int cpu, i; + unsigned long long notes_addr; + unsigned long mstart, mend; + + /* extra phdr for vmcoreinfo elf note */ + nr_phdr = nr_cpus + 1; + nr_phdr += mem->nr_ranges; + + /* + * kexec-tools creates an extra PT_LOAD phdr for kernel text mapping + * area (for example, ffffffff80000000 - ffffffffa0000000 on x86_64). + * I think this is required by tools like gdb. So same physical + * memory will be mapped in two elf headers. One will contain kernel + * text virtual addresses and other will have __va(physical) addresses. + */ + + nr_phdr++; + elf_sz = sizeof(Elf64_Ehdr) + nr_phdr * sizeof(Elf64_Phdr); + elf_sz = ALIGN(elf_sz, ELF_CORE_HEADER_ALIGN); + + buf = vzalloc(elf_sz); + if (!buf) + return -ENOMEM; + + ehdr = (Elf64_Ehdr *)buf; + phdr = (Elf64_Phdr *)(ehdr + 1); + memcpy(ehdr->e_ident, ELFMAG, SELFMAG); + ehdr->e_ident[EI_CLASS] = ELFCLASS64; + ehdr->e_ident[EI_DATA] = ELFDATA2LSB; + ehdr->e_ident[EI_VERSION] = EV_CURRENT; + ehdr->e_ident[EI_OSABI] = ELF_OSABI; + memset(ehdr->e_ident + EI_PAD, 0, EI_NIDENT - EI_PAD); + ehdr->e_type = ET_CORE; + ehdr->e_machine = ELF_ARCH; + ehdr->e_version = EV_CURRENT; + ehdr->e_phoff = sizeof(Elf64_Ehdr); + ehdr->e_ehsize = sizeof(Elf64_Ehdr); + ehdr->e_phentsize = sizeof(Elf64_Phdr); + + /* Prepare one phdr of type PT_NOTE for each present cpu */ + for_each_present_cpu(cpu) { + phdr->p_type = PT_NOTE; + notes_addr = per_cpu_ptr_to_phys(per_cpu_ptr(crash_notes, cpu)); + phdr->p_offset = phdr->p_paddr = notes_addr; + phdr->p_filesz = phdr->p_memsz = sizeof(note_buf_t); + (ehdr->e_phnum)++; + phdr++; + } + + /* Prepare one PT_NOTE header for vmcoreinfo */ + phdr->p_type = PT_NOTE; + phdr->p_offset = phdr->p_paddr = paddr_vmcoreinfo_note(); + phdr->p_filesz = phdr->p_memsz = VMCOREINFO_NOTE_SIZE; + (ehdr->e_phnum)++; + phdr++; + + /* Prepare PT_LOAD type program header for kernel text region */ + if (kernel_map) { + phdr->p_type = PT_LOAD; + phdr->p_flags = PF_R|PF_W|PF_X; + phdr->p_vaddr = (Elf64_Addr)_text; + phdr->p_filesz = phdr->p_memsz = _end - _text; + phdr->p_offset = phdr->p_paddr = __pa_symbol(_text); + ehdr->e_phnum++; + phdr++; + } + + /* Go through all the ranges in mem->ranges[] and prepare phdr */ + for (i = 0; i < mem->nr_ranges; i++) { + mstart = mem->ranges[i].start; + mend = mem->ranges[i].end; + + phdr->p_type = PT_LOAD; + phdr->p_flags = PF_R|PF_W|PF_X; + phdr->p_offset = mstart; + + phdr->p_paddr = mstart; + phdr->p_vaddr = (unsigned long long) __va(mstart); + phdr->p_filesz = phdr->p_memsz = mend - mstart + 1; + phdr->p_align = 0; + ehdr->e_phnum++; + phdr++; + pr_debug("Crash PT_LOAD elf header. phdr=%p vaddr=0x%llx, paddr=0x%llx, sz=0x%llx e_phnum=%d p_offset=0x%llx\n", + phdr, phdr->p_vaddr, phdr->p_paddr, phdr->p_filesz, + ehdr->e_phnum, phdr->p_offset); + } + + *addr = buf; + *sz = elf_sz; + return 0; +} diff --git a/kernel/livepatch/shadow.c b/kernel/livepatch/shadow.c index fdac27588d60..83958c814439 100644 --- a/kernel/livepatch/shadow.c +++ b/kernel/livepatch/shadow.c @@ -113,8 +113,10 @@ void *klp_shadow_get(void *obj, unsigned long id) } EXPORT_SYMBOL_GPL(klp_shadow_get); -static void *__klp_shadow_get_or_alloc(void *obj, unsigned long id, void *data, - size_t size, gfp_t gfp_flags, bool warn_on_exist) +static void *__klp_shadow_get_or_alloc(void *obj, unsigned long id, + size_t size, gfp_t gfp_flags, + klp_shadow_ctor_t ctor, void *ctor_data, + bool warn_on_exist) { struct klp_shadow *new_shadow; void *shadow_data; @@ -125,18 +127,15 @@ static void *__klp_shadow_get_or_alloc(void *obj, unsigned long id, void *data, if (shadow_data) goto exists; - /* Allocate a new shadow variable for use inside the lock below */ + /* + * Allocate a new shadow variable. Fill it with zeroes by default. + * More complex setting can be done by @ctor function. But it is + * called only when the buffer is really used (under klp_shadow_lock). + */ new_shadow = kzalloc(size + sizeof(*new_shadow), gfp_flags); if (!new_shadow) return NULL; - new_shadow->obj = obj; - new_shadow->id = id; - - /* Initialize the shadow variable if data provided */ - if (data) - memcpy(new_shadow->data, data, size); - /* Look for <obj, id> again under the lock */ spin_lock_irqsave(&klp_shadow_lock, flags); shadow_data = klp_shadow_get(obj, id); @@ -150,6 +149,22 @@ static void *__klp_shadow_get_or_alloc(void *obj, unsigned long id, void *data, goto exists; } + new_shadow->obj = obj; + new_shadow->id = id; + + if (ctor) { + int err; + + err = ctor(obj, new_shadow->data, ctor_data); + if (err) { + spin_unlock_irqrestore(&klp_shadow_lock, flags); + kfree(new_shadow); + pr_err("Failed to construct shadow variable <%p, %lx> (%d)\n", + obj, id, err); + return NULL; + } + } + /* No <obj, id> found, so attach the newly allocated one */ hash_add_rcu(klp_shadow_hash, &new_shadow->node, (unsigned long)new_shadow->obj); @@ -170,26 +185,32 @@ exists: * klp_shadow_alloc() - allocate and add a new shadow variable * @obj: pointer to parent object * @id: data identifier - * @data: pointer to data to attach to parent * @size: size of attached data * @gfp_flags: GFP mask for allocation + * @ctor: custom constructor to initialize the shadow data (optional) + * @ctor_data: pointer to any data needed by @ctor (optional) + * + * Allocates @size bytes for new shadow variable data using @gfp_flags. + * The data are zeroed by default. They are further initialized by @ctor + * function if it is not NULL. The new shadow variable is then added + * to the global hashtable. * - * Allocates @size bytes for new shadow variable data using @gfp_flags - * and copies @size bytes from @data into the new shadow variable's own - * data space. If @data is NULL, @size bytes are still allocated, but - * no copy is performed. The new shadow variable is then added to the - * global hashtable. + * If an existing <obj, id> shadow variable can be found, this routine will + * issue a WARN, exit early and return NULL. * - * If an existing <obj, id> shadow variable can be found, this routine - * will issue a WARN, exit early and return NULL. + * This function guarantees that the constructor function is called only when + * the variable did not exist before. The cost is that @ctor is called + * in atomic context under a spin lock. * * Return: the shadow variable data element, NULL on duplicate or * failure. */ -void *klp_shadow_alloc(void *obj, unsigned long id, void *data, - size_t size, gfp_t gfp_flags) +void *klp_shadow_alloc(void *obj, unsigned long id, + size_t size, gfp_t gfp_flags, + klp_shadow_ctor_t ctor, void *ctor_data) { - return __klp_shadow_get_or_alloc(obj, id, data, size, gfp_flags, true); + return __klp_shadow_get_or_alloc(obj, id, size, gfp_flags, + ctor, ctor_data, true); } EXPORT_SYMBOL_GPL(klp_shadow_alloc); @@ -197,37 +218,51 @@ EXPORT_SYMBOL_GPL(klp_shadow_alloc); * klp_shadow_get_or_alloc() - get existing or allocate a new shadow variable * @obj: pointer to parent object * @id: data identifier - * @data: pointer to data to attach to parent * @size: size of attached data * @gfp_flags: GFP mask for allocation + * @ctor: custom constructor to initialize the shadow data (optional) + * @ctor_data: pointer to any data needed by @ctor (optional) * * Returns a pointer to existing shadow data if an <obj, id> shadow * variable is already present. Otherwise, it creates a new shadow * variable like klp_shadow_alloc(). * - * This function guarantees that only one shadow variable exists with - * the given @id for the given @obj. It also guarantees that the shadow - * variable will be initialized by the given @data only when it did not - * exist before. + * This function guarantees that only one shadow variable exists with the given + * @id for the given @obj. It also guarantees that the constructor function + * will be called only when the variable did not exist before. The cost is + * that @ctor is called in atomic context under a spin lock. * * Return: the shadow variable data element, NULL on failure. */ -void *klp_shadow_get_or_alloc(void *obj, unsigned long id, void *data, - size_t size, gfp_t gfp_flags) +void *klp_shadow_get_or_alloc(void *obj, unsigned long id, + size_t size, gfp_t gfp_flags, + klp_shadow_ctor_t ctor, void *ctor_data) { - return __klp_shadow_get_or_alloc(obj, id, data, size, gfp_flags, false); + return __klp_shadow_get_or_alloc(obj, id, size, gfp_flags, + ctor, ctor_data, false); } EXPORT_SYMBOL_GPL(klp_shadow_get_or_alloc); +static void klp_shadow_free_struct(struct klp_shadow *shadow, + klp_shadow_dtor_t dtor) +{ + hash_del_rcu(&shadow->node); + if (dtor) + dtor(shadow->obj, shadow->data); + kfree_rcu(shadow, rcu_head); +} + /** * klp_shadow_free() - detach and free a <obj, id> shadow variable * @obj: pointer to parent object * @id: data identifier + * @dtor: custom callback that can be used to unregister the variable + * and/or free data that the shadow variable points to (optional) * * This function releases the memory for this <obj, id> shadow variable * instance, callers should stop referencing it accordingly. */ -void klp_shadow_free(void *obj, unsigned long id) +void klp_shadow_free(void *obj, unsigned long id, klp_shadow_dtor_t dtor) { struct klp_shadow *shadow; unsigned long flags; @@ -239,8 +274,7 @@ void klp_shadow_free(void *obj, unsigned long id) (unsigned long)obj) { if (klp_shadow_match(shadow, obj, id)) { - hash_del_rcu(&shadow->node); - kfree_rcu(shadow, rcu_head); + klp_shadow_free_struct(shadow, dtor); break; } } @@ -252,11 +286,13 @@ EXPORT_SYMBOL_GPL(klp_shadow_free); /** * klp_shadow_free_all() - detach and free all <*, id> shadow variables * @id: data identifier + * @dtor: custom callback that can be used to unregister the variable + * and/or free data that the shadow variable points to (optional) * * This function releases the memory for all <*, id> shadow variable * instances, callers should stop referencing them accordingly. */ -void klp_shadow_free_all(unsigned long id) +void klp_shadow_free_all(unsigned long id, klp_shadow_dtor_t dtor) { struct klp_shadow *shadow; unsigned long flags; @@ -266,10 +302,8 @@ void klp_shadow_free_all(unsigned long id) /* Delete all <*, id> from hash */ hash_for_each(klp_shadow_hash, i, shadow, node) { - if (klp_shadow_match(shadow, shadow->obj, id)) { - hash_del_rcu(&shadow->node); - kfree_rcu(shadow, rcu_head); - } + if (klp_shadow_match(shadow, shadow->obj, id)) + klp_shadow_free_struct(shadow, dtor); } spin_unlock_irqrestore(&klp_shadow_lock, flags); diff --git a/kernel/panic.c b/kernel/panic.c index 9d833d913c84..42e487488554 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -34,7 +34,8 @@ #define PANIC_BLINK_SPD 18 int panic_on_oops = CONFIG_PANIC_ON_OOPS_VALUE; -static unsigned long tainted_mask; +static unsigned long tainted_mask = + IS_ENABLED(CONFIG_GCC_PLUGIN_RANDSTRUCT) ? (1 << TAINT_RANDSTRUCT) : 0; static int pause_on_oops; static int pause_on_oops_flag; static DEFINE_SPINLOCK(pause_on_oops_lock); @@ -308,52 +309,40 @@ EXPORT_SYMBOL(panic); * is being removed anyway. */ const struct taint_flag taint_flags[TAINT_FLAGS_COUNT] = { - { 'P', 'G', true }, /* TAINT_PROPRIETARY_MODULE */ - { 'F', ' ', true }, /* TAINT_FORCED_MODULE */ - { 'S', ' ', false }, /* TAINT_CPU_OUT_OF_SPEC */ - { 'R', ' ', false }, /* TAINT_FORCED_RMMOD */ - { 'M', ' ', false }, /* TAINT_MACHINE_CHECK */ - { 'B', ' ', false }, /* TAINT_BAD_PAGE */ - { 'U', ' ', false }, /* TAINT_USER */ - { 'D', ' ', false }, /* TAINT_DIE */ - { 'A', ' ', false }, /* TAINT_OVERRIDDEN_ACPI_TABLE */ - { 'W', ' ', false }, /* TAINT_WARN */ - { 'C', ' ', true }, /* TAINT_CRAP */ - { 'I', ' ', false }, /* TAINT_FIRMWARE_WORKAROUND */ - { 'O', ' ', true }, /* TAINT_OOT_MODULE */ - { 'E', ' ', true }, /* TAINT_UNSIGNED_MODULE */ - { 'L', ' ', false }, /* TAINT_SOFTLOCKUP */ - { 'K', ' ', true }, /* TAINT_LIVEPATCH */ - { 'X', ' ', true }, /* TAINT_AUX */ + [ TAINT_PROPRIETARY_MODULE ] = { 'P', 'G', true }, + [ TAINT_FORCED_MODULE ] = { 'F', ' ', true }, + [ TAINT_CPU_OUT_OF_SPEC ] = { 'S', ' ', false }, + [ TAINT_FORCED_RMMOD ] = { 'R', ' ', false }, + [ TAINT_MACHINE_CHECK ] = { 'M', ' ', false }, + [ TAINT_BAD_PAGE ] = { 'B', ' ', false }, + [ TAINT_USER ] = { 'U', ' ', false }, + [ TAINT_DIE ] = { 'D', ' ', false }, + [ TAINT_OVERRIDDEN_ACPI_TABLE ] = { 'A', ' ', false }, + [ TAINT_WARN ] = { 'W', ' ', false }, + [ TAINT_CRAP ] = { 'C', ' ', true }, + [ TAINT_FIRMWARE_WORKAROUND ] = { 'I', ' ', false }, + [ TAINT_OOT_MODULE ] = { 'O', ' ', true }, + [ TAINT_UNSIGNED_MODULE ] = { 'E', ' ', true }, + [ TAINT_SOFTLOCKUP ] = { 'L', ' ', false }, + [ TAINT_LIVEPATCH ] = { 'K', ' ', true }, + [ TAINT_AUX ] = { 'X', ' ', true }, + [ TAINT_RANDSTRUCT ] = { 'T', ' ', true }, }; /** - * print_tainted - return a string to represent the kernel taint state. + * print_tainted - return a string to represent the kernel taint state. * - * 'P' - Proprietary module has been loaded. - * 'F' - Module has been forcibly loaded. - * 'S' - SMP with CPUs not designed for SMP. - * 'R' - User forced a module unload. - * 'M' - System experienced a machine check exception. - * 'B' - System has hit bad_page. - * 'U' - Userspace-defined naughtiness. - * 'D' - Kernel has oopsed before - * 'A' - ACPI table overridden. - * 'W' - Taint on warning. - * 'C' - modules from drivers/staging are loaded. - * 'I' - Working around severe firmware bug. - * 'O' - Out-of-tree module has been loaded. - * 'E' - Unsigned module has been loaded. - * 'L' - A soft lockup has previously occurred. - * 'K' - Kernel has been live patched. - * 'X' - Auxiliary taint, for distros' use. + * For individual taint flag meanings, see Documentation/sysctl/kernel.txt * - * The string is overwritten by the next call to print_tainted(). + * The string is overwritten by the next call to print_tainted(), + * but is always NULL terminated. */ const char *print_tainted(void) { static char buf[TAINT_FLAGS_COUNT + sizeof("Tainted: ")]; + BUILD_BUG_ON(ARRAY_SIZE(taint_flags) != TAINT_FLAGS_COUNT); + if (tainted_mask) { char *s; int i; @@ -554,6 +543,8 @@ void __warn(const char *file, int line, void *caller, unsigned taint, else dump_stack(); + print_irqtrace_events(current); + print_oops_end_marker(); /* Just a warning, don't kill lockdep. */ diff --git a/kernel/params.c b/kernel/params.c index cc9108c2a1fd..ce89f757e6da 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -111,8 +111,8 @@ bool parameq(const char *a, const char *b) static void param_check_unsafe(const struct kernel_param *kp) { if (kp->flags & KERNEL_PARAM_FL_UNSAFE) { - pr_warn("Setting dangerous option %s - tainting kernel\n", - kp->name); + pr_notice("Setting dangerous option %s - tainting kernel\n", + kp->name); add_taint(TAINT_USER, LOCKDEP_STILL_OK); } } diff --git a/kernel/pid.c b/kernel/pid.c index ed6c343fe50d..157fe4b19971 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -70,7 +70,7 @@ int pid_max_max = PID_MAX_LIMIT; */ struct pid_namespace init_pid_ns = { .kref = KREF_INIT(2), - .idr = IDR_INIT, + .idr = IDR_INIT(init_pid_ns.idr), .pid_allocated = PIDNS_ADDING, .level = 0, .child_reaper = &init_task, diff --git a/kernel/power/qos.c b/kernel/power/qos.c index 9d7503910ce2..fa39092b7aea 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c @@ -295,6 +295,7 @@ int pm_qos_update_target(struct pm_qos_constraints *c, struct plist_node *node, * changed */ plist_del(node, &c->list); + /* fall through */ case PM_QOS_ADD_REQ: plist_node_init(node, new_value); plist_add(node, &c->list); @@ -367,6 +368,7 @@ bool pm_qos_update_flags(struct pm_qos_flags *pqf, break; case PM_QOS_UPDATE_REQ: pm_qos_flags_remove_req(pqf, req); + /* fall through */ case PM_QOS_ADD_REQ: req->flags = val; INIT_LIST_HEAD(&req->node); diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index f274fbef821d..2f4af216bd6e 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -42,7 +42,6 @@ #include <linux/rculist.h> #include <linux/poll.h> #include <linux/irq_work.h> -#include <linux/utsname.h> #include <linux/ctype.h> #include <linux/uio.h> #include <linux/sched/clock.h> @@ -52,6 +51,7 @@ #include <linux/uaccess.h> #include <asm/sections.h> +#include <trace/events/initcall.h> #define CREATE_TRACE_POINTS #include <trace/events/printk.h> @@ -2162,7 +2162,7 @@ void suspend_console(void) { if (!console_suspend_enabled) return; - printk("Suspending console(s) (use no_console_suspend to debug)\n"); + pr_info("Suspending console(s) (use no_console_suspend to debug)\n"); console_lock(); console_suspended = 1; up_console_sem(); @@ -2781,6 +2781,7 @@ EXPORT_SYMBOL(unregister_console); */ void __init console_init(void) { + int ret; initcall_t *call; /* Setup the default TTY line discipline. */ @@ -2791,8 +2792,11 @@ void __init console_init(void) * inform about problems etc.. */ call = __con_initcall_start; + trace_initcall_level("console"); while (call < __con_initcall_end) { - (*call)(); + trace_initcall_start((*call)); + ret = (*call)(); + trace_initcall_finish((*call), ret); call++; } } @@ -3257,60 +3261,4 @@ void kmsg_dump_rewind(struct kmsg_dumper *dumper) } EXPORT_SYMBOL_GPL(kmsg_dump_rewind); -static char dump_stack_arch_desc_str[128]; - -/** - * dump_stack_set_arch_desc - set arch-specific str to show with task dumps - * @fmt: printf-style format string - * @...: arguments for the format string - * - * The configured string will be printed right after utsname during task - * dumps. Usually used to add arch-specific system identifiers. If an - * arch wants to make use of such an ID string, it should initialize this - * as soon as possible during boot. - */ -void __init dump_stack_set_arch_desc(const char *fmt, ...) -{ - va_list args; - - va_start(args, fmt); - vsnprintf(dump_stack_arch_desc_str, sizeof(dump_stack_arch_desc_str), - fmt, args); - va_end(args); -} - -/** - * dump_stack_print_info - print generic debug info for dump_stack() - * @log_lvl: log level - * - * Arch-specific dump_stack() implementations can use this function to - * print out the same debug information as the generic dump_stack(). - */ -void dump_stack_print_info(const char *log_lvl) -{ - printk("%sCPU: %d PID: %d Comm: %.20s %s %s %.*s\n", - log_lvl, raw_smp_processor_id(), current->pid, current->comm, - print_tainted(), init_utsname()->release, - (int)strcspn(init_utsname()->version, " "), - init_utsname()->version); - - if (dump_stack_arch_desc_str[0] != '\0') - printk("%sHardware name: %s\n", - log_lvl, dump_stack_arch_desc_str); - - print_worker_info(log_lvl, current); -} - -/** - * show_regs_print_info - print generic debug info for show_regs() - * @log_lvl: log level - * - * show_regs() implementations can use this function to print out generic - * debug information. - */ -void show_regs_print_info(const char *log_lvl) -{ - dump_stack_print_info(log_lvl); -} - #endif diff --git a/kernel/resource.c b/kernel/resource.c index e270b5048988..2af6c03858b9 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -651,7 +651,8 @@ static int __find_resource(struct resource *root, struct resource *old, alloc.start = constraint->alignf(constraint->alignf_data, &avail, size, constraint->align); alloc.end = alloc.start + size - 1; - if (resource_contains(&avail, &alloc)) { + if (alloc.start <= alloc.end && + resource_contains(&avail, &alloc)) { new->start = alloc.start; new->end = alloc.end; return 0; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 28b68995a417..5e10aaeebfcc 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -874,7 +874,7 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) * this case, we can save a useless back to back clock update. */ if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr)) - rq_clock_skip_update(rq, true); + rq_clock_skip_update(rq); } #ifdef CONFIG_SMP @@ -5560,6 +5560,7 @@ void idle_task_exit(void) if (mm != &init_mm) { switch_mm(mm, &init_mm, current); + current->active_mm = &init_mm; finish_arch_post_lock_switch(); } mmdrop(mm); diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 2b124811947d..d2c6083304b4 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -631,10 +631,9 @@ fail: stop_kthread: sugov_kthread_stop(sg_policy); - -free_sg_policy: mutex_unlock(&global_tunables_lock); +free_sg_policy: sugov_policy_free(sg_policy); disable_fast_switch: diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index d1c7bf7c7e5b..e7b3008b85bb 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1560,7 +1560,7 @@ static void yield_task_dl(struct rq *rq) * so we don't do microscopic update in schedule() * and double the fastpath cost. */ - rq_clock_skip_update(rq, true); + rq_clock_skip_update(rq); } #ifdef CONFIG_SMP diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 0951d1c58d2f..54dc31e7ab9b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7089,7 +7089,7 @@ static void yield_task_fair(struct rq *rq) * so we don't do microscopic update in schedule() * and double the fastpath cost. */ - rq_clock_skip_update(rq, true); + rq_clock_skip_update(rq); } set_skip_buddy(se); diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 2975f195e1c4..1a3e9bddd17b 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -141,13 +141,15 @@ static void cpuidle_idle_call(void) } /* - * Tell the RCU framework we are entering an idle section, - * so no more rcu read side critical sections and one more + * The RCU framework needs to be told that we are entering an idle + * section, so no more rcu read side critical sections and one more * step to the grace period */ - rcu_idle_enter(); if (cpuidle_not_available(drv, dev)) { + tick_nohz_idle_stop_tick(); + rcu_idle_enter(); + default_idle_call(); goto exit_idle; } @@ -164,20 +166,37 @@ static void cpuidle_idle_call(void) if (idle_should_enter_s2idle() || dev->use_deepest_state) { if (idle_should_enter_s2idle()) { + rcu_idle_enter(); + entered_state = cpuidle_enter_s2idle(drv, dev); if (entered_state > 0) { local_irq_enable(); goto exit_idle; } + + rcu_idle_exit(); } + tick_nohz_idle_stop_tick(); + rcu_idle_enter(); + next_state = cpuidle_find_deepest_state(drv, dev); call_cpuidle(drv, dev, next_state); } else { + bool stop_tick = true; + /* * Ask the cpuidle framework to choose a convenient idle state. */ - next_state = cpuidle_select(drv, dev); + next_state = cpuidle_select(drv, dev, &stop_tick); + + if (stop_tick) + tick_nohz_idle_stop_tick(); + else + tick_nohz_idle_retain_tick(); + + rcu_idle_enter(); + entered_state = call_cpuidle(drv, dev, next_state); /* * Give the governor an opportunity to reflect on the outcome @@ -222,6 +241,7 @@ static void do_idle(void) rmb(); if (cpu_is_offline(cpu)) { + tick_nohz_idle_stop_tick_protected(); cpuhp_report_idle_dead(); arch_cpu_idle_dead(); } @@ -235,10 +255,12 @@ static void do_idle(void) * broadcast device expired for us, we don't want to go deep * idle as we know that the IPI is going to arrive right away. */ - if (cpu_idle_force_poll || tick_check_broadcast_expired()) + if (cpu_idle_force_poll || tick_check_broadcast_expired()) { + tick_nohz_idle_restart_tick(); cpu_idle_poll(); - else + } else { cpuidle_idle_call(); + } arch_cpu_idle_exit(); } diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 86b77987435e..7aef6b4e885a 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -839,6 +839,8 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) continue; raw_spin_lock(&rq->lock); + update_rq_clock(rq); + if (rt_rq->rt_time) { u64 runtime; @@ -859,7 +861,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) * 'runtime'. */ if (rt_rq->rt_nr_running && rq->curr == rq->idle) - rq_clock_skip_update(rq, false); + rq_clock_cancel_skipupdate(rq); } if (rt_rq->rt_time || rt_rq->rt_nr_running) idle = 0; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index c3deaee7a7a2..15750c222ca2 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -976,13 +976,20 @@ static inline u64 rq_clock_task(struct rq *rq) return rq->clock_task; } -static inline void rq_clock_skip_update(struct rq *rq, bool skip) +static inline void rq_clock_skip_update(struct rq *rq) { lockdep_assert_held(&rq->lock); - if (skip) - rq->clock_update_flags |= RQCF_REQ_SKIP; - else - rq->clock_update_flags &= ~RQCF_REQ_SKIP; + rq->clock_update_flags |= RQCF_REQ_SKIP; +} + +/* + * See rt task throttoling, which is the only time a skip + * request is cancelled. + */ +static inline void rq_clock_cancel_skipupdate(struct rq *rq) +{ + lockdep_assert_held(&rq->lock); + rq->clock_update_flags &= ~RQCF_REQ_SKIP; } struct rq_flags { diff --git a/kernel/signal.c b/kernel/signal.c index 47491aa3e790..d4ccea599692 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -770,7 +770,7 @@ static int check_kill_permission(int sig, struct siginfo *info, } } - return security_task_kill(t, info, sig, 0); + return security_task_kill(t, info, sig, NULL); } /** @@ -1361,7 +1361,7 @@ static int kill_as_cred_perm(const struct cred *cred, /* like kill_pid_info(), but doesn't use uid/euid of "current" */ int kill_pid_info_as_cred(int sig, struct siginfo *info, struct pid *pid, - const struct cred *cred, u32 secid) + const struct cred *cred) { int ret = -EINVAL; struct task_struct *p; @@ -1380,7 +1380,7 @@ int kill_pid_info_as_cred(int sig, struct siginfo *info, struct pid *pid, ret = -EPERM; goto out_unlock; } - ret = security_task_kill(p, info, sig, secid); + ret = security_task_kill(p, info, sig, cred); if (ret) goto out_unlock; diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 6cafc008f6db..9791364925dc 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -5,6 +5,11 @@ #include <asm/unistd.h> +#ifdef CONFIG_ARCH_HAS_SYSCALL_WRAPPER +/* Architectures may override COND_SYSCALL and COND_SYSCALL_COMPAT */ +#include <asm/syscall_wrapper.h> +#endif /* CONFIG_ARCH_HAS_SYSCALL_WRAPPER */ + /* we can't #include <linux/syscalls.h> here, but tell gcc to not warn with -Wmissing-prototypes */ asmlinkage long sys_ni_syscall(void); @@ -17,8 +22,13 @@ asmlinkage long sys_ni_syscall(void) return -ENOSYS; } +#ifndef COND_SYSCALL #define COND_SYSCALL(name) cond_syscall(sys_##name) +#endif /* COND_SYSCALL */ + +#ifndef COND_SYSCALL_COMPAT #define COND_SYSCALL_COMPAT(name) cond_syscall(compat_sys_##name) +#endif /* COND_SYSCALL_COMPAT */ /* * This list is kept in the same order as include/uapi/asm-generic/unistd.h. diff --git a/kernel/sysctl.c b/kernel/sysctl.c index bdf7090b106d..6a78cf70761d 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1340,7 +1340,7 @@ static struct ctl_table vm_table[] = { { .procname = "dirtytime_expire_seconds", .data = &dirtytime_expire_interval, - .maxlen = sizeof(dirty_expire_interval), + .maxlen = sizeof(dirtytime_expire_interval), .mode = 0644, .proc_handler = dirtytime_interval_handler, .extra1 = &zero, @@ -2511,6 +2511,15 @@ static int proc_dointvec_minmax_sysadmin(struct ctl_table *table, int write, } #endif +/** + * struct do_proc_dointvec_minmax_conv_param - proc_dointvec_minmax() range checking structure + * @min: pointer to minimum allowable value + * @max: pointer to maximum allowable value + * + * The do_proc_dointvec_minmax_conv_param structure provides the + * minimum and maximum values for doing range checking for those sysctl + * parameters that use the proc_dointvec_minmax() handler. + */ struct do_proc_dointvec_minmax_conv_param { int *min; int *max; @@ -2554,7 +2563,7 @@ static int do_proc_dointvec_minmax_conv(bool *negp, unsigned long *lvalp, * This routine will ensure the values are within the range specified by * table->extra1 (min) and table->extra2 (max). * - * Returns 0 on success. + * Returns 0 on success or -EINVAL on write when the range check fails. */ int proc_dointvec_minmax(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) @@ -2567,6 +2576,15 @@ int proc_dointvec_minmax(struct ctl_table *table, int write, do_proc_dointvec_minmax_conv, ¶m); } +/** + * struct do_proc_douintvec_minmax_conv_param - proc_douintvec_minmax() range checking structure + * @min: pointer to minimum allowable value + * @max: pointer to maximum allowable value + * + * The do_proc_douintvec_minmax_conv_param structure provides the + * minimum and maximum values for doing range checking for those sysctl + * parameters that use the proc_douintvec_minmax() handler. + */ struct do_proc_douintvec_minmax_conv_param { unsigned int *min; unsigned int *max; @@ -2614,7 +2632,7 @@ static int do_proc_douintvec_minmax_conv(unsigned long *lvalp, * check for UINT_MAX to avoid having to support wrap around uses from * userspace. * - * Returns 0 on success. + * Returns 0 on success or -ERANGE on write when the range check fails. */ int proc_douintvec_minmax(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 9b082ce86325..eda1210ce50f 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -480,6 +480,7 @@ __next_base(struct hrtimer_cpu_base *cpu_base, unsigned int *active) while ((base = __next_base((cpu_base), &(active)))) static ktime_t __hrtimer_next_event_base(struct hrtimer_cpu_base *cpu_base, + const struct hrtimer *exclude, unsigned int active, ktime_t expires_next) { @@ -492,9 +493,22 @@ static ktime_t __hrtimer_next_event_base(struct hrtimer_cpu_base *cpu_base, next = timerqueue_getnext(&base->active); timer = container_of(next, struct hrtimer, node); + if (timer == exclude) { + /* Get to the next timer in the queue. */ + next = timerqueue_iterate_next(next); + if (!next) + continue; + + timer = container_of(next, struct hrtimer, node); + } expires = ktime_sub(hrtimer_get_expires(timer), base->offset); if (expires < expires_next) { expires_next = expires; + + /* Skip cpu_base update if a timer is being excluded. */ + if (exclude) + continue; + if (timer->is_soft) cpu_base->softirq_next_timer = timer; else @@ -538,7 +552,8 @@ __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base, unsigned int active_ if (!cpu_base->softirq_activated && (active_mask & HRTIMER_ACTIVE_SOFT)) { active = cpu_base->active_bases & HRTIMER_ACTIVE_SOFT; cpu_base->softirq_next_timer = NULL; - expires_next = __hrtimer_next_event_base(cpu_base, active, KTIME_MAX); + expires_next = __hrtimer_next_event_base(cpu_base, NULL, + active, KTIME_MAX); next_timer = cpu_base->softirq_next_timer; } @@ -546,7 +561,8 @@ __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base, unsigned int active_ if (active_mask & HRTIMER_ACTIVE_HARD) { active = cpu_base->active_bases & HRTIMER_ACTIVE_HARD; cpu_base->next_timer = next_timer; - expires_next = __hrtimer_next_event_base(cpu_base, active, expires_next); + expires_next = __hrtimer_next_event_base(cpu_base, NULL, active, + expires_next); } return expires_next; @@ -1190,6 +1206,39 @@ u64 hrtimer_get_next_event(void) return expires; } + +/** + * hrtimer_next_event_without - time until next expiry event w/o one timer + * @exclude: timer to exclude + * + * Returns the next expiry time over all timers except for the @exclude one or + * KTIME_MAX if none of them is pending. + */ +u64 hrtimer_next_event_without(const struct hrtimer *exclude) +{ + struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); + u64 expires = KTIME_MAX; + unsigned long flags; + + raw_spin_lock_irqsave(&cpu_base->lock, flags); + + if (__hrtimer_hres_active(cpu_base)) { + unsigned int active; + + if (!cpu_base->softirq_activated) { + active = cpu_base->active_bases & HRTIMER_ACTIVE_SOFT; + expires = __hrtimer_next_event_base(cpu_base, exclude, + active, KTIME_MAX); + } + active = cpu_base->active_bases & HRTIMER_ACTIVE_HARD; + expires = __hrtimer_next_event_base(cpu_base, exclude, active, + expires); + } + + raw_spin_unlock_irqrestore(&cpu_base->lock, flags); + + return expires; +} #endif static inline int hrtimer_clockid_to_base(clockid_t clock_id) diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 8d70da1b9a0d..a09ded765f6c 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -31,7 +31,7 @@ /* USER_HZ period (usecs): */ -unsigned long tick_usec = TICK_USEC; +unsigned long tick_usec = USER_TICK_USEC; /* SHIFTED_HZ period (nsecs): */ unsigned long tick_nsec; diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c index 6259dbc0191a..e0dbae98db9d 100644 --- a/kernel/time/posix-stubs.c +++ b/kernel/time/posix-stubs.c @@ -19,6 +19,11 @@ #include <linux/posix-timers.h> #include <linux/compat.h> +#ifdef CONFIG_ARCH_HAS_SYSCALL_WRAPPER +/* Architectures may override SYS_NI and COMPAT_SYS_NI */ +#include <asm/syscall_wrapper.h> +#endif + asmlinkage long sys_ni_posix_timers(void) { pr_err_once("process %d (%s) attempted a POSIX timer syscall " @@ -27,8 +32,13 @@ asmlinkage long sys_ni_posix_timers(void) return -ENOSYS; } +#ifndef SYS_NI #define SYS_NI(name) SYSCALL_ALIAS(sys_##name, sys_ni_posix_timers) +#endif + +#ifndef COMPAT_SYS_NI #define COMPAT_SYS_NI(name) SYSCALL_ALIAS(compat_sys_##name, sys_ni_posix_timers) +#endif SYS_NI(timer_create); SYS_NI(timer_gettime); diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index f3ab08caa2c3..646645e981f9 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -122,8 +122,7 @@ static ktime_t tick_init_jiffy_update(void) return period; } - -static void tick_sched_do_timer(ktime_t now) +static void tick_sched_do_timer(struct tick_sched *ts, ktime_t now) { int cpu = smp_processor_id(); @@ -143,6 +142,9 @@ static void tick_sched_do_timer(ktime_t now) /* Check, if the jiffies need an update */ if (tick_do_timer_cpu == cpu) tick_do_update_jiffies64(now); + + if (ts->inidle) + ts->got_idle_tick = 1; } static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs) @@ -474,7 +476,9 @@ __setup("nohz=", setup_tick_nohz); bool tick_nohz_tick_stopped(void) { - return __this_cpu_read(tick_cpu_sched.tick_stopped); + struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); + + return ts->tick_stopped; } bool tick_nohz_tick_stopped_cpu(int cpu) @@ -537,14 +541,11 @@ static void tick_nohz_stop_idle(struct tick_sched *ts, ktime_t now) sched_clock_idle_wakeup_event(); } -static ktime_t tick_nohz_start_idle(struct tick_sched *ts) +static void tick_nohz_start_idle(struct tick_sched *ts) { - ktime_t now = ktime_get(); - - ts->idle_entrytime = now; + ts->idle_entrytime = ktime_get(); ts->idle_active = 1; sched_clock_idle_sleep_event(); - return now; } /** @@ -653,13 +654,10 @@ static inline bool local_timer_softirq_pending(void) return local_softirq_pending() & TIMER_SOFTIRQ; } -static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, - ktime_t now, int cpu) +static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu) { - struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); u64 basemono, next_tick, next_tmr, next_rcu, delta, expires; unsigned long seq, basejiff; - ktime_t tick; /* Read jiffies and the time when jiffies were updated last */ do { @@ -668,6 +666,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, basejiff = jiffies; } while (read_seqretry(&jiffies_lock, seq)); ts->last_jiffies = basejiff; + ts->timer_expires_base = basemono; /* * Keep the periodic tick, when RCU, architecture or irq_work @@ -712,47 +711,63 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, * next period, so no point in stopping it either, bail. */ if (!ts->tick_stopped) { - tick = 0; + ts->timer_expires = 0; goto out; } } /* + * If this CPU is the one which had the do_timer() duty last, we limit + * the sleep time to the timekeeping max_deferment value. + * Otherwise we can sleep as long as we want. + */ + delta = timekeeping_max_deferment(); + if (cpu != tick_do_timer_cpu && + (tick_do_timer_cpu != TICK_DO_TIMER_NONE || !ts->do_timer_last)) + delta = KTIME_MAX; + + /* Calculate the next expiry time */ + if (delta < (KTIME_MAX - basemono)) + expires = basemono + delta; + else + expires = KTIME_MAX; + + ts->timer_expires = min_t(u64, expires, next_tick); + +out: + return ts->timer_expires; +} + +static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu) +{ + struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); + u64 basemono = ts->timer_expires_base; + u64 expires = ts->timer_expires; + ktime_t tick = expires; + + /* Make sure we won't be trying to stop it twice in a row. */ + ts->timer_expires_base = 0; + + /* * If this CPU is the one which updates jiffies, then give up * the assignment and let it be taken by the CPU which runs * the tick timer next, which might be this CPU as well. If we * don't drop this here the jiffies might be stale and * do_timer() never invoked. Keep track of the fact that it - * was the one which had the do_timer() duty last. If this CPU - * is the one which had the do_timer() duty last, we limit the - * sleep time to the timekeeping max_deferment value. - * Otherwise we can sleep as long as we want. + * was the one which had the do_timer() duty last. */ - delta = timekeeping_max_deferment(); if (cpu == tick_do_timer_cpu) { tick_do_timer_cpu = TICK_DO_TIMER_NONE; ts->do_timer_last = 1; } else if (tick_do_timer_cpu != TICK_DO_TIMER_NONE) { - delta = KTIME_MAX; ts->do_timer_last = 0; - } else if (!ts->do_timer_last) { - delta = KTIME_MAX; } - /* Calculate the next expiry time */ - if (delta < (KTIME_MAX - basemono)) - expires = basemono + delta; - else - expires = KTIME_MAX; - - expires = min_t(u64, expires, next_tick); - tick = expires; - /* Skip reprogram of event if its not changed */ if (ts->tick_stopped && (expires == ts->next_tick)) { /* Sanity check: make sure clockevent is actually programmed */ if (tick == KTIME_MAX || ts->next_tick == hrtimer_get_expires(&ts->sched_timer)) - goto out; + return; WARN_ON_ONCE(1); printk_once("basemono: %llu ts->next_tick: %llu dev->next_event: %llu timer->active: %d timer->expires: %llu\n", @@ -786,7 +801,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, if (unlikely(expires == KTIME_MAX)) { if (ts->nohz_mode == NOHZ_MODE_HIGHRES) hrtimer_cancel(&ts->sched_timer); - goto out; + return; } hrtimer_set_expires(&ts->sched_timer, tick); @@ -795,15 +810,23 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED); else tick_program_event(tick, 1); -out: - /* - * Update the estimated sleep length until the next timer - * (not only the tick). - */ - ts->sleep_length = ktime_sub(dev->next_event, now); - return tick; } +static void tick_nohz_retain_tick(struct tick_sched *ts) +{ + ts->timer_expires_base = 0; +} + +#ifdef CONFIG_NO_HZ_FULL +static void tick_nohz_stop_sched_tick(struct tick_sched *ts, int cpu) +{ + if (tick_nohz_next_event(ts, cpu)) + tick_nohz_stop_tick(ts, cpu); + else + tick_nohz_retain_tick(ts); +} +#endif /* CONFIG_NO_HZ_FULL */ + static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now) { /* Update jiffies first */ @@ -839,7 +862,7 @@ static void tick_nohz_full_update_tick(struct tick_sched *ts) return; if (can_stop_full_tick(cpu, ts)) - tick_nohz_stop_sched_tick(ts, ktime_get(), cpu); + tick_nohz_stop_sched_tick(ts, cpu); else if (ts->tick_stopped) tick_nohz_restart_sched_tick(ts, ktime_get()); #endif @@ -865,10 +888,8 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) return false; } - if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) { - ts->sleep_length = NSEC_PER_SEC / HZ; + if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) return false; - } if (need_resched()) return false; @@ -903,42 +924,65 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) return true; } -static void __tick_nohz_idle_enter(struct tick_sched *ts) +static void __tick_nohz_idle_stop_tick(struct tick_sched *ts) { - ktime_t now, expires; + ktime_t expires; int cpu = smp_processor_id(); - now = tick_nohz_start_idle(ts); + /* + * If tick_nohz_get_sleep_length() ran tick_nohz_next_event(), the + * tick timer expiration time is known already. + */ + if (ts->timer_expires_base) + expires = ts->timer_expires; + else if (can_stop_idle_tick(cpu, ts)) + expires = tick_nohz_next_event(ts, cpu); + else + return; + + ts->idle_calls++; - if (can_stop_idle_tick(cpu, ts)) { + if (expires > 0LL) { int was_stopped = ts->tick_stopped; - ts->idle_calls++; + tick_nohz_stop_tick(ts, cpu); - expires = tick_nohz_stop_sched_tick(ts, now, cpu); - if (expires > 0LL) { - ts->idle_sleeps++; - ts->idle_expires = expires; - } + ts->idle_sleeps++; + ts->idle_expires = expires; if (!was_stopped && ts->tick_stopped) { ts->idle_jiffies = ts->last_jiffies; nohz_balance_enter_idle(cpu); } + } else { + tick_nohz_retain_tick(ts); } } /** - * tick_nohz_idle_enter - stop the idle tick from the idle task + * tick_nohz_idle_stop_tick - stop the idle tick from the idle task * * When the next event is more than a tick into the future, stop the idle tick - * Called when we start the idle loop. - * - * The arch is responsible of calling: + */ +void tick_nohz_idle_stop_tick(void) +{ + __tick_nohz_idle_stop_tick(this_cpu_ptr(&tick_cpu_sched)); +} + +void tick_nohz_idle_retain_tick(void) +{ + tick_nohz_retain_tick(this_cpu_ptr(&tick_cpu_sched)); + /* + * Undo the effect of get_next_timer_interrupt() called from + * tick_nohz_next_event(). + */ + timer_clear_idle(); +} + +/** + * tick_nohz_idle_enter - prepare for entering idle on the current CPU * - * - rcu_idle_enter() after its last use of RCU before the CPU is put - * to sleep. - * - rcu_idle_exit() before the first use of RCU after the CPU is woken up. + * Called when we start the idle loop. */ void tick_nohz_idle_enter(void) { @@ -949,8 +993,11 @@ void tick_nohz_idle_enter(void) local_irq_disable(); ts = this_cpu_ptr(&tick_cpu_sched); + + WARN_ON_ONCE(ts->timer_expires_base); + ts->inidle = 1; - __tick_nohz_idle_enter(ts); + tick_nohz_start_idle(ts); local_irq_enable(); } @@ -968,21 +1015,62 @@ void tick_nohz_irq_exit(void) struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); if (ts->inidle) - __tick_nohz_idle_enter(ts); + tick_nohz_start_idle(ts); else tick_nohz_full_update_tick(ts); } /** - * tick_nohz_get_sleep_length - return the length of the current sleep + * tick_nohz_idle_got_tick - Check whether or not the tick handler has run + */ +bool tick_nohz_idle_got_tick(void) +{ + struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); + + if (ts->got_idle_tick) { + ts->got_idle_tick = 0; + return true; + } + return false; +} + +/** + * tick_nohz_get_sleep_length - return the expected length of the current sleep + * @delta_next: duration until the next event if the tick cannot be stopped * * Called from power state control code with interrupts disabled */ -ktime_t tick_nohz_get_sleep_length(void) +ktime_t tick_nohz_get_sleep_length(ktime_t *delta_next) { + struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); + int cpu = smp_processor_id(); + /* + * The idle entry time is expected to be a sufficient approximation of + * the current time at this point. + */ + ktime_t now = ts->idle_entrytime; + ktime_t next_event; + + WARN_ON_ONCE(!ts->inidle); + + *delta_next = ktime_sub(dev->next_event, now); + + if (!can_stop_idle_tick(cpu, ts)) + return *delta_next; + + next_event = tick_nohz_next_event(ts, cpu); + if (!next_event) + return *delta_next; + + /* + * If the next highres timer to expire is earlier than next_event, the + * idle governor needs to know that. + */ + next_event = min_t(u64, next_event, + hrtimer_next_event_without(&ts->sched_timer)); - return ts->sleep_length; + return ktime_sub(next_event, now); } /** @@ -1031,6 +1119,20 @@ static void tick_nohz_account_idle_ticks(struct tick_sched *ts) #endif } +static void __tick_nohz_idle_restart_tick(struct tick_sched *ts, ktime_t now) +{ + tick_nohz_restart_sched_tick(ts, now); + tick_nohz_account_idle_ticks(ts); +} + +void tick_nohz_idle_restart_tick(void) +{ + struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); + + if (ts->tick_stopped) + __tick_nohz_idle_restart_tick(ts, ktime_get()); +} + /** * tick_nohz_idle_exit - restart the idle tick from the idle task * @@ -1041,24 +1143,26 @@ static void tick_nohz_account_idle_ticks(struct tick_sched *ts) void tick_nohz_idle_exit(void) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); + bool idle_active, tick_stopped; ktime_t now; local_irq_disable(); WARN_ON_ONCE(!ts->inidle); + WARN_ON_ONCE(ts->timer_expires_base); ts->inidle = 0; + idle_active = ts->idle_active; + tick_stopped = ts->tick_stopped; - if (ts->idle_active || ts->tick_stopped) + if (idle_active || tick_stopped) now = ktime_get(); - if (ts->idle_active) + if (idle_active) tick_nohz_stop_idle(ts, now); - if (ts->tick_stopped) { - tick_nohz_restart_sched_tick(ts, now); - tick_nohz_account_idle_ticks(ts); - } + if (tick_stopped) + __tick_nohz_idle_restart_tick(ts, now); local_irq_enable(); } @@ -1074,7 +1178,7 @@ static void tick_nohz_handler(struct clock_event_device *dev) dev->next_event = KTIME_MAX; - tick_sched_do_timer(now); + tick_sched_do_timer(ts, now); tick_sched_handle(ts, regs); /* No need to reprogram if we are running tickless */ @@ -1169,7 +1273,7 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) struct pt_regs *regs = get_irq_regs(); ktime_t now = ktime_get(); - tick_sched_do_timer(now); + tick_sched_do_timer(ts, now); /* * Do not call, when we are not in irq context and have diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h index 954b43dbf21c..6de959a854b2 100644 --- a/kernel/time/tick-sched.h +++ b/kernel/time/tick-sched.h @@ -38,31 +38,37 @@ enum tick_nohz_mode { * @idle_exittime: Time when the idle state was left * @idle_sleeptime: Sum of the time slept in idle with sched tick stopped * @iowait_sleeptime: Sum of the time slept in idle with sched tick stopped, with IO outstanding - * @sleep_length: Duration of the current idle sleep + * @timer_expires: Anticipated timer expiration time (in case sched tick is stopped) + * @timer_expires_base: Base time clock monotonic for @timer_expires * @do_timer_lst: CPU was the last one doing do_timer before going idle + * @got_idle_tick: Tick timer function has run with @inidle set */ struct tick_sched { struct hrtimer sched_timer; unsigned long check_clocks; enum tick_nohz_mode nohz_mode; + + unsigned int inidle : 1; + unsigned int tick_stopped : 1; + unsigned int idle_active : 1; + unsigned int do_timer_last : 1; + unsigned int got_idle_tick : 1; + ktime_t last_tick; ktime_t next_tick; - int inidle; - int tick_stopped; unsigned long idle_jiffies; unsigned long idle_calls; unsigned long idle_sleeps; - int idle_active; ktime_t idle_entrytime; ktime_t idle_waketime; ktime_t idle_exittime; ktime_t idle_sleeptime; ktime_t iowait_sleeptime; - ktime_t sleep_length; unsigned long last_jiffies; + u64 timer_expires; + u64 timer_expires_base; u64 next_timer; ktime_t idle_expires; - int do_timer_last; atomic_t tick_dep_mask; }; diff --git a/kernel/time/timekeeping_internal.h b/kernel/time/timekeeping_internal.h index fdbeeb02dde9..cf5c0828ee31 100644 --- a/kernel/time/timekeeping_internal.h +++ b/kernel/time/timekeeping_internal.h @@ -31,6 +31,4 @@ static inline u64 clocksource_delta(u64 now, u64 last, u64 mask) } #endif -extern time64_t __ktime_get_real_seconds(void); - #endif /* _TIMEKEEPING_INTERNAL_H */ diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 0b249e2f0c3c..c4f0f2e4126e 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -606,7 +606,10 @@ config HIST_TRIGGERS event activity as an initial guide for further investigation using more advanced tools. - See Documentation/trace/events.txt. + Inter-event tracing of quantities such as latencies is also + supported using hist triggers under this option. + + See Documentation/trace/histogram.txt. If in doubt, say N. config MMIOTRACE_TEST diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index eac9ce2c57a2..16bbf062018f 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3902,14 +3902,13 @@ static bool module_exists(const char *module) { /* All modules have the symbol __this_module */ const char this_mod[] = "__this_module"; - const int modname_size = MAX_PARAM_PREFIX_LEN + sizeof(this_mod) + 1; - char modname[modname_size + 1]; + char modname[MAX_PARAM_PREFIX_LEN + sizeof(this_mod) + 2]; unsigned long val; int n; - n = snprintf(modname, modname_size + 1, "%s:%s", module, this_mod); + n = snprintf(modname, sizeof(modname), "%s:%s", module, this_mod); - if (n > modname_size) + if (n > sizeof(modname) - 1) return false; val = module_kallsyms_lookup_name(modname); diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index dcf1c4dd3efe..c9cb9767d49b 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -22,6 +22,7 @@ #include <linux/hash.h> #include <linux/list.h> #include <linux/cpu.h> +#include <linux/oom.h> #include <asm/local.h> @@ -41,6 +42,8 @@ int ring_buffer_print_entry_header(struct trace_seq *s) RINGBUF_TYPE_PADDING); trace_seq_printf(s, "\ttime_extend : type == %d\n", RINGBUF_TYPE_TIME_EXTEND); + trace_seq_printf(s, "\ttime_stamp : type == %d\n", + RINGBUF_TYPE_TIME_STAMP); trace_seq_printf(s, "\tdata max type_len == %d\n", RINGBUF_TYPE_DATA_TYPE_LEN_MAX); @@ -140,12 +143,15 @@ int ring_buffer_print_entry_header(struct trace_seq *s) enum { RB_LEN_TIME_EXTEND = 8, - RB_LEN_TIME_STAMP = 16, + RB_LEN_TIME_STAMP = 8, }; #define skip_time_extend(event) \ ((struct ring_buffer_event *)((char *)event + RB_LEN_TIME_EXTEND)) +#define extended_time(event) \ + (event->type_len >= RINGBUF_TYPE_TIME_EXTEND) + static inline int rb_null_event(struct ring_buffer_event *event) { return event->type_len == RINGBUF_TYPE_PADDING && !event->time_delta; @@ -209,7 +215,7 @@ rb_event_ts_length(struct ring_buffer_event *event) { unsigned len = 0; - if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) { + if (extended_time(event)) { /* time extends include the data event after it */ len = RB_LEN_TIME_EXTEND; event = skip_time_extend(event); @@ -231,7 +237,7 @@ unsigned ring_buffer_event_length(struct ring_buffer_event *event) { unsigned length; - if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) + if (extended_time(event)) event = skip_time_extend(event); length = rb_event_length(event); @@ -248,7 +254,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_length); static __always_inline void * rb_event_data(struct ring_buffer_event *event) { - if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) + if (extended_time(event)) event = skip_time_extend(event); BUG_ON(event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX); /* If length is in len field, then array[0] has the data */ @@ -275,6 +281,27 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_data); #define TS_MASK ((1ULL << TS_SHIFT) - 1) #define TS_DELTA_TEST (~TS_MASK) +/** + * ring_buffer_event_time_stamp - return the event's extended timestamp + * @event: the event to get the timestamp of + * + * Returns the extended timestamp associated with a data event. + * An extended time_stamp is a 64-bit timestamp represented + * internally in a special way that makes the best use of space + * contained within a ring buffer event. This function decodes + * it and maps it to a straight u64 value. + */ +u64 ring_buffer_event_time_stamp(struct ring_buffer_event *event) +{ + u64 ts; + + ts = event->array[0]; + ts <<= TS_SHIFT; + ts += event->time_delta; + + return ts; +} + /* Flag when events were overwritten */ #define RB_MISSED_EVENTS (1 << 31) /* Missed count stored at end */ @@ -451,6 +478,7 @@ struct ring_buffer_per_cpu { struct buffer_page *reader_page; unsigned long lost_events; unsigned long last_overrun; + unsigned long nest; local_t entries_bytes; local_t entries; local_t overrun; @@ -488,6 +516,7 @@ struct ring_buffer { u64 (*clock)(void); struct rb_irq_work irq_work; + bool time_stamp_abs; }; struct ring_buffer_iter { @@ -1134,30 +1163,60 @@ static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer) static int __rb_allocate_pages(long nr_pages, struct list_head *pages, int cpu) { struct buffer_page *bpage, *tmp; + bool user_thread = current->mm != NULL; + gfp_t mflags; long i; + /* + * Check if the available memory is there first. + * Note, si_mem_available() only gives us a rough estimate of available + * memory. It may not be accurate. But we don't care, we just want + * to prevent doing any allocation when it is obvious that it is + * not going to succeed. + */ + i = si_mem_available(); + if (i < nr_pages) + return -ENOMEM; + + /* + * __GFP_RETRY_MAYFAIL flag makes sure that the allocation fails + * gracefully without invoking oom-killer and the system is not + * destabilized. + */ + mflags = GFP_KERNEL | __GFP_RETRY_MAYFAIL; + + /* + * If a user thread allocates too much, and si_mem_available() + * reports there's enough memory, even though there is not. + * Make sure the OOM killer kills this thread. This can happen + * even with RETRY_MAYFAIL because another task may be doing + * an allocation after this task has taken all memory. + * This is the task the OOM killer needs to take out during this + * loop, even if it was triggered by an allocation somewhere else. + */ + if (user_thread) + set_current_oom_origin(); for (i = 0; i < nr_pages; i++) { struct page *page; - /* - * __GFP_RETRY_MAYFAIL flag makes sure that the allocation fails - * gracefully without invoking oom-killer and the system is not - * destabilized. - */ + bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), - GFP_KERNEL | __GFP_RETRY_MAYFAIL, - cpu_to_node(cpu)); + mflags, cpu_to_node(cpu)); if (!bpage) goto free_pages; list_add(&bpage->list, pages); - page = alloc_pages_node(cpu_to_node(cpu), - GFP_KERNEL | __GFP_RETRY_MAYFAIL, 0); + page = alloc_pages_node(cpu_to_node(cpu), mflags, 0); if (!page) goto free_pages; bpage->page = page_address(page); rb_init_page(bpage->page); + + if (user_thread && fatal_signal_pending(current)) + goto free_pages; } + if (user_thread) + clear_current_oom_origin(); return 0; @@ -1166,6 +1225,8 @@ free_pages: list_del_init(&bpage->list); free_buffer_page(bpage); } + if (user_thread) + clear_current_oom_origin(); return -ENOMEM; } @@ -1382,6 +1443,16 @@ void ring_buffer_set_clock(struct ring_buffer *buffer, buffer->clock = clock; } +void ring_buffer_set_time_stamp_abs(struct ring_buffer *buffer, bool abs) +{ + buffer->time_stamp_abs = abs; +} + +bool ring_buffer_time_stamp_abs(struct ring_buffer *buffer) +{ + return buffer->time_stamp_abs; +} + static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer); static inline unsigned long rb_page_entries(struct buffer_page *bpage) @@ -2206,12 +2277,15 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, /* Slow path, do not inline */ static noinline struct ring_buffer_event * -rb_add_time_stamp(struct ring_buffer_event *event, u64 delta) +rb_add_time_stamp(struct ring_buffer_event *event, u64 delta, bool abs) { - event->type_len = RINGBUF_TYPE_TIME_EXTEND; + if (abs) + event->type_len = RINGBUF_TYPE_TIME_STAMP; + else + event->type_len = RINGBUF_TYPE_TIME_EXTEND; - /* Not the first event on the page? */ - if (rb_event_index(event)) { + /* Not the first event on the page, or not delta? */ + if (abs || rb_event_index(event)) { event->time_delta = delta & TS_MASK; event->array[0] = delta >> TS_SHIFT; } else { @@ -2254,7 +2328,9 @@ rb_update_event(struct ring_buffer_per_cpu *cpu_buffer, * add it to the start of the resevered space. */ if (unlikely(info->add_timestamp)) { - event = rb_add_time_stamp(event, delta); + bool abs = ring_buffer_time_stamp_abs(cpu_buffer->buffer); + + event = rb_add_time_stamp(event, info->delta, abs); length -= RB_LEN_TIME_EXTEND; delta = 0; } @@ -2442,7 +2518,7 @@ static __always_inline void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer static inline void rb_event_discard(struct ring_buffer_event *event) { - if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) + if (extended_time(event)) event = skip_time_extend(event); /* array[0] holds the actual length for the discarded event */ @@ -2486,10 +2562,11 @@ rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer, cpu_buffer->write_stamp = cpu_buffer->commit_page->page->time_stamp; else if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) { - delta = event->array[0]; - delta <<= TS_SHIFT; - delta += event->time_delta; + delta = ring_buffer_event_time_stamp(event); cpu_buffer->write_stamp += delta; + } else if (event->type_len == RINGBUF_TYPE_TIME_STAMP) { + delta = ring_buffer_event_time_stamp(event); + cpu_buffer->write_stamp = delta; } else cpu_buffer->write_stamp += event->time_delta; } @@ -2581,10 +2658,10 @@ trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer) bit = pc & NMI_MASK ? RB_CTX_NMI : pc & HARDIRQ_MASK ? RB_CTX_IRQ : RB_CTX_SOFTIRQ; - if (unlikely(val & (1 << bit))) + if (unlikely(val & (1 << (bit + cpu_buffer->nest)))) return 1; - val |= (1 << bit); + val |= (1 << (bit + cpu_buffer->nest)); cpu_buffer->current_context = val; return 0; @@ -2593,7 +2670,57 @@ trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer) static __always_inline void trace_recursive_unlock(struct ring_buffer_per_cpu *cpu_buffer) { - cpu_buffer->current_context &= cpu_buffer->current_context - 1; + cpu_buffer->current_context &= + cpu_buffer->current_context - (1 << cpu_buffer->nest); +} + +/* The recursive locking above uses 4 bits */ +#define NESTED_BITS 4 + +/** + * ring_buffer_nest_start - Allow to trace while nested + * @buffer: The ring buffer to modify + * + * The ring buffer has a safty mechanism to prevent recursion. + * But there may be a case where a trace needs to be done while + * tracing something else. In this case, calling this function + * will allow this function to nest within a currently active + * ring_buffer_lock_reserve(). + * + * Call this function before calling another ring_buffer_lock_reserve() and + * call ring_buffer_nest_end() after the nested ring_buffer_unlock_commit(). + */ +void ring_buffer_nest_start(struct ring_buffer *buffer) +{ + struct ring_buffer_per_cpu *cpu_buffer; + int cpu; + + /* Enabled by ring_buffer_nest_end() */ + preempt_disable_notrace(); + cpu = raw_smp_processor_id(); + cpu_buffer = buffer->buffers[cpu]; + /* This is the shift value for the above recusive locking */ + cpu_buffer->nest += NESTED_BITS; +} + +/** + * ring_buffer_nest_end - Allow to trace while nested + * @buffer: The ring buffer to modify + * + * Must be called after ring_buffer_nest_start() and after the + * ring_buffer_unlock_commit(). + */ +void ring_buffer_nest_end(struct ring_buffer *buffer) +{ + struct ring_buffer_per_cpu *cpu_buffer; + int cpu; + + /* disabled by ring_buffer_nest_start() */ + cpu = raw_smp_processor_id(); + cpu_buffer = buffer->buffers[cpu]; + /* This is the shift value for the above recusive locking */ + cpu_buffer->nest -= NESTED_BITS; + preempt_enable_notrace(); } /** @@ -2637,7 +2764,8 @@ rb_handle_timestamp(struct ring_buffer_per_cpu *cpu_buffer, sched_clock_stable() ? "" : "If you just came from a suspend/resume,\n" "please switch to the trace global clock:\n" - " echo global > /sys/kernel/debug/tracing/trace_clock\n"); + " echo global > /sys/kernel/debug/tracing/trace_clock\n" + "or add trace_clock=global to the kernel command line\n"); info->add_timestamp = 1; } @@ -2669,7 +2797,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, * If this is the first commit on the page, then it has the same * timestamp as the page itself. */ - if (!tail) + if (!tail && !ring_buffer_time_stamp_abs(cpu_buffer->buffer)) info->delta = 0; /* See if we shot pass the end of this buffer page */ @@ -2746,8 +2874,11 @@ rb_reserve_next_event(struct ring_buffer *buffer, /* make sure this diff is calculated here */ barrier(); - /* Did the write stamp get updated already? */ - if (likely(info.ts >= cpu_buffer->write_stamp)) { + if (ring_buffer_time_stamp_abs(buffer)) { + info.delta = info.ts; + rb_handle_timestamp(cpu_buffer, &info); + } else /* Did the write stamp get updated already? */ + if (likely(info.ts >= cpu_buffer->write_stamp)) { info.delta = diff; if (unlikely(test_time_stamp(info.delta))) rb_handle_timestamp(cpu_buffer, &info); @@ -3429,14 +3560,13 @@ rb_update_read_stamp(struct ring_buffer_per_cpu *cpu_buffer, return; case RINGBUF_TYPE_TIME_EXTEND: - delta = event->array[0]; - delta <<= TS_SHIFT; - delta += event->time_delta; + delta = ring_buffer_event_time_stamp(event); cpu_buffer->read_stamp += delta; return; case RINGBUF_TYPE_TIME_STAMP: - /* FIXME: not implemented */ + delta = ring_buffer_event_time_stamp(event); + cpu_buffer->read_stamp = delta; return; case RINGBUF_TYPE_DATA: @@ -3460,14 +3590,13 @@ rb_update_iter_read_stamp(struct ring_buffer_iter *iter, return; case RINGBUF_TYPE_TIME_EXTEND: - delta = event->array[0]; - delta <<= TS_SHIFT; - delta += event->time_delta; + delta = ring_buffer_event_time_stamp(event); iter->read_stamp += delta; return; case RINGBUF_TYPE_TIME_STAMP: - /* FIXME: not implemented */ + delta = ring_buffer_event_time_stamp(event); + iter->read_stamp = delta; return; case RINGBUF_TYPE_DATA: @@ -3691,6 +3820,8 @@ rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts, struct buffer_page *reader; int nr_loops = 0; + if (ts) + *ts = 0; again: /* * We repeat when a time extend is encountered. @@ -3727,12 +3858,17 @@ rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts, goto again; case RINGBUF_TYPE_TIME_STAMP: - /* FIXME: not implemented */ + if (ts) { + *ts = ring_buffer_event_time_stamp(event); + ring_buffer_normalize_time_stamp(cpu_buffer->buffer, + cpu_buffer->cpu, ts); + } + /* Internal data, OK to advance */ rb_advance_reader(cpu_buffer); goto again; case RINGBUF_TYPE_DATA: - if (ts) { + if (ts && !(*ts)) { *ts = cpu_buffer->read_stamp + event->time_delta; ring_buffer_normalize_time_stamp(cpu_buffer->buffer, cpu_buffer->cpu, ts); @@ -3757,6 +3893,9 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts) struct ring_buffer_event *event; int nr_loops = 0; + if (ts) + *ts = 0; + cpu_buffer = iter->cpu_buffer; buffer = cpu_buffer->buffer; @@ -3809,12 +3948,17 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts) goto again; case RINGBUF_TYPE_TIME_STAMP: - /* FIXME: not implemented */ + if (ts) { + *ts = ring_buffer_event_time_stamp(event); + ring_buffer_normalize_time_stamp(cpu_buffer->buffer, + cpu_buffer->cpu, ts); + } + /* Internal data, OK to advance */ rb_advance_iter(iter); goto again; case RINGBUF_TYPE_DATA: - if (ts) { + if (ts && !(*ts)) { *ts = iter->read_stamp + event->time_delta; ring_buffer_normalize_time_stamp(buffer, cpu_buffer->cpu, ts); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 300f4ea39646..dfbcf9ee1447 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -41,6 +41,7 @@ #include <linux/nmi.h> #include <linux/fs.h> #include <linux/trace.h> +#include <linux/sched/clock.h> #include <linux/sched/rt.h> #include "trace.h" @@ -1168,6 +1169,14 @@ static struct { ARCH_TRACE_CLOCKS }; +bool trace_clock_in_ns(struct trace_array *tr) +{ + if (trace_clocks[tr->clock_id].in_ns) + return true; + + return false; +} + /* * trace_parser_get_init - gets the buffer for trace parser */ @@ -2269,7 +2278,7 @@ trace_event_buffer_lock_reserve(struct ring_buffer **current_rb, *current_rb = trace_file->tr->trace_buffer.buffer; - if ((trace_file->flags & + if (!ring_buffer_time_stamp_abs(*current_rb) && (trace_file->flags & (EVENT_FILE_FL_SOFT_DISABLED | EVENT_FILE_FL_FILTERED)) && (entry = this_cpu_read(trace_buffered_event))) { /* Try to use the per cpu buffer first */ @@ -2380,7 +2389,7 @@ EXPORT_SYMBOL_GPL(trace_event_buffer_commit); * trace_buffer_unlock_commit_regs() * trace_event_buffer_commit() * trace_event_raw_event_xxx() -*/ + */ # define STACK_SKIP 3 void trace_buffer_unlock_commit_regs(struct trace_array *tr, @@ -4515,6 +4524,9 @@ static const char readme_msg[] = #ifdef CONFIG_X86_64 " x86-tsc: TSC cycle counter\n" #endif + "\n timestamp_mode\t-view the mode used to timestamp events\n" + " delta: Delta difference against a buffer-wide timestamp\n" + " absolute: Absolute (standalone) timestamp\n" "\n trace_marker\t\t- Writes into this file writes into the kernel buffer\n" "\n trace_marker_raw\t\t- Writes into this file writes binary data into the kernel buffer\n" " tracing_cpumask\t- Limit which CPUs to trace\n" @@ -4691,8 +4703,9 @@ static const char readme_msg[] = "\t .sym display an address as a symbol\n" "\t .sym-offset display an address as a symbol and offset\n" "\t .execname display a common_pid as a program name\n" - "\t .syscall display a syscall id as a syscall name\n\n" - "\t .log2 display log2 value rather than raw number\n\n" + "\t .syscall display a syscall id as a syscall name\n" + "\t .log2 display log2 value rather than raw number\n" + "\t .usecs display a common_timestamp in microseconds\n\n" "\t The 'pause' parameter can be used to pause an existing hist\n" "\t trigger or to start a hist trigger but not log any events\n" "\t until told to do so. 'continue' can be used to start or\n" @@ -6202,7 +6215,7 @@ static int tracing_clock_show(struct seq_file *m, void *v) return 0; } -static int tracing_set_clock(struct trace_array *tr, const char *clockstr) +int tracing_set_clock(struct trace_array *tr, const char *clockstr) { int i; @@ -6282,6 +6295,71 @@ static int tracing_clock_open(struct inode *inode, struct file *file) return ret; } +static int tracing_time_stamp_mode_show(struct seq_file *m, void *v) +{ + struct trace_array *tr = m->private; + + mutex_lock(&trace_types_lock); + + if (ring_buffer_time_stamp_abs(tr->trace_buffer.buffer)) + seq_puts(m, "delta [absolute]\n"); + else + seq_puts(m, "[delta] absolute\n"); + + mutex_unlock(&trace_types_lock); + + return 0; +} + +static int tracing_time_stamp_mode_open(struct inode *inode, struct file *file) +{ + struct trace_array *tr = inode->i_private; + int ret; + + if (tracing_disabled) + return -ENODEV; + + if (trace_array_get(tr)) + return -ENODEV; + + ret = single_open(file, tracing_time_stamp_mode_show, inode->i_private); + if (ret < 0) + trace_array_put(tr); + + return ret; +} + +int tracing_set_time_stamp_abs(struct trace_array *tr, bool abs) +{ + int ret = 0; + + mutex_lock(&trace_types_lock); + + if (abs && tr->time_stamp_abs_ref++) + goto out; + + if (!abs) { + if (WARN_ON_ONCE(!tr->time_stamp_abs_ref)) { + ret = -EINVAL; + goto out; + } + + if (--tr->time_stamp_abs_ref) + goto out; + } + + ring_buffer_set_time_stamp_abs(tr->trace_buffer.buffer, abs); + +#ifdef CONFIG_TRACER_MAX_TRACE + if (tr->max_buffer.buffer) + ring_buffer_set_time_stamp_abs(tr->max_buffer.buffer, abs); +#endif + out: + mutex_unlock(&trace_types_lock); + + return ret; +} + struct ftrace_buffer_info { struct trace_iterator iter; void *spare; @@ -6529,6 +6607,13 @@ static const struct file_operations trace_clock_fops = { .write = tracing_clock_write, }; +static const struct file_operations trace_time_stamp_mode_fops = { + .open = tracing_time_stamp_mode_open, + .read = seq_read, + .llseek = seq_lseek, + .release = tracing_single_release_tr, +}; + #ifdef CONFIG_TRACER_SNAPSHOT static const struct file_operations snapshot_fops = { .open = tracing_snapshot_open, @@ -7699,6 +7784,7 @@ static int instance_mkdir(const char *name) INIT_LIST_HEAD(&tr->systems); INIT_LIST_HEAD(&tr->events); + INIT_LIST_HEAD(&tr->hist_vars); if (allocate_trace_buffers(tr, trace_buf_size) < 0) goto out_free_tr; @@ -7851,6 +7937,9 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer) trace_create_file("tracing_on", 0644, d_tracer, tr, &rb_simple_fops); + trace_create_file("timestamp_mode", 0444, d_tracer, tr, + &trace_time_stamp_mode_fops); + create_trace_options_dir(tr); #if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER) @@ -8446,6 +8535,7 @@ __init static int tracer_alloc_buffers(void) INIT_LIST_HEAD(&global_trace.systems); INIT_LIST_HEAD(&global_trace.events); + INIT_LIST_HEAD(&global_trace.hist_vars); list_add(&global_trace.list, &ftrace_trace_arrays); apply_trace_boot_options(); @@ -8507,3 +8597,21 @@ __init static int clear_boot_tracer(void) fs_initcall(tracer_init_tracefs); late_initcall_sync(clear_boot_tracer); + +#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK +__init static int tracing_set_default_clock(void) +{ + /* sched_clock_stable() is determined in late_initcall */ + if (!trace_boot_clock && !sched_clock_stable()) { + printk(KERN_WARNING + "Unstable clock detected, switching default tracing clock to \"global\"\n" + "If you want to keep using the local clock, then add:\n" + " \"trace_clock=local\"\n" + "on the kernel command line\n"); + tracing_set_clock(&global_trace, "global"); + } + + return 0; +} +late_initcall_sync(tracing_set_default_clock); +#endif diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 2a6d0325a761..6fb46a06c9dc 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -273,6 +273,8 @@ struct trace_array { /* function tracing enabled */ int function_enabled; #endif + int time_stamp_abs_ref; + struct list_head hist_vars; }; enum { @@ -286,6 +288,11 @@ extern struct mutex trace_types_lock; extern int trace_array_get(struct trace_array *tr); extern void trace_array_put(struct trace_array *tr); +extern int tracing_set_time_stamp_abs(struct trace_array *tr, bool abs); +extern int tracing_set_clock(struct trace_array *tr, const char *clockstr); + +extern bool trace_clock_in_ns(struct trace_array *tr); + /* * The global tracer (top) should be the first trace array added, * but we check the flag anyway. @@ -1209,12 +1216,11 @@ struct ftrace_event_field { int is_signed; }; +struct prog_entry; + struct event_filter { - int n_preds; /* Number assigned */ - int a_preds; /* allocated */ - struct filter_pred __rcu *preds; - struct filter_pred __rcu *root; - char *filter_string; + struct prog_entry __rcu *prog; + char *filter_string; }; struct event_subsystem { @@ -1291,7 +1297,7 @@ __event_trigger_test_discard(struct trace_event_file *file, unsigned long eflags = file->flags; if (eflags & EVENT_FILE_FL_TRIGGER_COND) - *tt = event_triggers_call(file, entry); + *tt = event_triggers_call(file, entry, event); if (test_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags) || (unlikely(file->flags & EVENT_FILE_FL_FILTERED) && @@ -1328,7 +1334,7 @@ event_trigger_unlock_commit(struct trace_event_file *file, trace_buffer_unlock_commit(file->tr, buffer, event, irq_flags, pc); if (tt) - event_triggers_post_call(file, tt, entry); + event_triggers_post_call(file, tt, entry, event); } /** @@ -1361,7 +1367,7 @@ event_trigger_unlock_commit_regs(struct trace_event_file *file, irq_flags, pc, regs); if (tt) - event_triggers_post_call(file, tt, entry); + event_triggers_post_call(file, tt, entry, event); } #define FILTER_PRED_INVALID ((unsigned short)-1) @@ -1406,12 +1412,8 @@ struct filter_pred { unsigned short *ops; struct ftrace_event_field *field; int offset; - int not; + int not; int op; - unsigned short index; - unsigned short parent; - unsigned short left; - unsigned short right; }; static inline bool is_string_field(struct ftrace_event_field *field) @@ -1543,6 +1545,8 @@ extern void pause_named_trigger(struct event_trigger_data *data); extern void unpause_named_trigger(struct event_trigger_data *data); extern void set_named_trigger_data(struct event_trigger_data *data, struct event_trigger_data *named_data); +extern struct event_trigger_data * +get_named_trigger_data(struct event_trigger_data *data); extern int register_event_command(struct event_command *cmd); extern int unregister_event_command(struct event_command *cmd); extern int register_trigger_hist_enable_disable_cmds(void); @@ -1586,7 +1590,8 @@ extern int register_trigger_hist_enable_disable_cmds(void); */ struct event_trigger_ops { void (*func)(struct event_trigger_data *data, - void *rec); + void *rec, + struct ring_buffer_event *rbe); int (*init)(struct event_trigger_ops *ops, struct event_trigger_data *data); void (*free)(struct event_trigger_ops *ops, diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c index 5fdc779f411d..d8a188e0418a 100644 --- a/kernel/trace/trace_clock.c +++ b/kernel/trace/trace_clock.c @@ -96,7 +96,7 @@ u64 notrace trace_clock_global(void) int this_cpu; u64 now; - local_irq_save(flags); + raw_local_irq_save(flags); this_cpu = raw_smp_processor_id(); now = sched_clock_cpu(this_cpu); @@ -122,7 +122,7 @@ u64 notrace trace_clock_global(void) arch_spin_unlock(&trace_clock_struct.lock); out: - local_irq_restore(flags); + raw_local_irq_restore(flags); return now; } diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 2c416509b834..c79193e598f5 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -252,6 +252,8 @@ int perf_kprobe_init(struct perf_event *p_event, bool is_retprobe) ret = strncpy_from_user( func, u64_to_user_ptr(p_event->attr.kprobe_func), KSYM_NAME_LEN); + if (ret == KSYM_NAME_LEN) + ret = -E2BIG; if (ret < 0) goto out; @@ -300,6 +302,8 @@ int perf_uprobe_init(struct perf_event *p_event, bool is_retprobe) return -ENOMEM; ret = strncpy_from_user( path, u64_to_user_ptr(p_event->attr.uprobe_path), PATH_MAX); + if (ret == PATH_MAX) + return -E2BIG; if (ret < 0) goto out; if (path[0] == '\0') { diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index a764aec3c9a1..9b4716bb8bb0 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -33,163 +33,595 @@ "# Only events with the given fields will be affected.\n" \ "# If no events are modified, an error message will be displayed here" -enum filter_op_ids -{ - OP_OR, - OP_AND, - OP_GLOB, - OP_NE, - OP_EQ, - OP_LT, - OP_LE, - OP_GT, - OP_GE, - OP_BAND, - OP_NOT, - OP_NONE, - OP_OPEN_PAREN, -}; +/* Due to token parsing '<=' must be before '<' and '>=' must be before '>' */ +#define OPS \ + C( OP_GLOB, "~" ), \ + C( OP_NE, "!=" ), \ + C( OP_EQ, "==" ), \ + C( OP_LE, "<=" ), \ + C( OP_LT, "<" ), \ + C( OP_GE, ">=" ), \ + C( OP_GT, ">" ), \ + C( OP_BAND, "&" ), \ + C( OP_MAX, NULL ) -struct filter_op { - int id; - char *string; - int precedence; -}; +#undef C +#define C(a, b) a -/* Order must be the same as enum filter_op_ids above */ -static struct filter_op filter_ops[] = { - { OP_OR, "||", 1 }, - { OP_AND, "&&", 2 }, - { OP_GLOB, "~", 4 }, - { OP_NE, "!=", 4 }, - { OP_EQ, "==", 4 }, - { OP_LT, "<", 5 }, - { OP_LE, "<=", 5 }, - { OP_GT, ">", 5 }, - { OP_GE, ">=", 5 }, - { OP_BAND, "&", 6 }, - { OP_NOT, "!", 6 }, - { OP_NONE, "OP_NONE", 0 }, - { OP_OPEN_PAREN, "(", 0 }, -}; +enum filter_op_ids { OPS }; -enum { - FILT_ERR_NONE, - FILT_ERR_INVALID_OP, - FILT_ERR_UNBALANCED_PAREN, - FILT_ERR_TOO_MANY_OPERANDS, - FILT_ERR_OPERAND_TOO_LONG, - FILT_ERR_FIELD_NOT_FOUND, - FILT_ERR_ILLEGAL_FIELD_OP, - FILT_ERR_ILLEGAL_INTVAL, - FILT_ERR_BAD_SUBSYS_FILTER, - FILT_ERR_TOO_MANY_PREDS, - FILT_ERR_MISSING_FIELD, - FILT_ERR_INVALID_FILTER, - FILT_ERR_IP_FIELD_ONLY, - FILT_ERR_ILLEGAL_NOT_OP, -}; +#undef C +#define C(a, b) b -static char *err_text[] = { - "No error", - "Invalid operator", - "Unbalanced parens", - "Too many operands", - "Operand too long", - "Field not found", - "Illegal operation for field type", - "Illegal integer value", - "Couldn't find or set field in one of a subsystem's events", - "Too many terms in predicate expression", - "Missing field name and/or value", - "Meaningless filter expression", - "Only 'ip' field is supported for function trace", - "Illegal use of '!'", -}; +static const char * ops[] = { OPS }; -struct opstack_op { - enum filter_op_ids op; - struct list_head list; -}; +/* + * pred functions are OP_LE, OP_LT, OP_GE, OP_GT, and OP_BAND + * pred_funcs_##type below must match the order of them above. + */ +#define PRED_FUNC_START OP_LE +#define PRED_FUNC_MAX (OP_BAND - PRED_FUNC_START) + +#define ERRORS \ + C(NONE, "No error"), \ + C(INVALID_OP, "Invalid operator"), \ + C(TOO_MANY_OPEN, "Too many '('"), \ + C(TOO_MANY_CLOSE, "Too few '('"), \ + C(MISSING_QUOTE, "Missing matching quote"), \ + C(OPERAND_TOO_LONG, "Operand too long"), \ + C(EXPECT_STRING, "Expecting string field"), \ + C(EXPECT_DIGIT, "Expecting numeric field"), \ + C(ILLEGAL_FIELD_OP, "Illegal operation for field type"), \ + C(FIELD_NOT_FOUND, "Field not found"), \ + C(ILLEGAL_INTVAL, "Illegal integer value"), \ + C(BAD_SUBSYS_FILTER, "Couldn't find or set field in one of a subsystem's events"), \ + C(TOO_MANY_PREDS, "Too many terms in predicate expression"), \ + C(INVALID_FILTER, "Meaningless filter expression"), \ + C(IP_FIELD_ONLY, "Only 'ip' field is supported for function trace"), \ + C(INVALID_VALUE, "Invalid value (did you forget quotes)?"), + +#undef C +#define C(a, b) FILT_ERR_##a + +enum { ERRORS }; + +#undef C +#define C(a, b) b + +static char *err_text[] = { ERRORS }; + +/* Called after a '!' character but "!=" and "!~" are not "not"s */ +static bool is_not(const char *str) +{ + switch (str[1]) { + case '=': + case '~': + return false; + } + return true; +} -struct postfix_elt { - enum filter_op_ids op; - char *operand; - struct list_head list; +/** + * prog_entry - a singe entry in the filter program + * @target: Index to jump to on a branch (actually one minus the index) + * @when_to_branch: The value of the result of the predicate to do a branch + * @pred: The predicate to execute. + */ +struct prog_entry { + int target; + int when_to_branch; + struct filter_pred *pred; }; -struct filter_parse_state { - struct filter_op *ops; - struct list_head opstack; - struct list_head postfix; +/** + * update_preds- assign a program entry a label target + * @prog: The program array + * @N: The index of the current entry in @prog + * @when_to_branch: What to assign a program entry for its branch condition + * + * The program entry at @N has a target that points to the index of a program + * entry that can have its target and when_to_branch fields updated. + * Update the current program entry denoted by index @N target field to be + * that of the updated entry. This will denote the entry to update if + * we are processing an "||" after an "&&" + */ +static void update_preds(struct prog_entry *prog, int N, int invert) +{ + int t, s; + + t = prog[N].target; + s = prog[t].target; + prog[t].when_to_branch = invert; + prog[t].target = N; + prog[N].target = s; +} + +struct filter_parse_error { int lasterr; int lasterr_pos; - - struct { - char *string; - unsigned int cnt; - unsigned int tail; - } infix; - - struct { - char string[MAX_FILTER_STR_VAL]; - int pos; - unsigned int tail; - } operand; }; -struct pred_stack { - struct filter_pred **preds; - int index; +static void parse_error(struct filter_parse_error *pe, int err, int pos) +{ + pe->lasterr = err; + pe->lasterr_pos = pos; +} + +typedef int (*parse_pred_fn)(const char *str, void *data, int pos, + struct filter_parse_error *pe, + struct filter_pred **pred); + +enum { + INVERT = 1, + PROCESS_AND = 2, + PROCESS_OR = 4, }; -/* If not of not match is equal to not of not, then it is a match */ +/* + * Without going into a formal proof, this explains the method that is used in + * parsing the logical expressions. + * + * For example, if we have: "a && !(!b || (c && g)) || d || e && !f" + * The first pass will convert it into the following program: + * + * n1: r=a; l1: if (!r) goto l4; + * n2: r=b; l2: if (!r) goto l4; + * n3: r=c; r=!r; l3: if (r) goto l4; + * n4: r=g; r=!r; l4: if (r) goto l5; + * n5: r=d; l5: if (r) goto T + * n6: r=e; l6: if (!r) goto l7; + * n7: r=f; r=!r; l7: if (!r) goto F + * T: return TRUE + * F: return FALSE + * + * To do this, we use a data structure to represent each of the above + * predicate and conditions that has: + * + * predicate, when_to_branch, invert, target + * + * The "predicate" will hold the function to determine the result "r". + * The "when_to_branch" denotes what "r" should be if a branch is to be taken + * "&&" would contain "!r" or (0) and "||" would contain "r" or (1). + * The "invert" holds whether the value should be reversed before testing. + * The "target" contains the label "l#" to jump to. + * + * A stack is created to hold values when parentheses are used. + * + * To simplify the logic, the labels will start at 0 and not 1. + * + * The possible invert values are 1 and 0. The number of "!"s that are in scope + * before the predicate determines the invert value, if the number is odd then + * the invert value is 1 and 0 otherwise. This means the invert value only + * needs to be toggled when a new "!" is introduced compared to what is stored + * on the stack, where parentheses were used. + * + * The top of the stack and "invert" are initialized to zero. + * + * ** FIRST PASS ** + * + * #1 A loop through all the tokens is done: + * + * #2 If the token is an "(", the stack is push, and the current stack value + * gets the current invert value, and the loop continues to the next token. + * The top of the stack saves the "invert" value to keep track of what + * the current inversion is. As "!(a && !b || c)" would require all + * predicates being affected separately by the "!" before the parentheses. + * And that would end up being equivalent to "(!a || b) && !c" + * + * #3 If the token is an "!", the current "invert" value gets inverted, and + * the loop continues. Note, if the next token is a predicate, then + * this "invert" value is only valid for the current program entry, + * and does not affect other predicates later on. + * + * The only other acceptable token is the predicate string. + * + * #4 A new entry into the program is added saving: the predicate and the + * current value of "invert". The target is currently assigned to the + * previous program index (this will not be its final value). + * + * #5 We now enter another loop and look at the next token. The only valid + * tokens are ")", "&&", "||" or end of the input string "\0". + * + * #6 The invert variable is reset to the current value saved on the top of + * the stack. + * + * #7 The top of the stack holds not only the current invert value, but also + * if a "&&" or "||" needs to be processed. Note, the "&&" takes higher + * precedence than "||". That is "a && b || c && d" is equivalent to + * "(a && b) || (c && d)". Thus the first thing to do is to see if "&&" needs + * to be processed. This is the case if an "&&" was the last token. If it was + * then we call update_preds(). This takes the program, the current index in + * the program, and the current value of "invert". More will be described + * below about this function. + * + * #8 If the next token is "&&" then we set a flag in the top of the stack + * that denotes that "&&" needs to be processed, break out of this loop + * and continue with the outer loop. + * + * #9 Otherwise, if a "||" needs to be processed then update_preds() is called. + * This is called with the program, the current index in the program, but + * this time with an inverted value of "invert" (that is !invert). This is + * because the value taken will become the "when_to_branch" value of the + * program. + * Note, this is called when the next token is not an "&&". As stated before, + * "&&" takes higher precedence, and "||" should not be processed yet if the + * next logical operation is "&&". + * + * #10 If the next token is "||" then we set a flag in the top of the stack + * that denotes that "||" needs to be processed, break out of this loop + * and continue with the outer loop. + * + * #11 If this is the end of the input string "\0" then we break out of both + * loops. + * + * #12 Otherwise, the next token is ")", where we pop the stack and continue + * this inner loop. + * + * Now to discuss the update_pred() function, as that is key to the setting up + * of the program. Remember the "target" of the program is initialized to the + * previous index and not the "l" label. The target holds the index into the + * program that gets affected by the operand. Thus if we have something like + * "a || b && c", when we process "a" the target will be "-1" (undefined). + * When we process "b", its target is "0", which is the index of "a", as that's + * the predicate that is affected by "||". But because the next token after "b" + * is "&&" we don't call update_preds(). Instead continue to "c". As the + * next token after "c" is not "&&" but the end of input, we first process the + * "&&" by calling update_preds() for the "&&" then we process the "||" by + * callin updates_preds() with the values for processing "||". + * + * What does that mean? What update_preds() does is to first save the "target" + * of the program entry indexed by the current program entry's "target" + * (remember the "target" is initialized to previous program entry), and then + * sets that "target" to the current index which represents the label "l#". + * That entry's "when_to_branch" is set to the value passed in (the "invert" + * or "!invert"). Then it sets the current program entry's target to the saved + * "target" value (the old value of the program that had its "target" updated + * to the label). + * + * Looking back at "a || b && c", we have the following steps: + * "a" - prog[0] = { "a", X, -1 } // pred, when_to_branch, target + * "||" - flag that we need to process "||"; continue outer loop + * "b" - prog[1] = { "b", X, 0 } + * "&&" - flag that we need to process "&&"; continue outer loop + * (Notice we did not process "||") + * "c" - prog[2] = { "c", X, 1 } + * update_preds(prog, 2, 0); // invert = 0 as we are processing "&&" + * t = prog[2].target; // t = 1 + * s = prog[t].target; // s = 0 + * prog[t].target = 2; // Set target to "l2" + * prog[t].when_to_branch = 0; + * prog[2].target = s; + * update_preds(prog, 2, 1); // invert = 1 as we are now processing "||" + * t = prog[2].target; // t = 0 + * s = prog[t].target; // s = -1 + * prog[t].target = 2; // Set target to "l2" + * prog[t].when_to_branch = 1; + * prog[2].target = s; + * + * #13 Which brings us to the final step of the first pass, which is to set + * the last program entry's when_to_branch and target, which will be + * when_to_branch = 0; target = N; ( the label after the program entry after + * the last program entry processed above). + * + * If we denote "TRUE" to be the entry after the last program entry processed, + * and "FALSE" the program entry after that, we are now done with the first + * pass. + * + * Making the above "a || b && c" have a progam of: + * prog[0] = { "a", 1, 2 } + * prog[1] = { "b", 0, 2 } + * prog[2] = { "c", 0, 3 } + * + * Which translates into: + * n0: r = a; l0: if (r) goto l2; + * n1: r = b; l1: if (!r) goto l2; + * n2: r = c; l2: if (!r) goto l3; // Which is the same as "goto F;" + * T: return TRUE; l3: + * F: return FALSE + * + * Although, after the first pass, the program is correct, it is + * inefficient. The simple sample of "a || b && c" could be easily been + * converted into: + * n0: r = a; if (r) goto T + * n1: r = b; if (!r) goto F + * n2: r = c; if (!r) goto F + * T: return TRUE; + * F: return FALSE; + * + * The First Pass is over the input string. The next too passes are over + * the program itself. + * + * ** SECOND PASS ** + * + * Which brings us to the second pass. If a jump to a label has the + * same condition as that label, it can instead jump to its target. + * The original example of "a && !(!b || (c && g)) || d || e && !f" + * where the first pass gives us: + * + * n1: r=a; l1: if (!r) goto l4; + * n2: r=b; l2: if (!r) goto l4; + * n3: r=c; r=!r; l3: if (r) goto l4; + * n4: r=g; r=!r; l4: if (r) goto l5; + * n5: r=d; l5: if (r) goto T + * n6: r=e; l6: if (!r) goto l7; + * n7: r=f; r=!r; l7: if (!r) goto F: + * T: return TRUE; + * F: return FALSE + * + * We can see that "l3: if (r) goto l4;" and at l4, we have "if (r) goto l5;". + * And "l5: if (r) goto T", we could optimize this by converting l3 and l4 + * to go directly to T. To accomplish this, we start from the last + * entry in the program and work our way back. If the target of the entry + * has the same "when_to_branch" then we could use that entry's target. + * Doing this, the above would end up as: + * + * n1: r=a; l1: if (!r) goto l4; + * n2: r=b; l2: if (!r) goto l4; + * n3: r=c; r=!r; l3: if (r) goto T; + * n4: r=g; r=!r; l4: if (r) goto T; + * n5: r=d; l5: if (r) goto T; + * n6: r=e; l6: if (!r) goto F; + * n7: r=f; r=!r; l7: if (!r) goto F; + * T: return TRUE + * F: return FALSE + * + * In that same pass, if the "when_to_branch" doesn't match, we can simply + * go to the program entry after the label. That is, "l2: if (!r) goto l4;" + * where "l4: if (r) goto T;", then we can convert l2 to be: + * "l2: if (!r) goto n5;". + * + * This will have the second pass give us: + * n1: r=a; l1: if (!r) goto n5; + * n2: r=b; l2: if (!r) goto n5; + * n3: r=c; r=!r; l3: if (r) goto T; + * n4: r=g; r=!r; l4: if (r) goto T; + * n5: r=d; l5: if (r) goto T + * n6: r=e; l6: if (!r) goto F; + * n7: r=f; r=!r; l7: if (!r) goto F + * T: return TRUE + * F: return FALSE + * + * Notice, all the "l#" labels are no longer used, and they can now + * be discarded. + * + * ** THIRD PASS ** + * + * For the third pass we deal with the inverts. As they simply just + * make the "when_to_branch" get inverted, a simple loop over the + * program to that does: "when_to_branch ^= invert;" will do the + * job, leaving us with: + * n1: r=a; if (!r) goto n5; + * n2: r=b; if (!r) goto n5; + * n3: r=c: if (!r) goto T; + * n4: r=g; if (!r) goto T; + * n5: r=d; if (r) goto T + * n6: r=e; if (!r) goto F; + * n7: r=f; if (r) goto F + * T: return TRUE + * F: return FALSE + * + * As "r = a; if (!r) goto n5;" is obviously the same as + * "if (!a) goto n5;" without doing anything we can interperate the + * program as: + * n1: if (!a) goto n5; + * n2: if (!b) goto n5; + * n3: if (!c) goto T; + * n4: if (!g) goto T; + * n5: if (d) goto T + * n6: if (!e) goto F; + * n7: if (f) goto F + * T: return TRUE + * F: return FALSE + * + * Since the inverts are discarded at the end, there's no reason to store + * them in the program array (and waste memory). A separate array to hold + * the inverts is used and freed at the end. + */ +static struct prog_entry * +predicate_parse(const char *str, int nr_parens, int nr_preds, + parse_pred_fn parse_pred, void *data, + struct filter_parse_error *pe) +{ + struct prog_entry *prog_stack; + struct prog_entry *prog; + const char *ptr = str; + char *inverts = NULL; + int *op_stack; + int *top; + int invert = 0; + int ret = -ENOMEM; + int len; + int N = 0; + int i; + + nr_preds += 2; /* For TRUE and FALSE */ + + op_stack = kmalloc(sizeof(*op_stack) * nr_parens, GFP_KERNEL); + if (!op_stack) + return ERR_PTR(-ENOMEM); + prog_stack = kmalloc(sizeof(*prog_stack) * nr_preds, GFP_KERNEL); + if (!prog_stack) { + parse_error(pe, -ENOMEM, 0); + goto out_free; + } + inverts = kmalloc(sizeof(*inverts) * nr_preds, GFP_KERNEL); + if (!inverts) { + parse_error(pe, -ENOMEM, 0); + goto out_free; + } + + top = op_stack; + prog = prog_stack; + *top = 0; + + /* First pass */ + while (*ptr) { /* #1 */ + const char *next = ptr++; + + if (isspace(*next)) + continue; + + switch (*next) { + case '(': /* #2 */ + if (top - op_stack > nr_parens) + return ERR_PTR(-EINVAL); + *(++top) = invert; + continue; + case '!': /* #3 */ + if (!is_not(next)) + break; + invert = !invert; + continue; + } + + if (N >= nr_preds) { + parse_error(pe, FILT_ERR_TOO_MANY_PREDS, next - str); + goto out_free; + } + + inverts[N] = invert; /* #4 */ + prog[N].target = N-1; + + len = parse_pred(next, data, ptr - str, pe, &prog[N].pred); + if (len < 0) { + ret = len; + goto out_free; + } + ptr = next + len; + + N++; + + ret = -1; + while (1) { /* #5 */ + next = ptr++; + if (isspace(*next)) + continue; + + switch (*next) { + case ')': + case '\0': + break; + case '&': + case '|': + if (next[1] == next[0]) { + ptr++; + break; + } + default: + parse_error(pe, FILT_ERR_TOO_MANY_PREDS, + next - str); + goto out_free; + } + + invert = *top & INVERT; + + if (*top & PROCESS_AND) { /* #7 */ + update_preds(prog, N - 1, invert); + *top &= ~PROCESS_AND; + } + if (*next == '&') { /* #8 */ + *top |= PROCESS_AND; + break; + } + if (*top & PROCESS_OR) { /* #9 */ + update_preds(prog, N - 1, !invert); + *top &= ~PROCESS_OR; + } + if (*next == '|') { /* #10 */ + *top |= PROCESS_OR; + break; + } + if (!*next) /* #11 */ + goto out; + + if (top == op_stack) { + ret = -1; + /* Too few '(' */ + parse_error(pe, FILT_ERR_TOO_MANY_CLOSE, ptr - str); + goto out_free; + } + top--; /* #12 */ + } + } + out: + if (top != op_stack) { + /* Too many '(' */ + parse_error(pe, FILT_ERR_TOO_MANY_OPEN, ptr - str); + goto out_free; + } + + prog[N].pred = NULL; /* #13 */ + prog[N].target = 1; /* TRUE */ + prog[N+1].pred = NULL; + prog[N+1].target = 0; /* FALSE */ + prog[N-1].target = N; + prog[N-1].when_to_branch = false; + + /* Second Pass */ + for (i = N-1 ; i--; ) { + int target = prog[i].target; + if (prog[i].when_to_branch == prog[target].when_to_branch) + prog[i].target = prog[target].target; + } + + /* Third Pass */ + for (i = 0; i < N; i++) { + invert = inverts[i] ^ prog[i].when_to_branch; + prog[i].when_to_branch = invert; + /* Make sure the program always moves forward */ + if (WARN_ON(prog[i].target <= i)) { + ret = -EINVAL; + goto out_free; + } + } + + return prog; +out_free: + kfree(op_stack); + kfree(prog_stack); + kfree(inverts); + return ERR_PTR(ret); +} + #define DEFINE_COMPARISON_PRED(type) \ static int filter_pred_LT_##type(struct filter_pred *pred, void *event) \ { \ type *addr = (type *)(event + pred->offset); \ type val = (type)pred->val; \ - int match = (*addr < val); \ - return !!match == !pred->not; \ + return *addr < val; \ } \ static int filter_pred_LE_##type(struct filter_pred *pred, void *event) \ { \ type *addr = (type *)(event + pred->offset); \ type val = (type)pred->val; \ - int match = (*addr <= val); \ - return !!match == !pred->not; \ + return *addr <= val; \ } \ static int filter_pred_GT_##type(struct filter_pred *pred, void *event) \ { \ type *addr = (type *)(event + pred->offset); \ type val = (type)pred->val; \ - int match = (*addr > val); \ - return !!match == !pred->not; \ + return *addr > val; \ } \ static int filter_pred_GE_##type(struct filter_pred *pred, void *event) \ { \ type *addr = (type *)(event + pred->offset); \ type val = (type)pred->val; \ - int match = (*addr >= val); \ - return !!match == !pred->not; \ + return *addr >= val; \ } \ static int filter_pred_BAND_##type(struct filter_pred *pred, void *event) \ { \ type *addr = (type *)(event + pred->offset); \ type val = (type)pred->val; \ - int match = !!(*addr & val); \ - return match == !pred->not; \ + return !!(*addr & val); \ } \ static const filter_pred_fn_t pred_funcs_##type[] = { \ - filter_pred_LT_##type, \ filter_pred_LE_##type, \ - filter_pred_GT_##type, \ + filter_pred_LT_##type, \ filter_pred_GE_##type, \ + filter_pred_GT_##type, \ filter_pred_BAND_##type, \ }; -#define PRED_FUNC_START OP_LT - #define DEFINE_EQUALITY_PRED(size) \ static int filter_pred_##size(struct filter_pred *pred, void *event) \ { \ @@ -272,44 +704,36 @@ static int filter_pred_strloc(struct filter_pred *pred, void *event) static int filter_pred_cpu(struct filter_pred *pred, void *event) { int cpu, cmp; - int match = 0; cpu = raw_smp_processor_id(); cmp = pred->val; switch (pred->op) { case OP_EQ: - match = cpu == cmp; - break; + return cpu == cmp; + case OP_NE: + return cpu != cmp; case OP_LT: - match = cpu < cmp; - break; + return cpu < cmp; case OP_LE: - match = cpu <= cmp; - break; + return cpu <= cmp; case OP_GT: - match = cpu > cmp; - break; + return cpu > cmp; case OP_GE: - match = cpu >= cmp; - break; + return cpu >= cmp; default: - break; + return 0; } - - return !!match == !pred->not; } /* Filter predicate for COMM. */ static int filter_pred_comm(struct filter_pred *pred, void *event) { - int cmp, match; + int cmp; cmp = pred->regex.match(current->comm, &pred->regex, - pred->regex.field_len); - match = cmp ^ pred->not; - - return match; + TASK_COMM_LEN); + return cmp ^ pred->not; } static int filter_pred_none(struct filter_pred *pred, void *event) @@ -366,6 +790,7 @@ static int regex_match_glob(char *str, struct regex *r, int len __maybe_unused) return 1; return 0; } + /** * filter_parse_regex - parse a basic regex * @buff: the raw regex @@ -426,10 +851,9 @@ static void filter_build_regex(struct filter_pred *pred) struct regex *r = &pred->regex; char *search; enum regex_type type = MATCH_FULL; - int not = 0; if (pred->op == OP_GLOB) { - type = filter_parse_regex(r->pattern, r->len, &search, ¬); + type = filter_parse_regex(r->pattern, r->len, &search, &pred->not); r->len = strlen(search); memmove(r->pattern, search, r->len+1); } @@ -451,210 +875,32 @@ static void filter_build_regex(struct filter_pred *pred) r->match = regex_match_glob; break; } - - pred->not ^= not; -} - -enum move_type { - MOVE_DOWN, - MOVE_UP_FROM_LEFT, - MOVE_UP_FROM_RIGHT -}; - -static struct filter_pred * -get_pred_parent(struct filter_pred *pred, struct filter_pred *preds, - int index, enum move_type *move) -{ - if (pred->parent & FILTER_PRED_IS_RIGHT) - *move = MOVE_UP_FROM_RIGHT; - else - *move = MOVE_UP_FROM_LEFT; - pred = &preds[pred->parent & ~FILTER_PRED_IS_RIGHT]; - - return pred; -} - -enum walk_return { - WALK_PRED_ABORT, - WALK_PRED_PARENT, - WALK_PRED_DEFAULT, -}; - -typedef int (*filter_pred_walkcb_t) (enum move_type move, - struct filter_pred *pred, - int *err, void *data); - -static int walk_pred_tree(struct filter_pred *preds, - struct filter_pred *root, - filter_pred_walkcb_t cb, void *data) -{ - struct filter_pred *pred = root; - enum move_type move = MOVE_DOWN; - int done = 0; - - if (!preds) - return -EINVAL; - - do { - int err = 0, ret; - - ret = cb(move, pred, &err, data); - if (ret == WALK_PRED_ABORT) - return err; - if (ret == WALK_PRED_PARENT) - goto get_parent; - - switch (move) { - case MOVE_DOWN: - if (pred->left != FILTER_PRED_INVALID) { - pred = &preds[pred->left]; - continue; - } - goto get_parent; - case MOVE_UP_FROM_LEFT: - pred = &preds[pred->right]; - move = MOVE_DOWN; - continue; - case MOVE_UP_FROM_RIGHT: - get_parent: - if (pred == root) - break; - pred = get_pred_parent(pred, preds, - pred->parent, - &move); - continue; - } - done = 1; - } while (!done); - - /* We are fine. */ - return 0; -} - -/* - * A series of AND or ORs where found together. Instead of - * climbing up and down the tree branches, an array of the - * ops were made in order of checks. We can just move across - * the array and short circuit if needed. - */ -static int process_ops(struct filter_pred *preds, - struct filter_pred *op, void *rec) -{ - struct filter_pred *pred; - int match = 0; - int type; - int i; - - /* - * Micro-optimization: We set type to true if op - * is an OR and false otherwise (AND). Then we - * just need to test if the match is equal to - * the type, and if it is, we can short circuit the - * rest of the checks: - * - * if ((match && op->op == OP_OR) || - * (!match && op->op == OP_AND)) - * return match; - */ - type = op->op == OP_OR; - - for (i = 0; i < op->val; i++) { - pred = &preds[op->ops[i]]; - if (!WARN_ON_ONCE(!pred->fn)) - match = pred->fn(pred, rec); - if (!!match == type) - break; - } - /* If not of not match is equal to not of not, then it is a match */ - return !!match == !op->not; -} - -struct filter_match_preds_data { - struct filter_pred *preds; - int match; - void *rec; -}; - -static int filter_match_preds_cb(enum move_type move, struct filter_pred *pred, - int *err, void *data) -{ - struct filter_match_preds_data *d = data; - - *err = 0; - switch (move) { - case MOVE_DOWN: - /* only AND and OR have children */ - if (pred->left != FILTER_PRED_INVALID) { - /* If ops is set, then it was folded. */ - if (!pred->ops) - return WALK_PRED_DEFAULT; - /* We can treat folded ops as a leaf node */ - d->match = process_ops(d->preds, pred, d->rec); - } else { - if (!WARN_ON_ONCE(!pred->fn)) - d->match = pred->fn(pred, d->rec); - } - - return WALK_PRED_PARENT; - case MOVE_UP_FROM_LEFT: - /* - * Check for short circuits. - * - * Optimization: !!match == (pred->op == OP_OR) - * is the same as: - * if ((match && pred->op == OP_OR) || - * (!match && pred->op == OP_AND)) - */ - if (!!d->match == (pred->op == OP_OR)) - return WALK_PRED_PARENT; - break; - case MOVE_UP_FROM_RIGHT: - break; - } - - return WALK_PRED_DEFAULT; } /* return 1 if event matches, 0 otherwise (discard) */ int filter_match_preds(struct event_filter *filter, void *rec) { - struct filter_pred *preds; - struct filter_pred *root; - struct filter_match_preds_data data = { - /* match is currently meaningless */ - .match = -1, - .rec = rec, - }; - int n_preds, ret; + struct prog_entry *prog; + int i; /* no filter is considered a match */ if (!filter) return 1; - n_preds = filter->n_preds; - if (!n_preds) + prog = rcu_dereference_sched(filter->prog); + if (!prog) return 1; - /* - * n_preds, root and filter->preds are protect with preemption disabled. - */ - root = rcu_dereference_sched(filter->root); - if (!root) - return 1; - - data.preds = preds = rcu_dereference_sched(filter->preds); - ret = walk_pred_tree(preds, root, filter_match_preds_cb, &data); - WARN_ON(ret); - return data.match; + for (i = 0; prog[i].pred; i++) { + struct filter_pred *pred = prog[i].pred; + int match = pred->fn(pred, rec); + if (match == prog[i].when_to_branch) + i = prog[i].target; + } + return prog[i].target; } EXPORT_SYMBOL_GPL(filter_match_preds); -static void parse_error(struct filter_parse_state *ps, int err, int pos) -{ - ps->lasterr = err; - ps->lasterr_pos = pos; -} - static void remove_filter_string(struct event_filter *filter) { if (!filter) @@ -664,57 +910,44 @@ static void remove_filter_string(struct event_filter *filter) filter->filter_string = NULL; } -static int replace_filter_string(struct event_filter *filter, - char *filter_string) -{ - kfree(filter->filter_string); - filter->filter_string = kstrdup(filter_string, GFP_KERNEL); - if (!filter->filter_string) - return -ENOMEM; - - return 0; -} - -static int append_filter_string(struct event_filter *filter, - char *string) -{ - int newlen; - char *new_filter_string; - - BUG_ON(!filter->filter_string); - newlen = strlen(filter->filter_string) + strlen(string) + 1; - new_filter_string = kmalloc(newlen, GFP_KERNEL); - if (!new_filter_string) - return -ENOMEM; - - strcpy(new_filter_string, filter->filter_string); - strcat(new_filter_string, string); - kfree(filter->filter_string); - filter->filter_string = new_filter_string; - - return 0; -} - -static void append_filter_err(struct filter_parse_state *ps, +static void append_filter_err(struct filter_parse_error *pe, struct event_filter *filter) { - int pos = ps->lasterr_pos; - char *buf, *pbuf; + struct trace_seq *s; + int pos = pe->lasterr_pos; + char *buf; + int len; - buf = (char *)__get_free_page(GFP_KERNEL); - if (!buf) + if (WARN_ON(!filter->filter_string)) return; - append_filter_string(filter, "\n"); - memset(buf, ' ', PAGE_SIZE); - if (pos > PAGE_SIZE - 128) - pos = 0; - buf[pos] = '^'; - pbuf = &buf[pos] + 1; + s = kmalloc(sizeof(*s), GFP_KERNEL); + if (!s) + return; + trace_seq_init(s); + + len = strlen(filter->filter_string); + if (pos > len) + pos = len; + + /* indexing is off by one */ + if (pos) + pos++; - sprintf(pbuf, "\nparse_error: %s\n", err_text[ps->lasterr]); - append_filter_string(filter, buf); - free_page((unsigned long) buf); + trace_seq_puts(s, filter->filter_string); + if (pe->lasterr > 0) { + trace_seq_printf(s, "\n%*s", pos, "^"); + trace_seq_printf(s, "\nparse_error: %s\n", err_text[pe->lasterr]); + } else { + trace_seq_printf(s, "\nError: (%d)\n", pe->lasterr); + } + trace_seq_putc(s, 0); + buf = kmemdup_nul(s->buffer, s->seq.len, GFP_KERNEL); + if (buf) { + kfree(filter->filter_string); + filter->filter_string = buf; + } + kfree(s); } static inline struct event_filter *event_filter(struct trace_event_file *file) @@ -747,108 +980,18 @@ void print_subsystem_event_filter(struct event_subsystem *system, mutex_unlock(&event_mutex); } -static int __alloc_pred_stack(struct pred_stack *stack, int n_preds) -{ - stack->preds = kcalloc(n_preds + 1, sizeof(*stack->preds), GFP_KERNEL); - if (!stack->preds) - return -ENOMEM; - stack->index = n_preds; - return 0; -} - -static void __free_pred_stack(struct pred_stack *stack) -{ - kfree(stack->preds); - stack->index = 0; -} - -static int __push_pred_stack(struct pred_stack *stack, - struct filter_pred *pred) -{ - int index = stack->index; - - if (WARN_ON(index == 0)) - return -ENOSPC; - - stack->preds[--index] = pred; - stack->index = index; - return 0; -} - -static struct filter_pred * -__pop_pred_stack(struct pred_stack *stack) -{ - struct filter_pred *pred; - int index = stack->index; - - pred = stack->preds[index++]; - if (!pred) - return NULL; - - stack->index = index; - return pred; -} - -static int filter_set_pred(struct event_filter *filter, - int idx, - struct pred_stack *stack, - struct filter_pred *src) -{ - struct filter_pred *dest = &filter->preds[idx]; - struct filter_pred *left; - struct filter_pred *right; - - *dest = *src; - dest->index = idx; - - if (dest->op == OP_OR || dest->op == OP_AND) { - right = __pop_pred_stack(stack); - left = __pop_pred_stack(stack); - if (!left || !right) - return -EINVAL; - /* - * If both children can be folded - * and they are the same op as this op or a leaf, - * then this op can be folded. - */ - if (left->index & FILTER_PRED_FOLD && - ((left->op == dest->op && !left->not) || - left->left == FILTER_PRED_INVALID) && - right->index & FILTER_PRED_FOLD && - ((right->op == dest->op && !right->not) || - right->left == FILTER_PRED_INVALID)) - dest->index |= FILTER_PRED_FOLD; - - dest->left = left->index & ~FILTER_PRED_FOLD; - dest->right = right->index & ~FILTER_PRED_FOLD; - left->parent = dest->index & ~FILTER_PRED_FOLD; - right->parent = dest->index | FILTER_PRED_IS_RIGHT; - } else { - /* - * Make dest->left invalid to be used as a quick - * way to know this is a leaf node. - */ - dest->left = FILTER_PRED_INVALID; - - /* All leafs allow folding the parent ops. */ - dest->index |= FILTER_PRED_FOLD; - } - - return __push_pred_stack(stack, dest); -} - -static void __free_preds(struct event_filter *filter) +static void free_prog(struct event_filter *filter) { + struct prog_entry *prog; int i; - if (filter->preds) { - for (i = 0; i < filter->n_preds; i++) - kfree(filter->preds[i].ops); - kfree(filter->preds); - filter->preds = NULL; - } - filter->a_preds = 0; - filter->n_preds = 0; + prog = rcu_access_pointer(filter->prog); + if (!prog) + return; + + for (i = 0; prog[i].pred; i++) + kfree(prog[i].pred); + kfree(prog); } static void filter_disable(struct trace_event_file *file) @@ -866,7 +1009,7 @@ static void __free_filter(struct event_filter *filter) if (!filter) return; - __free_preds(filter); + free_prog(filter); kfree(filter->filter_string); kfree(filter); } @@ -876,38 +1019,6 @@ void free_event_filter(struct event_filter *filter) __free_filter(filter); } -static struct event_filter *__alloc_filter(void) -{ - struct event_filter *filter; - - filter = kzalloc(sizeof(*filter), GFP_KERNEL); - return filter; -} - -static int __alloc_preds(struct event_filter *filter, int n_preds) -{ - struct filter_pred *pred; - int i; - - if (filter->preds) - __free_preds(filter); - - filter->preds = kcalloc(n_preds, sizeof(*filter->preds), GFP_KERNEL); - - if (!filter->preds) - return -ENOMEM; - - filter->a_preds = n_preds; - filter->n_preds = 0; - - for (i = 0; i < n_preds; i++) { - pred = &filter->preds[i]; - pred->fn = filter_pred_none; - } - - return 0; -} - static inline void __remove_filter(struct trace_event_file *file) { filter_disable(file); @@ -944,27 +1055,6 @@ static void filter_free_subsystem_filters(struct trace_subsystem_dir *dir, } } -static int filter_add_pred(struct filter_parse_state *ps, - struct event_filter *filter, - struct filter_pred *pred, - struct pred_stack *stack) -{ - int err; - - if (WARN_ON(filter->n_preds == filter->a_preds)) { - parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0); - return -ENOSPC; - } - - err = filter_set_pred(filter, filter->n_preds, stack, pred); - if (err) - return err; - - filter->n_preds++; - - return 0; -} - int filter_assign_type(const char *type) { if (strstr(type, "__data_loc") && strstr(type, "char")) @@ -976,761 +1066,449 @@ int filter_assign_type(const char *type) return FILTER_OTHER; } -static bool is_legal_op(struct ftrace_event_field *field, enum filter_op_ids op) -{ - if (is_string_field(field) && - (op != OP_EQ && op != OP_NE && op != OP_GLOB)) - return false; - if (!is_string_field(field) && op == OP_GLOB) - return false; - - return true; -} - static filter_pred_fn_t select_comparison_fn(enum filter_op_ids op, int field_size, int field_is_signed) { filter_pred_fn_t fn = NULL; + int pred_func_index = -1; + + switch (op) { + case OP_EQ: + case OP_NE: + break; + default: + if (WARN_ON_ONCE(op < PRED_FUNC_START)) + return NULL; + pred_func_index = op - PRED_FUNC_START; + if (WARN_ON_ONCE(pred_func_index > PRED_FUNC_MAX)) + return NULL; + } switch (field_size) { case 8: - if (op == OP_EQ || op == OP_NE) + if (pred_func_index < 0) fn = filter_pred_64; else if (field_is_signed) - fn = pred_funcs_s64[op - PRED_FUNC_START]; + fn = pred_funcs_s64[pred_func_index]; else - fn = pred_funcs_u64[op - PRED_FUNC_START]; + fn = pred_funcs_u64[pred_func_index]; break; case 4: - if (op == OP_EQ || op == OP_NE) + if (pred_func_index < 0) fn = filter_pred_32; else if (field_is_signed) - fn = pred_funcs_s32[op - PRED_FUNC_START]; + fn = pred_funcs_s32[pred_func_index]; else - fn = pred_funcs_u32[op - PRED_FUNC_START]; + fn = pred_funcs_u32[pred_func_index]; break; case 2: - if (op == OP_EQ || op == OP_NE) + if (pred_func_index < 0) fn = filter_pred_16; else if (field_is_signed) - fn = pred_funcs_s16[op - PRED_FUNC_START]; + fn = pred_funcs_s16[pred_func_index]; else - fn = pred_funcs_u16[op - PRED_FUNC_START]; + fn = pred_funcs_u16[pred_func_index]; break; case 1: - if (op == OP_EQ || op == OP_NE) + if (pred_func_index < 0) fn = filter_pred_8; else if (field_is_signed) - fn = pred_funcs_s8[op - PRED_FUNC_START]; + fn = pred_funcs_s8[pred_func_index]; else - fn = pred_funcs_u8[op - PRED_FUNC_START]; + fn = pred_funcs_u8[pred_func_index]; break; } return fn; } -static int init_pred(struct filter_parse_state *ps, - struct ftrace_event_field *field, - struct filter_pred *pred) - +/* Called when a predicate is encountered by predicate_parse() */ +static int parse_pred(const char *str, void *data, + int pos, struct filter_parse_error *pe, + struct filter_pred **pred_ptr) { - filter_pred_fn_t fn = filter_pred_none; - unsigned long long val; + struct trace_event_call *call = data; + struct ftrace_event_field *field; + struct filter_pred *pred = NULL; + char num_buf[24]; /* Big enough to hold an address */ + char *field_name; + char q; + u64 val; + int len; int ret; + int op; + int s; + int i = 0; - pred->offset = field->offset; - - if (!is_legal_op(field, pred->op)) { - parse_error(ps, FILT_ERR_ILLEGAL_FIELD_OP, 0); - return -EINVAL; - } - - if (field->filter_type == FILTER_COMM) { - filter_build_regex(pred); - fn = filter_pred_comm; - pred->regex.field_len = TASK_COMM_LEN; - } else if (is_string_field(field)) { - filter_build_regex(pred); - - if (field->filter_type == FILTER_STATIC_STRING) { - fn = filter_pred_string; - pred->regex.field_len = field->size; - } else if (field->filter_type == FILTER_DYN_STRING) - fn = filter_pred_strloc; - else - fn = filter_pred_pchar; - } else if (is_function_field(field)) { - if (strcmp(field->name, "ip")) { - parse_error(ps, FILT_ERR_IP_FIELD_ONLY, 0); - return -EINVAL; - } - } else { - if (field->is_signed) - ret = kstrtoll(pred->regex.pattern, 0, &val); - else - ret = kstrtoull(pred->regex.pattern, 0, &val); - if (ret) { - parse_error(ps, FILT_ERR_ILLEGAL_INTVAL, 0); - return -EINVAL; - } - pred->val = val; - - if (field->filter_type == FILTER_CPU) - fn = filter_pred_cpu; - else - fn = select_comparison_fn(pred->op, field->size, - field->is_signed); - if (!fn) { - parse_error(ps, FILT_ERR_INVALID_OP, 0); - return -EINVAL; - } - } - - if (pred->op == OP_NE) - pred->not ^= 1; - - pred->fn = fn; - return 0; -} - -static void parse_init(struct filter_parse_state *ps, - struct filter_op *ops, - char *infix_string) -{ - memset(ps, '\0', sizeof(*ps)); - - ps->infix.string = infix_string; - ps->infix.cnt = strlen(infix_string); - ps->ops = ops; - - INIT_LIST_HEAD(&ps->opstack); - INIT_LIST_HEAD(&ps->postfix); -} - -static char infix_next(struct filter_parse_state *ps) -{ - if (!ps->infix.cnt) - return 0; - - ps->infix.cnt--; + /* First find the field to associate to */ + while (isspace(str[i])) + i++; + s = i; - return ps->infix.string[ps->infix.tail++]; -} - -static char infix_peek(struct filter_parse_state *ps) -{ - if (ps->infix.tail == strlen(ps->infix.string)) - return 0; - - return ps->infix.string[ps->infix.tail]; -} + while (isalnum(str[i]) || str[i] == '_') + i++; -static void infix_advance(struct filter_parse_state *ps) -{ - if (!ps->infix.cnt) - return; + len = i - s; - ps->infix.cnt--; - ps->infix.tail++; -} + if (!len) + return -1; -static inline int is_precedence_lower(struct filter_parse_state *ps, - int a, int b) -{ - return ps->ops[a].precedence < ps->ops[b].precedence; -} + field_name = kmemdup_nul(str + s, len, GFP_KERNEL); + if (!field_name) + return -ENOMEM; -static inline int is_op_char(struct filter_parse_state *ps, char c) -{ - int i; + /* Make sure that the field exists */ - for (i = 0; strcmp(ps->ops[i].string, "OP_NONE"); i++) { - if (ps->ops[i].string[0] == c) - return 1; + field = trace_find_event_field(call, field_name); + kfree(field_name); + if (!field) { + parse_error(pe, FILT_ERR_FIELD_NOT_FOUND, pos + i); + return -EINVAL; } - return 0; -} - -static int infix_get_op(struct filter_parse_state *ps, char firstc) -{ - char nextc = infix_peek(ps); - char opstr[3]; - int i; - - opstr[0] = firstc; - opstr[1] = nextc; - opstr[2] = '\0'; + while (isspace(str[i])) + i++; - for (i = 0; strcmp(ps->ops[i].string, "OP_NONE"); i++) { - if (!strcmp(opstr, ps->ops[i].string)) { - infix_advance(ps); - return ps->ops[i].id; - } + /* Make sure this op is supported */ + for (op = 0; ops[op]; op++) { + /* This is why '<=' must come before '<' in ops[] */ + if (strncmp(str + i, ops[op], strlen(ops[op])) == 0) + break; } - opstr[1] = '\0'; - - for (i = 0; strcmp(ps->ops[i].string, "OP_NONE"); i++) { - if (!strcmp(opstr, ps->ops[i].string)) - return ps->ops[i].id; + if (!ops[op]) { + parse_error(pe, FILT_ERR_INVALID_OP, pos + i); + goto err_free; } - return OP_NONE; -} - -static inline void clear_operand_string(struct filter_parse_state *ps) -{ - memset(ps->operand.string, '\0', MAX_FILTER_STR_VAL); - ps->operand.tail = 0; -} - -static inline int append_operand_char(struct filter_parse_state *ps, char c) -{ - if (ps->operand.tail == MAX_FILTER_STR_VAL - 1) - return -EINVAL; + i += strlen(ops[op]); - ps->operand.string[ps->operand.tail++] = c; + while (isspace(str[i])) + i++; - return 0; -} + s = i; -static int filter_opstack_push(struct filter_parse_state *ps, - enum filter_op_ids op) -{ - struct opstack_op *opstack_op; - - opstack_op = kmalloc(sizeof(*opstack_op), GFP_KERNEL); - if (!opstack_op) + pred = kzalloc(sizeof(*pred), GFP_KERNEL); + if (!pred) return -ENOMEM; - opstack_op->op = op; - list_add(&opstack_op->list, &ps->opstack); - - return 0; -} + pred->field = field; + pred->offset = field->offset; + pred->op = op; -static int filter_opstack_empty(struct filter_parse_state *ps) -{ - return list_empty(&ps->opstack); -} + if (ftrace_event_is_function(call)) { + /* + * Perf does things different with function events. + * It only allows an "ip" field, and expects a string. + * But the string does not need to be surrounded by quotes. + * If it is a string, the assigned function as a nop, + * (perf doesn't use it) and grab everything. + */ + if (strcmp(field->name, "ip") != 0) { + parse_error(pe, FILT_ERR_IP_FIELD_ONLY, pos + i); + goto err_free; + } + pred->fn = filter_pred_none; + + /* + * Quotes are not required, but if they exist then we need + * to read them till we hit a matching one. + */ + if (str[i] == '\'' || str[i] == '"') + q = str[i]; + else + q = 0; + + for (i++; str[i]; i++) { + if (q && str[i] == q) + break; + if (!q && (str[i] == ')' || str[i] == '&' || + str[i] == '|')) + break; + } + /* Skip quotes */ + if (q) + s++; + len = i - s; + if (len >= MAX_FILTER_STR_VAL) { + parse_error(pe, FILT_ERR_OPERAND_TOO_LONG, pos + i); + goto err_free; + } -static int filter_opstack_top(struct filter_parse_state *ps) -{ - struct opstack_op *opstack_op; + pred->regex.len = len; + strncpy(pred->regex.pattern, str + s, len); + pred->regex.pattern[len] = 0; + + /* This is either a string, or an integer */ + } else if (str[i] == '\'' || str[i] == '"') { + char q = str[i]; + + /* Make sure the op is OK for strings */ + switch (op) { + case OP_NE: + pred->not = 1; + /* Fall through */ + case OP_GLOB: + case OP_EQ: + break; + default: + parse_error(pe, FILT_ERR_ILLEGAL_FIELD_OP, pos + i); + goto err_free; + } - if (filter_opstack_empty(ps)) - return OP_NONE; + /* Make sure the field is OK for strings */ + if (!is_string_field(field)) { + parse_error(pe, FILT_ERR_EXPECT_DIGIT, pos + i); + goto err_free; + } - opstack_op = list_first_entry(&ps->opstack, struct opstack_op, list); + for (i++; str[i]; i++) { + if (str[i] == q) + break; + } + if (!str[i]) { + parse_error(pe, FILT_ERR_MISSING_QUOTE, pos + i); + goto err_free; + } - return opstack_op->op; -} + /* Skip quotes */ + s++; + len = i - s; + if (len >= MAX_FILTER_STR_VAL) { + parse_error(pe, FILT_ERR_OPERAND_TOO_LONG, pos + i); + goto err_free; + } -static int filter_opstack_pop(struct filter_parse_state *ps) -{ - struct opstack_op *opstack_op; - enum filter_op_ids op; + pred->regex.len = len; + strncpy(pred->regex.pattern, str + s, len); + pred->regex.pattern[len] = 0; - if (filter_opstack_empty(ps)) - return OP_NONE; + filter_build_regex(pred); - opstack_op = list_first_entry(&ps->opstack, struct opstack_op, list); - op = opstack_op->op; - list_del(&opstack_op->list); + if (field->filter_type == FILTER_COMM) { + pred->fn = filter_pred_comm; - kfree(opstack_op); + } else if (field->filter_type == FILTER_STATIC_STRING) { + pred->fn = filter_pred_string; + pred->regex.field_len = field->size; - return op; -} + } else if (field->filter_type == FILTER_DYN_STRING) + pred->fn = filter_pred_strloc; + else + pred->fn = filter_pred_pchar; + /* go past the last quote */ + i++; -static void filter_opstack_clear(struct filter_parse_state *ps) -{ - while (!filter_opstack_empty(ps)) - filter_opstack_pop(ps); -} + } else if (isdigit(str[i])) { -static char *curr_operand(struct filter_parse_state *ps) -{ - return ps->operand.string; -} + /* Make sure the field is not a string */ + if (is_string_field(field)) { + parse_error(pe, FILT_ERR_EXPECT_STRING, pos + i); + goto err_free; + } -static int postfix_append_operand(struct filter_parse_state *ps, char *operand) -{ - struct postfix_elt *elt; + if (op == OP_GLOB) { + parse_error(pe, FILT_ERR_ILLEGAL_FIELD_OP, pos + i); + goto err_free; + } - elt = kmalloc(sizeof(*elt), GFP_KERNEL); - if (!elt) - return -ENOMEM; + /* We allow 0xDEADBEEF */ + while (isalnum(str[i])) + i++; - elt->op = OP_NONE; - elt->operand = kstrdup(operand, GFP_KERNEL); - if (!elt->operand) { - kfree(elt); - return -ENOMEM; - } + len = i - s; + /* 0xfeedfacedeadbeef is 18 chars max */ + if (len >= sizeof(num_buf)) { + parse_error(pe, FILT_ERR_OPERAND_TOO_LONG, pos + i); + goto err_free; + } - list_add_tail(&elt->list, &ps->postfix); + strncpy(num_buf, str + s, len); + num_buf[len] = 0; - return 0; -} + /* Make sure it is a value */ + if (field->is_signed) + ret = kstrtoll(num_buf, 0, &val); + else + ret = kstrtoull(num_buf, 0, &val); + if (ret) { + parse_error(pe, FILT_ERR_ILLEGAL_INTVAL, pos + s); + goto err_free; + } -static int postfix_append_op(struct filter_parse_state *ps, enum filter_op_ids op) -{ - struct postfix_elt *elt; + pred->val = val; - elt = kmalloc(sizeof(*elt), GFP_KERNEL); - if (!elt) - return -ENOMEM; + if (field->filter_type == FILTER_CPU) + pred->fn = filter_pred_cpu; + else { + pred->fn = select_comparison_fn(pred->op, field->size, + field->is_signed); + if (pred->op == OP_NE) + pred->not = 1; + } - elt->op = op; - elt->operand = NULL; + } else { + parse_error(pe, FILT_ERR_INVALID_VALUE, pos + i); + goto err_free; + } - list_add_tail(&elt->list, &ps->postfix); + *pred_ptr = pred; + return i; - return 0; +err_free: + kfree(pred); + return -EINVAL; } -static void postfix_clear(struct filter_parse_state *ps) -{ - struct postfix_elt *elt; +enum { + TOO_MANY_CLOSE = -1, + TOO_MANY_OPEN = -2, + MISSING_QUOTE = -3, +}; - while (!list_empty(&ps->postfix)) { - elt = list_first_entry(&ps->postfix, struct postfix_elt, list); - list_del(&elt->list); - kfree(elt->operand); - kfree(elt); - } -} +/* + * Read the filter string once to calculate the number of predicates + * as well as how deep the parentheses go. + * + * Returns: + * 0 - everything is fine (err is undefined) + * -1 - too many ')' + * -2 - too many '(' + * -3 - No matching quote + */ +static int calc_stack(const char *str, int *parens, int *preds, int *err) +{ + bool is_pred = false; + int nr_preds = 0; + int open = 1; /* Count the expression as "(E)" */ + int last_quote = 0; + int max_open = 1; + int quote = 0; + int i; -static int filter_parse(struct filter_parse_state *ps) -{ - enum filter_op_ids op, top_op; - int in_string = 0; - char ch; + *err = 0; - while ((ch = infix_next(ps))) { - if (ch == '"') { - in_string ^= 1; + for (i = 0; str[i]; i++) { + if (isspace(str[i])) continue; - } - - if (in_string) - goto parse_operand; - - if (isspace(ch)) + if (quote) { + if (str[i] == quote) + quote = 0; continue; + } - if (is_op_char(ps, ch)) { - op = infix_get_op(ps, ch); - if (op == OP_NONE) { - parse_error(ps, FILT_ERR_INVALID_OP, 0); - return -EINVAL; - } - - if (strlen(curr_operand(ps))) { - postfix_append_operand(ps, curr_operand(ps)); - clear_operand_string(ps); - } - - while (!filter_opstack_empty(ps)) { - top_op = filter_opstack_top(ps); - if (!is_precedence_lower(ps, top_op, op)) { - top_op = filter_opstack_pop(ps); - postfix_append_op(ps, top_op); - continue; - } + switch (str[i]) { + case '\'': + case '"': + quote = str[i]; + last_quote = i; + break; + case '|': + case '&': + if (str[i+1] != str[i]) break; - } - - filter_opstack_push(ps, op); + is_pred = false; continue; - } - - if (ch == '(') { - filter_opstack_push(ps, OP_OPEN_PAREN); + case '(': + is_pred = false; + open++; + if (open > max_open) + max_open = open; continue; - } - - if (ch == ')') { - if (strlen(curr_operand(ps))) { - postfix_append_operand(ps, curr_operand(ps)); - clear_operand_string(ps); - } - - top_op = filter_opstack_pop(ps); - while (top_op != OP_NONE) { - if (top_op == OP_OPEN_PAREN) - break; - postfix_append_op(ps, top_op); - top_op = filter_opstack_pop(ps); - } - if (top_op == OP_NONE) { - parse_error(ps, FILT_ERR_UNBALANCED_PAREN, 0); - return -EINVAL; + case ')': + is_pred = false; + if (open == 1) { + *err = i; + return TOO_MANY_CLOSE; } + open--; continue; } -parse_operand: - if (append_operand_char(ps, ch)) { - parse_error(ps, FILT_ERR_OPERAND_TOO_LONG, 0); - return -EINVAL; - } - } - - if (strlen(curr_operand(ps))) - postfix_append_operand(ps, curr_operand(ps)); - - while (!filter_opstack_empty(ps)) { - top_op = filter_opstack_pop(ps); - if (top_op == OP_NONE) - break; - if (top_op == OP_OPEN_PAREN) { - parse_error(ps, FILT_ERR_UNBALANCED_PAREN, 0); - return -EINVAL; + if (!is_pred) { + nr_preds++; + is_pred = true; } - postfix_append_op(ps, top_op); } - return 0; -} - -static struct filter_pred *create_pred(struct filter_parse_state *ps, - struct trace_event_call *call, - enum filter_op_ids op, - char *operand1, char *operand2) -{ - struct ftrace_event_field *field; - static struct filter_pred pred; - - memset(&pred, 0, sizeof(pred)); - pred.op = op; - - if (op == OP_AND || op == OP_OR) - return &pred; - - if (!operand1 || !operand2) { - parse_error(ps, FILT_ERR_MISSING_FIELD, 0); - return NULL; - } - - field = trace_find_event_field(call, operand1); - if (!field) { - parse_error(ps, FILT_ERR_FIELD_NOT_FOUND, 0); - return NULL; + if (quote) { + *err = last_quote; + return MISSING_QUOTE; } - strcpy(pred.regex.pattern, operand2); - pred.regex.len = strlen(pred.regex.pattern); - pred.field = field; - return init_pred(ps, field, &pred) ? NULL : &pred; -} - -static int check_preds(struct filter_parse_state *ps) -{ - int n_normal_preds = 0, n_logical_preds = 0; - struct postfix_elt *elt; - int cnt = 0; + if (open != 1) { + int level = open; - list_for_each_entry(elt, &ps->postfix, list) { - if (elt->op == OP_NONE) { - cnt++; - continue; - } - - if (elt->op == OP_AND || elt->op == OP_OR) { - n_logical_preds++; - cnt--; - continue; + /* find the bad open */ + for (i--; i; i--) { + if (quote) { + if (str[i] == quote) + quote = 0; + continue; + } + switch (str[i]) { + case '(': + if (level == open) { + *err = i; + return TOO_MANY_OPEN; + } + level--; + break; + case ')': + level++; + break; + case '\'': + case '"': + quote = str[i]; + break; + } } - if (elt->op != OP_NOT) - cnt--; - n_normal_preds++; - /* all ops should have operands */ - if (cnt < 0) - break; - } - - if (cnt != 1 || !n_normal_preds || n_logical_preds >= n_normal_preds) { - parse_error(ps, FILT_ERR_INVALID_FILTER, 0); - return -EINVAL; + /* First character is the '(' with missing ')' */ + *err = 0; + return TOO_MANY_OPEN; } + /* Set the size of the required stacks */ + *parens = max_open; + *preds = nr_preds; return 0; } -static int count_preds(struct filter_parse_state *ps) -{ - struct postfix_elt *elt; - int n_preds = 0; - - list_for_each_entry(elt, &ps->postfix, list) { - if (elt->op == OP_NONE) - continue; - n_preds++; - } - - return n_preds; -} - -struct check_pred_data { - int count; - int max; -}; - -static int check_pred_tree_cb(enum move_type move, struct filter_pred *pred, - int *err, void *data) -{ - struct check_pred_data *d = data; - - if (WARN_ON(d->count++ > d->max)) { - *err = -EINVAL; - return WALK_PRED_ABORT; - } - return WALK_PRED_DEFAULT; -} - -/* - * The tree is walked at filtering of an event. If the tree is not correctly - * built, it may cause an infinite loop. Check here that the tree does - * indeed terminate. - */ -static int check_pred_tree(struct event_filter *filter, - struct filter_pred *root) -{ - struct check_pred_data data = { - /* - * The max that we can hit a node is three times. - * Once going down, once coming up from left, and - * once coming up from right. This is more than enough - * since leafs are only hit a single time. - */ - .max = 3 * filter->n_preds, - .count = 0, - }; - - return walk_pred_tree(filter->preds, root, - check_pred_tree_cb, &data); -} - -static int count_leafs_cb(enum move_type move, struct filter_pred *pred, - int *err, void *data) -{ - int *count = data; - - if ((move == MOVE_DOWN) && - (pred->left == FILTER_PRED_INVALID)) - (*count)++; - - return WALK_PRED_DEFAULT; -} - -static int count_leafs(struct filter_pred *preds, struct filter_pred *root) -{ - int count = 0, ret; - - ret = walk_pred_tree(preds, root, count_leafs_cb, &count); - WARN_ON(ret); - return count; -} - -struct fold_pred_data { - struct filter_pred *root; - int count; - int children; -}; - -static int fold_pred_cb(enum move_type move, struct filter_pred *pred, - int *err, void *data) -{ - struct fold_pred_data *d = data; - struct filter_pred *root = d->root; - - if (move != MOVE_DOWN) - return WALK_PRED_DEFAULT; - if (pred->left != FILTER_PRED_INVALID) - return WALK_PRED_DEFAULT; - - if (WARN_ON(d->count == d->children)) { - *err = -EINVAL; - return WALK_PRED_ABORT; - } - - pred->index &= ~FILTER_PRED_FOLD; - root->ops[d->count++] = pred->index; - return WALK_PRED_DEFAULT; -} - -static int fold_pred(struct filter_pred *preds, struct filter_pred *root) -{ - struct fold_pred_data data = { - .root = root, - .count = 0, - }; - int children; - - /* No need to keep the fold flag */ - root->index &= ~FILTER_PRED_FOLD; - - /* If the root is a leaf then do nothing */ - if (root->left == FILTER_PRED_INVALID) - return 0; - - /* count the children */ - children = count_leafs(preds, &preds[root->left]); - children += count_leafs(preds, &preds[root->right]); - - root->ops = kcalloc(children, sizeof(*root->ops), GFP_KERNEL); - if (!root->ops) - return -ENOMEM; - - root->val = children; - data.children = children; - return walk_pred_tree(preds, root, fold_pred_cb, &data); -} - -static int fold_pred_tree_cb(enum move_type move, struct filter_pred *pred, - int *err, void *data) -{ - struct filter_pred *preds = data; - - if (move != MOVE_DOWN) - return WALK_PRED_DEFAULT; - if (!(pred->index & FILTER_PRED_FOLD)) - return WALK_PRED_DEFAULT; - - *err = fold_pred(preds, pred); - if (*err) - return WALK_PRED_ABORT; - - /* eveyrhing below is folded, continue with parent */ - return WALK_PRED_PARENT; -} - -/* - * To optimize the processing of the ops, if we have several "ors" or - * "ands" together, we can put them in an array and process them all - * together speeding up the filter logic. - */ -static int fold_pred_tree(struct event_filter *filter, - struct filter_pred *root) -{ - return walk_pred_tree(filter->preds, root, fold_pred_tree_cb, - filter->preds); -} - -static int replace_preds(struct trace_event_call *call, +static int process_preds(struct trace_event_call *call, + const char *filter_string, struct event_filter *filter, - struct filter_parse_state *ps, - bool dry_run) + struct filter_parse_error *pe) { - char *operand1 = NULL, *operand2 = NULL; - struct filter_pred *pred; - struct filter_pred *root; - struct postfix_elt *elt; - struct pred_stack stack = { }; /* init to NULL */ - int err; - int n_preds = 0; - - n_preds = count_preds(ps); - if (n_preds >= MAX_FILTER_PRED) { - parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0); - return -ENOSPC; - } - - err = check_preds(ps); - if (err) - return err; - - if (!dry_run) { - err = __alloc_pred_stack(&stack, n_preds); - if (err) - return err; - err = __alloc_preds(filter, n_preds); - if (err) - goto fail; - } - - n_preds = 0; - list_for_each_entry(elt, &ps->postfix, list) { - if (elt->op == OP_NONE) { - if (!operand1) - operand1 = elt->operand; - else if (!operand2) - operand2 = elt->operand; - else { - parse_error(ps, FILT_ERR_TOO_MANY_OPERANDS, 0); - err = -EINVAL; - goto fail; - } - continue; - } - - if (elt->op == OP_NOT) { - if (!n_preds || operand1 || operand2) { - parse_error(ps, FILT_ERR_ILLEGAL_NOT_OP, 0); - err = -EINVAL; - goto fail; - } - if (!dry_run) - filter->preds[n_preds - 1].not ^= 1; - continue; - } - - if (WARN_ON(n_preds++ == MAX_FILTER_PRED)) { - parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0); - err = -ENOSPC; - goto fail; - } - - pred = create_pred(ps, call, elt->op, operand1, operand2); - if (!pred) { - err = -EINVAL; - goto fail; - } + struct prog_entry *prog; + int nr_parens; + int nr_preds; + int index; + int ret; - if (!dry_run) { - err = filter_add_pred(ps, filter, pred, &stack); - if (err) - goto fail; + ret = calc_stack(filter_string, &nr_parens, &nr_preds, &index); + if (ret < 0) { + switch (ret) { + case MISSING_QUOTE: + parse_error(pe, FILT_ERR_MISSING_QUOTE, index); + break; + case TOO_MANY_OPEN: + parse_error(pe, FILT_ERR_TOO_MANY_OPEN, index); + break; + default: + parse_error(pe, FILT_ERR_TOO_MANY_CLOSE, index); } - - operand1 = operand2 = NULL; + return ret; } - if (!dry_run) { - /* We should have one item left on the stack */ - pred = __pop_pred_stack(&stack); - if (!pred) - return -EINVAL; - /* This item is where we start from in matching */ - root = pred; - /* Make sure the stack is empty */ - pred = __pop_pred_stack(&stack); - if (WARN_ON(pred)) { - err = -EINVAL; - filter->root = NULL; - goto fail; - } - err = check_pred_tree(filter, root); - if (err) - goto fail; - - /* Optimize the tree */ - err = fold_pred_tree(filter, root); - if (err) - goto fail; - - /* We don't set root until we know it works */ - barrier(); - filter->root = root; + if (!nr_preds) { + prog = NULL; + } else { + prog = predicate_parse(filter_string, nr_parens, nr_preds, + parse_pred, call, pe); + if (IS_ERR(prog)) + return PTR_ERR(prog); } - - err = 0; -fail: - __free_pred_stack(&stack); - return err; + rcu_assign_pointer(filter->prog, prog); + return 0; } static inline void event_set_filtered_flag(struct trace_event_file *file) @@ -1780,72 +1558,53 @@ struct filter_list { struct event_filter *filter; }; -static int replace_system_preds(struct trace_subsystem_dir *dir, +static int process_system_preds(struct trace_subsystem_dir *dir, struct trace_array *tr, - struct filter_parse_state *ps, + struct filter_parse_error *pe, char *filter_string) { struct trace_event_file *file; struct filter_list *filter_item; + struct event_filter *filter = NULL; struct filter_list *tmp; LIST_HEAD(filter_list); bool fail = true; int err; list_for_each_entry(file, &tr->events, list) { - if (file->system != dir) - continue; - - /* - * Try to see if the filter can be applied - * (filter arg is ignored on dry_run) - */ - err = replace_preds(file->event_call, NULL, ps, true); - if (err) - event_set_no_set_filter_flag(file); - else - event_clear_no_set_filter_flag(file); - } - - list_for_each_entry(file, &tr->events, list) { - struct event_filter *filter; if (file->system != dir) continue; - if (event_no_set_filter_flag(file)) - continue; - - filter_item = kzalloc(sizeof(*filter_item), GFP_KERNEL); - if (!filter_item) - goto fail_mem; - - list_add_tail(&filter_item->list, &filter_list); - - filter_item->filter = __alloc_filter(); - if (!filter_item->filter) + filter = kzalloc(sizeof(*filter), GFP_KERNEL); + if (!filter) goto fail_mem; - filter = filter_item->filter; - /* Can only fail on no memory */ - err = replace_filter_string(filter, filter_string); - if (err) + filter->filter_string = kstrdup(filter_string, GFP_KERNEL); + if (!filter->filter_string) goto fail_mem; - err = replace_preds(file->event_call, filter, ps, false); + err = process_preds(file->event_call, filter_string, filter, pe); if (err) { filter_disable(file); - parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0); - append_filter_err(ps, filter); + parse_error(pe, FILT_ERR_BAD_SUBSYS_FILTER, 0); + append_filter_err(pe, filter); } else event_set_filtered_flag(file); + + + filter_item = kzalloc(sizeof(*filter_item), GFP_KERNEL); + if (!filter_item) + goto fail_mem; + + list_add_tail(&filter_item->list, &filter_list); /* * Regardless of if this returned an error, we still * replace the filter for the call. */ - filter = event_filter(file); - event_set_filter(file, filter_item->filter); - filter_item->filter = filter; + filter_item->filter = event_filter(file); + event_set_filter(file, filter); + filter = NULL; fail = false; } @@ -1871,9 +1630,10 @@ static int replace_system_preds(struct trace_subsystem_dir *dir, list_del(&filter_item->list); kfree(filter_item); } - parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0); + parse_error(pe, FILT_ERR_BAD_SUBSYS_FILTER, 0); return -EINVAL; fail_mem: + kfree(filter); /* If any call succeeded, we still need to sync */ if (!fail) synchronize_sched(); @@ -1885,47 +1645,42 @@ static int replace_system_preds(struct trace_subsystem_dir *dir, return -ENOMEM; } -static int create_filter_start(char *filter_str, bool set_str, - struct filter_parse_state **psp, +static int create_filter_start(char *filter_string, bool set_str, + struct filter_parse_error **pse, struct event_filter **filterp) { struct event_filter *filter; - struct filter_parse_state *ps = NULL; + struct filter_parse_error *pe = NULL; int err = 0; - WARN_ON_ONCE(*psp || *filterp); + if (WARN_ON_ONCE(*pse || *filterp)) + return -EINVAL; - /* allocate everything, and if any fails, free all and fail */ - filter = __alloc_filter(); - if (filter && set_str) - err = replace_filter_string(filter, filter_str); + filter = kzalloc(sizeof(*filter), GFP_KERNEL); + if (filter && set_str) { + filter->filter_string = kstrdup(filter_string, GFP_KERNEL); + if (!filter->filter_string) + err = -ENOMEM; + } - ps = kzalloc(sizeof(*ps), GFP_KERNEL); + pe = kzalloc(sizeof(*pe), GFP_KERNEL); - if (!filter || !ps || err) { - kfree(ps); + if (!filter || !pe || err) { + kfree(pe); __free_filter(filter); return -ENOMEM; } /* we're committed to creating a new filter */ *filterp = filter; - *psp = ps; + *pse = pe; - parse_init(ps, filter_ops, filter_str); - err = filter_parse(ps); - if (err && set_str) - append_filter_err(ps, filter); - return err; + return 0; } -static void create_filter_finish(struct filter_parse_state *ps) +static void create_filter_finish(struct filter_parse_error *pe) { - if (ps) { - filter_opstack_clear(ps); - postfix_clear(ps); - kfree(ps); - } + kfree(pe); } /** @@ -1945,26 +1700,20 @@ static void create_filter_finish(struct filter_parse_state *ps) * freeing it. */ static int create_filter(struct trace_event_call *call, - char *filter_str, bool set_str, + char *filter_string, bool set_str, struct event_filter **filterp) { - struct event_filter *filter = NULL; - struct filter_parse_state *ps = NULL; + struct filter_parse_error *pe = NULL; int err; - err = create_filter_start(filter_str, set_str, &ps, &filter); - if (!err) { - err = replace_preds(call, filter, ps, false); - if (err && set_str) - append_filter_err(ps, filter); - } - if (err && !set_str) { - free_event_filter(filter); - filter = NULL; - } - create_filter_finish(ps); + err = create_filter_start(filter_string, set_str, &pe, filterp); + if (err) + return err; + + err = process_preds(call, filter_string, *filterp, pe); + if (err && set_str) + append_filter_err(pe, *filterp); - *filterp = filter; return err; } @@ -1988,24 +1737,22 @@ static int create_system_filter(struct trace_subsystem_dir *dir, struct trace_array *tr, char *filter_str, struct event_filter **filterp) { - struct event_filter *filter = NULL; - struct filter_parse_state *ps = NULL; + struct filter_parse_error *pe = NULL; int err; - err = create_filter_start(filter_str, true, &ps, &filter); + err = create_filter_start(filter_str, true, &pe, filterp); if (!err) { - err = replace_system_preds(dir, tr, ps, filter_str); + err = process_system_preds(dir, tr, pe, filter_str); if (!err) { /* System filters just show a default message */ - kfree(filter->filter_string); - filter->filter_string = NULL; + kfree((*filterp)->filter_string); + (*filterp)->filter_string = NULL; } else { - append_filter_err(ps, filter); + append_filter_err(pe, *filterp); } } - create_filter_finish(ps); + create_filter_finish(pe); - *filterp = filter; return err; } @@ -2013,7 +1760,7 @@ static int create_system_filter(struct trace_subsystem_dir *dir, int apply_event_filter(struct trace_event_file *file, char *filter_string) { struct trace_event_call *call = file->event_call; - struct event_filter *filter; + struct event_filter *filter = NULL; int err; if (!strcmp(strstrip(filter_string), "0")) { @@ -2066,7 +1813,7 @@ int apply_subsystem_event_filter(struct trace_subsystem_dir *dir, { struct event_subsystem *system = dir->subsystem; struct trace_array *tr = dir->tr; - struct event_filter *filter; + struct event_filter *filter = NULL; int err = 0; mutex_lock(&event_mutex); @@ -2186,66 +1933,80 @@ static int __ftrace_function_set_filter(int filter, char *buf, int len, return ret; } -static int ftrace_function_check_pred(struct filter_pred *pred, int leaf) +static int ftrace_function_check_pred(struct filter_pred *pred) { struct ftrace_event_field *field = pred->field; - if (leaf) { - /* - * Check the leaf predicate for function trace, verify: - * - only '==' and '!=' is used - * - the 'ip' field is used - */ - if ((pred->op != OP_EQ) && (pred->op != OP_NE)) - return -EINVAL; + /* + * Check the predicate for function trace, verify: + * - only '==' and '!=' is used + * - the 'ip' field is used + */ + if ((pred->op != OP_EQ) && (pred->op != OP_NE)) + return -EINVAL; - if (strcmp(field->name, "ip")) - return -EINVAL; - } else { - /* - * Check the non leaf predicate for function trace, verify: - * - only '||' is used - */ - if (pred->op != OP_OR) - return -EINVAL; - } + if (strcmp(field->name, "ip")) + return -EINVAL; return 0; } -static int ftrace_function_set_filter_cb(enum move_type move, - struct filter_pred *pred, - int *err, void *data) +static int ftrace_function_set_filter_pred(struct filter_pred *pred, + struct function_filter_data *data) { + int ret; + /* Checking the node is valid for function trace. */ - if ((move != MOVE_DOWN) || - (pred->left != FILTER_PRED_INVALID)) { - *err = ftrace_function_check_pred(pred, 0); - } else { - *err = ftrace_function_check_pred(pred, 1); - if (*err) - return WALK_PRED_ABORT; - - *err = __ftrace_function_set_filter(pred->op == OP_EQ, - pred->regex.pattern, - pred->regex.len, - data); - } + ret = ftrace_function_check_pred(pred); + if (ret) + return ret; - return (*err) ? WALK_PRED_ABORT : WALK_PRED_DEFAULT; + return __ftrace_function_set_filter(pred->op == OP_EQ, + pred->regex.pattern, + pred->regex.len, + data); +} + +static bool is_or(struct prog_entry *prog, int i) +{ + int target; + + /* + * Only "||" is allowed for function events, thus, + * all true branches should jump to true, and any + * false branch should jump to false. + */ + target = prog[i].target + 1; + /* True and false have NULL preds (all prog entries should jump to one */ + if (prog[target].pred) + return false; + + /* prog[target].target is 1 for TRUE, 0 for FALSE */ + return prog[i].when_to_branch == prog[target].target; } static int ftrace_function_set_filter(struct perf_event *event, struct event_filter *filter) { + struct prog_entry *prog = rcu_dereference_protected(filter->prog, + lockdep_is_held(&event_mutex)); struct function_filter_data data = { .first_filter = 1, .first_notrace = 1, .ops = &event->ftrace_ops, }; + int i; + + for (i = 0; prog[i].pred; i++) { + struct filter_pred *pred = prog[i].pred; - return walk_pred_tree(filter->preds, filter->root, - ftrace_function_set_filter_cb, &data); + if (!is_or(prog, i)) + return -EINVAL; + + if (ftrace_function_set_filter_pred(pred, &data) < 0) + return -EINVAL; + } + return 0; } #else static int ftrace_function_set_filter(struct perf_event *event, @@ -2259,7 +2020,7 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id, char *filter_str) { int err; - struct event_filter *filter; + struct event_filter *filter = NULL; struct trace_event_call *call; mutex_lock(&event_mutex); @@ -2375,7 +2136,7 @@ static struct test_filter_data_t { #undef YES #undef NO -#define DATA_CNT (sizeof(test_filter_data)/sizeof(struct test_filter_data_t)) +#define DATA_CNT ARRAY_SIZE(test_filter_data) static int test_pred_visited; @@ -2388,26 +2149,28 @@ static int test_pred_visited_fn(struct filter_pred *pred, void *event) return 1; } -static int test_walk_pred_cb(enum move_type move, struct filter_pred *pred, - int *err, void *data) +static void update_pred_fn(struct event_filter *filter, char *fields) { - char *fields = data; + struct prog_entry *prog = rcu_dereference_protected(filter->prog, + lockdep_is_held(&event_mutex)); + int i; - if ((move == MOVE_DOWN) && - (pred->left == FILTER_PRED_INVALID)) { + for (i = 0; prog[i].pred; i++) { + struct filter_pred *pred = prog[i].pred; struct ftrace_event_field *field = pred->field; + WARN_ON_ONCE(!pred->fn); + if (!field) { - WARN(1, "all leafs should have field defined"); - return WALK_PRED_DEFAULT; + WARN_ONCE(1, "all leafs should have field defined %d", i); + continue; } + if (!strchr(fields, *field->name)) - return WALK_PRED_DEFAULT; + continue; - WARN_ON(!pred->fn); pred->fn = test_pred_visited_fn; } - return WALK_PRED_DEFAULT; } static __init int ftrace_test_event_filter(void) @@ -2431,20 +2194,22 @@ static __init int ftrace_test_event_filter(void) break; } + /* Needed to dereference filter->prog */ + mutex_lock(&event_mutex); /* * The preemption disabling is not really needed for self * tests, but the rcu dereference will complain without it. */ preempt_disable(); if (*d->not_visited) - walk_pred_tree(filter->preds, filter->root, - test_walk_pred_cb, - d->not_visited); + update_pred_fn(filter, d->not_visited); test_pred_visited = 0; err = filter_match_preds(filter, &d->rec); preempt_enable(); + mutex_unlock(&event_mutex); + __free_filter(filter); if (test_pred_visited) { diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 1e1558c99d56..0d7b3ffbecc2 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -20,15 +20,39 @@ #include <linux/slab.h> #include <linux/stacktrace.h> #include <linux/rculist.h> +#include <linux/tracefs.h> #include "tracing_map.h" #include "trace.h" +#define SYNTH_SYSTEM "synthetic" +#define SYNTH_FIELDS_MAX 16 + +#define STR_VAR_LEN_MAX 32 /* must be multiple of sizeof(u64) */ + struct hist_field; -typedef u64 (*hist_field_fn_t) (struct hist_field *field, void *event); +typedef u64 (*hist_field_fn_t) (struct hist_field *field, + struct tracing_map_elt *elt, + struct ring_buffer_event *rbe, + void *event); #define HIST_FIELD_OPERANDS_MAX 2 +#define HIST_FIELDS_MAX (TRACING_MAP_FIELDS_MAX + TRACING_MAP_VARS_MAX) +#define HIST_ACTIONS_MAX 8 + +enum field_op_id { + FIELD_OP_NONE, + FIELD_OP_PLUS, + FIELD_OP_MINUS, + FIELD_OP_UNARY_MINUS, +}; + +struct hist_var { + char *name; + struct hist_trigger_data *hist_data; + unsigned int idx; +}; struct hist_field { struct ftrace_event_field *field; @@ -37,27 +61,49 @@ struct hist_field { unsigned int size; unsigned int offset; unsigned int is_signed; + const char *type; struct hist_field *operands[HIST_FIELD_OPERANDS_MAX]; + struct hist_trigger_data *hist_data; + struct hist_var var; + enum field_op_id operator; + char *system; + char *event_name; + char *name; + unsigned int var_idx; + unsigned int var_ref_idx; + bool read_once; }; -static u64 hist_field_none(struct hist_field *field, void *event) +static u64 hist_field_none(struct hist_field *field, + struct tracing_map_elt *elt, + struct ring_buffer_event *rbe, + void *event) { return 0; } -static u64 hist_field_counter(struct hist_field *field, void *event) +static u64 hist_field_counter(struct hist_field *field, + struct tracing_map_elt *elt, + struct ring_buffer_event *rbe, + void *event) { return 1; } -static u64 hist_field_string(struct hist_field *hist_field, void *event) +static u64 hist_field_string(struct hist_field *hist_field, + struct tracing_map_elt *elt, + struct ring_buffer_event *rbe, + void *event) { char *addr = (char *)(event + hist_field->field->offset); return (u64)(unsigned long)addr; } -static u64 hist_field_dynstring(struct hist_field *hist_field, void *event) +static u64 hist_field_dynstring(struct hist_field *hist_field, + struct tracing_map_elt *elt, + struct ring_buffer_event *rbe, + void *event) { u32 str_item = *(u32 *)(event + hist_field->field->offset); int str_loc = str_item & 0xffff; @@ -66,24 +112,74 @@ static u64 hist_field_dynstring(struct hist_field *hist_field, void *event) return (u64)(unsigned long)addr; } -static u64 hist_field_pstring(struct hist_field *hist_field, void *event) +static u64 hist_field_pstring(struct hist_field *hist_field, + struct tracing_map_elt *elt, + struct ring_buffer_event *rbe, + void *event) { char **addr = (char **)(event + hist_field->field->offset); return (u64)(unsigned long)*addr; } -static u64 hist_field_log2(struct hist_field *hist_field, void *event) +static u64 hist_field_log2(struct hist_field *hist_field, + struct tracing_map_elt *elt, + struct ring_buffer_event *rbe, + void *event) { struct hist_field *operand = hist_field->operands[0]; - u64 val = operand->fn(operand, event); + u64 val = operand->fn(operand, elt, rbe, event); return (u64) ilog2(roundup_pow_of_two(val)); } +static u64 hist_field_plus(struct hist_field *hist_field, + struct tracing_map_elt *elt, + struct ring_buffer_event *rbe, + void *event) +{ + struct hist_field *operand1 = hist_field->operands[0]; + struct hist_field *operand2 = hist_field->operands[1]; + + u64 val1 = operand1->fn(operand1, elt, rbe, event); + u64 val2 = operand2->fn(operand2, elt, rbe, event); + + return val1 + val2; +} + +static u64 hist_field_minus(struct hist_field *hist_field, + struct tracing_map_elt *elt, + struct ring_buffer_event *rbe, + void *event) +{ + struct hist_field *operand1 = hist_field->operands[0]; + struct hist_field *operand2 = hist_field->operands[1]; + + u64 val1 = operand1->fn(operand1, elt, rbe, event); + u64 val2 = operand2->fn(operand2, elt, rbe, event); + + return val1 - val2; +} + +static u64 hist_field_unary_minus(struct hist_field *hist_field, + struct tracing_map_elt *elt, + struct ring_buffer_event *rbe, + void *event) +{ + struct hist_field *operand = hist_field->operands[0]; + + s64 sval = (s64)operand->fn(operand, elt, rbe, event); + u64 val = (u64)-sval; + + return val; +} + #define DEFINE_HIST_FIELD_FN(type) \ -static u64 hist_field_##type(struct hist_field *hist_field, void *event)\ + static u64 hist_field_##type(struct hist_field *hist_field, \ + struct tracing_map_elt *elt, \ + struct ring_buffer_event *rbe, \ + void *event) \ { \ type *addr = (type *)(event + hist_field->field->offset); \ \ @@ -126,6 +222,19 @@ enum hist_field_flags { HIST_FIELD_FL_SYSCALL = 1 << 7, HIST_FIELD_FL_STACKTRACE = 1 << 8, HIST_FIELD_FL_LOG2 = 1 << 9, + HIST_FIELD_FL_TIMESTAMP = 1 << 10, + HIST_FIELD_FL_TIMESTAMP_USECS = 1 << 11, + HIST_FIELD_FL_VAR = 1 << 12, + HIST_FIELD_FL_EXPR = 1 << 13, + HIST_FIELD_FL_VAR_REF = 1 << 14, + HIST_FIELD_FL_CPU = 1 << 15, + HIST_FIELD_FL_ALIAS = 1 << 16, +}; + +struct var_defs { + unsigned int n_vars; + char *name[TRACING_MAP_VARS_MAX]; + char *expr[TRACING_MAP_VARS_MAX]; }; struct hist_trigger_attrs { @@ -133,25 +242,1437 @@ struct hist_trigger_attrs { char *vals_str; char *sort_key_str; char *name; + char *clock; bool pause; bool cont; bool clear; + bool ts_in_usecs; unsigned int map_bits; + + char *assignment_str[TRACING_MAP_VARS_MAX]; + unsigned int n_assignments; + + char *action_str[HIST_ACTIONS_MAX]; + unsigned int n_actions; + + struct var_defs var_defs; +}; + +struct field_var { + struct hist_field *var; + struct hist_field *val; +}; + +struct field_var_hist { + struct hist_trigger_data *hist_data; + char *cmd; }; struct hist_trigger_data { - struct hist_field *fields[TRACING_MAP_FIELDS_MAX]; + struct hist_field *fields[HIST_FIELDS_MAX]; unsigned int n_vals; unsigned int n_keys; unsigned int n_fields; + unsigned int n_vars; unsigned int key_size; struct tracing_map_sort_key sort_keys[TRACING_MAP_SORT_KEYS_MAX]; unsigned int n_sort_keys; struct trace_event_file *event_file; struct hist_trigger_attrs *attrs; struct tracing_map *map; + bool enable_timestamps; + bool remove; + struct hist_field *var_refs[TRACING_MAP_VARS_MAX]; + unsigned int n_var_refs; + + struct action_data *actions[HIST_ACTIONS_MAX]; + unsigned int n_actions; + + struct hist_field *synth_var_refs[SYNTH_FIELDS_MAX]; + unsigned int n_synth_var_refs; + struct field_var *field_vars[SYNTH_FIELDS_MAX]; + unsigned int n_field_vars; + unsigned int n_field_var_str; + struct field_var_hist *field_var_hists[SYNTH_FIELDS_MAX]; + unsigned int n_field_var_hists; + + struct field_var *max_vars[SYNTH_FIELDS_MAX]; + unsigned int n_max_vars; + unsigned int n_max_var_str; +}; + +struct synth_field { + char *type; + char *name; + size_t size; + bool is_signed; + bool is_string; +}; + +struct synth_event { + struct list_head list; + int ref; + char *name; + struct synth_field **fields; + unsigned int n_fields; + unsigned int n_u64; + struct trace_event_class class; + struct trace_event_call call; + struct tracepoint *tp; +}; + +struct action_data; + +typedef void (*action_fn_t) (struct hist_trigger_data *hist_data, + struct tracing_map_elt *elt, void *rec, + struct ring_buffer_event *rbe, + struct action_data *data, u64 *var_ref_vals); + +struct action_data { + action_fn_t fn; + unsigned int n_params; + char *params[SYNTH_FIELDS_MAX]; + + union { + struct { + unsigned int var_ref_idx; + char *match_event; + char *match_event_system; + char *synth_event_name; + struct synth_event *synth_event; + } onmatch; + + struct { + char *var_str; + char *fn_name; + unsigned int max_var_ref_idx; + struct hist_field *max_var; + struct hist_field *var; + } onmax; + }; +}; + + +static char last_hist_cmd[MAX_FILTER_STR_VAL]; +static char hist_err_str[MAX_FILTER_STR_VAL]; + +static void last_cmd_set(char *str) +{ + if (!str) + return; + + strncpy(last_hist_cmd, str, MAX_FILTER_STR_VAL - 1); +} + +static void hist_err(char *str, char *var) +{ + int maxlen = MAX_FILTER_STR_VAL - 1; + + if (!str) + return; + + if (strlen(hist_err_str)) + return; + + if (!var) + var = ""; + + if (strlen(hist_err_str) + strlen(str) + strlen(var) > maxlen) + return; + + strcat(hist_err_str, str); + strcat(hist_err_str, var); +} + +static void hist_err_event(char *str, char *system, char *event, char *var) +{ + char err[MAX_FILTER_STR_VAL]; + + if (system && var) + snprintf(err, MAX_FILTER_STR_VAL, "%s.%s.%s", system, event, var); + else if (system) + snprintf(err, MAX_FILTER_STR_VAL, "%s.%s", system, event); + else + strncpy(err, var, MAX_FILTER_STR_VAL); + + hist_err(str, err); +} + +static void hist_err_clear(void) +{ + hist_err_str[0] = '\0'; +} + +static bool have_hist_err(void) +{ + if (strlen(hist_err_str)) + return true; + + return false; +} + +static LIST_HEAD(synth_event_list); +static DEFINE_MUTEX(synth_event_mutex); + +struct synth_trace_event { + struct trace_entry ent; + u64 fields[]; +}; + +static int synth_event_define_fields(struct trace_event_call *call) +{ + struct synth_trace_event trace; + int offset = offsetof(typeof(trace), fields); + struct synth_event *event = call->data; + unsigned int i, size, n_u64; + char *name, *type; + bool is_signed; + int ret = 0; + + for (i = 0, n_u64 = 0; i < event->n_fields; i++) { + size = event->fields[i]->size; + is_signed = event->fields[i]->is_signed; + type = event->fields[i]->type; + name = event->fields[i]->name; + ret = trace_define_field(call, type, name, offset, size, + is_signed, FILTER_OTHER); + if (ret) + break; + + if (event->fields[i]->is_string) { + offset += STR_VAR_LEN_MAX; + n_u64 += STR_VAR_LEN_MAX / sizeof(u64); + } else { + offset += sizeof(u64); + n_u64++; + } + } + + event->n_u64 = n_u64; + + return ret; +} + +static bool synth_field_signed(char *type) +{ + if (strncmp(type, "u", 1) == 0) + return false; + + return true; +} + +static int synth_field_is_string(char *type) +{ + if (strstr(type, "char[") != NULL) + return true; + + return false; +} + +static int synth_field_string_size(char *type) +{ + char buf[4], *end, *start; + unsigned int len; + int size, err; + + start = strstr(type, "char["); + if (start == NULL) + return -EINVAL; + start += strlen("char["); + + end = strchr(type, ']'); + if (!end || end < start) + return -EINVAL; + + len = end - start; + if (len > 3) + return -EINVAL; + + strncpy(buf, start, len); + buf[len] = '\0'; + + err = kstrtouint(buf, 0, &size); + if (err) + return err; + + if (size > STR_VAR_LEN_MAX) + return -EINVAL; + + return size; +} + +static int synth_field_size(char *type) +{ + int size = 0; + + if (strcmp(type, "s64") == 0) + size = sizeof(s64); + else if (strcmp(type, "u64") == 0) + size = sizeof(u64); + else if (strcmp(type, "s32") == 0) + size = sizeof(s32); + else if (strcmp(type, "u32") == 0) + size = sizeof(u32); + else if (strcmp(type, "s16") == 0) + size = sizeof(s16); + else if (strcmp(type, "u16") == 0) + size = sizeof(u16); + else if (strcmp(type, "s8") == 0) + size = sizeof(s8); + else if (strcmp(type, "u8") == 0) + size = sizeof(u8); + else if (strcmp(type, "char") == 0) + size = sizeof(char); + else if (strcmp(type, "unsigned char") == 0) + size = sizeof(unsigned char); + else if (strcmp(type, "int") == 0) + size = sizeof(int); + else if (strcmp(type, "unsigned int") == 0) + size = sizeof(unsigned int); + else if (strcmp(type, "long") == 0) + size = sizeof(long); + else if (strcmp(type, "unsigned long") == 0) + size = sizeof(unsigned long); + else if (strcmp(type, "pid_t") == 0) + size = sizeof(pid_t); + else if (synth_field_is_string(type)) + size = synth_field_string_size(type); + + return size; +} + +static const char *synth_field_fmt(char *type) +{ + const char *fmt = "%llu"; + + if (strcmp(type, "s64") == 0) + fmt = "%lld"; + else if (strcmp(type, "u64") == 0) + fmt = "%llu"; + else if (strcmp(type, "s32") == 0) + fmt = "%d"; + else if (strcmp(type, "u32") == 0) + fmt = "%u"; + else if (strcmp(type, "s16") == 0) + fmt = "%d"; + else if (strcmp(type, "u16") == 0) + fmt = "%u"; + else if (strcmp(type, "s8") == 0) + fmt = "%d"; + else if (strcmp(type, "u8") == 0) + fmt = "%u"; + else if (strcmp(type, "char") == 0) + fmt = "%d"; + else if (strcmp(type, "unsigned char") == 0) + fmt = "%u"; + else if (strcmp(type, "int") == 0) + fmt = "%d"; + else if (strcmp(type, "unsigned int") == 0) + fmt = "%u"; + else if (strcmp(type, "long") == 0) + fmt = "%ld"; + else if (strcmp(type, "unsigned long") == 0) + fmt = "%lu"; + else if (strcmp(type, "pid_t") == 0) + fmt = "%d"; + else if (synth_field_is_string(type)) + fmt = "%s"; + + return fmt; +} + +static enum print_line_t print_synth_event(struct trace_iterator *iter, + int flags, + struct trace_event *event) +{ + struct trace_array *tr = iter->tr; + struct trace_seq *s = &iter->seq; + struct synth_trace_event *entry; + struct synth_event *se; + unsigned int i, n_u64; + char print_fmt[32]; + const char *fmt; + + entry = (struct synth_trace_event *)iter->ent; + se = container_of(event, struct synth_event, call.event); + + trace_seq_printf(s, "%s: ", se->name); + + for (i = 0, n_u64 = 0; i < se->n_fields; i++) { + if (trace_seq_has_overflowed(s)) + goto end; + + fmt = synth_field_fmt(se->fields[i]->type); + + /* parameter types */ + if (tr->trace_flags & TRACE_ITER_VERBOSE) + trace_seq_printf(s, "%s ", fmt); + + snprintf(print_fmt, sizeof(print_fmt), "%%s=%s%%s", fmt); + + /* parameter values */ + if (se->fields[i]->is_string) { + trace_seq_printf(s, print_fmt, se->fields[i]->name, + (char *)&entry->fields[n_u64], + i == se->n_fields - 1 ? "" : " "); + n_u64 += STR_VAR_LEN_MAX / sizeof(u64); + } else { + trace_seq_printf(s, print_fmt, se->fields[i]->name, + entry->fields[n_u64], + i == se->n_fields - 1 ? "" : " "); + n_u64++; + } + } +end: + trace_seq_putc(s, '\n'); + + return trace_handle_return(s); +} + +static struct trace_event_functions synth_event_funcs = { + .trace = print_synth_event +}; + +static notrace void trace_event_raw_event_synth(void *__data, + u64 *var_ref_vals, + unsigned int var_ref_idx) +{ + struct trace_event_file *trace_file = __data; + struct synth_trace_event *entry; + struct trace_event_buffer fbuffer; + struct ring_buffer *buffer; + struct synth_event *event; + unsigned int i, n_u64; + int fields_size = 0; + + event = trace_file->event_call->data; + + if (trace_trigger_soft_disabled(trace_file)) + return; + + fields_size = event->n_u64 * sizeof(u64); + + /* + * Avoid ring buffer recursion detection, as this event + * is being performed within another event. + */ + buffer = trace_file->tr->trace_buffer.buffer; + ring_buffer_nest_start(buffer); + + entry = trace_event_buffer_reserve(&fbuffer, trace_file, + sizeof(*entry) + fields_size); + if (!entry) + goto out; + + for (i = 0, n_u64 = 0; i < event->n_fields; i++) { + if (event->fields[i]->is_string) { + char *str_val = (char *)(long)var_ref_vals[var_ref_idx + i]; + char *str_field = (char *)&entry->fields[n_u64]; + + strscpy(str_field, str_val, STR_VAR_LEN_MAX); + n_u64 += STR_VAR_LEN_MAX / sizeof(u64); + } else { + entry->fields[n_u64] = var_ref_vals[var_ref_idx + i]; + n_u64++; + } + } + + trace_event_buffer_commit(&fbuffer); +out: + ring_buffer_nest_end(buffer); +} + +static void free_synth_event_print_fmt(struct trace_event_call *call) +{ + if (call) { + kfree(call->print_fmt); + call->print_fmt = NULL; + } +} + +static int __set_synth_event_print_fmt(struct synth_event *event, + char *buf, int len) +{ + const char *fmt; + int pos = 0; + int i; + + /* When len=0, we just calculate the needed length */ +#define LEN_OR_ZERO (len ? len - pos : 0) + + pos += snprintf(buf + pos, LEN_OR_ZERO, "\""); + for (i = 0; i < event->n_fields; i++) { + fmt = synth_field_fmt(event->fields[i]->type); + pos += snprintf(buf + pos, LEN_OR_ZERO, "%s=%s%s", + event->fields[i]->name, fmt, + i == event->n_fields - 1 ? "" : ", "); + } + pos += snprintf(buf + pos, LEN_OR_ZERO, "\""); + + for (i = 0; i < event->n_fields; i++) { + pos += snprintf(buf + pos, LEN_OR_ZERO, + ", REC->%s", event->fields[i]->name); + } + +#undef LEN_OR_ZERO + + /* return the length of print_fmt */ + return pos; +} + +static int set_synth_event_print_fmt(struct trace_event_call *call) +{ + struct synth_event *event = call->data; + char *print_fmt; + int len; + + /* First: called with 0 length to calculate the needed length */ + len = __set_synth_event_print_fmt(event, NULL, 0); + + print_fmt = kmalloc(len + 1, GFP_KERNEL); + if (!print_fmt) + return -ENOMEM; + + /* Second: actually write the @print_fmt */ + __set_synth_event_print_fmt(event, print_fmt, len + 1); + call->print_fmt = print_fmt; + + return 0; +} + +static void free_synth_field(struct synth_field *field) +{ + kfree(field->type); + kfree(field->name); + kfree(field); +} + +static struct synth_field *parse_synth_field(char *field_type, + char *field_name) +{ + struct synth_field *field; + int len, ret = 0; + char *array; + + if (field_type[0] == ';') + field_type++; + + len = strlen(field_name); + if (field_name[len - 1] == ';') + field_name[len - 1] = '\0'; + + field = kzalloc(sizeof(*field), GFP_KERNEL); + if (!field) + return ERR_PTR(-ENOMEM); + + len = strlen(field_type) + 1; + array = strchr(field_name, '['); + if (array) + len += strlen(array); + field->type = kzalloc(len, GFP_KERNEL); + if (!field->type) { + ret = -ENOMEM; + goto free; + } + strcat(field->type, field_type); + if (array) { + strcat(field->type, array); + *array = '\0'; + } + + field->size = synth_field_size(field->type); + if (!field->size) { + ret = -EINVAL; + goto free; + } + + if (synth_field_is_string(field->type)) + field->is_string = true; + + field->is_signed = synth_field_signed(field->type); + + field->name = kstrdup(field_name, GFP_KERNEL); + if (!field->name) { + ret = -ENOMEM; + goto free; + } + out: + return field; + free: + free_synth_field(field); + field = ERR_PTR(ret); + goto out; +} + +static void free_synth_tracepoint(struct tracepoint *tp) +{ + if (!tp) + return; + + kfree(tp->name); + kfree(tp); +} + +static struct tracepoint *alloc_synth_tracepoint(char *name) +{ + struct tracepoint *tp; + + tp = kzalloc(sizeof(*tp), GFP_KERNEL); + if (!tp) + return ERR_PTR(-ENOMEM); + + tp->name = kstrdup(name, GFP_KERNEL); + if (!tp->name) { + kfree(tp); + return ERR_PTR(-ENOMEM); + } + + return tp; +} + +typedef void (*synth_probe_func_t) (void *__data, u64 *var_ref_vals, + unsigned int var_ref_idx); + +static inline void trace_synth(struct synth_event *event, u64 *var_ref_vals, + unsigned int var_ref_idx) +{ + struct tracepoint *tp = event->tp; + + if (unlikely(atomic_read(&tp->key.enabled) > 0)) { + struct tracepoint_func *probe_func_ptr; + synth_probe_func_t probe_func; + void *__data; + + if (!(cpu_online(raw_smp_processor_id()))) + return; + + probe_func_ptr = rcu_dereference_sched((tp)->funcs); + if (probe_func_ptr) { + do { + probe_func = probe_func_ptr->func; + __data = probe_func_ptr->data; + probe_func(__data, var_ref_vals, var_ref_idx); + } while ((++probe_func_ptr)->func); + } + } +} + +static struct synth_event *find_synth_event(const char *name) +{ + struct synth_event *event; + + list_for_each_entry(event, &synth_event_list, list) { + if (strcmp(event->name, name) == 0) + return event; + } + + return NULL; +} + +static int register_synth_event(struct synth_event *event) +{ + struct trace_event_call *call = &event->call; + int ret = 0; + + event->call.class = &event->class; + event->class.system = kstrdup(SYNTH_SYSTEM, GFP_KERNEL); + if (!event->class.system) { + ret = -ENOMEM; + goto out; + } + + event->tp = alloc_synth_tracepoint(event->name); + if (IS_ERR(event->tp)) { + ret = PTR_ERR(event->tp); + event->tp = NULL; + goto out; + } + + INIT_LIST_HEAD(&call->class->fields); + call->event.funcs = &synth_event_funcs; + call->class->define_fields = synth_event_define_fields; + + ret = register_trace_event(&call->event); + if (!ret) { + ret = -ENODEV; + goto out; + } + call->flags = TRACE_EVENT_FL_TRACEPOINT; + call->class->reg = trace_event_reg; + call->class->probe = trace_event_raw_event_synth; + call->data = event; + call->tp = event->tp; + + ret = trace_add_event_call(call); + if (ret) { + pr_warn("Failed to register synthetic event: %s\n", + trace_event_name(call)); + goto err; + } + + ret = set_synth_event_print_fmt(call); + if (ret < 0) { + trace_remove_event_call(call); + goto err; + } + out: + return ret; + err: + unregister_trace_event(&call->event); + goto out; +} + +static int unregister_synth_event(struct synth_event *event) +{ + struct trace_event_call *call = &event->call; + int ret; + + ret = trace_remove_event_call(call); + + return ret; +} + +static void free_synth_event(struct synth_event *event) +{ + unsigned int i; + + if (!event) + return; + + for (i = 0; i < event->n_fields; i++) + free_synth_field(event->fields[i]); + + kfree(event->fields); + kfree(event->name); + kfree(event->class.system); + free_synth_tracepoint(event->tp); + free_synth_event_print_fmt(&event->call); + kfree(event); +} + +static struct synth_event *alloc_synth_event(char *event_name, int n_fields, + struct synth_field **fields) +{ + struct synth_event *event; + unsigned int i; + + event = kzalloc(sizeof(*event), GFP_KERNEL); + if (!event) { + event = ERR_PTR(-ENOMEM); + goto out; + } + + event->name = kstrdup(event_name, GFP_KERNEL); + if (!event->name) { + kfree(event); + event = ERR_PTR(-ENOMEM); + goto out; + } + + event->fields = kcalloc(n_fields, sizeof(*event->fields), GFP_KERNEL); + if (!event->fields) { + free_synth_event(event); + event = ERR_PTR(-ENOMEM); + goto out; + } + + for (i = 0; i < n_fields; i++) + event->fields[i] = fields[i]; + + event->n_fields = n_fields; + out: + return event; +} + +static void action_trace(struct hist_trigger_data *hist_data, + struct tracing_map_elt *elt, void *rec, + struct ring_buffer_event *rbe, + struct action_data *data, u64 *var_ref_vals) +{ + struct synth_event *event = data->onmatch.synth_event; + + trace_synth(event, var_ref_vals, data->onmatch.var_ref_idx); +} + +struct hist_var_data { + struct list_head list; + struct hist_trigger_data *hist_data; +}; + +static void add_or_delete_synth_event(struct synth_event *event, int delete) +{ + if (delete) + free_synth_event(event); + else { + mutex_lock(&synth_event_mutex); + if (!find_synth_event(event->name)) + list_add(&event->list, &synth_event_list); + else + free_synth_event(event); + mutex_unlock(&synth_event_mutex); + } +} + +static int create_synth_event(int argc, char **argv) +{ + struct synth_field *field, *fields[SYNTH_FIELDS_MAX]; + struct synth_event *event = NULL; + bool delete_event = false; + int i, n_fields = 0, ret = 0; + char *name; + + mutex_lock(&synth_event_mutex); + + /* + * Argument syntax: + * - Add synthetic event: <event_name> field[;field] ... + * - Remove synthetic event: !<event_name> field[;field] ... + * where 'field' = type field_name + */ + if (argc < 1) { + ret = -EINVAL; + goto out; + } + + name = argv[0]; + if (name[0] == '!') { + delete_event = true; + name++; + } + + event = find_synth_event(name); + if (event) { + if (delete_event) { + if (event->ref) { + event = NULL; + ret = -EBUSY; + goto out; + } + list_del(&event->list); + goto out; + } + event = NULL; + ret = -EEXIST; + goto out; + } else if (delete_event) + goto out; + + if (argc < 2) { + ret = -EINVAL; + goto out; + } + + for (i = 1; i < argc - 1; i++) { + if (strcmp(argv[i], ";") == 0) + continue; + if (n_fields == SYNTH_FIELDS_MAX) { + ret = -EINVAL; + goto err; + } + + field = parse_synth_field(argv[i], argv[i + 1]); + if (IS_ERR(field)) { + ret = PTR_ERR(field); + goto err; + } + fields[n_fields] = field; + i++; n_fields++; + } + + if (i < argc) { + ret = -EINVAL; + goto err; + } + + event = alloc_synth_event(name, n_fields, fields); + if (IS_ERR(event)) { + ret = PTR_ERR(event); + event = NULL; + goto err; + } + out: + mutex_unlock(&synth_event_mutex); + + if (event) { + if (delete_event) { + ret = unregister_synth_event(event); + add_or_delete_synth_event(event, !ret); + } else { + ret = register_synth_event(event); + add_or_delete_synth_event(event, ret); + } + } + + return ret; + err: + mutex_unlock(&synth_event_mutex); + + for (i = 0; i < n_fields; i++) + free_synth_field(fields[i]); + free_synth_event(event); + + return ret; +} + +static int release_all_synth_events(void) +{ + struct list_head release_events; + struct synth_event *event, *e; + int ret = 0; + + INIT_LIST_HEAD(&release_events); + + mutex_lock(&synth_event_mutex); + + list_for_each_entry(event, &synth_event_list, list) { + if (event->ref) { + mutex_unlock(&synth_event_mutex); + return -EBUSY; + } + } + + list_splice_init(&event->list, &release_events); + + mutex_unlock(&synth_event_mutex); + + list_for_each_entry_safe(event, e, &release_events, list) { + list_del(&event->list); + + ret = unregister_synth_event(event); + add_or_delete_synth_event(event, !ret); + } + + return ret; +} + + +static void *synth_events_seq_start(struct seq_file *m, loff_t *pos) +{ + mutex_lock(&synth_event_mutex); + + return seq_list_start(&synth_event_list, *pos); +} + +static void *synth_events_seq_next(struct seq_file *m, void *v, loff_t *pos) +{ + return seq_list_next(v, &synth_event_list, pos); +} + +static void synth_events_seq_stop(struct seq_file *m, void *v) +{ + mutex_unlock(&synth_event_mutex); +} + +static int synth_events_seq_show(struct seq_file *m, void *v) +{ + struct synth_field *field; + struct synth_event *event = v; + unsigned int i; + + seq_printf(m, "%s\t", event->name); + + for (i = 0; i < event->n_fields; i++) { + field = event->fields[i]; + + /* parameter values */ + seq_printf(m, "%s %s%s", field->type, field->name, + i == event->n_fields - 1 ? "" : "; "); + } + + seq_putc(m, '\n'); + + return 0; +} + +static const struct seq_operations synth_events_seq_op = { + .start = synth_events_seq_start, + .next = synth_events_seq_next, + .stop = synth_events_seq_stop, + .show = synth_events_seq_show +}; + +static int synth_events_open(struct inode *inode, struct file *file) +{ + int ret; + + if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) { + ret = release_all_synth_events(); + if (ret < 0) + return ret; + } + + return seq_open(file, &synth_events_seq_op); +} + +static ssize_t synth_events_write(struct file *file, + const char __user *buffer, + size_t count, loff_t *ppos) +{ + return trace_parse_run_command(file, buffer, count, ppos, + create_synth_event); +} + +static const struct file_operations synth_events_fops = { + .open = synth_events_open, + .write = synth_events_write, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static u64 hist_field_timestamp(struct hist_field *hist_field, + struct tracing_map_elt *elt, + struct ring_buffer_event *rbe, + void *event) +{ + struct hist_trigger_data *hist_data = hist_field->hist_data; + struct trace_array *tr = hist_data->event_file->tr; + + u64 ts = ring_buffer_event_time_stamp(rbe); + + if (hist_data->attrs->ts_in_usecs && trace_clock_in_ns(tr)) + ts = ns2usecs(ts); + + return ts; +} + +static u64 hist_field_cpu(struct hist_field *hist_field, + struct tracing_map_elt *elt, + struct ring_buffer_event *rbe, + void *event) +{ + int cpu = smp_processor_id(); + + return cpu; +} + +static struct hist_field * +check_field_for_var_ref(struct hist_field *hist_field, + struct hist_trigger_data *var_data, + unsigned int var_idx) +{ + struct hist_field *found = NULL; + + if (hist_field && hist_field->flags & HIST_FIELD_FL_VAR_REF) { + if (hist_field->var.idx == var_idx && + hist_field->var.hist_data == var_data) { + found = hist_field; + } + } + + return found; +} + +static struct hist_field * +check_field_for_var_refs(struct hist_trigger_data *hist_data, + struct hist_field *hist_field, + struct hist_trigger_data *var_data, + unsigned int var_idx, + unsigned int level) +{ + struct hist_field *found = NULL; + unsigned int i; + + if (level > 3) + return found; + + if (!hist_field) + return found; + + found = check_field_for_var_ref(hist_field, var_data, var_idx); + if (found) + return found; + + for (i = 0; i < HIST_FIELD_OPERANDS_MAX; i++) { + struct hist_field *operand; + + operand = hist_field->operands[i]; + found = check_field_for_var_refs(hist_data, operand, var_data, + var_idx, level + 1); + if (found) + return found; + } + + return found; +} + +static struct hist_field *find_var_ref(struct hist_trigger_data *hist_data, + struct hist_trigger_data *var_data, + unsigned int var_idx) +{ + struct hist_field *hist_field, *found = NULL; + unsigned int i; + + for_each_hist_field(i, hist_data) { + hist_field = hist_data->fields[i]; + found = check_field_for_var_refs(hist_data, hist_field, + var_data, var_idx, 0); + if (found) + return found; + } + + for (i = 0; i < hist_data->n_synth_var_refs; i++) { + hist_field = hist_data->synth_var_refs[i]; + found = check_field_for_var_refs(hist_data, hist_field, + var_data, var_idx, 0); + if (found) + return found; + } + + return found; +} + +static struct hist_field *find_any_var_ref(struct hist_trigger_data *hist_data, + unsigned int var_idx) +{ + struct trace_array *tr = hist_data->event_file->tr; + struct hist_field *found = NULL; + struct hist_var_data *var_data; + + list_for_each_entry(var_data, &tr->hist_vars, list) { + if (var_data->hist_data == hist_data) + continue; + found = find_var_ref(var_data->hist_data, hist_data, var_idx); + if (found) + break; + } + + return found; +} + +static bool check_var_refs(struct hist_trigger_data *hist_data) +{ + struct hist_field *field; + bool found = false; + int i; + + for_each_hist_field(i, hist_data) { + field = hist_data->fields[i]; + if (field && field->flags & HIST_FIELD_FL_VAR) { + if (find_any_var_ref(hist_data, field->var.idx)) { + found = true; + break; + } + } + } + + return found; +} + +static struct hist_var_data *find_hist_vars(struct hist_trigger_data *hist_data) +{ + struct trace_array *tr = hist_data->event_file->tr; + struct hist_var_data *var_data, *found = NULL; + + list_for_each_entry(var_data, &tr->hist_vars, list) { + if (var_data->hist_data == hist_data) { + found = var_data; + break; + } + } + + return found; +} + +static bool field_has_hist_vars(struct hist_field *hist_field, + unsigned int level) +{ + int i; + + if (level > 3) + return false; + + if (!hist_field) + return false; + + if (hist_field->flags & HIST_FIELD_FL_VAR || + hist_field->flags & HIST_FIELD_FL_VAR_REF) + return true; + + for (i = 0; i < HIST_FIELD_OPERANDS_MAX; i++) { + struct hist_field *operand; + + operand = hist_field->operands[i]; + if (field_has_hist_vars(operand, level + 1)) + return true; + } + + return false; +} + +static bool has_hist_vars(struct hist_trigger_data *hist_data) +{ + struct hist_field *hist_field; + int i; + + for_each_hist_field(i, hist_data) { + hist_field = hist_data->fields[i]; + if (field_has_hist_vars(hist_field, 0)) + return true; + } + + return false; +} + +static int save_hist_vars(struct hist_trigger_data *hist_data) +{ + struct trace_array *tr = hist_data->event_file->tr; + struct hist_var_data *var_data; + + var_data = find_hist_vars(hist_data); + if (var_data) + return 0; + + if (trace_array_get(tr) < 0) + return -ENODEV; + + var_data = kzalloc(sizeof(*var_data), GFP_KERNEL); + if (!var_data) { + trace_array_put(tr); + return -ENOMEM; + } + + var_data->hist_data = hist_data; + list_add(&var_data->list, &tr->hist_vars); + + return 0; +} + +static void remove_hist_vars(struct hist_trigger_data *hist_data) +{ + struct trace_array *tr = hist_data->event_file->tr; + struct hist_var_data *var_data; + + var_data = find_hist_vars(hist_data); + if (!var_data) + return; + + if (WARN_ON(check_var_refs(hist_data))) + return; + + list_del(&var_data->list); + + kfree(var_data); + + trace_array_put(tr); +} + +static struct hist_field *find_var_field(struct hist_trigger_data *hist_data, + const char *var_name) +{ + struct hist_field *hist_field, *found = NULL; + int i; + + for_each_hist_field(i, hist_data) { + hist_field = hist_data->fields[i]; + if (hist_field && hist_field->flags & HIST_FIELD_FL_VAR && + strcmp(hist_field->var.name, var_name) == 0) { + found = hist_field; + break; + } + } + + return found; +} + +static struct hist_field *find_var(struct hist_trigger_data *hist_data, + struct trace_event_file *file, + const char *var_name) +{ + struct hist_trigger_data *test_data; + struct event_trigger_data *test; + struct hist_field *hist_field; + + hist_field = find_var_field(hist_data, var_name); + if (hist_field) + return hist_field; + + list_for_each_entry_rcu(test, &file->triggers, list) { + if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) { + test_data = test->private_data; + hist_field = find_var_field(test_data, var_name); + if (hist_field) + return hist_field; + } + } + + return NULL; +} + +static struct trace_event_file *find_var_file(struct trace_array *tr, + char *system, + char *event_name, + char *var_name) +{ + struct hist_trigger_data *var_hist_data; + struct hist_var_data *var_data; + struct trace_event_file *file, *found = NULL; + + if (system) + return find_event_file(tr, system, event_name); + + list_for_each_entry(var_data, &tr->hist_vars, list) { + var_hist_data = var_data->hist_data; + file = var_hist_data->event_file; + if (file == found) + continue; + + if (find_var_field(var_hist_data, var_name)) { + if (found) { + hist_err_event("Variable name not unique, need to use fully qualified name (subsys.event.var) for variable: ", system, event_name, var_name); + return NULL; + } + + found = file; + } + } + + return found; +} + +static struct hist_field *find_file_var(struct trace_event_file *file, + const char *var_name) +{ + struct hist_trigger_data *test_data; + struct event_trigger_data *test; + struct hist_field *hist_field; + + list_for_each_entry_rcu(test, &file->triggers, list) { + if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) { + test_data = test->private_data; + hist_field = find_var_field(test_data, var_name); + if (hist_field) + return hist_field; + } + } + + return NULL; +} + +static struct hist_field * +find_match_var(struct hist_trigger_data *hist_data, char *var_name) +{ + struct trace_array *tr = hist_data->event_file->tr; + struct hist_field *hist_field, *found = NULL; + struct trace_event_file *file; + unsigned int i; + + for (i = 0; i < hist_data->n_actions; i++) { + struct action_data *data = hist_data->actions[i]; + + if (data->fn == action_trace) { + char *system = data->onmatch.match_event_system; + char *event_name = data->onmatch.match_event; + + file = find_var_file(tr, system, event_name, var_name); + if (!file) + continue; + hist_field = find_file_var(file, var_name); + if (hist_field) { + if (found) { + hist_err_event("Variable name not unique, need to use fully qualified name (subsys.event.var) for variable: ", system, event_name, var_name); + return ERR_PTR(-EINVAL); + } + + found = hist_field; + } + } + } + return found; +} + +static struct hist_field *find_event_var(struct hist_trigger_data *hist_data, + char *system, + char *event_name, + char *var_name) +{ + struct trace_array *tr = hist_data->event_file->tr; + struct hist_field *hist_field = NULL; + struct trace_event_file *file; + + if (!system || !event_name) { + hist_field = find_match_var(hist_data, var_name); + if (IS_ERR(hist_field)) + return NULL; + if (hist_field) + return hist_field; + } + + file = find_var_file(tr, system, event_name, var_name); + if (!file) + return NULL; + + hist_field = find_file_var(file, var_name); + + return hist_field; +} + +struct hist_elt_data { + char *comm; + u64 *var_ref_vals; + char *field_var_str[SYNTH_FIELDS_MAX]; }; +static u64 hist_field_var_ref(struct hist_field *hist_field, + struct tracing_map_elt *elt, + struct ring_buffer_event *rbe, + void *event) +{ + struct hist_elt_data *elt_data; + u64 var_val = 0; + + elt_data = elt->private_data; + var_val = elt_data->var_ref_vals[hist_field->var_ref_idx]; + + return var_val; +} + +static bool resolve_var_refs(struct hist_trigger_data *hist_data, void *key, + u64 *var_ref_vals, bool self) +{ + struct hist_trigger_data *var_data; + struct tracing_map_elt *var_elt; + struct hist_field *hist_field; + unsigned int i, var_idx; + bool resolved = true; + u64 var_val = 0; + + for (i = 0; i < hist_data->n_var_refs; i++) { + hist_field = hist_data->var_refs[i]; + var_idx = hist_field->var.idx; + var_data = hist_field->var.hist_data; + + if (var_data == NULL) { + resolved = false; + break; + } + + if ((self && var_data != hist_data) || + (!self && var_data == hist_data)) + continue; + + var_elt = tracing_map_lookup(var_data->map, key); + if (!var_elt) { + resolved = false; + break; + } + + if (!tracing_map_var_set(var_elt, var_idx)) { + resolved = false; + break; + } + + if (self || !hist_field->read_once) + var_val = tracing_map_read_var(var_elt, var_idx); + else + var_val = tracing_map_read_var_once(var_elt, var_idx); + + var_ref_vals[i] = var_val; + } + + return resolved; +} + static const char *hist_field_name(struct hist_field *field, unsigned int level) { @@ -162,8 +1683,26 @@ static const char *hist_field_name(struct hist_field *field, if (field->field) field_name = field->field->name; - else if (field->flags & HIST_FIELD_FL_LOG2) + else if (field->flags & HIST_FIELD_FL_LOG2 || + field->flags & HIST_FIELD_FL_ALIAS) field_name = hist_field_name(field->operands[0], ++level); + else if (field->flags & HIST_FIELD_FL_CPU) + field_name = "cpu"; + else if (field->flags & HIST_FIELD_FL_EXPR || + field->flags & HIST_FIELD_FL_VAR_REF) { + if (field->system) { + static char full_name[MAX_FILTER_STR_VAL]; + + strcat(full_name, field->system); + strcat(full_name, "."); + strcat(full_name, field->event_name); + strcat(full_name, "."); + strcat(full_name, field->name); + field_name = full_name; + } else + field_name = field->name; + } else if (field->flags & HIST_FIELD_FL_TIMESTAMP) + field_name = "common_timestamp"; if (field_name == NULL) field_name = ""; @@ -232,16 +1771,119 @@ static int parse_map_size(char *str) static void destroy_hist_trigger_attrs(struct hist_trigger_attrs *attrs) { + unsigned int i; + if (!attrs) return; + for (i = 0; i < attrs->n_assignments; i++) + kfree(attrs->assignment_str[i]); + + for (i = 0; i < attrs->n_actions; i++) + kfree(attrs->action_str[i]); + kfree(attrs->name); kfree(attrs->sort_key_str); kfree(attrs->keys_str); kfree(attrs->vals_str); + kfree(attrs->clock); kfree(attrs); } +static int parse_action(char *str, struct hist_trigger_attrs *attrs) +{ + int ret = -EINVAL; + + if (attrs->n_actions >= HIST_ACTIONS_MAX) + return ret; + + if ((strncmp(str, "onmatch(", strlen("onmatch(")) == 0) || + (strncmp(str, "onmax(", strlen("onmax(")) == 0)) { + attrs->action_str[attrs->n_actions] = kstrdup(str, GFP_KERNEL); + if (!attrs->action_str[attrs->n_actions]) { + ret = -ENOMEM; + return ret; + } + attrs->n_actions++; + ret = 0; + } + + return ret; +} + +static int parse_assignment(char *str, struct hist_trigger_attrs *attrs) +{ + int ret = 0; + + if ((strncmp(str, "key=", strlen("key=")) == 0) || + (strncmp(str, "keys=", strlen("keys=")) == 0)) { + attrs->keys_str = kstrdup(str, GFP_KERNEL); + if (!attrs->keys_str) { + ret = -ENOMEM; + goto out; + } + } else if ((strncmp(str, "val=", strlen("val=")) == 0) || + (strncmp(str, "vals=", strlen("vals=")) == 0) || + (strncmp(str, "values=", strlen("values=")) == 0)) { + attrs->vals_str = kstrdup(str, GFP_KERNEL); + if (!attrs->vals_str) { + ret = -ENOMEM; + goto out; + } + } else if (strncmp(str, "sort=", strlen("sort=")) == 0) { + attrs->sort_key_str = kstrdup(str, GFP_KERNEL); + if (!attrs->sort_key_str) { + ret = -ENOMEM; + goto out; + } + } else if (strncmp(str, "name=", strlen("name=")) == 0) { + attrs->name = kstrdup(str, GFP_KERNEL); + if (!attrs->name) { + ret = -ENOMEM; + goto out; + } + } else if (strncmp(str, "clock=", strlen("clock=")) == 0) { + strsep(&str, "="); + if (!str) { + ret = -EINVAL; + goto out; + } + + str = strstrip(str); + attrs->clock = kstrdup(str, GFP_KERNEL); + if (!attrs->clock) { + ret = -ENOMEM; + goto out; + } + } else if (strncmp(str, "size=", strlen("size=")) == 0) { + int map_bits = parse_map_size(str); + + if (map_bits < 0) { + ret = map_bits; + goto out; + } + attrs->map_bits = map_bits; + } else { + char *assignment; + + if (attrs->n_assignments == TRACING_MAP_VARS_MAX) { + hist_err("Too many variables defined: ", str); + ret = -EINVAL; + goto out; + } + + assignment = kstrdup(str, GFP_KERNEL); + if (!assignment) { + ret = -ENOMEM; + goto out; + } + + attrs->assignment_str[attrs->n_assignments++] = assignment; + } + out: + return ret; +} + static struct hist_trigger_attrs *parse_hist_trigger_attrs(char *trigger_str) { struct hist_trigger_attrs *attrs; @@ -254,35 +1896,21 @@ static struct hist_trigger_attrs *parse_hist_trigger_attrs(char *trigger_str) while (trigger_str) { char *str = strsep(&trigger_str, ":"); - if ((strncmp(str, "key=", strlen("key=")) == 0) || - (strncmp(str, "keys=", strlen("keys=")) == 0)) - attrs->keys_str = kstrdup(str, GFP_KERNEL); - else if ((strncmp(str, "val=", strlen("val=")) == 0) || - (strncmp(str, "vals=", strlen("vals=")) == 0) || - (strncmp(str, "values=", strlen("values=")) == 0)) - attrs->vals_str = kstrdup(str, GFP_KERNEL); - else if (strncmp(str, "sort=", strlen("sort=")) == 0) - attrs->sort_key_str = kstrdup(str, GFP_KERNEL); - else if (strncmp(str, "name=", strlen("name=")) == 0) - attrs->name = kstrdup(str, GFP_KERNEL); - else if (strcmp(str, "pause") == 0) + if (strchr(str, '=')) { + ret = parse_assignment(str, attrs); + if (ret) + goto free; + } else if (strcmp(str, "pause") == 0) attrs->pause = true; else if ((strcmp(str, "cont") == 0) || (strcmp(str, "continue") == 0)) attrs->cont = true; else if (strcmp(str, "clear") == 0) attrs->clear = true; - else if (strncmp(str, "size=", strlen("size=")) == 0) { - int map_bits = parse_map_size(str); - - if (map_bits < 0) { - ret = map_bits; + else { + ret = parse_action(str, attrs); + if (ret) goto free; - } - attrs->map_bits = map_bits; - } else { - ret = -EINVAL; - goto free; } } @@ -291,6 +1919,14 @@ static struct hist_trigger_attrs *parse_hist_trigger_attrs(char *trigger_str) goto free; } + if (!attrs->clock) { + attrs->clock = kstrdup("global", GFP_KERNEL); + if (!attrs->clock) { + ret = -ENOMEM; + goto free; + } + } + return attrs; free: destroy_hist_trigger_attrs(attrs); @@ -313,64 +1949,203 @@ static inline void save_comm(char *comm, struct task_struct *task) memcpy(comm, task->comm, TASK_COMM_LEN); } -static void hist_trigger_elt_comm_free(struct tracing_map_elt *elt) +static void hist_elt_data_free(struct hist_elt_data *elt_data) { - kfree((char *)elt->private_data); + unsigned int i; + + for (i = 0; i < SYNTH_FIELDS_MAX; i++) + kfree(elt_data->field_var_str[i]); + + kfree(elt_data->comm); + kfree(elt_data); } -static int hist_trigger_elt_comm_alloc(struct tracing_map_elt *elt) +static void hist_trigger_elt_data_free(struct tracing_map_elt *elt) +{ + struct hist_elt_data *elt_data = elt->private_data; + + hist_elt_data_free(elt_data); +} + +static int hist_trigger_elt_data_alloc(struct tracing_map_elt *elt) { struct hist_trigger_data *hist_data = elt->map->private_data; + unsigned int size = TASK_COMM_LEN; + struct hist_elt_data *elt_data; struct hist_field *key_field; - unsigned int i; + unsigned int i, n_str; + + elt_data = kzalloc(sizeof(*elt_data), GFP_KERNEL); + if (!elt_data) + return -ENOMEM; for_each_hist_key_field(i, hist_data) { key_field = hist_data->fields[i]; if (key_field->flags & HIST_FIELD_FL_EXECNAME) { - unsigned int size = TASK_COMM_LEN + 1; - - elt->private_data = kzalloc(size, GFP_KERNEL); - if (!elt->private_data) + elt_data->comm = kzalloc(size, GFP_KERNEL); + if (!elt_data->comm) { + kfree(elt_data); return -ENOMEM; + } break; } } + n_str = hist_data->n_field_var_str + hist_data->n_max_var_str; + + size = STR_VAR_LEN_MAX; + + for (i = 0; i < n_str; i++) { + elt_data->field_var_str[i] = kzalloc(size, GFP_KERNEL); + if (!elt_data->field_var_str[i]) { + hist_elt_data_free(elt_data); + return -ENOMEM; + } + } + + elt->private_data = elt_data; + return 0; } -static void hist_trigger_elt_comm_copy(struct tracing_map_elt *to, - struct tracing_map_elt *from) +static void hist_trigger_elt_data_init(struct tracing_map_elt *elt) +{ + struct hist_elt_data *elt_data = elt->private_data; + + if (elt_data->comm) + save_comm(elt_data->comm, current); +} + +static const struct tracing_map_ops hist_trigger_elt_data_ops = { + .elt_alloc = hist_trigger_elt_data_alloc, + .elt_free = hist_trigger_elt_data_free, + .elt_init = hist_trigger_elt_data_init, +}; + +static const char *get_hist_field_flags(struct hist_field *hist_field) +{ + const char *flags_str = NULL; + + if (hist_field->flags & HIST_FIELD_FL_HEX) + flags_str = "hex"; + else if (hist_field->flags & HIST_FIELD_FL_SYM) + flags_str = "sym"; + else if (hist_field->flags & HIST_FIELD_FL_SYM_OFFSET) + flags_str = "sym-offset"; + else if (hist_field->flags & HIST_FIELD_FL_EXECNAME) + flags_str = "execname"; + else if (hist_field->flags & HIST_FIELD_FL_SYSCALL) + flags_str = "syscall"; + else if (hist_field->flags & HIST_FIELD_FL_LOG2) + flags_str = "log2"; + else if (hist_field->flags & HIST_FIELD_FL_TIMESTAMP_USECS) + flags_str = "usecs"; + + return flags_str; +} + +static void expr_field_str(struct hist_field *field, char *expr) { - char *comm_from = from->private_data; - char *comm_to = to->private_data; + if (field->flags & HIST_FIELD_FL_VAR_REF) + strcat(expr, "$"); + + strcat(expr, hist_field_name(field, 0)); - if (comm_from) - memcpy(comm_to, comm_from, TASK_COMM_LEN + 1); + if (field->flags && !(field->flags & HIST_FIELD_FL_VAR_REF)) { + const char *flags_str = get_hist_field_flags(field); + + if (flags_str) { + strcat(expr, "."); + strcat(expr, flags_str); + } + } } -static void hist_trigger_elt_comm_init(struct tracing_map_elt *elt) +static char *expr_str(struct hist_field *field, unsigned int level) { - char *comm = elt->private_data; + char *expr; + + if (level > 1) + return NULL; + + expr = kzalloc(MAX_FILTER_STR_VAL, GFP_KERNEL); + if (!expr) + return NULL; + + if (!field->operands[0]) { + expr_field_str(field, expr); + return expr; + } + + if (field->operator == FIELD_OP_UNARY_MINUS) { + char *subexpr; - if (comm) - save_comm(comm, current); + strcat(expr, "-("); + subexpr = expr_str(field->operands[0], ++level); + if (!subexpr) { + kfree(expr); + return NULL; + } + strcat(expr, subexpr); + strcat(expr, ")"); + + kfree(subexpr); + + return expr; + } + + expr_field_str(field->operands[0], expr); + + switch (field->operator) { + case FIELD_OP_MINUS: + strcat(expr, "-"); + break; + case FIELD_OP_PLUS: + strcat(expr, "+"); + break; + default: + kfree(expr); + return NULL; + } + + expr_field_str(field->operands[1], expr); + + return expr; } -static const struct tracing_map_ops hist_trigger_elt_comm_ops = { - .elt_alloc = hist_trigger_elt_comm_alloc, - .elt_copy = hist_trigger_elt_comm_copy, - .elt_free = hist_trigger_elt_comm_free, - .elt_init = hist_trigger_elt_comm_init, -}; +static int contains_operator(char *str) +{ + enum field_op_id field_op = FIELD_OP_NONE; + char *op; + + op = strpbrk(str, "+-"); + if (!op) + return FIELD_OP_NONE; + + switch (*op) { + case '-': + if (*str == '-') + field_op = FIELD_OP_UNARY_MINUS; + else + field_op = FIELD_OP_MINUS; + break; + case '+': + field_op = FIELD_OP_PLUS; + break; + default: + break; + } + + return field_op; +} static void destroy_hist_field(struct hist_field *hist_field, unsigned int level) { unsigned int i; - if (level > 2) + if (level > 3) return; if (!hist_field) @@ -379,11 +2154,17 @@ static void destroy_hist_field(struct hist_field *hist_field, for (i = 0; i < HIST_FIELD_OPERANDS_MAX; i++) destroy_hist_field(hist_field->operands[i], level + 1); + kfree(hist_field->var.name); + kfree(hist_field->name); + kfree(hist_field->type); + kfree(hist_field); } -static struct hist_field *create_hist_field(struct ftrace_event_field *field, - unsigned long flags) +static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data, + struct ftrace_event_field *field, + unsigned long flags, + char *var_name) { struct hist_field *hist_field; @@ -394,8 +2175,22 @@ static struct hist_field *create_hist_field(struct ftrace_event_field *field, if (!hist_field) return NULL; + hist_field->hist_data = hist_data; + + if (flags & HIST_FIELD_FL_EXPR || flags & HIST_FIELD_FL_ALIAS) + goto out; /* caller will populate */ + + if (flags & HIST_FIELD_FL_VAR_REF) { + hist_field->fn = hist_field_var_ref; + goto out; + } + if (flags & HIST_FIELD_FL_HITCOUNT) { hist_field->fn = hist_field_counter; + hist_field->size = sizeof(u64); + hist_field->type = kstrdup("u64", GFP_KERNEL); + if (!hist_field->type) + goto free; goto out; } @@ -407,8 +2202,29 @@ static struct hist_field *create_hist_field(struct ftrace_event_field *field, if (flags & HIST_FIELD_FL_LOG2) { unsigned long fl = flags & ~HIST_FIELD_FL_LOG2; hist_field->fn = hist_field_log2; - hist_field->operands[0] = create_hist_field(field, fl); + hist_field->operands[0] = create_hist_field(hist_data, field, fl, NULL); hist_field->size = hist_field->operands[0]->size; + hist_field->type = kstrdup(hist_field->operands[0]->type, GFP_KERNEL); + if (!hist_field->type) + goto free; + goto out; + } + + if (flags & HIST_FIELD_FL_TIMESTAMP) { + hist_field->fn = hist_field_timestamp; + hist_field->size = sizeof(u64); + hist_field->type = kstrdup("u64", GFP_KERNEL); + if (!hist_field->type) + goto free; + goto out; + } + + if (flags & HIST_FIELD_FL_CPU) { + hist_field->fn = hist_field_cpu; + hist_field->size = sizeof(int); + hist_field->type = kstrdup("unsigned int", GFP_KERNEL); + if (!hist_field->type) + goto free; goto out; } @@ -418,6 +2234,11 @@ static struct hist_field *create_hist_field(struct ftrace_event_field *field, if (is_string_field(field)) { flags |= HIST_FIELD_FL_STRING; + hist_field->size = MAX_FILTER_STR_VAL; + hist_field->type = kstrdup(field->type, GFP_KERNEL); + if (!hist_field->type) + goto free; + if (field->filter_type == FILTER_STATIC_STRING) hist_field->fn = hist_field_string; else if (field->filter_type == FILTER_DYN_STRING) @@ -425,6 +2246,12 @@ static struct hist_field *create_hist_field(struct ftrace_event_field *field, else hist_field->fn = hist_field_pstring; } else { + hist_field->size = field->size; + hist_field->is_signed = field->is_signed; + hist_field->type = kstrdup(field->type, GFP_KERNEL); + if (!hist_field->type) + goto free; + hist_field->fn = select_value_fn(field->size, field->is_signed); if (!hist_field->fn) { @@ -436,14 +2263,23 @@ static struct hist_field *create_hist_field(struct ftrace_event_field *field, hist_field->field = field; hist_field->flags = flags; + if (var_name) { + hist_field->var.name = kstrdup(var_name, GFP_KERNEL); + if (!hist_field->var.name) + goto free; + } + return hist_field; + free: + destroy_hist_field(hist_field, 0); + return NULL; } static void destroy_hist_fields(struct hist_trigger_data *hist_data) { unsigned int i; - for (i = 0; i < TRACING_MAP_FIELDS_MAX; i++) { + for (i = 0; i < HIST_FIELDS_MAX; i++) { if (hist_data->fields[i]) { destroy_hist_field(hist_data->fields[i], 0); hist_data->fields[i] = NULL; @@ -451,69 +2287,1610 @@ static void destroy_hist_fields(struct hist_trigger_data *hist_data) } } -static int create_hitcount_val(struct hist_trigger_data *hist_data) +static int init_var_ref(struct hist_field *ref_field, + struct hist_field *var_field, + char *system, char *event_name) { - hist_data->fields[HITCOUNT_IDX] = - create_hist_field(NULL, HIST_FIELD_FL_HITCOUNT); - if (!hist_data->fields[HITCOUNT_IDX]) - return -ENOMEM; + int err = 0; + + ref_field->var.idx = var_field->var.idx; + ref_field->var.hist_data = var_field->hist_data; + ref_field->size = var_field->size; + ref_field->is_signed = var_field->is_signed; + ref_field->flags |= var_field->flags & + (HIST_FIELD_FL_TIMESTAMP | HIST_FIELD_FL_TIMESTAMP_USECS); + + if (system) { + ref_field->system = kstrdup(system, GFP_KERNEL); + if (!ref_field->system) + return -ENOMEM; + } - hist_data->n_vals++; + if (event_name) { + ref_field->event_name = kstrdup(event_name, GFP_KERNEL); + if (!ref_field->event_name) { + err = -ENOMEM; + goto free; + } + } - if (WARN_ON(hist_data->n_vals > TRACING_MAP_VALS_MAX)) + if (var_field->var.name) { + ref_field->name = kstrdup(var_field->var.name, GFP_KERNEL); + if (!ref_field->name) { + err = -ENOMEM; + goto free; + } + } else if (var_field->name) { + ref_field->name = kstrdup(var_field->name, GFP_KERNEL); + if (!ref_field->name) { + err = -ENOMEM; + goto free; + } + } + + ref_field->type = kstrdup(var_field->type, GFP_KERNEL); + if (!ref_field->type) { + err = -ENOMEM; + goto free; + } + out: + return err; + free: + kfree(ref_field->system); + kfree(ref_field->event_name); + kfree(ref_field->name); + + goto out; +} + +static struct hist_field *create_var_ref(struct hist_field *var_field, + char *system, char *event_name) +{ + unsigned long flags = HIST_FIELD_FL_VAR_REF; + struct hist_field *ref_field; + + ref_field = create_hist_field(var_field->hist_data, NULL, flags, NULL); + if (ref_field) { + if (init_var_ref(ref_field, var_field, system, event_name)) { + destroy_hist_field(ref_field, 0); + return NULL; + } + } + + return ref_field; +} + +static bool is_var_ref(char *var_name) +{ + if (!var_name || strlen(var_name) < 2 || var_name[0] != '$') + return false; + + return true; +} + +static char *field_name_from_var(struct hist_trigger_data *hist_data, + char *var_name) +{ + char *name, *field; + unsigned int i; + + for (i = 0; i < hist_data->attrs->var_defs.n_vars; i++) { + name = hist_data->attrs->var_defs.name[i]; + + if (strcmp(var_name, name) == 0) { + field = hist_data->attrs->var_defs.expr[i]; + if (contains_operator(field) || is_var_ref(field)) + continue; + return field; + } + } + + return NULL; +} + +static char *local_field_var_ref(struct hist_trigger_data *hist_data, + char *system, char *event_name, + char *var_name) +{ + struct trace_event_call *call; + + if (system && event_name) { + call = hist_data->event_file->event_call; + + if (strcmp(system, call->class->system) != 0) + return NULL; + + if (strcmp(event_name, trace_event_name(call)) != 0) + return NULL; + } + + if (!!system != !!event_name) + return NULL; + + if (!is_var_ref(var_name)) + return NULL; + + var_name++; + + return field_name_from_var(hist_data, var_name); +} + +static struct hist_field *parse_var_ref(struct hist_trigger_data *hist_data, + char *system, char *event_name, + char *var_name) +{ + struct hist_field *var_field = NULL, *ref_field = NULL; + + if (!is_var_ref(var_name)) + return NULL; + + var_name++; + + var_field = find_event_var(hist_data, system, event_name, var_name); + if (var_field) + ref_field = create_var_ref(var_field, system, event_name); + + if (!ref_field) + hist_err_event("Couldn't find variable: $", + system, event_name, var_name); + + return ref_field; +} + +static struct ftrace_event_field * +parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file, + char *field_str, unsigned long *flags) +{ + struct ftrace_event_field *field = NULL; + char *field_name, *modifier, *str; + + modifier = str = kstrdup(field_str, GFP_KERNEL); + if (!modifier) + return ERR_PTR(-ENOMEM); + + field_name = strsep(&modifier, "."); + if (modifier) { + if (strcmp(modifier, "hex") == 0) + *flags |= HIST_FIELD_FL_HEX; + else if (strcmp(modifier, "sym") == 0) + *flags |= HIST_FIELD_FL_SYM; + else if (strcmp(modifier, "sym-offset") == 0) + *flags |= HIST_FIELD_FL_SYM_OFFSET; + else if ((strcmp(modifier, "execname") == 0) && + (strcmp(field_name, "common_pid") == 0)) + *flags |= HIST_FIELD_FL_EXECNAME; + else if (strcmp(modifier, "syscall") == 0) + *flags |= HIST_FIELD_FL_SYSCALL; + else if (strcmp(modifier, "log2") == 0) + *flags |= HIST_FIELD_FL_LOG2; + else if (strcmp(modifier, "usecs") == 0) + *flags |= HIST_FIELD_FL_TIMESTAMP_USECS; + else { + field = ERR_PTR(-EINVAL); + goto out; + } + } + + if (strcmp(field_name, "common_timestamp") == 0) { + *flags |= HIST_FIELD_FL_TIMESTAMP; + hist_data->enable_timestamps = true; + if (*flags & HIST_FIELD_FL_TIMESTAMP_USECS) + hist_data->attrs->ts_in_usecs = true; + } else if (strcmp(field_name, "cpu") == 0) + *flags |= HIST_FIELD_FL_CPU; + else { + field = trace_find_event_field(file->event_call, field_name); + if (!field || !field->size) { + field = ERR_PTR(-EINVAL); + goto out; + } + } + out: + kfree(str); + + return field; +} + +static struct hist_field *create_alias(struct hist_trigger_data *hist_data, + struct hist_field *var_ref, + char *var_name) +{ + struct hist_field *alias = NULL; + unsigned long flags = HIST_FIELD_FL_ALIAS | HIST_FIELD_FL_VAR; + + alias = create_hist_field(hist_data, NULL, flags, var_name); + if (!alias) + return NULL; + + alias->fn = var_ref->fn; + alias->operands[0] = var_ref; + + if (init_var_ref(alias, var_ref, var_ref->system, var_ref->event_name)) { + destroy_hist_field(alias, 0); + return NULL; + } + + return alias; +} + +static struct hist_field *parse_atom(struct hist_trigger_data *hist_data, + struct trace_event_file *file, char *str, + unsigned long *flags, char *var_name) +{ + char *s, *ref_system = NULL, *ref_event = NULL, *ref_var = str; + struct ftrace_event_field *field = NULL; + struct hist_field *hist_field = NULL; + int ret = 0; + + s = strchr(str, '.'); + if (s) { + s = strchr(++s, '.'); + if (s) { + ref_system = strsep(&str, "."); + if (!str) { + ret = -EINVAL; + goto out; + } + ref_event = strsep(&str, "."); + if (!str) { + ret = -EINVAL; + goto out; + } + ref_var = str; + } + } + + s = local_field_var_ref(hist_data, ref_system, ref_event, ref_var); + if (!s) { + hist_field = parse_var_ref(hist_data, ref_system, ref_event, ref_var); + if (hist_field) { + hist_data->var_refs[hist_data->n_var_refs] = hist_field; + hist_field->var_ref_idx = hist_data->n_var_refs++; + if (var_name) { + hist_field = create_alias(hist_data, hist_field, var_name); + if (!hist_field) { + ret = -ENOMEM; + goto out; + } + } + return hist_field; + } + } else + str = s; + + field = parse_field(hist_data, file, str, flags); + if (IS_ERR(field)) { + ret = PTR_ERR(field); + goto out; + } + + hist_field = create_hist_field(hist_data, field, *flags, var_name); + if (!hist_field) { + ret = -ENOMEM; + goto out; + } + + return hist_field; + out: + return ERR_PTR(ret); +} + +static struct hist_field *parse_expr(struct hist_trigger_data *hist_data, + struct trace_event_file *file, + char *str, unsigned long flags, + char *var_name, unsigned int level); + +static struct hist_field *parse_unary(struct hist_trigger_data *hist_data, + struct trace_event_file *file, + char *str, unsigned long flags, + char *var_name, unsigned int level) +{ + struct hist_field *operand1, *expr = NULL; + unsigned long operand_flags; + int ret = 0; + char *s; + + /* we support only -(xxx) i.e. explicit parens required */ + + if (level > 3) { + hist_err("Too many subexpressions (3 max): ", str); + ret = -EINVAL; + goto free; + } + + str++; /* skip leading '-' */ + + s = strchr(str, '('); + if (s) + str++; + else { + ret = -EINVAL; + goto free; + } + + s = strrchr(str, ')'); + if (s) + *s = '\0'; + else { + ret = -EINVAL; /* no closing ')' */ + goto free; + } + + flags |= HIST_FIELD_FL_EXPR; + expr = create_hist_field(hist_data, NULL, flags, var_name); + if (!expr) { + ret = -ENOMEM; + goto free; + } + + operand_flags = 0; + operand1 = parse_expr(hist_data, file, str, operand_flags, NULL, ++level); + if (IS_ERR(operand1)) { + ret = PTR_ERR(operand1); + goto free; + } + + expr->flags |= operand1->flags & + (HIST_FIELD_FL_TIMESTAMP | HIST_FIELD_FL_TIMESTAMP_USECS); + expr->fn = hist_field_unary_minus; + expr->operands[0] = operand1; + expr->operator = FIELD_OP_UNARY_MINUS; + expr->name = expr_str(expr, 0); + expr->type = kstrdup(operand1->type, GFP_KERNEL); + if (!expr->type) { + ret = -ENOMEM; + goto free; + } + + return expr; + free: + destroy_hist_field(expr, 0); + return ERR_PTR(ret); +} + +static int check_expr_operands(struct hist_field *operand1, + struct hist_field *operand2) +{ + unsigned long operand1_flags = operand1->flags; + unsigned long operand2_flags = operand2->flags; + + if ((operand1_flags & HIST_FIELD_FL_VAR_REF) || + (operand1_flags & HIST_FIELD_FL_ALIAS)) { + struct hist_field *var; + + var = find_var_field(operand1->var.hist_data, operand1->name); + if (!var) + return -EINVAL; + operand1_flags = var->flags; + } + + if ((operand2_flags & HIST_FIELD_FL_VAR_REF) || + (operand2_flags & HIST_FIELD_FL_ALIAS)) { + struct hist_field *var; + + var = find_var_field(operand2->var.hist_data, operand2->name); + if (!var) + return -EINVAL; + operand2_flags = var->flags; + } + + if ((operand1_flags & HIST_FIELD_FL_TIMESTAMP_USECS) != + (operand2_flags & HIST_FIELD_FL_TIMESTAMP_USECS)) { + hist_err("Timestamp units in expression don't match", NULL); return -EINVAL; + } return 0; } -static int create_val_field(struct hist_trigger_data *hist_data, - unsigned int val_idx, - struct trace_event_file *file, - char *field_str) +static struct hist_field *parse_expr(struct hist_trigger_data *hist_data, + struct trace_event_file *file, + char *str, unsigned long flags, + char *var_name, unsigned int level) { - struct ftrace_event_field *field = NULL; - unsigned long flags = 0; - char *field_name; + struct hist_field *operand1 = NULL, *operand2 = NULL, *expr = NULL; + unsigned long operand_flags; + int field_op, ret = -EINVAL; + char *sep, *operand1_str; + + if (level > 3) { + hist_err("Too many subexpressions (3 max): ", str); + return ERR_PTR(-EINVAL); + } + + field_op = contains_operator(str); + + if (field_op == FIELD_OP_NONE) + return parse_atom(hist_data, file, str, &flags, var_name); + + if (field_op == FIELD_OP_UNARY_MINUS) + return parse_unary(hist_data, file, str, flags, var_name, ++level); + + switch (field_op) { + case FIELD_OP_MINUS: + sep = "-"; + break; + case FIELD_OP_PLUS: + sep = "+"; + break; + default: + goto free; + } + + operand1_str = strsep(&str, sep); + if (!operand1_str || !str) + goto free; + + operand_flags = 0; + operand1 = parse_atom(hist_data, file, operand1_str, + &operand_flags, NULL); + if (IS_ERR(operand1)) { + ret = PTR_ERR(operand1); + operand1 = NULL; + goto free; + } + + /* rest of string could be another expression e.g. b+c in a+b+c */ + operand_flags = 0; + operand2 = parse_expr(hist_data, file, str, operand_flags, NULL, ++level); + if (IS_ERR(operand2)) { + ret = PTR_ERR(operand2); + operand2 = NULL; + goto free; + } + + ret = check_expr_operands(operand1, operand2); + if (ret) + goto free; + + flags |= HIST_FIELD_FL_EXPR; + + flags |= operand1->flags & + (HIST_FIELD_FL_TIMESTAMP | HIST_FIELD_FL_TIMESTAMP_USECS); + + expr = create_hist_field(hist_data, NULL, flags, var_name); + if (!expr) { + ret = -ENOMEM; + goto free; + } + + operand1->read_once = true; + operand2->read_once = true; + + expr->operands[0] = operand1; + expr->operands[1] = operand2; + expr->operator = field_op; + expr->name = expr_str(expr, 0); + expr->type = kstrdup(operand1->type, GFP_KERNEL); + if (!expr->type) { + ret = -ENOMEM; + goto free; + } + + switch (field_op) { + case FIELD_OP_MINUS: + expr->fn = hist_field_minus; + break; + case FIELD_OP_PLUS: + expr->fn = hist_field_plus; + break; + default: + ret = -EINVAL; + goto free; + } + + return expr; + free: + destroy_hist_field(operand1, 0); + destroy_hist_field(operand2, 0); + destroy_hist_field(expr, 0); + + return ERR_PTR(ret); +} + +static char *find_trigger_filter(struct hist_trigger_data *hist_data, + struct trace_event_file *file) +{ + struct event_trigger_data *test; + + list_for_each_entry_rcu(test, &file->triggers, list) { + if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) { + if (test->private_data == hist_data) + return test->filter_str; + } + } + + return NULL; +} + +static struct event_command trigger_hist_cmd; +static int event_hist_trigger_func(struct event_command *cmd_ops, + struct trace_event_file *file, + char *glob, char *cmd, char *param); + +static bool compatible_keys(struct hist_trigger_data *target_hist_data, + struct hist_trigger_data *hist_data, + unsigned int n_keys) +{ + struct hist_field *target_hist_field, *hist_field; + unsigned int n, i, j; + + if (hist_data->n_fields - hist_data->n_vals != n_keys) + return false; + + i = hist_data->n_vals; + j = target_hist_data->n_vals; + + for (n = 0; n < n_keys; n++) { + hist_field = hist_data->fields[i + n]; + target_hist_field = target_hist_data->fields[j + n]; + + if (strcmp(hist_field->type, target_hist_field->type) != 0) + return false; + if (hist_field->size != target_hist_field->size) + return false; + if (hist_field->is_signed != target_hist_field->is_signed) + return false; + } + + return true; +} + +static struct hist_trigger_data * +find_compatible_hist(struct hist_trigger_data *target_hist_data, + struct trace_event_file *file) +{ + struct hist_trigger_data *hist_data; + struct event_trigger_data *test; + unsigned int n_keys; + + n_keys = target_hist_data->n_fields - target_hist_data->n_vals; + + list_for_each_entry_rcu(test, &file->triggers, list) { + if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) { + hist_data = test->private_data; + + if (compatible_keys(target_hist_data, hist_data, n_keys)) + return hist_data; + } + } + + return NULL; +} + +static struct trace_event_file *event_file(struct trace_array *tr, + char *system, char *event_name) +{ + struct trace_event_file *file; + + file = find_event_file(tr, system, event_name); + if (!file) + return ERR_PTR(-EINVAL); + + return file; +} + +static struct hist_field * +find_synthetic_field_var(struct hist_trigger_data *target_hist_data, + char *system, char *event_name, char *field_name) +{ + struct hist_field *event_var; + char *synthetic_name; + + synthetic_name = kzalloc(MAX_FILTER_STR_VAL, GFP_KERNEL); + if (!synthetic_name) + return ERR_PTR(-ENOMEM); + + strcpy(synthetic_name, "synthetic_"); + strcat(synthetic_name, field_name); + + event_var = find_event_var(target_hist_data, system, event_name, synthetic_name); + + kfree(synthetic_name); + + return event_var; +} + +/** + * create_field_var_hist - Automatically create a histogram and var for a field + * @target_hist_data: The target hist trigger + * @subsys_name: Optional subsystem name + * @event_name: Optional event name + * @field_name: The name of the field (and the resulting variable) + * + * Hist trigger actions fetch data from variables, not directly from + * events. However, for convenience, users are allowed to directly + * specify an event field in an action, which will be automatically + * converted into a variable on their behalf. + + * If a user specifies a field on an event that isn't the event the + * histogram currently being defined (the target event histogram), the + * only way that can be accomplished is if a new hist trigger is + * created and the field variable defined on that. + * + * This function creates a new histogram compatible with the target + * event (meaning a histogram with the same key as the target + * histogram), and creates a variable for the specified field, but + * with 'synthetic_' prepended to the variable name in order to avoid + * collision with normal field variables. + * + * Return: The variable created for the field. + */ +static struct hist_field * +create_field_var_hist(struct hist_trigger_data *target_hist_data, + char *subsys_name, char *event_name, char *field_name) +{ + struct trace_array *tr = target_hist_data->event_file->tr; + struct hist_field *event_var = ERR_PTR(-EINVAL); + struct hist_trigger_data *hist_data; + unsigned int i, n, first = true; + struct field_var_hist *var_hist; + struct trace_event_file *file; + struct hist_field *key_field; + char *saved_filter; + char *cmd; + int ret; + + if (target_hist_data->n_field_var_hists >= SYNTH_FIELDS_MAX) { + hist_err_event("onmatch: Too many field variables defined: ", + subsys_name, event_name, field_name); + return ERR_PTR(-EINVAL); + } + + file = event_file(tr, subsys_name, event_name); + + if (IS_ERR(file)) { + hist_err_event("onmatch: Event file not found: ", + subsys_name, event_name, field_name); + ret = PTR_ERR(file); + return ERR_PTR(ret); + } + + /* + * Look for a histogram compatible with target. We'll use the + * found histogram specification to create a new matching + * histogram with our variable on it. target_hist_data is not + * yet a registered histogram so we can't use that. + */ + hist_data = find_compatible_hist(target_hist_data, file); + if (!hist_data) { + hist_err_event("onmatch: Matching event histogram not found: ", + subsys_name, event_name, field_name); + return ERR_PTR(-EINVAL); + } + + /* See if a synthetic field variable has already been created */ + event_var = find_synthetic_field_var(target_hist_data, subsys_name, + event_name, field_name); + if (!IS_ERR_OR_NULL(event_var)) + return event_var; + + var_hist = kzalloc(sizeof(*var_hist), GFP_KERNEL); + if (!var_hist) + return ERR_PTR(-ENOMEM); + + cmd = kzalloc(MAX_FILTER_STR_VAL, GFP_KERNEL); + if (!cmd) { + kfree(var_hist); + return ERR_PTR(-ENOMEM); + } + + /* Use the same keys as the compatible histogram */ + strcat(cmd, "keys="); + + for_each_hist_key_field(i, hist_data) { + key_field = hist_data->fields[i]; + if (!first) + strcat(cmd, ","); + strcat(cmd, key_field->field->name); + first = false; + } + + /* Create the synthetic field variable specification */ + strcat(cmd, ":synthetic_"); + strcat(cmd, field_name); + strcat(cmd, "="); + strcat(cmd, field_name); + + /* Use the same filter as the compatible histogram */ + saved_filter = find_trigger_filter(hist_data, file); + if (saved_filter) { + strcat(cmd, " if "); + strcat(cmd, saved_filter); + } + + var_hist->cmd = kstrdup(cmd, GFP_KERNEL); + if (!var_hist->cmd) { + kfree(cmd); + kfree(var_hist); + return ERR_PTR(-ENOMEM); + } + + /* Save the compatible histogram information */ + var_hist->hist_data = hist_data; + + /* Create the new histogram with our variable */ + ret = event_hist_trigger_func(&trigger_hist_cmd, file, + "", "hist", cmd); + if (ret) { + kfree(cmd); + kfree(var_hist->cmd); + kfree(var_hist); + hist_err_event("onmatch: Couldn't create histogram for field: ", + subsys_name, event_name, field_name); + return ERR_PTR(ret); + } + + kfree(cmd); + + /* If we can't find the variable, something went wrong */ + event_var = find_synthetic_field_var(target_hist_data, subsys_name, + event_name, field_name); + if (IS_ERR_OR_NULL(event_var)) { + kfree(var_hist->cmd); + kfree(var_hist); + hist_err_event("onmatch: Couldn't find synthetic variable: ", + subsys_name, event_name, field_name); + return ERR_PTR(-EINVAL); + } + + n = target_hist_data->n_field_var_hists; + target_hist_data->field_var_hists[n] = var_hist; + target_hist_data->n_field_var_hists++; + + return event_var; +} + +static struct hist_field * +find_target_event_var(struct hist_trigger_data *hist_data, + char *subsys_name, char *event_name, char *var_name) +{ + struct trace_event_file *file = hist_data->event_file; + struct hist_field *hist_field = NULL; + + if (subsys_name) { + struct trace_event_call *call; + + if (!event_name) + return NULL; + + call = file->event_call; + + if (strcmp(subsys_name, call->class->system) != 0) + return NULL; + + if (strcmp(event_name, trace_event_name(call)) != 0) + return NULL; + } + + hist_field = find_var_field(hist_data, var_name); + + return hist_field; +} + +static inline void __update_field_vars(struct tracing_map_elt *elt, + struct ring_buffer_event *rbe, + void *rec, + struct field_var **field_vars, + unsigned int n_field_vars, + unsigned int field_var_str_start) +{ + struct hist_elt_data *elt_data = elt->private_data; + unsigned int i, j, var_idx; + u64 var_val; + + for (i = 0, j = field_var_str_start; i < n_field_vars; i++) { + struct field_var *field_var = field_vars[i]; + struct hist_field *var = field_var->var; + struct hist_field *val = field_var->val; + + var_val = val->fn(val, elt, rbe, rec); + var_idx = var->var.idx; + + if (val->flags & HIST_FIELD_FL_STRING) { + char *str = elt_data->field_var_str[j++]; + char *val_str = (char *)(uintptr_t)var_val; + + strscpy(str, val_str, STR_VAR_LEN_MAX); + var_val = (u64)(uintptr_t)str; + } + tracing_map_set_var(elt, var_idx, var_val); + } +} + +static void update_field_vars(struct hist_trigger_data *hist_data, + struct tracing_map_elt *elt, + struct ring_buffer_event *rbe, + void *rec) +{ + __update_field_vars(elt, rbe, rec, hist_data->field_vars, + hist_data->n_field_vars, 0); +} + +static void update_max_vars(struct hist_trigger_data *hist_data, + struct tracing_map_elt *elt, + struct ring_buffer_event *rbe, + void *rec) +{ + __update_field_vars(elt, rbe, rec, hist_data->max_vars, + hist_data->n_max_vars, hist_data->n_field_var_str); +} + +static struct hist_field *create_var(struct hist_trigger_data *hist_data, + struct trace_event_file *file, + char *name, int size, const char *type) +{ + struct hist_field *var; + int idx; + + if (find_var(hist_data, file, name) && !hist_data->remove) { + var = ERR_PTR(-EINVAL); + goto out; + } + + var = kzalloc(sizeof(struct hist_field), GFP_KERNEL); + if (!var) { + var = ERR_PTR(-ENOMEM); + goto out; + } + + idx = tracing_map_add_var(hist_data->map); + if (idx < 0) { + kfree(var); + var = ERR_PTR(-EINVAL); + goto out; + } + + var->flags = HIST_FIELD_FL_VAR; + var->var.idx = idx; + var->var.hist_data = var->hist_data = hist_data; + var->size = size; + var->var.name = kstrdup(name, GFP_KERNEL); + var->type = kstrdup(type, GFP_KERNEL); + if (!var->var.name || !var->type) { + kfree(var->var.name); + kfree(var->type); + kfree(var); + var = ERR_PTR(-ENOMEM); + } + out: + return var; +} + +static struct field_var *create_field_var(struct hist_trigger_data *hist_data, + struct trace_event_file *file, + char *field_name) +{ + struct hist_field *val = NULL, *var = NULL; + unsigned long flags = HIST_FIELD_FL_VAR; + struct field_var *field_var; int ret = 0; - if (WARN_ON(val_idx >= TRACING_MAP_VALS_MAX)) + if (hist_data->n_field_vars >= SYNTH_FIELDS_MAX) { + hist_err("Too many field variables defined: ", field_name); + ret = -EINVAL; + goto err; + } + + val = parse_atom(hist_data, file, field_name, &flags, NULL); + if (IS_ERR(val)) { + hist_err("Couldn't parse field variable: ", field_name); + ret = PTR_ERR(val); + goto err; + } + + var = create_var(hist_data, file, field_name, val->size, val->type); + if (IS_ERR(var)) { + hist_err("Couldn't create or find variable: ", field_name); + kfree(val); + ret = PTR_ERR(var); + goto err; + } + + field_var = kzalloc(sizeof(struct field_var), GFP_KERNEL); + if (!field_var) { + kfree(val); + kfree(var); + ret = -ENOMEM; + goto err; + } + + field_var->var = var; + field_var->val = val; + out: + return field_var; + err: + field_var = ERR_PTR(ret); + goto out; +} + +/** + * create_target_field_var - Automatically create a variable for a field + * @target_hist_data: The target hist trigger + * @subsys_name: Optional subsystem name + * @event_name: Optional event name + * @var_name: The name of the field (and the resulting variable) + * + * Hist trigger actions fetch data from variables, not directly from + * events. However, for convenience, users are allowed to directly + * specify an event field in an action, which will be automatically + * converted into a variable on their behalf. + + * This function creates a field variable with the name var_name on + * the hist trigger currently being defined on the target event. If + * subsys_name and event_name are specified, this function simply + * verifies that they do in fact match the target event subsystem and + * event name. + * + * Return: The variable created for the field. + */ +static struct field_var * +create_target_field_var(struct hist_trigger_data *target_hist_data, + char *subsys_name, char *event_name, char *var_name) +{ + struct trace_event_file *file = target_hist_data->event_file; + + if (subsys_name) { + struct trace_event_call *call; + + if (!event_name) + return NULL; + + call = file->event_call; + + if (strcmp(subsys_name, call->class->system) != 0) + return NULL; + + if (strcmp(event_name, trace_event_name(call)) != 0) + return NULL; + } + + return create_field_var(target_hist_data, file, var_name); +} + +static void onmax_print(struct seq_file *m, + struct hist_trigger_data *hist_data, + struct tracing_map_elt *elt, + struct action_data *data) +{ + unsigned int i, save_var_idx, max_idx = data->onmax.max_var->var.idx; + + seq_printf(m, "\n\tmax: %10llu", tracing_map_read_var(elt, max_idx)); + + for (i = 0; i < hist_data->n_max_vars; i++) { + struct hist_field *save_val = hist_data->max_vars[i]->val; + struct hist_field *save_var = hist_data->max_vars[i]->var; + u64 val; + + save_var_idx = save_var->var.idx; + + val = tracing_map_read_var(elt, save_var_idx); + + if (save_val->flags & HIST_FIELD_FL_STRING) { + seq_printf(m, " %s: %-32s", save_var->var.name, + (char *)(uintptr_t)(val)); + } else + seq_printf(m, " %s: %10llu", save_var->var.name, val); + } +} + +static void onmax_save(struct hist_trigger_data *hist_data, + struct tracing_map_elt *elt, void *rec, + struct ring_buffer_event *rbe, + struct action_data *data, u64 *var_ref_vals) +{ + unsigned int max_idx = data->onmax.max_var->var.idx; + unsigned int max_var_ref_idx = data->onmax.max_var_ref_idx; + + u64 var_val, max_val; + + var_val = var_ref_vals[max_var_ref_idx]; + max_val = tracing_map_read_var(elt, max_idx); + + if (var_val <= max_val) + return; + + tracing_map_set_var(elt, max_idx, var_val); + + update_max_vars(hist_data, elt, rbe, rec); +} + +static void onmax_destroy(struct action_data *data) +{ + unsigned int i; + + destroy_hist_field(data->onmax.max_var, 0); + destroy_hist_field(data->onmax.var, 0); + + kfree(data->onmax.var_str); + kfree(data->onmax.fn_name); + + for (i = 0; i < data->n_params; i++) + kfree(data->params[i]); + + kfree(data); +} + +static int onmax_create(struct hist_trigger_data *hist_data, + struct action_data *data) +{ + struct trace_event_file *file = hist_data->event_file; + struct hist_field *var_field, *ref_field, *max_var; + unsigned int var_ref_idx = hist_data->n_var_refs; + struct field_var *field_var; + char *onmax_var_str, *param; + unsigned long flags; + unsigned int i; + int ret = 0; + + onmax_var_str = data->onmax.var_str; + if (onmax_var_str[0] != '$') { + hist_err("onmax: For onmax(x), x must be a variable: ", onmax_var_str); return -EINVAL; + } + onmax_var_str++; - field_name = strsep(&field_str, "."); - if (field_str) { - if (strcmp(field_str, "hex") == 0) - flags |= HIST_FIELD_FL_HEX; - else { + var_field = find_target_event_var(hist_data, NULL, NULL, onmax_var_str); + if (!var_field) { + hist_err("onmax: Couldn't find onmax variable: ", onmax_var_str); + return -EINVAL; + } + + flags = HIST_FIELD_FL_VAR_REF; + ref_field = create_hist_field(hist_data, NULL, flags, NULL); + if (!ref_field) + return -ENOMEM; + + if (init_var_ref(ref_field, var_field, NULL, NULL)) { + destroy_hist_field(ref_field, 0); + ret = -ENOMEM; + goto out; + } + hist_data->var_refs[hist_data->n_var_refs] = ref_field; + ref_field->var_ref_idx = hist_data->n_var_refs++; + data->onmax.var = ref_field; + + data->fn = onmax_save; + data->onmax.max_var_ref_idx = var_ref_idx; + max_var = create_var(hist_data, file, "max", sizeof(u64), "u64"); + if (IS_ERR(max_var)) { + hist_err("onmax: Couldn't create onmax variable: ", "max"); + ret = PTR_ERR(max_var); + goto out; + } + data->onmax.max_var = max_var; + + for (i = 0; i < data->n_params; i++) { + param = kstrdup(data->params[i], GFP_KERNEL); + if (!param) { + ret = -ENOMEM; + goto out; + } + + field_var = create_target_field_var(hist_data, NULL, NULL, param); + if (IS_ERR(field_var)) { + hist_err("onmax: Couldn't create field variable: ", param); + ret = PTR_ERR(field_var); + kfree(param); + goto out; + } + + hist_data->max_vars[hist_data->n_max_vars++] = field_var; + if (field_var->val->flags & HIST_FIELD_FL_STRING) + hist_data->n_max_var_str++; + + kfree(param); + } + out: + return ret; +} + +static int parse_action_params(char *params, struct action_data *data) +{ + char *param, *saved_param; + int ret = 0; + + while (params) { + if (data->n_params >= SYNTH_FIELDS_MAX) + goto out; + + param = strsep(¶ms, ","); + if (!param) { ret = -EINVAL; goto out; } + + param = strstrip(param); + if (strlen(param) < 2) { + hist_err("Invalid action param: ", param); + ret = -EINVAL; + goto out; + } + + saved_param = kstrdup(param, GFP_KERNEL); + if (!saved_param) { + ret = -ENOMEM; + goto out; + } + + data->params[data->n_params++] = saved_param; } + out: + return ret; +} - field = trace_find_event_field(file->event_call, field_name); - if (!field || !field->size) { +static struct action_data *onmax_parse(char *str) +{ + char *onmax_fn_name, *onmax_var_str; + struct action_data *data; + int ret = -EINVAL; + + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + return ERR_PTR(-ENOMEM); + + onmax_var_str = strsep(&str, ")"); + if (!onmax_var_str || !str) { ret = -EINVAL; - goto out; + goto free; + } + + data->onmax.var_str = kstrdup(onmax_var_str, GFP_KERNEL); + if (!data->onmax.var_str) { + ret = -ENOMEM; + goto free; + } + + strsep(&str, "."); + if (!str) + goto free; + + onmax_fn_name = strsep(&str, "("); + if (!onmax_fn_name || !str) + goto free; + + if (strncmp(onmax_fn_name, "save", strlen("save")) == 0) { + char *params = strsep(&str, ")"); + + if (!params) { + ret = -EINVAL; + goto free; + } + + ret = parse_action_params(params, data); + if (ret) + goto free; + } else + goto free; + + data->onmax.fn_name = kstrdup(onmax_fn_name, GFP_KERNEL); + if (!data->onmax.fn_name) { + ret = -ENOMEM; + goto free; + } + out: + return data; + free: + onmax_destroy(data); + data = ERR_PTR(ret); + goto out; +} + +static void onmatch_destroy(struct action_data *data) +{ + unsigned int i; + + mutex_lock(&synth_event_mutex); + + kfree(data->onmatch.match_event); + kfree(data->onmatch.match_event_system); + kfree(data->onmatch.synth_event_name); + + for (i = 0; i < data->n_params; i++) + kfree(data->params[i]); + + if (data->onmatch.synth_event) + data->onmatch.synth_event->ref--; + + kfree(data); + + mutex_unlock(&synth_event_mutex); +} + +static void destroy_field_var(struct field_var *field_var) +{ + if (!field_var) + return; + + destroy_hist_field(field_var->var, 0); + destroy_hist_field(field_var->val, 0); + + kfree(field_var); +} + +static void destroy_field_vars(struct hist_trigger_data *hist_data) +{ + unsigned int i; + + for (i = 0; i < hist_data->n_field_vars; i++) + destroy_field_var(hist_data->field_vars[i]); +} + +static void save_field_var(struct hist_trigger_data *hist_data, + struct field_var *field_var) +{ + hist_data->field_vars[hist_data->n_field_vars++] = field_var; + + if (field_var->val->flags & HIST_FIELD_FL_STRING) + hist_data->n_field_var_str++; +} + + +static void destroy_synth_var_refs(struct hist_trigger_data *hist_data) +{ + unsigned int i; + + for (i = 0; i < hist_data->n_synth_var_refs; i++) + destroy_hist_field(hist_data->synth_var_refs[i], 0); +} + +static void save_synth_var_ref(struct hist_trigger_data *hist_data, + struct hist_field *var_ref) +{ + hist_data->synth_var_refs[hist_data->n_synth_var_refs++] = var_ref; + + hist_data->var_refs[hist_data->n_var_refs] = var_ref; + var_ref->var_ref_idx = hist_data->n_var_refs++; +} + +static int check_synth_field(struct synth_event *event, + struct hist_field *hist_field, + unsigned int field_pos) +{ + struct synth_field *field; + + if (field_pos >= event->n_fields) + return -EINVAL; + + field = event->fields[field_pos]; + + if (strcmp(field->type, hist_field->type) != 0) + return -EINVAL; + + return 0; +} + +static struct hist_field * +onmatch_find_var(struct hist_trigger_data *hist_data, struct action_data *data, + char *system, char *event, char *var) +{ + struct hist_field *hist_field; + + var++; /* skip '$' */ + + hist_field = find_target_event_var(hist_data, system, event, var); + if (!hist_field) { + if (!system) { + system = data->onmatch.match_event_system; + event = data->onmatch.match_event; + } + + hist_field = find_event_var(hist_data, system, event, var); + } + + if (!hist_field) + hist_err_event("onmatch: Couldn't find onmatch param: $", system, event, var); + + return hist_field; +} + +static struct hist_field * +onmatch_create_field_var(struct hist_trigger_data *hist_data, + struct action_data *data, char *system, + char *event, char *var) +{ + struct hist_field *hist_field = NULL; + struct field_var *field_var; + + /* + * First try to create a field var on the target event (the + * currently being defined). This will create a variable for + * unqualified fields on the target event, or if qualified, + * target fields that have qualified names matching the target. + */ + field_var = create_target_field_var(hist_data, system, event, var); + + if (field_var && !IS_ERR(field_var)) { + save_field_var(hist_data, field_var); + hist_field = field_var->var; + } else { + field_var = NULL; + /* + * If no explicit system.event is specfied, default to + * looking for fields on the onmatch(system.event.xxx) + * event. + */ + if (!system) { + system = data->onmatch.match_event_system; + event = data->onmatch.match_event; + } + + /* + * At this point, we're looking at a field on another + * event. Because we can't modify a hist trigger on + * another event to add a variable for a field, we need + * to create a new trigger on that event and create the + * variable at the same time. + */ + hist_field = create_field_var_hist(hist_data, system, event, var); + if (IS_ERR(hist_field)) + goto free; + } + out: + return hist_field; + free: + destroy_field_var(field_var); + hist_field = NULL; + goto out; +} + +static int onmatch_create(struct hist_trigger_data *hist_data, + struct trace_event_file *file, + struct action_data *data) +{ + char *event_name, *param, *system = NULL; + struct hist_field *hist_field, *var_ref; + unsigned int i, var_ref_idx; + unsigned int field_pos = 0; + struct synth_event *event; + int ret = 0; + + mutex_lock(&synth_event_mutex); + event = find_synth_event(data->onmatch.synth_event_name); + if (!event) { + hist_err("onmatch: Couldn't find synthetic event: ", data->onmatch.synth_event_name); + mutex_unlock(&synth_event_mutex); + return -EINVAL; + } + event->ref++; + mutex_unlock(&synth_event_mutex); + + var_ref_idx = hist_data->n_var_refs; + + for (i = 0; i < data->n_params; i++) { + char *p; + + p = param = kstrdup(data->params[i], GFP_KERNEL); + if (!param) { + ret = -ENOMEM; + goto err; + } + + system = strsep(¶m, "."); + if (!param) { + param = (char *)system; + system = event_name = NULL; + } else { + event_name = strsep(¶m, "."); + if (!param) { + kfree(p); + ret = -EINVAL; + goto err; + } + } + + if (param[0] == '$') + hist_field = onmatch_find_var(hist_data, data, system, + event_name, param); + else + hist_field = onmatch_create_field_var(hist_data, data, + system, + event_name, + param); + + if (!hist_field) { + kfree(p); + ret = -EINVAL; + goto err; + } + + if (check_synth_field(event, hist_field, field_pos) == 0) { + var_ref = create_var_ref(hist_field, system, event_name); + if (!var_ref) { + kfree(p); + ret = -ENOMEM; + goto err; + } + + save_synth_var_ref(hist_data, var_ref); + field_pos++; + kfree(p); + continue; + } + + hist_err_event("onmatch: Param type doesn't match synthetic event field type: ", + system, event_name, param); + kfree(p); + ret = -EINVAL; + goto err; + } + + if (field_pos != event->n_fields) { + hist_err("onmatch: Param count doesn't match synthetic event field count: ", event->name); + ret = -EINVAL; + goto err; + } + + data->fn = action_trace; + data->onmatch.synth_event = event; + data->onmatch.var_ref_idx = var_ref_idx; + out: + return ret; + err: + mutex_lock(&synth_event_mutex); + event->ref--; + mutex_unlock(&synth_event_mutex); + + goto out; +} + +static struct action_data *onmatch_parse(struct trace_array *tr, char *str) +{ + char *match_event, *match_event_system; + char *synth_event_name, *params; + struct action_data *data; + int ret = -EINVAL; + + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + return ERR_PTR(-ENOMEM); + + match_event = strsep(&str, ")"); + if (!match_event || !str) { + hist_err("onmatch: Missing closing paren: ", match_event); + goto free; + } + + match_event_system = strsep(&match_event, "."); + if (!match_event) { + hist_err("onmatch: Missing subsystem for match event: ", match_event_system); + goto free; + } + + if (IS_ERR(event_file(tr, match_event_system, match_event))) { + hist_err_event("onmatch: Invalid subsystem or event name: ", + match_event_system, match_event, NULL); + goto free; + } + + data->onmatch.match_event = kstrdup(match_event, GFP_KERNEL); + if (!data->onmatch.match_event) { + ret = -ENOMEM; + goto free; + } + + data->onmatch.match_event_system = kstrdup(match_event_system, GFP_KERNEL); + if (!data->onmatch.match_event_system) { + ret = -ENOMEM; + goto free; + } + + strsep(&str, "."); + if (!str) { + hist_err("onmatch: Missing . after onmatch(): ", str); + goto free; + } + + synth_event_name = strsep(&str, "("); + if (!synth_event_name || !str) { + hist_err("onmatch: Missing opening paramlist paren: ", synth_event_name); + goto free; } - hist_data->fields[val_idx] = create_hist_field(field, flags); - if (!hist_data->fields[val_idx]) { + data->onmatch.synth_event_name = kstrdup(synth_event_name, GFP_KERNEL); + if (!data->onmatch.synth_event_name) { ret = -ENOMEM; + goto free; + } + + params = strsep(&str, ")"); + if (!params || !str || (str && strlen(str))) { + hist_err("onmatch: Missing closing paramlist paren: ", params); + goto free; + } + + ret = parse_action_params(params, data); + if (ret) + goto free; + out: + return data; + free: + onmatch_destroy(data); + data = ERR_PTR(ret); + goto out; +} + +static int create_hitcount_val(struct hist_trigger_data *hist_data) +{ + hist_data->fields[HITCOUNT_IDX] = + create_hist_field(hist_data, NULL, HIST_FIELD_FL_HITCOUNT, NULL); + if (!hist_data->fields[HITCOUNT_IDX]) + return -ENOMEM; + + hist_data->n_vals++; + hist_data->n_fields++; + + if (WARN_ON(hist_data->n_vals > TRACING_MAP_VALS_MAX)) + return -EINVAL; + + return 0; +} + +static int __create_val_field(struct hist_trigger_data *hist_data, + unsigned int val_idx, + struct trace_event_file *file, + char *var_name, char *field_str, + unsigned long flags) +{ + struct hist_field *hist_field; + int ret = 0; + + hist_field = parse_expr(hist_data, file, field_str, flags, var_name, 0); + if (IS_ERR(hist_field)) { + ret = PTR_ERR(hist_field); goto out; } + hist_data->fields[val_idx] = hist_field; + ++hist_data->n_vals; + ++hist_data->n_fields; - if (WARN_ON(hist_data->n_vals > TRACING_MAP_VALS_MAX)) + if (WARN_ON(hist_data->n_vals > TRACING_MAP_VALS_MAX + TRACING_MAP_VARS_MAX)) ret = -EINVAL; out: return ret; } +static int create_val_field(struct hist_trigger_data *hist_data, + unsigned int val_idx, + struct trace_event_file *file, + char *field_str) +{ + if (WARN_ON(val_idx >= TRACING_MAP_VALS_MAX)) + return -EINVAL; + + return __create_val_field(hist_data, val_idx, file, NULL, field_str, 0); +} + +static int create_var_field(struct hist_trigger_data *hist_data, + unsigned int val_idx, + struct trace_event_file *file, + char *var_name, char *expr_str) +{ + unsigned long flags = 0; + + if (WARN_ON(val_idx >= TRACING_MAP_VALS_MAX + TRACING_MAP_VARS_MAX)) + return -EINVAL; + + if (find_var(hist_data, file, var_name) && !hist_data->remove) { + hist_err("Variable already defined: ", var_name); + return -EINVAL; + } + + flags |= HIST_FIELD_FL_VAR; + hist_data->n_vars++; + if (WARN_ON(hist_data->n_vars > TRACING_MAP_VARS_MAX)) + return -EINVAL; + + return __create_val_field(hist_data, val_idx, file, var_name, expr_str, flags); +} + static int create_val_fields(struct hist_trigger_data *hist_data, struct trace_event_file *file) { char *fields_str, *field_str; - unsigned int i, j; + unsigned int i, j = 1; int ret; ret = create_hitcount_val(hist_data); @@ -533,12 +3910,15 @@ static int create_val_fields(struct hist_trigger_data *hist_data, field_str = strsep(&fields_str, ","); if (!field_str) break; + if (strcmp(field_str, "hitcount") == 0) continue; + ret = create_val_field(hist_data, j++, file, field_str); if (ret) goto out; } + if (fields_str && (strcmp(fields_str, "hitcount") != 0)) ret = -EINVAL; out: @@ -551,12 +3931,13 @@ static int create_key_field(struct hist_trigger_data *hist_data, struct trace_event_file *file, char *field_str) { - struct ftrace_event_field *field = NULL; + struct hist_field *hist_field = NULL; + unsigned long flags = 0; unsigned int key_size; int ret = 0; - if (WARN_ON(key_idx >= TRACING_MAP_FIELDS_MAX)) + if (WARN_ON(key_idx >= HIST_FIELDS_MAX)) return -EINVAL; flags |= HIST_FIELD_FL_KEY; @@ -564,57 +3945,40 @@ static int create_key_field(struct hist_trigger_data *hist_data, if (strcmp(field_str, "stacktrace") == 0) { flags |= HIST_FIELD_FL_STACKTRACE; key_size = sizeof(unsigned long) * HIST_STACKTRACE_DEPTH; + hist_field = create_hist_field(hist_data, NULL, flags, NULL); } else { - char *field_name = strsep(&field_str, "."); - - if (field_str) { - if (strcmp(field_str, "hex") == 0) - flags |= HIST_FIELD_FL_HEX; - else if (strcmp(field_str, "sym") == 0) - flags |= HIST_FIELD_FL_SYM; - else if (strcmp(field_str, "sym-offset") == 0) - flags |= HIST_FIELD_FL_SYM_OFFSET; - else if ((strcmp(field_str, "execname") == 0) && - (strcmp(field_name, "common_pid") == 0)) - flags |= HIST_FIELD_FL_EXECNAME; - else if (strcmp(field_str, "syscall") == 0) - flags |= HIST_FIELD_FL_SYSCALL; - else if (strcmp(field_str, "log2") == 0) - flags |= HIST_FIELD_FL_LOG2; - else { - ret = -EINVAL; - goto out; - } + hist_field = parse_expr(hist_data, file, field_str, flags, + NULL, 0); + if (IS_ERR(hist_field)) { + ret = PTR_ERR(hist_field); + goto out; } - field = trace_find_event_field(file->event_call, field_name); - if (!field || !field->size) { + if (hist_field->flags & HIST_FIELD_FL_VAR_REF) { + hist_err("Using variable references as keys not supported: ", field_str); + destroy_hist_field(hist_field, 0); ret = -EINVAL; goto out; } - if (is_string_field(field)) - key_size = MAX_FILTER_STR_VAL; - else - key_size = field->size; + key_size = hist_field->size; } - hist_data->fields[key_idx] = create_hist_field(field, flags); - if (!hist_data->fields[key_idx]) { - ret = -ENOMEM; - goto out; - } + hist_data->fields[key_idx] = hist_field; key_size = ALIGN(key_size, sizeof(u64)); hist_data->fields[key_idx]->size = key_size; hist_data->fields[key_idx]->offset = key_offset; + hist_data->key_size += key_size; + if (hist_data->key_size > HIST_KEY_SIZE_MAX) { ret = -EINVAL; goto out; } hist_data->n_keys++; + hist_data->n_fields++; if (WARN_ON(hist_data->n_keys > TRACING_MAP_KEYS_MAX)) return -EINVAL; @@ -658,21 +4022,113 @@ static int create_key_fields(struct hist_trigger_data *hist_data, return ret; } +static int create_var_fields(struct hist_trigger_data *hist_data, + struct trace_event_file *file) +{ + unsigned int i, j = hist_data->n_vals; + int ret = 0; + + unsigned int n_vars = hist_data->attrs->var_defs.n_vars; + + for (i = 0; i < n_vars; i++) { + char *var_name = hist_data->attrs->var_defs.name[i]; + char *expr = hist_data->attrs->var_defs.expr[i]; + + ret = create_var_field(hist_data, j++, file, var_name, expr); + if (ret) + goto out; + } + out: + return ret; +} + +static void free_var_defs(struct hist_trigger_data *hist_data) +{ + unsigned int i; + + for (i = 0; i < hist_data->attrs->var_defs.n_vars; i++) { + kfree(hist_data->attrs->var_defs.name[i]); + kfree(hist_data->attrs->var_defs.expr[i]); + } + + hist_data->attrs->var_defs.n_vars = 0; +} + +static int parse_var_defs(struct hist_trigger_data *hist_data) +{ + char *s, *str, *var_name, *field_str; + unsigned int i, j, n_vars = 0; + int ret = 0; + + for (i = 0; i < hist_data->attrs->n_assignments; i++) { + str = hist_data->attrs->assignment_str[i]; + for (j = 0; j < TRACING_MAP_VARS_MAX; j++) { + field_str = strsep(&str, ","); + if (!field_str) + break; + + var_name = strsep(&field_str, "="); + if (!var_name || !field_str) { + hist_err("Malformed assignment: ", var_name); + ret = -EINVAL; + goto free; + } + + if (n_vars == TRACING_MAP_VARS_MAX) { + hist_err("Too many variables defined: ", var_name); + ret = -EINVAL; + goto free; + } + + s = kstrdup(var_name, GFP_KERNEL); + if (!s) { + ret = -ENOMEM; + goto free; + } + hist_data->attrs->var_defs.name[n_vars] = s; + + s = kstrdup(field_str, GFP_KERNEL); + if (!s) { + kfree(hist_data->attrs->var_defs.name[n_vars]); + ret = -ENOMEM; + goto free; + } + hist_data->attrs->var_defs.expr[n_vars++] = s; + + hist_data->attrs->var_defs.n_vars = n_vars; + } + } + + return ret; + free: + free_var_defs(hist_data); + + return ret; +} + static int create_hist_fields(struct hist_trigger_data *hist_data, struct trace_event_file *file) { int ret; + ret = parse_var_defs(hist_data); + if (ret) + goto out; + ret = create_val_fields(hist_data, file); if (ret) goto out; - ret = create_key_fields(hist_data, file); + ret = create_var_fields(hist_data, file); if (ret) goto out; - hist_data->n_fields = hist_data->n_vals + hist_data->n_keys; + ret = create_key_fields(hist_data, file); + if (ret) + goto out; out: + free_var_defs(hist_data); + return ret; } @@ -695,7 +4151,7 @@ static int create_sort_keys(struct hist_trigger_data *hist_data) char *fields_str = hist_data->attrs->sort_key_str; struct tracing_map_sort_key *sort_key; int descending, ret = 0; - unsigned int i, j; + unsigned int i, j, k; hist_data->n_sort_keys = 1; /* we always have at least one, hitcount */ @@ -743,12 +4199,19 @@ static int create_sort_keys(struct hist_trigger_data *hist_data) continue; } - for (j = 1; j < hist_data->n_fields; j++) { + for (j = 1, k = 1; j < hist_data->n_fields; j++) { + unsigned int idx; + hist_field = hist_data->fields[j]; + if (hist_field->flags & HIST_FIELD_FL_VAR) + continue; + + idx = k++; + test_name = hist_field_name(hist_field, 0); if (strcmp(field_name, test_name) == 0) { - sort_key->field_idx = j; + sort_key->field_idx = idx; descending = is_descending(field_str); if (descending < 0) { ret = descending; @@ -763,16 +4226,230 @@ static int create_sort_keys(struct hist_trigger_data *hist_data) break; } } + hist_data->n_sort_keys = i; out: return ret; } +static void destroy_actions(struct hist_trigger_data *hist_data) +{ + unsigned int i; + + for (i = 0; i < hist_data->n_actions; i++) { + struct action_data *data = hist_data->actions[i]; + + if (data->fn == action_trace) + onmatch_destroy(data); + else if (data->fn == onmax_save) + onmax_destroy(data); + else + kfree(data); + } +} + +static int parse_actions(struct hist_trigger_data *hist_data) +{ + struct trace_array *tr = hist_data->event_file->tr; + struct action_data *data; + unsigned int i; + int ret = 0; + char *str; + + for (i = 0; i < hist_data->attrs->n_actions; i++) { + str = hist_data->attrs->action_str[i]; + + if (strncmp(str, "onmatch(", strlen("onmatch(")) == 0) { + char *action_str = str + strlen("onmatch("); + + data = onmatch_parse(tr, action_str); + if (IS_ERR(data)) { + ret = PTR_ERR(data); + break; + } + data->fn = action_trace; + } else if (strncmp(str, "onmax(", strlen("onmax(")) == 0) { + char *action_str = str + strlen("onmax("); + + data = onmax_parse(action_str); + if (IS_ERR(data)) { + ret = PTR_ERR(data); + break; + } + data->fn = onmax_save; + } else { + ret = -EINVAL; + break; + } + + hist_data->actions[hist_data->n_actions++] = data; + } + + return ret; +} + +static int create_actions(struct hist_trigger_data *hist_data, + struct trace_event_file *file) +{ + struct action_data *data; + unsigned int i; + int ret = 0; + + for (i = 0; i < hist_data->attrs->n_actions; i++) { + data = hist_data->actions[i]; + + if (data->fn == action_trace) { + ret = onmatch_create(hist_data, file, data); + if (ret) + return ret; + } else if (data->fn == onmax_save) { + ret = onmax_create(hist_data, data); + if (ret) + return ret; + } + } + + return ret; +} + +static void print_actions(struct seq_file *m, + struct hist_trigger_data *hist_data, + struct tracing_map_elt *elt) +{ + unsigned int i; + + for (i = 0; i < hist_data->n_actions; i++) { + struct action_data *data = hist_data->actions[i]; + + if (data->fn == onmax_save) + onmax_print(m, hist_data, elt, data); + } +} + +static void print_onmax_spec(struct seq_file *m, + struct hist_trigger_data *hist_data, + struct action_data *data) +{ + unsigned int i; + + seq_puts(m, ":onmax("); + seq_printf(m, "%s", data->onmax.var_str); + seq_printf(m, ").%s(", data->onmax.fn_name); + + for (i = 0; i < hist_data->n_max_vars; i++) { + seq_printf(m, "%s", hist_data->max_vars[i]->var->var.name); + if (i < hist_data->n_max_vars - 1) + seq_puts(m, ","); + } + seq_puts(m, ")"); +} + +static void print_onmatch_spec(struct seq_file *m, + struct hist_trigger_data *hist_data, + struct action_data *data) +{ + unsigned int i; + + seq_printf(m, ":onmatch(%s.%s).", data->onmatch.match_event_system, + data->onmatch.match_event); + + seq_printf(m, "%s(", data->onmatch.synth_event->name); + + for (i = 0; i < data->n_params; i++) { + if (i) + seq_puts(m, ","); + seq_printf(m, "%s", data->params[i]); + } + + seq_puts(m, ")"); +} + +static bool actions_match(struct hist_trigger_data *hist_data, + struct hist_trigger_data *hist_data_test) +{ + unsigned int i, j; + + if (hist_data->n_actions != hist_data_test->n_actions) + return false; + + for (i = 0; i < hist_data->n_actions; i++) { + struct action_data *data = hist_data->actions[i]; + struct action_data *data_test = hist_data_test->actions[i]; + + if (data->fn != data_test->fn) + return false; + + if (data->n_params != data_test->n_params) + return false; + + for (j = 0; j < data->n_params; j++) { + if (strcmp(data->params[j], data_test->params[j]) != 0) + return false; + } + + if (data->fn == action_trace) { + if (strcmp(data->onmatch.synth_event_name, + data_test->onmatch.synth_event_name) != 0) + return false; + if (strcmp(data->onmatch.match_event_system, + data_test->onmatch.match_event_system) != 0) + return false; + if (strcmp(data->onmatch.match_event, + data_test->onmatch.match_event) != 0) + return false; + } else if (data->fn == onmax_save) { + if (strcmp(data->onmax.var_str, + data_test->onmax.var_str) != 0) + return false; + if (strcmp(data->onmax.fn_name, + data_test->onmax.fn_name) != 0) + return false; + } + } + + return true; +} + + +static void print_actions_spec(struct seq_file *m, + struct hist_trigger_data *hist_data) +{ + unsigned int i; + + for (i = 0; i < hist_data->n_actions; i++) { + struct action_data *data = hist_data->actions[i]; + + if (data->fn == action_trace) + print_onmatch_spec(m, hist_data, data); + else if (data->fn == onmax_save) + print_onmax_spec(m, hist_data, data); + } +} + +static void destroy_field_var_hists(struct hist_trigger_data *hist_data) +{ + unsigned int i; + + for (i = 0; i < hist_data->n_field_var_hists; i++) { + kfree(hist_data->field_var_hists[i]->cmd); + kfree(hist_data->field_var_hists[i]); + } +} + static void destroy_hist_data(struct hist_trigger_data *hist_data) { + if (!hist_data) + return; + destroy_hist_trigger_attrs(hist_data->attrs); destroy_hist_fields(hist_data); tracing_map_destroy(hist_data->map); + + destroy_actions(hist_data); + destroy_field_vars(hist_data); + destroy_field_var_hists(hist_data); + destroy_synth_var_refs(hist_data); + kfree(hist_data); } @@ -781,7 +4458,7 @@ static int create_tracing_map_fields(struct hist_trigger_data *hist_data) struct tracing_map *map = hist_data->map; struct ftrace_event_field *field; struct hist_field *hist_field; - int i, idx; + int i, idx = 0; for_each_hist_field(i, hist_data) { hist_field = hist_data->fields[i]; @@ -792,6 +4469,9 @@ static int create_tracing_map_fields(struct hist_trigger_data *hist_data) if (hist_field->flags & HIST_FIELD_FL_STACKTRACE) cmp_fn = tracing_map_cmp_none; + else if (!field) + cmp_fn = tracing_map_cmp_num(hist_field->size, + hist_field->is_signed); else if (is_string_field(field)) cmp_fn = tracing_map_cmp_string; else @@ -800,36 +4480,29 @@ static int create_tracing_map_fields(struct hist_trigger_data *hist_data) idx = tracing_map_add_key_field(map, hist_field->offset, cmp_fn); - - } else + } else if (!(hist_field->flags & HIST_FIELD_FL_VAR)) idx = tracing_map_add_sum_field(map); if (idx < 0) return idx; - } - - return 0; -} - -static bool need_tracing_map_ops(struct hist_trigger_data *hist_data) -{ - struct hist_field *key_field; - unsigned int i; - - for_each_hist_key_field(i, hist_data) { - key_field = hist_data->fields[i]; - if (key_field->flags & HIST_FIELD_FL_EXECNAME) - return true; + if (hist_field->flags & HIST_FIELD_FL_VAR) { + idx = tracing_map_add_var(map); + if (idx < 0) + return idx; + hist_field->var.idx = idx; + hist_field->var.hist_data = hist_data; + } } - return false; + return 0; } static struct hist_trigger_data * create_hist_data(unsigned int map_bits, struct hist_trigger_attrs *attrs, - struct trace_event_file *file) + struct trace_event_file *file, + bool remove) { const struct tracing_map_ops *map_ops = NULL; struct hist_trigger_data *hist_data; @@ -840,6 +4513,12 @@ create_hist_data(unsigned int map_bits, return ERR_PTR(-ENOMEM); hist_data->attrs = attrs; + hist_data->remove = remove; + hist_data->event_file = file; + + ret = parse_actions(hist_data); + if (ret) + goto free; ret = create_hist_fields(hist_data, file); if (ret) @@ -849,8 +4528,7 @@ create_hist_data(unsigned int map_bits, if (ret) goto free; - if (need_tracing_map_ops(hist_data)) - map_ops = &hist_trigger_elt_comm_ops; + map_ops = &hist_trigger_elt_data_ops; hist_data->map = tracing_map_create(map_bits, hist_data->key_size, map_ops, hist_data); @@ -863,12 +4541,6 @@ create_hist_data(unsigned int map_bits, ret = create_tracing_map_fields(hist_data); if (ret) goto free; - - ret = tracing_map_init(hist_data->map); - if (ret) - goto free; - - hist_data->event_file = file; out: return hist_data; free: @@ -882,18 +4554,39 @@ create_hist_data(unsigned int map_bits, } static void hist_trigger_elt_update(struct hist_trigger_data *hist_data, - struct tracing_map_elt *elt, - void *rec) + struct tracing_map_elt *elt, void *rec, + struct ring_buffer_event *rbe, + u64 *var_ref_vals) { + struct hist_elt_data *elt_data; struct hist_field *hist_field; - unsigned int i; + unsigned int i, var_idx; u64 hist_val; + elt_data = elt->private_data; + elt_data->var_ref_vals = var_ref_vals; + for_each_hist_val_field(i, hist_data) { hist_field = hist_data->fields[i]; - hist_val = hist_field->fn(hist_field, rec); + hist_val = hist_field->fn(hist_field, elt, rbe, rec); + if (hist_field->flags & HIST_FIELD_FL_VAR) { + var_idx = hist_field->var.idx; + tracing_map_set_var(elt, var_idx, hist_val); + continue; + } tracing_map_update_sum(elt, i, hist_val); } + + for_each_hist_key_field(i, hist_data) { + hist_field = hist_data->fields[i]; + if (hist_field->flags & HIST_FIELD_FL_VAR) { + hist_val = hist_field->fn(hist_field, elt, rbe, rec); + var_idx = hist_field->var.idx; + tracing_map_set_var(elt, var_idx, hist_val); + } + } + + update_field_vars(hist_data, elt, rbe, rec); } static inline void add_to_key(char *compound_key, void *key, @@ -920,15 +4613,31 @@ static inline void add_to_key(char *compound_key, void *key, memcpy(compound_key + key_field->offset, key, size); } -static void event_hist_trigger(struct event_trigger_data *data, void *rec) +static void +hist_trigger_actions(struct hist_trigger_data *hist_data, + struct tracing_map_elt *elt, void *rec, + struct ring_buffer_event *rbe, u64 *var_ref_vals) +{ + struct action_data *data; + unsigned int i; + + for (i = 0; i < hist_data->n_actions; i++) { + data = hist_data->actions[i]; + data->fn(hist_data, elt, rec, rbe, data, var_ref_vals); + } +} + +static void event_hist_trigger(struct event_trigger_data *data, void *rec, + struct ring_buffer_event *rbe) { struct hist_trigger_data *hist_data = data->private_data; bool use_compound_key = (hist_data->n_keys > 1); unsigned long entries[HIST_STACKTRACE_DEPTH]; + u64 var_ref_vals[TRACING_MAP_VARS_MAX]; char compound_key[HIST_KEY_SIZE_MAX]; + struct tracing_map_elt *elt = NULL; struct stack_trace stacktrace; struct hist_field *key_field; - struct tracing_map_elt *elt; u64 field_contents; void *key = NULL; unsigned int i; @@ -949,7 +4658,7 @@ static void event_hist_trigger(struct event_trigger_data *data, void *rec) key = entries; } else { - field_contents = key_field->fn(key_field, rec); + field_contents = key_field->fn(key_field, elt, rbe, rec); if (key_field->flags & HIST_FIELD_FL_STRING) { key = (void *)(unsigned long)field_contents; use_compound_key = true; @@ -964,9 +4673,18 @@ static void event_hist_trigger(struct event_trigger_data *data, void *rec) if (use_compound_key) key = compound_key; + if (hist_data->n_var_refs && + !resolve_var_refs(hist_data, key, var_ref_vals, false)) + return; + elt = tracing_map_insert(hist_data->map, key); - if (elt) - hist_trigger_elt_update(hist_data, elt, rec); + if (!elt) + return; + + hist_trigger_elt_update(hist_data, elt, rec, rbe, var_ref_vals); + + if (resolve_var_refs(hist_data, key, var_ref_vals, true)) + hist_trigger_actions(hist_data, elt, rec, rbe, var_ref_vals); } static void hist_trigger_stacktrace_print(struct seq_file *m, @@ -1023,7 +4741,13 @@ hist_trigger_entry_print(struct seq_file *m, seq_printf(m, "%s: [%llx] %-55s", field_name, uval, str); } else if (key_field->flags & HIST_FIELD_FL_EXECNAME) { - char *comm = elt->private_data; + struct hist_elt_data *elt_data = elt->private_data; + char *comm; + + if (WARN_ON_ONCE(!elt_data)) + return; + + comm = elt_data->comm; uval = *(u64 *)(key + key_field->offset); seq_printf(m, "%s: %-16s[%10llu]", field_name, @@ -1067,6 +4791,10 @@ hist_trigger_entry_print(struct seq_file *m, for (i = 1; i < hist_data->n_vals; i++) { field_name = hist_field_name(hist_data->fields[i], 0); + if (hist_data->fields[i]->flags & HIST_FIELD_FL_VAR || + hist_data->fields[i]->flags & HIST_FIELD_FL_EXPR) + continue; + if (hist_data->fields[i]->flags & HIST_FIELD_FL_HEX) { seq_printf(m, " %s: %10llx", field_name, tracing_map_read_sum(elt, i)); @@ -1076,6 +4804,8 @@ hist_trigger_entry_print(struct seq_file *m, } } + print_actions(m, hist_data, elt); + seq_puts(m, "\n"); } @@ -1144,6 +4874,11 @@ static int hist_show(struct seq_file *m, void *v) hist_trigger_show(m, data, n++); } + if (have_hist_err()) { + seq_printf(m, "\nERROR: %s\n", hist_err_str); + seq_printf(m, " Last command: %s\n", last_hist_cmd); + } + out_unlock: mutex_unlock(&event_mutex); @@ -1162,37 +4897,22 @@ const struct file_operations event_hist_fops = { .release = single_release, }; -static const char *get_hist_field_flags(struct hist_field *hist_field) -{ - const char *flags_str = NULL; - - if (hist_field->flags & HIST_FIELD_FL_HEX) - flags_str = "hex"; - else if (hist_field->flags & HIST_FIELD_FL_SYM) - flags_str = "sym"; - else if (hist_field->flags & HIST_FIELD_FL_SYM_OFFSET) - flags_str = "sym-offset"; - else if (hist_field->flags & HIST_FIELD_FL_EXECNAME) - flags_str = "execname"; - else if (hist_field->flags & HIST_FIELD_FL_SYSCALL) - flags_str = "syscall"; - else if (hist_field->flags & HIST_FIELD_FL_LOG2) - flags_str = "log2"; - - return flags_str; -} - static void hist_field_print(struct seq_file *m, struct hist_field *hist_field) { const char *field_name = hist_field_name(hist_field, 0); - seq_printf(m, "%s", field_name); - if (hist_field->flags) { - const char *flags_str = get_hist_field_flags(hist_field); - - if (flags_str) - seq_printf(m, ".%s", flags_str); - } + if (hist_field->var.name) + seq_printf(m, "%s=", hist_field->var.name); + + if (hist_field->flags & HIST_FIELD_FL_CPU) + seq_puts(m, "cpu"); + else if (field_name) { + if (hist_field->flags & HIST_FIELD_FL_VAR_REF || + hist_field->flags & HIST_FIELD_FL_ALIAS) + seq_putc(m, '$'); + seq_printf(m, "%s", field_name); + } else if (hist_field->flags & HIST_FIELD_FL_TIMESTAMP) + seq_puts(m, "common_timestamp"); } static int event_hist_trigger_print(struct seq_file *m, @@ -1200,7 +4920,8 @@ static int event_hist_trigger_print(struct seq_file *m, struct event_trigger_data *data) { struct hist_trigger_data *hist_data = data->private_data; - struct hist_field *key_field; + struct hist_field *field; + bool have_var = false; unsigned int i; seq_puts(m, "hist:"); @@ -1211,25 +4932,47 @@ static int event_hist_trigger_print(struct seq_file *m, seq_puts(m, "keys="); for_each_hist_key_field(i, hist_data) { - key_field = hist_data->fields[i]; + field = hist_data->fields[i]; if (i > hist_data->n_vals) seq_puts(m, ","); - if (key_field->flags & HIST_FIELD_FL_STACKTRACE) + if (field->flags & HIST_FIELD_FL_STACKTRACE) seq_puts(m, "stacktrace"); else - hist_field_print(m, key_field); + hist_field_print(m, field); } seq_puts(m, ":vals="); for_each_hist_val_field(i, hist_data) { + field = hist_data->fields[i]; + if (field->flags & HIST_FIELD_FL_VAR) { + have_var = true; + continue; + } + if (i == HITCOUNT_IDX) seq_puts(m, "hitcount"); else { seq_puts(m, ","); - hist_field_print(m, hist_data->fields[i]); + hist_field_print(m, field); + } + } + + if (have_var) { + unsigned int n = 0; + + seq_puts(m, ":"); + + for_each_hist_val_field(i, hist_data) { + field = hist_data->fields[i]; + + if (field->flags & HIST_FIELD_FL_VAR) { + if (n++) + seq_puts(m, ","); + hist_field_print(m, field); + } } } @@ -1237,28 +4980,36 @@ static int event_hist_trigger_print(struct seq_file *m, for (i = 0; i < hist_data->n_sort_keys; i++) { struct tracing_map_sort_key *sort_key; + unsigned int idx, first_key_idx; + + /* skip VAR vals */ + first_key_idx = hist_data->n_vals - hist_data->n_vars; sort_key = &hist_data->sort_keys[i]; + idx = sort_key->field_idx; + + if (WARN_ON(idx >= HIST_FIELDS_MAX)) + return -EINVAL; if (i > 0) seq_puts(m, ","); - if (sort_key->field_idx == HITCOUNT_IDX) + if (idx == HITCOUNT_IDX) seq_puts(m, "hitcount"); else { - unsigned int idx = sort_key->field_idx; - - if (WARN_ON(idx >= TRACING_MAP_FIELDS_MAX)) - return -EINVAL; - + if (idx >= first_key_idx) + idx += hist_data->n_vars; hist_field_print(m, hist_data->fields[idx]); } if (sort_key->descending) seq_puts(m, ".descending"); } - seq_printf(m, ":size=%u", (1 << hist_data->map->map_bits)); + if (hist_data->enable_timestamps) + seq_printf(m, ":clock=%s", hist_data->attrs->clock); + + print_actions_spec(m, hist_data); if (data->filter_str) seq_printf(m, " if %s", data->filter_str); @@ -1286,6 +5037,21 @@ static int event_hist_trigger_init(struct event_trigger_ops *ops, return 0; } +static void unregister_field_var_hists(struct hist_trigger_data *hist_data) +{ + struct trace_event_file *file; + unsigned int i; + char *cmd; + int ret; + + for (i = 0; i < hist_data->n_field_var_hists; i++) { + file = hist_data->field_var_hists[i]->hist_data->event_file; + cmd = hist_data->field_var_hists[i]->cmd; + ret = event_hist_trigger_func(&trigger_hist_cmd, file, + "!hist", "hist", cmd); + } +} + static void event_hist_trigger_free(struct event_trigger_ops *ops, struct event_trigger_data *data) { @@ -1298,7 +5064,13 @@ static void event_hist_trigger_free(struct event_trigger_ops *ops, if (!data->ref) { if (data->name) del_named_trigger(data); + trigger_data_free(data); + + remove_hist_vars(hist_data); + + unregister_field_var_hists(hist_data); + destroy_hist_data(hist_data); } } @@ -1425,6 +5197,15 @@ static bool hist_trigger_match(struct event_trigger_data *data, return false; if (key_field->offset != key_field_test->offset) return false; + if (key_field->size != key_field_test->size) + return false; + if (key_field->is_signed != key_field_test->is_signed) + return false; + if (!!key_field->var.name != !!key_field_test->var.name) + return false; + if (key_field->var.name && + strcmp(key_field->var.name, key_field_test->var.name) != 0) + return false; } for (i = 0; i < hist_data->n_sort_keys; i++) { @@ -1440,6 +5221,9 @@ static bool hist_trigger_match(struct event_trigger_data *data, (strcmp(data->filter_str, data_test->filter_str) != 0)) return false; + if (!actions_match(hist_data, hist_data_test)) + return false; + return true; } @@ -1456,6 +5240,7 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops, if (named_data) { if (!hist_trigger_match(data, named_data, named_data, true)) { + hist_err("Named hist trigger doesn't match existing named trigger (includes variables): ", hist_data->attrs->name); ret = -EINVAL; goto out; } @@ -1475,13 +5260,16 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops, test->paused = false; else if (hist_data->attrs->clear) hist_clear(test); - else + else { + hist_err("Hist trigger already exists", NULL); ret = -EEXIST; + } goto out; } } new: if (hist_data->attrs->cont || hist_data->attrs->clear) { + hist_err("Can't clear or continue a nonexistent hist trigger", NULL); ret = -ENOENT; goto out; } @@ -1490,7 +5278,6 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops, data->paused = true; if (named_data) { - destroy_hist_data(data->private_data); data->private_data = named_data->private_data; set_named_trigger_data(data, named_data); data->ops = &event_hist_trigger_named_ops; @@ -1502,8 +5289,32 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops, goto out; } - list_add_rcu(&data->list, &file->triggers); + if (hist_data->enable_timestamps) { + char *clock = hist_data->attrs->clock; + + ret = tracing_set_clock(file->tr, hist_data->attrs->clock); + if (ret) { + hist_err("Couldn't set trace_clock: ", clock); + goto out; + } + + tracing_set_time_stamp_abs(file->tr, true); + } + + if (named_data) + destroy_hist_data(hist_data); + ret++; + out: + return ret; +} + +static int hist_trigger_enable(struct event_trigger_data *data, + struct trace_event_file *file) +{ + int ret = 0; + + list_add_tail_rcu(&data->list, &file->triggers); update_cond_flag(file); @@ -1512,10 +5323,55 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops, update_cond_flag(file); ret--; } - out: + return ret; } +static bool have_hist_trigger_match(struct event_trigger_data *data, + struct trace_event_file *file) +{ + struct hist_trigger_data *hist_data = data->private_data; + struct event_trigger_data *test, *named_data = NULL; + bool match = false; + + if (hist_data->attrs->name) + named_data = find_named_trigger(hist_data->attrs->name); + + list_for_each_entry_rcu(test, &file->triggers, list) { + if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) { + if (hist_trigger_match(data, test, named_data, false)) { + match = true; + break; + } + } + } + + return match; +} + +static bool hist_trigger_check_refs(struct event_trigger_data *data, + struct trace_event_file *file) +{ + struct hist_trigger_data *hist_data = data->private_data; + struct event_trigger_data *test, *named_data = NULL; + + if (hist_data->attrs->name) + named_data = find_named_trigger(hist_data->attrs->name); + + list_for_each_entry_rcu(test, &file->triggers, list) { + if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) { + if (!hist_trigger_match(data, test, named_data, false)) + continue; + hist_data = test->private_data; + if (check_var_refs(hist_data)) + return true; + break; + } + } + + return false; +} + static void hist_unregister_trigger(char *glob, struct event_trigger_ops *ops, struct event_trigger_data *data, struct trace_event_file *file) @@ -1541,17 +5397,55 @@ static void hist_unregister_trigger(char *glob, struct event_trigger_ops *ops, if (unregistered && test->ops->free) test->ops->free(test->ops, test); + + if (hist_data->enable_timestamps) { + if (!hist_data->remove || unregistered) + tracing_set_time_stamp_abs(file->tr, false); + } +} + +static bool hist_file_check_refs(struct trace_event_file *file) +{ + struct hist_trigger_data *hist_data; + struct event_trigger_data *test; + + list_for_each_entry_rcu(test, &file->triggers, list) { + if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) { + hist_data = test->private_data; + if (check_var_refs(hist_data)) + return true; + } + } + + return false; } static void hist_unreg_all(struct trace_event_file *file) { struct event_trigger_data *test, *n; + struct hist_trigger_data *hist_data; + struct synth_event *se; + const char *se_name; + + if (hist_file_check_refs(file)) + return; list_for_each_entry_safe(test, n, &file->triggers, list) { if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) { + hist_data = test->private_data; list_del_rcu(&test->list); trace_event_trigger_enable_disable(file, 0); + + mutex_lock(&synth_event_mutex); + se_name = trace_event_name(file->event_call); + se = find_synth_event(se_name); + if (se) + se->ref--; + mutex_unlock(&synth_event_mutex); + update_cond_flag(file); + if (hist_data->enable_timestamps) + tracing_set_time_stamp_abs(file->tr, false); if (test->ops->free) test->ops->free(test->ops, test); } @@ -1567,16 +5461,54 @@ static int event_hist_trigger_func(struct event_command *cmd_ops, struct hist_trigger_attrs *attrs; struct event_trigger_ops *trigger_ops; struct hist_trigger_data *hist_data; - char *trigger; + struct synth_event *se; + const char *se_name; + bool remove = false; + char *trigger, *p; int ret = 0; + if (glob && strlen(glob)) { + last_cmd_set(param); + hist_err_clear(); + } + if (!param) return -EINVAL; - /* separate the trigger from the filter (k:v [if filter]) */ - trigger = strsep(¶m, " \t"); - if (!trigger) - return -EINVAL; + if (glob[0] == '!') + remove = true; + + /* + * separate the trigger from the filter (k:v [if filter]) + * allowing for whitespace in the trigger + */ + p = trigger = param; + do { + p = strstr(p, "if"); + if (!p) + break; + if (p == param) + return -EINVAL; + if (*(p - 1) != ' ' && *(p - 1) != '\t') { + p++; + continue; + } + if (p >= param + strlen(param) - strlen("if") - 1) + return -EINVAL; + if (*(p + strlen("if")) != ' ' && *(p + strlen("if")) != '\t') { + p++; + continue; + } + break; + } while (p); + + if (!p) + param = NULL; + else { + *(p - 1) = '\0'; + param = strstrip(p); + trigger = strstrip(trigger); + } attrs = parse_hist_trigger_attrs(trigger); if (IS_ERR(attrs)) @@ -1585,7 +5517,7 @@ static int event_hist_trigger_func(struct event_command *cmd_ops, if (attrs->map_bits) hist_trigger_bits = attrs->map_bits; - hist_data = create_hist_data(hist_trigger_bits, attrs, file); + hist_data = create_hist_data(hist_trigger_bits, attrs, file, remove); if (IS_ERR(hist_data)) { destroy_hist_trigger_attrs(attrs); return PTR_ERR(hist_data); @@ -1593,10 +5525,11 @@ static int event_hist_trigger_func(struct event_command *cmd_ops, trigger_ops = cmd_ops->get_trigger_ops(cmd, trigger); - ret = -ENOMEM; trigger_data = kzalloc(sizeof(*trigger_data), GFP_KERNEL); - if (!trigger_data) + if (!trigger_data) { + ret = -ENOMEM; goto out_free; + } trigger_data->count = -1; trigger_data->ops = trigger_ops; @@ -1614,8 +5547,24 @@ static int event_hist_trigger_func(struct event_command *cmd_ops, goto out_free; } - if (glob[0] == '!') { + if (remove) { + if (!have_hist_trigger_match(trigger_data, file)) + goto out_free; + + if (hist_trigger_check_refs(trigger_data, file)) { + ret = -EBUSY; + goto out_free; + } + cmd_ops->unreg(glob+1, trigger_ops, trigger_data, file); + + mutex_lock(&synth_event_mutex); + se_name = trace_event_name(file->event_call); + se = find_synth_event(se_name); + if (se) + se->ref--; + mutex_unlock(&synth_event_mutex); + ret = 0; goto out_free; } @@ -1632,14 +5581,47 @@ static int event_hist_trigger_func(struct event_command *cmd_ops, goto out_free; } else if (ret < 0) goto out_free; + + if (get_named_trigger_data(trigger_data)) + goto enable; + + if (has_hist_vars(hist_data)) + save_hist_vars(hist_data); + + ret = create_actions(hist_data, file); + if (ret) + goto out_unreg; + + ret = tracing_map_init(hist_data->map); + if (ret) + goto out_unreg; +enable: + ret = hist_trigger_enable(trigger_data, file); + if (ret) + goto out_unreg; + + mutex_lock(&synth_event_mutex); + se_name = trace_event_name(file->event_call); + se = find_synth_event(se_name); + if (se) + se->ref++; + mutex_unlock(&synth_event_mutex); + /* Just return zero, not the number of registered triggers */ ret = 0; out: + if (ret == 0) + hist_err_clear(); + return ret; + out_unreg: + cmd_ops->unreg(glob+1, trigger_ops, trigger_data, file); out_free: if (cmd_ops->set_filter) cmd_ops->set_filter(NULL, trigger_data, NULL); + remove_hist_vars(hist_data); + kfree(trigger_data); destroy_hist_data(hist_data); @@ -1669,7 +5651,8 @@ __init int register_trigger_hist_cmd(void) } static void -hist_enable_trigger(struct event_trigger_data *data, void *rec) +hist_enable_trigger(struct event_trigger_data *data, void *rec, + struct ring_buffer_event *event) { struct enable_trigger_data *enable_data = data->private_data; struct event_trigger_data *test; @@ -1685,7 +5668,8 @@ hist_enable_trigger(struct event_trigger_data *data, void *rec) } static void -hist_enable_count_trigger(struct event_trigger_data *data, void *rec) +hist_enable_count_trigger(struct event_trigger_data *data, void *rec, + struct ring_buffer_event *event) { if (!data->count) return; @@ -1693,7 +5677,7 @@ hist_enable_count_trigger(struct event_trigger_data *data, void *rec) if (data->count != -1) (data->count)--; - hist_enable_trigger(data, rec); + hist_enable_trigger(data, rec, event); } static struct event_trigger_ops hist_enable_trigger_ops = { @@ -1798,3 +5782,31 @@ __init int register_trigger_hist_enable_disable_cmds(void) return ret; } + +static __init int trace_events_hist_init(void) +{ + struct dentry *entry = NULL; + struct dentry *d_tracer; + int err = 0; + + d_tracer = tracing_init_dentry(); + if (IS_ERR(d_tracer)) { + err = PTR_ERR(d_tracer); + goto err; + } + + entry = tracefs_create_file("synthetic_events", 0644, d_tracer, + NULL, &synth_events_fops); + if (!entry) { + err = -ENODEV; + goto err; + } + + return err; + err: + pr_warn("Could not create tracefs 'synthetic_events' entry\n"); + + return err; +} + +fs_initcall(trace_events_hist_init); diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index 87411482a46f..d251cabcf69a 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -63,7 +63,8 @@ void trigger_data_free(struct event_trigger_data *data) * any trigger that should be deferred, ETT_NONE if nothing to defer. */ enum event_trigger_type -event_triggers_call(struct trace_event_file *file, void *rec) +event_triggers_call(struct trace_event_file *file, void *rec, + struct ring_buffer_event *event) { struct event_trigger_data *data; enum event_trigger_type tt = ETT_NONE; @@ -76,7 +77,7 @@ event_triggers_call(struct trace_event_file *file, void *rec) if (data->paused) continue; if (!rec) { - data->ops->func(data, rec); + data->ops->func(data, rec, event); continue; } filter = rcu_dereference_sched(data->filter); @@ -86,7 +87,7 @@ event_triggers_call(struct trace_event_file *file, void *rec) tt |= data->cmd_ops->trigger_type; continue; } - data->ops->func(data, rec); + data->ops->func(data, rec, event); } return tt; } @@ -108,7 +109,7 @@ EXPORT_SYMBOL_GPL(event_triggers_call); void event_triggers_post_call(struct trace_event_file *file, enum event_trigger_type tt, - void *rec) + void *rec, struct ring_buffer_event *event) { struct event_trigger_data *data; @@ -116,7 +117,7 @@ event_triggers_post_call(struct trace_event_file *file, if (data->paused) continue; if (data->cmd_ops->trigger_type & tt) - data->ops->func(data, rec); + data->ops->func(data, rec, event); } } EXPORT_SYMBOL_GPL(event_triggers_post_call); @@ -908,8 +909,15 @@ void set_named_trigger_data(struct event_trigger_data *data, data->named_data = named_data; } +struct event_trigger_data * +get_named_trigger_data(struct event_trigger_data *data) +{ + return data->named_data; +} + static void -traceon_trigger(struct event_trigger_data *data, void *rec) +traceon_trigger(struct event_trigger_data *data, void *rec, + struct ring_buffer_event *event) { if (tracing_is_on()) return; @@ -918,7 +926,8 @@ traceon_trigger(struct event_trigger_data *data, void *rec) } static void -traceon_count_trigger(struct event_trigger_data *data, void *rec) +traceon_count_trigger(struct event_trigger_data *data, void *rec, + struct ring_buffer_event *event) { if (tracing_is_on()) return; @@ -933,7 +942,8 @@ traceon_count_trigger(struct event_trigger_data *data, void *rec) } static void -traceoff_trigger(struct event_trigger_data *data, void *rec) +traceoff_trigger(struct event_trigger_data *data, void *rec, + struct ring_buffer_event *event) { if (!tracing_is_on()) return; @@ -942,7 +952,8 @@ traceoff_trigger(struct event_trigger_data *data, void *rec) } static void -traceoff_count_trigger(struct event_trigger_data *data, void *rec) +traceoff_count_trigger(struct event_trigger_data *data, void *rec, + struct ring_buffer_event *event) { if (!tracing_is_on()) return; @@ -1039,13 +1050,15 @@ static struct event_command trigger_traceoff_cmd = { #ifdef CONFIG_TRACER_SNAPSHOT static void -snapshot_trigger(struct event_trigger_data *data, void *rec) +snapshot_trigger(struct event_trigger_data *data, void *rec, + struct ring_buffer_event *event) { tracing_snapshot(); } static void -snapshot_count_trigger(struct event_trigger_data *data, void *rec) +snapshot_count_trigger(struct event_trigger_data *data, void *rec, + struct ring_buffer_event *event) { if (!data->count) return; @@ -1053,7 +1066,7 @@ snapshot_count_trigger(struct event_trigger_data *data, void *rec) if (data->count != -1) (data->count)--; - snapshot_trigger(data, rec); + snapshot_trigger(data, rec, event); } static int @@ -1141,13 +1154,15 @@ static __init int register_trigger_snapshot_cmd(void) { return 0; } #endif static void -stacktrace_trigger(struct event_trigger_data *data, void *rec) +stacktrace_trigger(struct event_trigger_data *data, void *rec, + struct ring_buffer_event *event) { trace_dump_stack(STACK_SKIP); } static void -stacktrace_count_trigger(struct event_trigger_data *data, void *rec) +stacktrace_count_trigger(struct event_trigger_data *data, void *rec, + struct ring_buffer_event *event) { if (!data->count) return; @@ -1155,7 +1170,7 @@ stacktrace_count_trigger(struct event_trigger_data *data, void *rec) if (data->count != -1) (data->count)--; - stacktrace_trigger(data, rec); + stacktrace_trigger(data, rec, event); } static int @@ -1217,7 +1232,8 @@ static __init void unregister_trigger_traceon_traceoff_cmds(void) } static void -event_enable_trigger(struct event_trigger_data *data, void *rec) +event_enable_trigger(struct event_trigger_data *data, void *rec, + struct ring_buffer_event *event) { struct enable_trigger_data *enable_data = data->private_data; @@ -1228,7 +1244,8 @@ event_enable_trigger(struct event_trigger_data *data, void *rec) } static void -event_enable_count_trigger(struct event_trigger_data *data, void *rec) +event_enable_count_trigger(struct event_trigger_data *data, void *rec, + struct ring_buffer_event *event) { struct enable_trigger_data *enable_data = data->private_data; @@ -1242,7 +1259,7 @@ event_enable_count_trigger(struct event_trigger_data *data, void *rec) if (data->count != -1) (data->count)--; - event_enable_trigger(data, rec); + event_enable_trigger(data, rec, event); } int event_enable_trigger_print(struct seq_file *m, diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c index ad1d6164e946..50f44b7b2b32 100644 --- a/kernel/trace/trace_printk.c +++ b/kernel/trace/trace_printk.c @@ -196,7 +196,7 @@ struct notifier_block module_trace_bprintk_format_nb = { }; int __trace_bprintk(unsigned long ip, const char *fmt, ...) - { +{ int ret; va_list ap; @@ -214,7 +214,7 @@ int __trace_bprintk(unsigned long ip, const char *fmt, ...) EXPORT_SYMBOL_GPL(__trace_bprintk); int __ftrace_vbprintk(unsigned long ip, const char *fmt, va_list ap) - { +{ if (unlikely(!fmt)) return 0; diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 2014f4351ae0..34fd0e0ec51d 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -151,6 +151,8 @@ static void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs, return; ret = strncpy_from_user(dst, src, maxlen); + if (ret == maxlen) + dst[--ret] = '\0'; if (ret < 0) { /* Failed to fetch string */ ((u8 *)get_rloc_data(dest))[0] = '\0'; @@ -446,7 +448,7 @@ static int create_trace_uprobe(int argc, char **argv) if (ret) goto fail_address_parse; - inode = igrab(d_inode(path.dentry)); + inode = igrab(d_real_inode(path.dentry)); path_put(&path); if (!inode || !S_ISREG(inode->i_mode)) { @@ -602,24 +604,9 @@ static int probes_seq_show(struct seq_file *m, void *v) char c = is_ret_probe(tu) ? 'r' : 'p'; int i; - seq_printf(m, "%c:%s/%s", c, tu->tp.call.class->system, - trace_event_name(&tu->tp.call)); - seq_printf(m, " %s:", tu->filename); - - /* Don't print "0x (null)" when offset is 0 */ - if (tu->offset) { - seq_printf(m, "0x%px", (void *)tu->offset); - } else { - switch (sizeof(void *)) { - case 4: - seq_printf(m, "0x00000000"); - break; - case 8: - default: - seq_printf(m, "0x0000000000000000"); - break; - } - } + seq_printf(m, "%c:%s/%s %s:0x%0*lx", c, tu->tp.call.class->system, + trace_event_name(&tu->tp.call), tu->filename, + (int)(sizeof(void *) * 2), tu->offset); for (i = 0; i < tu->tp.nr_args; i++) seq_printf(m, " %s=%s", tu->tp.args[i].name, tu->tp.args[i].comm); diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c index 07e75344725b..5cadb1b8b5fe 100644 --- a/kernel/trace/tracing_map.c +++ b/kernel/trace/tracing_map.c @@ -66,6 +66,73 @@ u64 tracing_map_read_sum(struct tracing_map_elt *elt, unsigned int i) return (u64)atomic64_read(&elt->fields[i].sum); } +/** + * tracing_map_set_var - Assign a tracing_map_elt's variable field + * @elt: The tracing_map_elt + * @i: The index of the given variable associated with the tracing_map_elt + * @n: The value to assign + * + * Assign n to variable i associated with the specified tracing_map_elt + * instance. The index i is the index returned by the call to + * tracing_map_add_var() when the tracing map was set up. + */ +void tracing_map_set_var(struct tracing_map_elt *elt, unsigned int i, u64 n) +{ + atomic64_set(&elt->vars[i], n); + elt->var_set[i] = true; +} + +/** + * tracing_map_var_set - Return whether or not a variable has been set + * @elt: The tracing_map_elt + * @i: The index of the given variable associated with the tracing_map_elt + * + * Return true if the variable has been set, false otherwise. The + * index i is the index returned by the call to tracing_map_add_var() + * when the tracing map was set up. + */ +bool tracing_map_var_set(struct tracing_map_elt *elt, unsigned int i) +{ + return elt->var_set[i]; +} + +/** + * tracing_map_read_var - Return the value of a tracing_map_elt's variable field + * @elt: The tracing_map_elt + * @i: The index of the given variable associated with the tracing_map_elt + * + * Retrieve the value of the variable i associated with the specified + * tracing_map_elt instance. The index i is the index returned by the + * call to tracing_map_add_var() when the tracing map was set + * up. + * + * Return: The variable value associated with field i for elt. + */ +u64 tracing_map_read_var(struct tracing_map_elt *elt, unsigned int i) +{ + return (u64)atomic64_read(&elt->vars[i]); +} + +/** + * tracing_map_read_var_once - Return and reset a tracing_map_elt's variable field + * @elt: The tracing_map_elt + * @i: The index of the given variable associated with the tracing_map_elt + * + * Retrieve the value of the variable i associated with the specified + * tracing_map_elt instance, and reset the variable to the 'not set' + * state. The index i is the index returned by the call to + * tracing_map_add_var() when the tracing map was set up. The reset + * essentially makes the variable a read-once variable if it's only + * accessed using this function. + * + * Return: The variable value associated with field i for elt. + */ +u64 tracing_map_read_var_once(struct tracing_map_elt *elt, unsigned int i) +{ + elt->var_set[i] = false; + return (u64)atomic64_read(&elt->vars[i]); +} + int tracing_map_cmp_string(void *val_a, void *val_b) { char *a = val_a; @@ -171,6 +238,28 @@ int tracing_map_add_sum_field(struct tracing_map *map) } /** + * tracing_map_add_var - Add a field describing a tracing_map var + * @map: The tracing_map + * + * Add a var to the map and return the index identifying it in the map + * and associated tracing_map_elts. This is the index used for + * instance to update a var for a particular tracing_map_elt using + * tracing_map_update_var() or reading it via tracing_map_read_var(). + * + * Return: The index identifying the var in the map and associated + * tracing_map_elts, or -EINVAL on error. + */ +int tracing_map_add_var(struct tracing_map *map) +{ + int ret = -EINVAL; + + if (map->n_vars < TRACING_MAP_VARS_MAX) + ret = map->n_vars++; + + return ret; +} + +/** * tracing_map_add_key_field - Add a field describing a tracing_map key * @map: The tracing_map * @offset: The offset within the key @@ -280,6 +369,11 @@ static void tracing_map_elt_clear(struct tracing_map_elt *elt) if (elt->fields[i].cmp_fn == tracing_map_cmp_atomic64) atomic64_set(&elt->fields[i].sum, 0); + for (i = 0; i < elt->map->n_vars; i++) { + atomic64_set(&elt->vars[i], 0); + elt->var_set[i] = false; + } + if (elt->map->ops && elt->map->ops->elt_clear) elt->map->ops->elt_clear(elt); } @@ -306,6 +400,8 @@ static void tracing_map_elt_free(struct tracing_map_elt *elt) if (elt->map->ops && elt->map->ops->elt_free) elt->map->ops->elt_free(elt); kfree(elt->fields); + kfree(elt->vars); + kfree(elt->var_set); kfree(elt->key); kfree(elt); } @@ -333,6 +429,18 @@ static struct tracing_map_elt *tracing_map_elt_alloc(struct tracing_map *map) goto free; } + elt->vars = kcalloc(map->n_vars, sizeof(*elt->vars), GFP_KERNEL); + if (!elt->vars) { + err = -ENOMEM; + goto free; + } + + elt->var_set = kcalloc(map->n_vars, sizeof(*elt->var_set), GFP_KERNEL); + if (!elt->var_set) { + err = -ENOMEM; + goto free; + } + tracing_map_elt_init_fields(elt); if (map->ops && map->ops->elt_alloc) { @@ -414,7 +522,9 @@ static inline struct tracing_map_elt * __tracing_map_insert(struct tracing_map *map, void *key, bool lookup_only) { u32 idx, key_hash, test_key; + int dup_try = 0; struct tracing_map_entry *entry; + struct tracing_map_elt *val; key_hash = jhash(key, map->key_size, 0); if (key_hash == 0) @@ -426,11 +536,33 @@ __tracing_map_insert(struct tracing_map *map, void *key, bool lookup_only) entry = TRACING_MAP_ENTRY(map->map, idx); test_key = entry->key; - if (test_key && test_key == key_hash && entry->val && - keys_match(key, entry->val->key, map->key_size)) { - if (!lookup_only) - atomic64_inc(&map->hits); - return entry->val; + if (test_key && test_key == key_hash) { + val = READ_ONCE(entry->val); + if (val && + keys_match(key, val->key, map->key_size)) { + if (!lookup_only) + atomic64_inc(&map->hits); + return val; + } else if (unlikely(!val)) { + /* + * The key is present. But, val (pointer to elt + * struct) is still NULL. which means some other + * thread is in the process of inserting an + * element. + * + * On top of that, it's key_hash is same as the + * one being inserted right now. So, it's + * possible that the element has the same + * key as well. + */ + + dup_try++; + if (dup_try > map->map_size) { + atomic64_inc(&map->drops); + break; + } + continue; + } } if (!test_key) { @@ -452,6 +584,13 @@ __tracing_map_insert(struct tracing_map *map, void *key, bool lookup_only) atomic64_inc(&map->hits); return entry->val; + } else { + /* + * cmpxchg() failed. Loop around once + * more to check what key was inserted. + */ + dup_try++; + continue; } } @@ -816,67 +955,15 @@ create_sort_entry(void *key, struct tracing_map_elt *elt) return sort_entry; } -static struct tracing_map_elt *copy_elt(struct tracing_map_elt *elt) -{ - struct tracing_map_elt *dup_elt; - unsigned int i; - - dup_elt = tracing_map_elt_alloc(elt->map); - if (IS_ERR(dup_elt)) - return NULL; - - if (elt->map->ops && elt->map->ops->elt_copy) - elt->map->ops->elt_copy(dup_elt, elt); - - dup_elt->private_data = elt->private_data; - memcpy(dup_elt->key, elt->key, elt->map->key_size); - - for (i = 0; i < elt->map->n_fields; i++) { - atomic64_set(&dup_elt->fields[i].sum, - atomic64_read(&elt->fields[i].sum)); - dup_elt->fields[i].cmp_fn = elt->fields[i].cmp_fn; - } - - return dup_elt; -} - -static int merge_dup(struct tracing_map_sort_entry **sort_entries, - unsigned int target, unsigned int dup) -{ - struct tracing_map_elt *target_elt, *elt; - bool first_dup = (target - dup) == 1; - int i; - - if (first_dup) { - elt = sort_entries[target]->elt; - target_elt = copy_elt(elt); - if (!target_elt) - return -ENOMEM; - sort_entries[target]->elt = target_elt; - sort_entries[target]->elt_copied = true; - } else - target_elt = sort_entries[target]->elt; - - elt = sort_entries[dup]->elt; - - for (i = 0; i < elt->map->n_fields; i++) - atomic64_add(atomic64_read(&elt->fields[i].sum), - &target_elt->fields[i].sum); - - sort_entries[dup]->dup = true; - - return 0; -} - -static int merge_dups(struct tracing_map_sort_entry **sort_entries, +static void detect_dups(struct tracing_map_sort_entry **sort_entries, int n_entries, unsigned int key_size) { unsigned int dups = 0, total_dups = 0; - int err, i, j; + int i; void *key; if (n_entries < 2) - return total_dups; + return; sort(sort_entries, n_entries, sizeof(struct tracing_map_sort_entry *), (int (*)(const void *, const void *))cmp_entries_dup, NULL); @@ -885,30 +972,14 @@ static int merge_dups(struct tracing_map_sort_entry **sort_entries, for (i = 1; i < n_entries; i++) { if (!memcmp(sort_entries[i]->key, key, key_size)) { dups++; total_dups++; - err = merge_dup(sort_entries, i - dups, i); - if (err) - return err; continue; } key = sort_entries[i]->key; dups = 0; } - if (!total_dups) - return total_dups; - - for (i = 0, j = 0; i < n_entries; i++) { - if (!sort_entries[i]->dup) { - sort_entries[j] = sort_entries[i]; - if (j++ != i) - sort_entries[i] = NULL; - } else { - destroy_sort_entry(sort_entries[i]); - sort_entries[i] = NULL; - } - } - - return total_dups; + WARN_ONCE(total_dups > 0, + "Duplicates detected: %d\n", total_dups); } static bool is_key(struct tracing_map *map, unsigned int field_idx) @@ -1034,10 +1105,7 @@ int tracing_map_sort_entries(struct tracing_map *map, return 1; } - ret = merge_dups(entries, n_entries, map->key_size); - if (ret < 0) - goto free; - n_entries -= ret; + detect_dups(entries, n_entries, map->key_size); if (is_key(map, sort_keys[0].field_idx)) cmp_entries_fn = cmp_entries_key; diff --git a/kernel/trace/tracing_map.h b/kernel/trace/tracing_map.h index 5b5bbf8ae550..053eb92b2d31 100644 --- a/kernel/trace/tracing_map.h +++ b/kernel/trace/tracing_map.h @@ -10,6 +10,7 @@ #define TRACING_MAP_VALS_MAX 3 #define TRACING_MAP_FIELDS_MAX (TRACING_MAP_KEYS_MAX + \ TRACING_MAP_VALS_MAX) +#define TRACING_MAP_VARS_MAX 16 #define TRACING_MAP_SORT_KEYS_MAX 2 typedef int (*tracing_map_cmp_fn_t) (void *val_a, void *val_b); @@ -137,6 +138,8 @@ struct tracing_map_field { struct tracing_map_elt { struct tracing_map *map; struct tracing_map_field *fields; + atomic64_t *vars; + bool *var_set; void *key; void *private_data; }; @@ -192,6 +195,7 @@ struct tracing_map { int key_idx[TRACING_MAP_KEYS_MAX]; unsigned int n_keys; struct tracing_map_sort_key sort_key; + unsigned int n_vars; atomic64_t hits; atomic64_t drops; }; @@ -215,11 +219,6 @@ struct tracing_map { * Element allocation occurs before tracing begins, when the * tracing_map_init() call is made by client code. * - * @elt_copy: At certain points in the lifetime of an element, it may - * need to be copied. The copy should include a copy of the - * client-allocated data, which can be copied into the 'to' - * element from the 'from' element. - * * @elt_free: When a tracing_map_elt is freed, this function is called * and allows client-allocated per-element data to be freed. * @@ -233,8 +232,6 @@ struct tracing_map { */ struct tracing_map_ops { int (*elt_alloc)(struct tracing_map_elt *elt); - void (*elt_copy)(struct tracing_map_elt *to, - struct tracing_map_elt *from); void (*elt_free)(struct tracing_map_elt *elt); void (*elt_clear)(struct tracing_map_elt *elt); void (*elt_init)(struct tracing_map_elt *elt); @@ -248,6 +245,7 @@ tracing_map_create(unsigned int map_bits, extern int tracing_map_init(struct tracing_map *map); extern int tracing_map_add_sum_field(struct tracing_map *map); +extern int tracing_map_add_var(struct tracing_map *map); extern int tracing_map_add_key_field(struct tracing_map *map, unsigned int offset, tracing_map_cmp_fn_t cmp_fn); @@ -267,7 +265,13 @@ extern int tracing_map_cmp_none(void *val_a, void *val_b); extern void tracing_map_update_sum(struct tracing_map_elt *elt, unsigned int i, u64 n); +extern void tracing_map_set_var(struct tracing_map_elt *elt, + unsigned int i, u64 n); +extern bool tracing_map_var_set(struct tracing_map_elt *elt, unsigned int i); extern u64 tracing_map_read_sum(struct tracing_map_elt *elt, unsigned int i); +extern u64 tracing_map_read_var(struct tracing_map_elt *elt, unsigned int i); +extern u64 tracing_map_read_var_once(struct tracing_map_elt *elt, unsigned int i); + extern void tracing_map_set_field_descr(struct tracing_map *map, unsigned int i, unsigned int key_offset, diff --git a/kernel/ucount.c b/kernel/ucount.c index b4eeee03934f..f48d1b6376a4 100644 --- a/kernel/ucount.c +++ b/kernel/ucount.c @@ -10,6 +10,7 @@ #include <linux/slab.h> #include <linux/cred.h> #include <linux/hash.h> +#include <linux/kmemleak.h> #include <linux/user_namespace.h> #define UCOUNTS_HASHTABLE_BITS 10 diff --git a/kernel/utsname.c b/kernel/utsname.c index 913fe4336d2b..dcd6be1996fe 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c @@ -19,6 +19,8 @@ #include <linux/proc_ns.h> #include <linux/sched/task.h> +static struct kmem_cache *uts_ns_cache __ro_after_init; + static struct ucounts *inc_uts_namespaces(struct user_namespace *ns) { return inc_ucount(ns, current_euid(), UCOUNT_UTS_NAMESPACES); @@ -33,7 +35,7 @@ static struct uts_namespace *create_uts_ns(void) { struct uts_namespace *uts_ns; - uts_ns = kmalloc(sizeof(struct uts_namespace), GFP_KERNEL); + uts_ns = kmem_cache_alloc(uts_ns_cache, GFP_KERNEL); if (uts_ns) kref_init(&uts_ns->kref); return uts_ns; @@ -42,7 +44,7 @@ static struct uts_namespace *create_uts_ns(void) /* * Clone a new ns copying an original utsname, setting refcount to 1 * @old_ns: namespace to clone - * Return ERR_PTR(-ENOMEM) on error (failure to kmalloc), new ns otherwise + * Return ERR_PTR(-ENOMEM) on error (failure to allocate), new ns otherwise */ static struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns, struct uts_namespace *old_ns) @@ -75,7 +77,7 @@ static struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns, return ns; fail_free: - kfree(ns); + kmem_cache_free(uts_ns_cache, ns); fail_dec: dec_uts_namespaces(ucounts); fail: @@ -113,7 +115,7 @@ void free_uts_ns(struct kref *kref) dec_uts_namespaces(ns->ucounts); put_user_ns(ns->user_ns); ns_free_inum(&ns->ns); - kfree(ns); + kmem_cache_free(uts_ns_cache, ns); } static inline struct uts_namespace *to_uts_ns(struct ns_common *ns) @@ -169,3 +171,13 @@ const struct proc_ns_operations utsns_operations = { .install = utsns_install, .owner = utsns_owner, }; + +void __init uts_ns_init(void) +{ + uts_ns_cache = kmem_cache_create_usercopy( + "uts_namespace", sizeof(struct uts_namespace), 0, + SLAB_PANIC|SLAB_ACCOUNT, + offsetof(struct uts_namespace, name), + sizeof_field(struct uts_namespace, name), + NULL); +} |