diff options
Diffstat (limited to 'kernel')
77 files changed, 2476 insertions, 1493 deletions
diff --git a/kernel/audit.c b/kernel/audit.c index 8d528f9930da..a8a91bd2b2a9 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -932,7 +932,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) if (!audit_enabled && msg_type != AUDIT_USER_AVC) return 0; - err = audit_filter_user(msg_type); + err = audit_filter(msg_type, AUDIT_FILTER_USER); if (err == 1) { /* match or error */ err = 0; if (msg_type == AUDIT_USER_TTY) { @@ -1379,7 +1379,7 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, if (audit_initialized != AUDIT_INITIALIZED) return NULL; - if (unlikely(audit_filter_type(type))) + if (unlikely(!audit_filter(type, AUDIT_FILTER_TYPE))) return NULL; if (gfp_mask & __GFP_DIRECT_RECLAIM) { diff --git a/kernel/audit.h b/kernel/audit.h index a492f4c4e710..431444c3708b 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -331,6 +331,8 @@ extern pid_t audit_sig_pid; extern kuid_t audit_sig_uid; extern u32 audit_sig_sid; +extern int audit_filter(int msgtype, unsigned int listtype); + #ifdef CONFIG_AUDITSYSCALL extern int __audit_signal_info(int sig, struct task_struct *t); static inline int audit_signal_info(int sig, struct task_struct *t) diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index d6709eb70970..0d302a87f21b 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -19,6 +19,7 @@ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ +#include <linux/file.h> #include <linux/kernel.h> #include <linux/audit.h> #include <linux/kthread.h> @@ -544,10 +545,11 @@ int audit_exe_compare(struct task_struct *tsk, struct audit_fsnotify_mark *mark) unsigned long ino; dev_t dev; - rcu_read_lock(); - exe_file = rcu_dereference(tsk->mm->exe_file); + exe_file = get_task_exe_file(tsk); + if (!exe_file) + return 0; ino = exe_file->f_inode->i_ino; dev = exe_file->f_inode->i_sb->s_dev; - rcu_read_unlock(); + fput(exe_file); return audit_mark_compare(mark, ino, dev); } diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 94ca7b1e5e7e..85d9cac497e4 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -1290,113 +1290,72 @@ int audit_compare_dname_path(const char *dname, const char *path, int parentlen) return strncmp(p, dname, dlen); } -static int audit_filter_user_rules(struct audit_krule *rule, int type, - enum audit_state *state) +int audit_filter(int msgtype, unsigned int listtype) { - int i; - - for (i = 0; i < rule->field_count; i++) { - struct audit_field *f = &rule->fields[i]; - pid_t pid; - int result = 0; - u32 sid; - - switch (f->type) { - case AUDIT_PID: - pid = task_pid_nr(current); - result = audit_comparator(pid, f->op, f->val); - break; - case AUDIT_UID: - result = audit_uid_comparator(current_uid(), f->op, f->uid); - break; - case AUDIT_GID: - result = audit_gid_comparator(current_gid(), f->op, f->gid); - break; - case AUDIT_LOGINUID: - result = audit_uid_comparator(audit_get_loginuid(current), - f->op, f->uid); - break; - case AUDIT_LOGINUID_SET: - result = audit_comparator(audit_loginuid_set(current), - f->op, f->val); - break; - case AUDIT_MSGTYPE: - result = audit_comparator(type, f->op, f->val); - break; - case AUDIT_SUBJ_USER: - case AUDIT_SUBJ_ROLE: - case AUDIT_SUBJ_TYPE: - case AUDIT_SUBJ_SEN: - case AUDIT_SUBJ_CLR: - if (f->lsm_rule) { - security_task_getsecid(current, &sid); - result = security_audit_rule_match(sid, - f->type, - f->op, - f->lsm_rule, - NULL); - } - break; - } - - if (!result) - return 0; - } - switch (rule->action) { - case AUDIT_NEVER: *state = AUDIT_DISABLED; break; - case AUDIT_ALWAYS: *state = AUDIT_RECORD_CONTEXT; break; - } - return 1; -} - -int audit_filter_user(int type) -{ - enum audit_state state = AUDIT_DISABLED; struct audit_entry *e; - int rc, ret; - - ret = 1; /* Audit by default */ - - rcu_read_lock(); - list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_USER], list) { - rc = audit_filter_user_rules(&e->rule, type, &state); - if (rc) { - if (rc > 0 && state == AUDIT_DISABLED) - ret = 0; - break; - } - } - rcu_read_unlock(); - - return ret; -} - -int audit_filter_type(int type) -{ - struct audit_entry *e; - int result = 0; + int ret = 1; /* Audit by default */ rcu_read_lock(); - if (list_empty(&audit_filter_list[AUDIT_FILTER_TYPE])) + if (list_empty(&audit_filter_list[listtype])) goto unlock_and_return; + list_for_each_entry_rcu(e, &audit_filter_list[listtype], list) { + int i, result = 0; - list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_TYPE], - list) { - int i; for (i = 0; i < e->rule.field_count; i++) { struct audit_field *f = &e->rule.fields[i]; - if (f->type == AUDIT_MSGTYPE) { - result = audit_comparator(type, f->op, f->val); - if (!result) - break; + pid_t pid; + u32 sid; + + switch (f->type) { + case AUDIT_PID: + pid = task_pid_nr(current); + result = audit_comparator(pid, f->op, f->val); + break; + case AUDIT_UID: + result = audit_uid_comparator(current_uid(), f->op, f->uid); + break; + case AUDIT_GID: + result = audit_gid_comparator(current_gid(), f->op, f->gid); + break; + case AUDIT_LOGINUID: + result = audit_uid_comparator(audit_get_loginuid(current), + f->op, f->uid); + break; + case AUDIT_LOGINUID_SET: + result = audit_comparator(audit_loginuid_set(current), + f->op, f->val); + break; + case AUDIT_MSGTYPE: + result = audit_comparator(msgtype, f->op, f->val); + break; + case AUDIT_SUBJ_USER: + case AUDIT_SUBJ_ROLE: + case AUDIT_SUBJ_TYPE: + case AUDIT_SUBJ_SEN: + case AUDIT_SUBJ_CLR: + if (f->lsm_rule) { + security_task_getsecid(current, &sid); + result = security_audit_rule_match(sid, + f->type, f->op, f->lsm_rule, NULL); + } + break; + default: + goto unlock_and_return; } + if (result < 0) /* error */ + goto unlock_and_return; + if (!result) + break; + } + if (result > 0) { + if (e->rule.action == AUDIT_NEVER || listtype == AUDIT_FILTER_TYPE) + ret = 0; + break; } - if (result) - goto unlock_and_return; } unlock_and_return: rcu_read_unlock(); - return result; + return ret; } static int update_lsm_rule(struct audit_krule *r) diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 2672d105cffc..5abf1dc1f91c 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -72,6 +72,7 @@ #include <linux/compat.h> #include <linux/ctype.h> #include <linux/string.h> +#include <linux/uaccess.h> #include <uapi/linux/limits.h> #include "audit.h" @@ -81,7 +82,8 @@ #define AUDITSC_SUCCESS 1 #define AUDITSC_FAILURE 2 -/* no execve audit message should be longer than this (userspace limits) */ +/* no execve audit message should be longer than this (userspace limits), + * see the note near the top of audit_log_execve_info() about this value */ #define MAX_EXECVE_AUDIT_LEN 7500 /* max length to print of cmdline/proctitle value during audit */ @@ -694,8 +696,12 @@ static int audit_filter_rules(struct task_struct *tsk, ctx->prio = rule->prio; } switch (rule->action) { - case AUDIT_NEVER: *state = AUDIT_DISABLED; break; - case AUDIT_ALWAYS: *state = AUDIT_RECORD_CONTEXT; break; + case AUDIT_NEVER: + *state = AUDIT_DISABLED; + break; + case AUDIT_ALWAYS: + *state = AUDIT_RECORD_CONTEXT; + break; } return 1; } @@ -987,184 +993,178 @@ static int audit_log_pid_context(struct audit_context *context, pid_t pid, return rc; } -/* - * to_send and len_sent accounting are very loose estimates. We aren't - * really worried about a hard cap to MAX_EXECVE_AUDIT_LEN so much as being - * within about 500 bytes (next page boundary) - * - * why snprintf? an int is up to 12 digits long. if we just assumed when - * logging that a[%d]= was going to be 16 characters long we would be wasting - * space in every audit message. In one 7500 byte message we can log up to - * about 1000 min size arguments. That comes down to about 50% waste of space - * if we didn't do the snprintf to find out how long arg_num_len was. - */ -static int audit_log_single_execve_arg(struct audit_context *context, - struct audit_buffer **ab, - int arg_num, - size_t *len_sent, - const char __user *p, - char *buf) +static void audit_log_execve_info(struct audit_context *context, + struct audit_buffer **ab) { - char arg_num_len_buf[12]; - const char __user *tmp_p = p; - /* how many digits are in arg_num? 5 is the length of ' a=""' */ - size_t arg_num_len = snprintf(arg_num_len_buf, 12, "%d", arg_num) + 5; - size_t len, len_left, to_send; - size_t max_execve_audit_len = MAX_EXECVE_AUDIT_LEN; - unsigned int i, has_cntl = 0, too_long = 0; - int ret; - - /* strnlen_user includes the null we don't want to send */ - len_left = len = strnlen_user(p, MAX_ARG_STRLEN) - 1; - - /* - * We just created this mm, if we can't find the strings - * we just copied into it something is _very_ wrong. Similar - * for strings that are too long, we should not have created - * any. - */ - if (WARN_ON_ONCE(len < 0 || len > MAX_ARG_STRLEN - 1)) { - send_sig(SIGKILL, current, 0); - return -1; + long len_max; + long len_rem; + long len_full; + long len_buf; + long len_abuf; + long len_tmp; + bool require_data; + bool encode; + unsigned int iter; + unsigned int arg; + char *buf_head; + char *buf; + const char __user *p = (const char __user *)current->mm->arg_start; + + /* NOTE: this buffer needs to be large enough to hold all the non-arg + * data we put in the audit record for this argument (see the + * code below) ... at this point in time 96 is plenty */ + char abuf[96]; + + /* NOTE: we set MAX_EXECVE_AUDIT_LEN to a rather arbitrary limit, the + * current value of 7500 is not as important as the fact that it + * is less than 8k, a setting of 7500 gives us plenty of wiggle + * room if we go over a little bit in the logging below */ + WARN_ON_ONCE(MAX_EXECVE_AUDIT_LEN > 7500); + len_max = MAX_EXECVE_AUDIT_LEN; + + /* scratch buffer to hold the userspace args */ + buf_head = kmalloc(MAX_EXECVE_AUDIT_LEN + 1, GFP_KERNEL); + if (!buf_head) { + audit_panic("out of memory for argv string"); + return; } + buf = buf_head; - /* walk the whole argument looking for non-ascii chars */ + audit_log_format(*ab, "argc=%d", context->execve.argc); + + len_rem = len_max; + len_buf = 0; + len_full = 0; + require_data = true; + encode = false; + iter = 0; + arg = 0; do { - if (len_left > MAX_EXECVE_AUDIT_LEN) - to_send = MAX_EXECVE_AUDIT_LEN; - else - to_send = len_left; - ret = copy_from_user(buf, tmp_p, to_send); - /* - * There is no reason for this copy to be short. We just - * copied them here, and the mm hasn't been exposed to user- - * space yet. - */ - if (ret) { - WARN_ON(1); - send_sig(SIGKILL, current, 0); - return -1; - } - buf[to_send] = '\0'; - has_cntl = audit_string_contains_control(buf, to_send); - if (has_cntl) { - /* - * hex messages get logged as 2 bytes, so we can only - * send half as much in each message - */ - max_execve_audit_len = MAX_EXECVE_AUDIT_LEN / 2; - break; - } - len_left -= to_send; - tmp_p += to_send; - } while (len_left > 0); - - len_left = len; - - if (len > max_execve_audit_len) - too_long = 1; - - /* rewalk the argument actually logging the message */ - for (i = 0; len_left > 0; i++) { - int room_left; - - if (len_left > max_execve_audit_len) - to_send = max_execve_audit_len; - else - to_send = len_left; - - /* do we have space left to send this argument in this ab? */ - room_left = MAX_EXECVE_AUDIT_LEN - arg_num_len - *len_sent; - if (has_cntl) - room_left -= (to_send * 2); - else - room_left -= to_send; - if (room_left < 0) { - *len_sent = 0; - audit_log_end(*ab); - *ab = audit_log_start(context, GFP_KERNEL, AUDIT_EXECVE); - if (!*ab) - return 0; - } + /* NOTE: we don't ever want to trust this value for anything + * serious, but the audit record format insists we + * provide an argument length for really long arguments, + * e.g. > MAX_EXECVE_AUDIT_LEN, so we have no choice but + * to use strncpy_from_user() to obtain this value for + * recording in the log, although we don't use it + * anywhere here to avoid a double-fetch problem */ + if (len_full == 0) + len_full = strnlen_user(p, MAX_ARG_STRLEN) - 1; + + /* read more data from userspace */ + if (require_data) { + /* can we make more room in the buffer? */ + if (buf != buf_head) { + memmove(buf_head, buf, len_buf); + buf = buf_head; + } + + /* fetch as much as we can of the argument */ + len_tmp = strncpy_from_user(&buf_head[len_buf], p, + len_max - len_buf); + if (len_tmp == -EFAULT) { + /* unable to copy from userspace */ + send_sig(SIGKILL, current, 0); + goto out; + } else if (len_tmp == (len_max - len_buf)) { + /* buffer is not large enough */ + require_data = true; + /* NOTE: if we are going to span multiple + * buffers force the encoding so we stand + * a chance at a sane len_full value and + * consistent record encoding */ + encode = true; + len_full = len_full * 2; + p += len_tmp; + } else { + require_data = false; + if (!encode) + encode = audit_string_contains_control( + buf, len_tmp); + /* try to use a trusted value for len_full */ + if (len_full < len_max) + len_full = (encode ? + len_tmp * 2 : len_tmp); + p += len_tmp + 1; + } + len_buf += len_tmp; + buf_head[len_buf] = '\0'; - /* - * first record needs to say how long the original string was - * so we can be sure nothing was lost. - */ - if ((i == 0) && (too_long)) - audit_log_format(*ab, " a%d_len=%zu", arg_num, - has_cntl ? 2*len : len); - - /* - * normally arguments are small enough to fit and we already - * filled buf above when we checked for control characters - * so don't bother with another copy_from_user - */ - if (len >= max_execve_audit_len) - ret = copy_from_user(buf, p, to_send); - else - ret = 0; - if (ret) { - WARN_ON(1); - send_sig(SIGKILL, current, 0); - return -1; + /* length of the buffer in the audit record? */ + len_abuf = (encode ? len_buf * 2 : len_buf + 2); } - buf[to_send] = '\0'; - - /* actually log it */ - audit_log_format(*ab, " a%d", arg_num); - if (too_long) - audit_log_format(*ab, "[%d]", i); - audit_log_format(*ab, "="); - if (has_cntl) - audit_log_n_hex(*ab, buf, to_send); - else - audit_log_string(*ab, buf); - - p += to_send; - len_left -= to_send; - *len_sent += arg_num_len; - if (has_cntl) - *len_sent += to_send * 2; - else - *len_sent += to_send; - } - /* include the null we didn't log */ - return len + 1; -} -static void audit_log_execve_info(struct audit_context *context, - struct audit_buffer **ab) -{ - int i, len; - size_t len_sent = 0; - const char __user *p; - char *buf; + /* write as much as we can to the audit log */ + if (len_buf > 0) { + /* NOTE: some magic numbers here - basically if we + * can't fit a reasonable amount of data into the + * existing audit buffer, flush it and start with + * a new buffer */ + if ((sizeof(abuf) + 8) > len_rem) { + len_rem = len_max; + audit_log_end(*ab); + *ab = audit_log_start(context, + GFP_KERNEL, AUDIT_EXECVE); + if (!*ab) + goto out; + } - p = (const char __user *)current->mm->arg_start; + /* create the non-arg portion of the arg record */ + len_tmp = 0; + if (require_data || (iter > 0) || + ((len_abuf + sizeof(abuf)) > len_rem)) { + if (iter == 0) { + len_tmp += snprintf(&abuf[len_tmp], + sizeof(abuf) - len_tmp, + " a%d_len=%lu", + arg, len_full); + } + len_tmp += snprintf(&abuf[len_tmp], + sizeof(abuf) - len_tmp, + " a%d[%d]=", arg, iter++); + } else + len_tmp += snprintf(&abuf[len_tmp], + sizeof(abuf) - len_tmp, + " a%d=", arg); + WARN_ON(len_tmp >= sizeof(abuf)); + abuf[sizeof(abuf) - 1] = '\0'; + + /* log the arg in the audit record */ + audit_log_format(*ab, "%s", abuf); + len_rem -= len_tmp; + len_tmp = len_buf; + if (encode) { + if (len_abuf > len_rem) + len_tmp = len_rem / 2; /* encoding */ + audit_log_n_hex(*ab, buf, len_tmp); + len_rem -= len_tmp * 2; + len_abuf -= len_tmp * 2; + } else { + if (len_abuf > len_rem) + len_tmp = len_rem - 2; /* quotes */ + audit_log_n_string(*ab, buf, len_tmp); + len_rem -= len_tmp + 2; + /* don't subtract the "2" because we still need + * to add quotes to the remaining string */ + len_abuf -= len_tmp; + } + len_buf -= len_tmp; + buf += len_tmp; + } - audit_log_format(*ab, "argc=%d", context->execve.argc); + /* ready to move to the next argument? */ + if ((len_buf == 0) && !require_data) { + arg++; + iter = 0; + len_full = 0; + require_data = true; + encode = false; + } + } while (arg < context->execve.argc); - /* - * we need some kernel buffer to hold the userspace args. Just - * allocate one big one rather than allocating one of the right size - * for every single argument inside audit_log_single_execve_arg() - * should be <8k allocation so should be pretty safe. - */ - buf = kmalloc(MAX_EXECVE_AUDIT_LEN + 1, GFP_KERNEL); - if (!buf) { - audit_panic("out of memory for argv string"); - return; - } + /* NOTE: the caller handles the final audit_log_end() call */ - for (i = 0; i < context->execve.argc; i++) { - len = audit_log_single_execve_arg(context, ab, i, - &len_sent, p, buf); - if (len <= 0) - break; - p += len; - } - kfree(buf); +out: + kfree(buf_head); } static void show_special(struct audit_context *context, int *call_panic) @@ -1425,7 +1425,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts if (context->pwd.dentry && context->pwd.mnt) { ab = audit_log_start(context, GFP_KERNEL, AUDIT_CWD); if (ab) { - audit_log_d_path(ab, " cwd=", &context->pwd); + audit_log_d_path(ab, "cwd=", &context->pwd); audit_log_end(ab); } } diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index fff3650d52fc..570eeca7bdfa 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -26,11 +26,18 @@ struct bpf_htab { struct bucket *buckets; void *elems; struct pcpu_freelist freelist; + void __percpu *extra_elems; atomic_t count; /* number of elements in this hashtable */ u32 n_buckets; /* number of hash buckets */ u32 elem_size; /* size of each element in bytes */ }; +enum extra_elem_state { + HTAB_NOT_AN_EXTRA_ELEM = 0, + HTAB_EXTRA_ELEM_FREE, + HTAB_EXTRA_ELEM_USED +}; + /* each htab element is struct htab_elem + key + value */ struct htab_elem { union { @@ -38,7 +45,10 @@ struct htab_elem { struct bpf_htab *htab; struct pcpu_freelist_node fnode; }; - struct rcu_head rcu; + union { + struct rcu_head rcu; + enum extra_elem_state state; + }; u32 hash; char key[0] __aligned(8); }; @@ -113,6 +123,23 @@ free_elems: return err; } +static int alloc_extra_elems(struct bpf_htab *htab) +{ + void __percpu *pptr; + int cpu; + + pptr = __alloc_percpu_gfp(htab->elem_size, 8, GFP_USER | __GFP_NOWARN); + if (!pptr) + return -ENOMEM; + + for_each_possible_cpu(cpu) { + ((struct htab_elem *)per_cpu_ptr(pptr, cpu))->state = + HTAB_EXTRA_ELEM_FREE; + } + htab->extra_elems = pptr; + return 0; +} + /* Called from syscall */ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) { @@ -185,6 +212,8 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) if (percpu) cost += (u64) round_up(htab->map.value_size, 8) * num_possible_cpus() * htab->map.max_entries; + else + cost += (u64) htab->elem_size * num_possible_cpus(); if (cost >= U32_MAX - PAGE_SIZE) /* make sure page count doesn't overflow */ @@ -212,14 +241,22 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) raw_spin_lock_init(&htab->buckets[i].lock); } + if (!percpu) { + err = alloc_extra_elems(htab); + if (err) + goto free_buckets; + } + if (!(attr->map_flags & BPF_F_NO_PREALLOC)) { err = prealloc_elems_and_freelist(htab); if (err) - goto free_buckets; + goto free_extra_elems; } return &htab->map; +free_extra_elems: + free_percpu(htab->extra_elems); free_buckets: kvfree(htab->buckets); free_htab: @@ -349,7 +386,6 @@ static void htab_elem_free(struct bpf_htab *htab, struct htab_elem *l) if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH) free_percpu(htab_elem_get_ptr(l, htab->map.key_size)); kfree(l); - } static void htab_elem_free_rcu(struct rcu_head *head) @@ -370,6 +406,11 @@ static void htab_elem_free_rcu(struct rcu_head *head) static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) { + if (l->state == HTAB_EXTRA_ELEM_USED) { + l->state = HTAB_EXTRA_ELEM_FREE; + return; + } + if (!(htab->map.map_flags & BPF_F_NO_PREALLOC)) { pcpu_freelist_push(&htab->freelist, &l->fnode); } else { @@ -381,25 +422,44 @@ static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, void *value, u32 key_size, u32 hash, - bool percpu, bool onallcpus) + bool percpu, bool onallcpus, + bool old_elem_exists) { u32 size = htab->map.value_size; bool prealloc = !(htab->map.map_flags & BPF_F_NO_PREALLOC); struct htab_elem *l_new; void __percpu *pptr; + int err = 0; if (prealloc) { l_new = (struct htab_elem *)pcpu_freelist_pop(&htab->freelist); if (!l_new) - return ERR_PTR(-E2BIG); + err = -E2BIG; } else { if (atomic_inc_return(&htab->count) > htab->map.max_entries) { atomic_dec(&htab->count); - return ERR_PTR(-E2BIG); + err = -E2BIG; + } else { + l_new = kmalloc(htab->elem_size, + GFP_ATOMIC | __GFP_NOWARN); + if (!l_new) + return ERR_PTR(-ENOMEM); } - l_new = kmalloc(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN); - if (!l_new) - return ERR_PTR(-ENOMEM); + } + + if (err) { + if (!old_elem_exists) + return ERR_PTR(err); + + /* if we're updating the existing element and the hash table + * is full, use per-cpu extra elems + */ + l_new = this_cpu_ptr(htab->extra_elems); + if (l_new->state != HTAB_EXTRA_ELEM_FREE) + return ERR_PTR(-E2BIG); + l_new->state = HTAB_EXTRA_ELEM_USED; + } else { + l_new->state = HTAB_NOT_AN_EXTRA_ELEM; } memcpy(l_new->key, key, key_size); @@ -489,7 +549,8 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, if (ret) goto err; - l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false); + l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false, + !!l_old); if (IS_ERR(l_new)) { /* all pre-allocated elements are in use or memory exhausted */ ret = PTR_ERR(l_new); @@ -563,7 +624,7 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key, } } else { l_new = alloc_htab_elem(htab, key, value, key_size, - hash, true, onallcpus); + hash, true, onallcpus, false); if (IS_ERR(l_new)) { ret = PTR_ERR(l_new); goto err; @@ -652,6 +713,7 @@ static void htab_map_free(struct bpf_map *map) htab_free_elems(htab); pcpu_freelist_destroy(&htab->freelist); } + free_percpu(htab->extra_elems); kvfree(htab->buckets); kfree(htab); } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index f72f23b8fdab..daea765d72e6 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -194,6 +194,7 @@ struct verifier_env { struct verifier_state_list **explored_states; /* search pruning optimization */ struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */ u32 used_map_cnt; /* number of used maps */ + u32 id_gen; /* used to generate unique reg IDs */ bool allow_ptr_leaks; }; @@ -1052,7 +1053,7 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id) goto error; break; case BPF_MAP_TYPE_CGROUP_ARRAY: - if (func_id != BPF_FUNC_skb_in_cgroup) + if (func_id != BPF_FUNC_skb_under_cgroup) goto error; break; default: @@ -1074,7 +1075,7 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id) if (map->map_type != BPF_MAP_TYPE_STACK_TRACE) goto error; break; - case BPF_FUNC_skb_in_cgroup: + case BPF_FUNC_skb_under_cgroup: if (map->map_type != BPF_MAP_TYPE_CGROUP_ARRAY) goto error; break; @@ -1301,7 +1302,7 @@ add_imm: /* dst_reg stays as pkt_ptr type and since some positive * integer value was added to the pointer, increment its 'id' */ - dst_reg->id++; + dst_reg->id = ++env->id_gen; /* something was added to pkt_ptr, set range and off to zero */ dst_reg->off = 0; diff --git a/kernel/capability.c b/kernel/capability.c index 45432b54d5c6..00411c82dac5 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -361,6 +361,24 @@ bool has_capability_noaudit(struct task_struct *t, int cap) return has_ns_capability_noaudit(t, &init_user_ns, cap); } +static bool ns_capable_common(struct user_namespace *ns, int cap, bool audit) +{ + int capable; + + if (unlikely(!cap_valid(cap))) { + pr_crit("capable() called with invalid cap=%u\n", cap); + BUG(); + } + + capable = audit ? security_capable(current_cred(), ns, cap) : + security_capable_noaudit(current_cred(), ns, cap); + if (capable == 0) { + current->flags |= PF_SUPERPRIV; + return true; + } + return false; +} + /** * ns_capable - Determine if the current task has a superior capability in effect * @ns: The usernamespace we want the capability in @@ -374,19 +392,27 @@ bool has_capability_noaudit(struct task_struct *t, int cap) */ bool ns_capable(struct user_namespace *ns, int cap) { - if (unlikely(!cap_valid(cap))) { - pr_crit("capable() called with invalid cap=%u\n", cap); - BUG(); - } - - if (security_capable(current_cred(), ns, cap) == 0) { - current->flags |= PF_SUPERPRIV; - return true; - } - return false; + return ns_capable_common(ns, cap, true); } EXPORT_SYMBOL(ns_capable); +/** + * ns_capable_noaudit - Determine if the current task has a superior capability + * (unaudited) in effect + * @ns: The usernamespace we want the capability in + * @cap: The capability to be tested for + * + * Return true if the current task has the given superior capability currently + * available for use, false if not. + * + * This sets PF_SUPERPRIV on the task if the capability is available on the + * assumption that it's about to be used. + */ +bool ns_capable_noaudit(struct user_namespace *ns, int cap) +{ + return ns_capable_common(ns, cap, false); +} +EXPORT_SYMBOL(ns_capable_noaudit); /** * capable - Determine if the current task has a superior capability in effect diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 9624db80dc4e..5e8dab5bf9ad 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2209,12 +2209,8 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, goto out_unlock; } - /* - * We know this subsystem has not yet been bound. Users in a non-init - * user namespace may only mount hierarchies with no bound subsystems, - * i.e. 'none,name=user1' - */ - if (!opts.none && !capable(CAP_SYS_ADMIN)) { + /* Hierarchies may only be created in the initial cgroup namespace. */ + if (ns != &init_cgroup_ns) { ret = -EPERM; goto out_unlock; } @@ -2956,6 +2952,7 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) int retval = 0; mutex_lock(&cgroup_mutex); + percpu_down_write(&cgroup_threadgroup_rwsem); for_each_root(root) { struct cgroup *from_cgrp; @@ -2970,6 +2967,7 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) if (retval) break; } + percpu_up_write(&cgroup_threadgroup_rwsem); mutex_unlock(&cgroup_mutex); return retval; @@ -4337,6 +4335,8 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) mutex_lock(&cgroup_mutex); + percpu_down_write(&cgroup_threadgroup_rwsem); + /* all tasks in @from are being moved, all csets are source */ spin_lock_irq(&css_set_lock); list_for_each_entry(link, &from->cset_links, cset_link) @@ -4365,6 +4365,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) } while (task && !ret); out_err: cgroup_migrate_finish(&preloaded_csets); + percpu_up_write(&cgroup_threadgroup_rwsem); mutex_unlock(&cgroup_mutex); return ret; } @@ -6269,6 +6270,12 @@ void cgroup_sk_alloc(struct sock_cgroup_data *skcd) if (cgroup_sk_alloc_disabled) return; + /* Socket clone path */ + if (skcd->val) { + cgroup_get(sock_cgroup_ptr(skcd)); + return; + } + rcu_read_lock(); while (true) { @@ -6339,14 +6346,11 @@ struct cgroup_namespace *copy_cgroup_ns(unsigned long flags, if (!ns_capable(user_ns, CAP_SYS_ADMIN)) return ERR_PTR(-EPERM); - mutex_lock(&cgroup_mutex); + /* It is not safe to take cgroup_mutex here */ spin_lock_irq(&css_set_lock); - cset = task_css_set(current); get_css_set(cset); - spin_unlock_irq(&css_set_lock); - mutex_unlock(&cgroup_mutex); new_ns = alloc_cgroup_ns(); if (IS_ERR(new_ns)) { diff --git a/kernel/configs/android-base.config b/kernel/configs/android-base.config new file mode 100644 index 000000000000..9f748ed7bea8 --- /dev/null +++ b/kernel/configs/android-base.config @@ -0,0 +1,152 @@ +# KEEP ALPHABETICALLY SORTED +# CONFIG_DEVKMEM is not set +# CONFIG_DEVMEM is not set +# CONFIG_INET_LRO is not set +# CONFIG_MODULES is not set +# CONFIG_OABI_COMPAT is not set +# CONFIG_SYSVIPC is not set +CONFIG_ANDROID=y +CONFIG_ANDROID_BINDER_IPC=y +CONFIG_ANDROID_LOW_MEMORY_KILLER=y +CONFIG_ARMV8_DEPRECATED=y +CONFIG_ASHMEM=y +CONFIG_AUDIT=y +CONFIG_BLK_DEV_DM=y +CONFIG_BLK_DEV_INITRD=y +CONFIG_CGROUPS=y +CONFIG_CGROUP_CPUACCT=y +CONFIG_CGROUP_DEBUG=y +CONFIG_CGROUP_FREEZER=y +CONFIG_CGROUP_SCHED=y +CONFIG_CP15_BARRIER_EMULATION=y +CONFIG_DM_CRYPT=y +CONFIG_DM_VERITY=y +CONFIG_DM_VERITY_FEC=y +CONFIG_EMBEDDED=y +CONFIG_FB=y +CONFIG_HIGH_RES_TIMERS=y +CONFIG_INET6_AH=y +CONFIG_INET6_ESP=y +CONFIG_INET6_IPCOMP=y +CONFIG_INET=y +CONFIG_INET_DIAG_DESTROY=y +CONFIG_INET_ESP=y +CONFIG_INET_XFRM_MODE_TUNNEL=y +CONFIG_IP6_NF_FILTER=y +CONFIG_IP6_NF_IPTABLES=y +CONFIG_IP6_NF_MANGLE=y +CONFIG_IP6_NF_RAW=y +CONFIG_IP6_NF_TARGET_REJECT=y +CONFIG_IPV6=y +CONFIG_IPV6_MIP6=y +CONFIG_IPV6_MULTIPLE_TABLES=y +CONFIG_IPV6_OPTIMISTIC_DAD=y +CONFIG_IPV6_PRIVACY=y +CONFIG_IPV6_ROUTER_PREF=y +CONFIG_IPV6_ROUTE_INFO=y +CONFIG_IP_ADVANCED_ROUTER=y +CONFIG_IP_MULTICAST=y +CONFIG_IP_MULTIPLE_TABLES=y +CONFIG_IP_NF_ARPFILTER=y +CONFIG_IP_NF_ARPTABLES=y +CONFIG_IP_NF_ARP_MANGLE=y +CONFIG_IP_NF_FILTER=y +CONFIG_IP_NF_IPTABLES=y +CONFIG_IP_NF_MANGLE=y +CONFIG_IP_NF_MATCH_AH=y +CONFIG_IP_NF_MATCH_ECN=y +CONFIG_IP_NF_MATCH_TTL=y +CONFIG_IP_NF_NAT=y +CONFIG_IP_NF_RAW=y +CONFIG_IP_NF_SECURITY=y +CONFIG_IP_NF_TARGET_MASQUERADE=y +CONFIG_IP_NF_TARGET_NETMAP=y +CONFIG_IP_NF_TARGET_REDIRECT=y +CONFIG_IP_NF_TARGET_REJECT=y +CONFIG_NET=y +CONFIG_NETDEVICES=y +CONFIG_NETFILTER=y +CONFIG_NETFILTER_TPROXY=y +CONFIG_NETFILTER_XT_MATCH_COMMENT=y +CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=y +CONFIG_NETFILTER_XT_MATCH_CONNMARK=y +CONFIG_NETFILTER_XT_MATCH_CONNTRACK=y +CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=y +CONFIG_NETFILTER_XT_MATCH_HELPER=y +CONFIG_NETFILTER_XT_MATCH_IPRANGE=y +CONFIG_NETFILTER_XT_MATCH_LENGTH=y +CONFIG_NETFILTER_XT_MATCH_LIMIT=y +CONFIG_NETFILTER_XT_MATCH_MAC=y +CONFIG_NETFILTER_XT_MATCH_MARK=y +CONFIG_NETFILTER_XT_MATCH_PKTTYPE=y +CONFIG_NETFILTER_XT_MATCH_POLICY=y +CONFIG_NETFILTER_XT_MATCH_QUOTA=y +CONFIG_NETFILTER_XT_MATCH_SOCKET=y +CONFIG_NETFILTER_XT_MATCH_STATE=y +CONFIG_NETFILTER_XT_MATCH_STATISTIC=y +CONFIG_NETFILTER_XT_MATCH_STRING=y +CONFIG_NETFILTER_XT_MATCH_TIME=y +CONFIG_NETFILTER_XT_MATCH_U32=y +CONFIG_NETFILTER_XT_TARGET_CLASSIFY=y +CONFIG_NETFILTER_XT_TARGET_CONNMARK=y +CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=y +CONFIG_NETFILTER_XT_TARGET_IDLETIMER=y +CONFIG_NETFILTER_XT_TARGET_MARK=y +CONFIG_NETFILTER_XT_TARGET_NFLOG=y +CONFIG_NETFILTER_XT_TARGET_NFQUEUE=y +CONFIG_NETFILTER_XT_TARGET_SECMARK=y +CONFIG_NETFILTER_XT_TARGET_TCPMSS=y +CONFIG_NETFILTER_XT_TARGET_TPROXY=y +CONFIG_NETFILTER_XT_TARGET_TRACE=y +CONFIG_NET_CLS_ACT=y +CONFIG_NET_CLS_U32=y +CONFIG_NET_EMATCH=y +CONFIG_NET_EMATCH_U32=y +CONFIG_NET_KEY=y +CONFIG_NET_SCHED=y +CONFIG_NET_SCH_HTB=y +CONFIG_NF_CONNTRACK=y +CONFIG_NF_CONNTRACK_AMANDA=y +CONFIG_NF_CONNTRACK_EVENTS=y +CONFIG_NF_CONNTRACK_FTP=y +CONFIG_NF_CONNTRACK_H323=y +CONFIG_NF_CONNTRACK_IPV4=y +CONFIG_NF_CONNTRACK_IPV6=y +CONFIG_NF_CONNTRACK_IRC=y +CONFIG_NF_CONNTRACK_NETBIOS_NS=y +CONFIG_NF_CONNTRACK_PPTP=y +CONFIG_NF_CONNTRACK_SANE=y +CONFIG_NF_CONNTRACK_SECMARK=y +CONFIG_NF_CONNTRACK_TFTP=y +CONFIG_NF_CT_NETLINK=y +CONFIG_NF_CT_PROTO_DCCP=y +CONFIG_NF_CT_PROTO_SCTP=y +CONFIG_NF_CT_PROTO_UDPLITE=y +CONFIG_NF_NAT=y +CONFIG_NO_HZ=y +CONFIG_PACKET=y +CONFIG_PM_AUTOSLEEP=y +CONFIG_PM_WAKELOCKS=y +CONFIG_PPP=y +CONFIG_PPP_BSDCOMP=y +CONFIG_PPP_DEFLATE=y +CONFIG_PPP_MPPE=y +CONFIG_PREEMPT=y +CONFIG_QUOTA=y +CONFIG_RTC_CLASS=y +CONFIG_RT_GROUP_SCHED=y +CONFIG_SECURITY=y +CONFIG_SECURITY_NETWORK=y +CONFIG_SECURITY_SELINUX=y +CONFIG_SETEND_EMULATION=y +CONFIG_STAGING=y +CONFIG_SWP_EMULATION=y +CONFIG_SYNC=y +CONFIG_TUN=y +CONFIG_UNIX=y +CONFIG_USB_GADGET=y +CONFIG_USB_CONFIGFS=y +CONFIG_USB_CONFIGFS_F_FS=y +CONFIG_USB_CONFIGFS_F_MIDI=y +CONFIG_USB_OTG_WAKELOCK=y +CONFIG_XFRM_USER=y diff --git a/kernel/configs/android-recommended.config b/kernel/configs/android-recommended.config new file mode 100644 index 000000000000..e3b953e966d2 --- /dev/null +++ b/kernel/configs/android-recommended.config @@ -0,0 +1,121 @@ +# KEEP ALPHABETICALLY SORTED +# CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set +# CONFIG_INPUT_MOUSE is not set +# CONFIG_LEGACY_PTYS is not set +# CONFIG_NF_CONNTRACK_SIP is not set +# CONFIG_PM_WAKELOCKS_GC is not set +# CONFIG_VT is not set +CONFIG_BACKLIGHT_LCD_SUPPORT=y +CONFIG_BLK_DEV_LOOP=y +CONFIG_BLK_DEV_RAM=y +CONFIG_BLK_DEV_RAM_SIZE=8192 +CONFIG_COMPACTION=y +CONFIG_DEBUG_RODATA=y +CONFIG_DM_UEVENT=y +CONFIG_DRAGONRISE_FF=y +CONFIG_ENABLE_DEFAULT_TRACERS=y +CONFIG_EXT4_FS=y +CONFIG_EXT4_FS_SECURITY=y +CONFIG_FUSE_FS=y +CONFIG_GREENASIA_FF=y +CONFIG_HIDRAW=y +CONFIG_HID_A4TECH=y +CONFIG_HID_ACRUX=y +CONFIG_HID_ACRUX_FF=y +CONFIG_HID_APPLE=y +CONFIG_HID_BELKIN=y +CONFIG_HID_CHERRY=y +CONFIG_HID_CHICONY=y +CONFIG_HID_CYPRESS=y +CONFIG_HID_DRAGONRISE=y +CONFIG_HID_ELECOM=y +CONFIG_HID_EMS_FF=y +CONFIG_HID_EZKEY=y +CONFIG_HID_GREENASIA=y +CONFIG_HID_GYRATION=y +CONFIG_HID_HOLTEK=y +CONFIG_HID_KENSINGTON=y +CONFIG_HID_KEYTOUCH=y +CONFIG_HID_KYE=y +CONFIG_HID_LCPOWER=y +CONFIG_HID_LOGITECH=y +CONFIG_HID_LOGITECH_DJ=y +CONFIG_HID_MAGICMOUSE=y +CONFIG_HID_MICROSOFT=y +CONFIG_HID_MONTEREY=y +CONFIG_HID_MULTITOUCH=y +CONFIG_HID_NTRIG=y +CONFIG_HID_ORTEK=y +CONFIG_HID_PANTHERLORD=y +CONFIG_HID_PETALYNX=y +CONFIG_HID_PICOLCD=y +CONFIG_HID_PRIMAX=y +CONFIG_HID_PRODIKEYS=y +CONFIG_HID_ROCCAT=y +CONFIG_HID_SAITEK=y +CONFIG_HID_SAMSUNG=y +CONFIG_HID_SMARTJOYPLUS=y +CONFIG_HID_SONY=y +CONFIG_HID_SPEEDLINK=y +CONFIG_HID_SUNPLUS=y +CONFIG_HID_THRUSTMASTER=y +CONFIG_HID_TIVO=y +CONFIG_HID_TOPSEED=y +CONFIG_HID_TWINHAN=y +CONFIG_HID_UCLOGIC=y +CONFIG_HID_WACOM=y +CONFIG_HID_WALTOP=y +CONFIG_HID_WIIMOTE=y +CONFIG_HID_ZEROPLUS=y +CONFIG_HID_ZYDACRON=y +CONFIG_INPUT_EVDEV=y +CONFIG_INPUT_GPIO=y +CONFIG_INPUT_JOYSTICK=y +CONFIG_INPUT_MISC=y +CONFIG_INPUT_TABLET=y +CONFIG_INPUT_UINPUT=y +CONFIG_ION=y +CONFIG_JOYSTICK_XPAD=y +CONFIG_JOYSTICK_XPAD_FF=y +CONFIG_JOYSTICK_XPAD_LEDS=y +CONFIG_KALLSYMS_ALL=y +CONFIG_KSM=y +CONFIG_LOGIG940_FF=y +CONFIG_LOGIRUMBLEPAD2_FF=y +CONFIG_LOGITECH_FF=y +CONFIG_MD=y +CONFIG_MEDIA_SUPPORT=y +CONFIG_MSDOS_FS=y +CONFIG_PANIC_TIMEOUT=5 +CONFIG_PANTHERLORD_FF=y +CONFIG_PERF_EVENTS=y +CONFIG_PM_DEBUG=y +CONFIG_PM_RUNTIME=y +CONFIG_PM_WAKELOCKS_LIMIT=0 +CONFIG_POWER_SUPPLY=y +CONFIG_PSTORE=y +CONFIG_PSTORE_CONSOLE=y +CONFIG_PSTORE_RAM=y +CONFIG_SCHEDSTATS=y +CONFIG_SMARTJOYPLUS_FF=y +CONFIG_SND=y +CONFIG_SOUND=y +CONFIG_SUSPEND_TIME=y +CONFIG_TABLET_USB_ACECAD=y +CONFIG_TABLET_USB_AIPTEK=y +CONFIG_TABLET_USB_GTCO=y +CONFIG_TABLET_USB_HANWANG=y +CONFIG_TABLET_USB_KBTAB=y +CONFIG_TASKSTATS=y +CONFIG_TASK_DELAY_ACCT=y +CONFIG_TASK_IO_ACCOUNTING=y +CONFIG_TASK_XACCT=y +CONFIG_TIMER_STATS=y +CONFIG_TMPFS=y +CONFIG_TMPFS_POSIX_ACL=y +CONFIG_UHID=y +CONFIG_USB_ANNOUNCE_NEW_DEVICES=y +CONFIG_USB_EHCI_HCD=y +CONFIG_USB_HIDDEV=y +CONFIG_USB_USBNET=y +CONFIG_VFAT_FS=y diff --git a/kernel/configs/tiny.config b/kernel/configs/tiny.config index c2de56ab0fce..7fa0c4ae6394 100644 --- a/kernel/configs/tiny.config +++ b/kernel/configs/tiny.config @@ -1,4 +1,12 @@ +# CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE is not set CONFIG_CC_OPTIMIZE_FOR_SIZE=y +# CONFIG_KERNEL_GZIP is not set +# CONFIG_KERNEL_BZIP2 is not set +# CONFIG_KERNEL_LZMA is not set CONFIG_KERNEL_XZ=y +# CONFIG_KERNEL_LZO is not set +# CONFIG_KERNEL_LZ4 is not set CONFIG_OPTIMIZE_INLINING=y +# CONFIG_SLAB is not set +# CONFIG_SLUB is not set CONFIG_SLOB=y diff --git a/kernel/cpu.c b/kernel/cpu.c index 7b61887f7ccd..341bf80f80bd 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -517,6 +517,13 @@ static int cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state, if (!cpu_online(cpu)) return 0; + /* + * If we are up and running, use the hotplug thread. For early calls + * we invoke the thread function directly. + */ + if (!st->thread) + return cpuhp_invoke_callback(cpu, state, cb); + st->cb_state = state; st->cb = cb; /* @@ -1173,6 +1180,31 @@ static struct cpuhp_step cpuhp_bp_states[] = { .teardown = NULL, .cant_stop = true, }, + [CPUHP_PERF_PREPARE] = { + .name = "perf prepare", + .startup = perf_event_init_cpu, + .teardown = perf_event_exit_cpu, + }, + [CPUHP_WORKQUEUE_PREP] = { + .name = "workqueue prepare", + .startup = workqueue_prepare_cpu, + .teardown = NULL, + }, + [CPUHP_HRTIMERS_PREPARE] = { + .name = "hrtimers prepare", + .startup = hrtimers_prepare_cpu, + .teardown = hrtimers_dead_cpu, + }, + [CPUHP_SMPCFD_PREPARE] = { + .name = "SMPCFD prepare", + .startup = smpcfd_prepare_cpu, + .teardown = smpcfd_dead_cpu, + }, + [CPUHP_RCUTREE_PREP] = { + .name = "RCU-tree prepare", + .startup = rcutree_prepare_cpu, + .teardown = rcutree_dead_cpu, + }, /* * Preparatory and dead notifiers. Will be replaced once the notifiers * are converted to states. @@ -1184,6 +1216,16 @@ static struct cpuhp_step cpuhp_bp_states[] = { .skip_onerr = true, .cant_stop = true, }, + /* + * On the tear-down path, timers_dead_cpu() must be invoked + * before blk_mq_queue_reinit_notify() from notify_dead(), + * otherwise a RCU stall occurs. + */ + [CPUHP_TIMERS_DEAD] = { + .name = "timers dead", + .startup = NULL, + .teardown = timers_dead_cpu, + }, /* Kicks the plugged cpu into life */ [CPUHP_BRINGUP_CPU] = { .name = "cpu:bringup", @@ -1191,6 +1233,10 @@ static struct cpuhp_step cpuhp_bp_states[] = { .teardown = NULL, .cant_stop = true, }, + [CPUHP_AP_SMPCFD_DYING] = { + .startup = NULL, + .teardown = smpcfd_dying_cpu, + }, /* * Handled on controll processor until the plugged processor manages * this itself. @@ -1227,6 +1273,10 @@ static struct cpuhp_step cpuhp_ap_states[] = { .startup = sched_cpu_starting, .teardown = sched_cpu_dying, }, + [CPUHP_AP_RCUTREE_DYING] = { + .startup = NULL, + .teardown = rcutree_dying_cpu, + }, /* * Low level startup/teardown notifiers. Run with interrupts * disabled. Will be removed once the notifiers are converted to @@ -1250,6 +1300,22 @@ static struct cpuhp_step cpuhp_ap_states[] = { .startup = smpboot_unpark_threads, .teardown = NULL, }, + [CPUHP_AP_PERF_ONLINE] = { + .name = "perf online", + .startup = perf_event_init_cpu, + .teardown = perf_event_exit_cpu, + }, + [CPUHP_AP_WORKQUEUE_ONLINE] = { + .name = "workqueue online", + .startup = workqueue_online_cpu, + .teardown = workqueue_offline_cpu, + }, + [CPUHP_AP_RCUTREE_ONLINE] = { + .name = "RCU-tree online", + .startup = rcutree_online_cpu, + .teardown = rcutree_offline_cpu, + }, + /* * Online/down_prepare notifiers. Will be removed once the notifiers * are converted to states. diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 73e93e53884d..c27e53326bef 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1034,15 +1034,6 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk, { bool need_loop; - /* - * Allow tasks that have access to memory reserves because they have - * been OOM killed to get memory anywhere. - */ - if (unlikely(test_thread_flag(TIF_MEMDIE))) - return; - if (current->flags & PF_EXITING) /* Let dying task have memory */ - return; - task_lock(tsk); /* * Determine if a loop is necessary if another thread is doing @@ -2078,6 +2069,20 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css) mutex_unlock(&cpuset_mutex); } +/* + * Make sure the new task conform to the current state of its parent, + * which could have been changed by cpuset just after it inherits the + * state from the parent and before it sits on the cgroup's task list. + */ +void cpuset_fork(struct task_struct *task) +{ + if (task_css_is_root(task, cpuset_cgrp_id)) + return; + + set_cpus_allowed_ptr(task, ¤t->cpus_allowed); + task->mems_allowed = current->mems_allowed; +} + struct cgroup_subsys cpuset_cgrp_subsys = { .css_alloc = cpuset_css_alloc, .css_online = cpuset_css_online, @@ -2088,6 +2093,7 @@ struct cgroup_subsys cpuset_cgrp_subsys = { .attach = cpuset_attach, .post_attach = cpuset_post_attach, .bind = cpuset_bind, + .fork = cpuset_fork, .legacy_cftypes = files, .early_init = true, }; diff --git a/kernel/cred.c b/kernel/cred.c index 0c0cd8a62285..5f264fb5737d 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -689,6 +689,8 @@ EXPORT_SYMBOL(set_security_override_from_ctx); */ int set_create_files_as(struct cred *new, struct inode *inode) { + if (!uid_valid(inode->i_uid) || !gid_valid(inode->i_gid)) + return -EINVAL; new->fsuid = inode->i_uid; new->fsgid = inode->i_gid; return security_kernel_create_files_as(new, inode); diff --git a/kernel/events/core.c b/kernel/events/core.c index 09ae27b353c1..fc9bb2225291 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -242,18 +242,6 @@ unlock: return ret; } -static void event_function_local(struct perf_event *event, event_f func, void *data) -{ - struct event_function_struct efs = { - .event = event, - .func = func, - .data = data, - }; - - int ret = event_function(&efs); - WARN_ON_ONCE(ret); -} - static void event_function_call(struct perf_event *event, event_f func, void *data) { struct perf_event_context *ctx = event->ctx; @@ -303,6 +291,54 @@ again: raw_spin_unlock_irq(&ctx->lock); } +/* + * Similar to event_function_call() + event_function(), but hard assumes IRQs + * are already disabled and we're on the right CPU. + */ +static void event_function_local(struct perf_event *event, event_f func, void *data) +{ + struct perf_event_context *ctx = event->ctx; + struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); + struct task_struct *task = READ_ONCE(ctx->task); + struct perf_event_context *task_ctx = NULL; + + WARN_ON_ONCE(!irqs_disabled()); + + if (task) { + if (task == TASK_TOMBSTONE) + return; + + task_ctx = ctx; + } + + perf_ctx_lock(cpuctx, task_ctx); + + task = ctx->task; + if (task == TASK_TOMBSTONE) + goto unlock; + + if (task) { + /* + * We must be either inactive or active and the right task, + * otherwise we're screwed, since we cannot IPI to somewhere + * else. + */ + if (ctx->is_active) { + if (WARN_ON_ONCE(task != current)) + goto unlock; + + if (WARN_ON_ONCE(cpuctx->task_ctx != ctx)) + goto unlock; + } + } else { + WARN_ON_ONCE(&cpuctx->ctx != ctx); + } + + func(event, cpuctx, ctx, data); +unlock: + perf_ctx_unlock(cpuctx, task_ctx); +} + #define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\ PERF_FLAG_FD_OUTPUT |\ PERF_FLAG_PID_CGROUP |\ @@ -448,7 +484,7 @@ static u64 __report_allowed; static void perf_duration_warn(struct irq_work *w) { - printk_ratelimited(KERN_WARNING + printk_ratelimited(KERN_INFO "perf: interrupt took too long (%lld > %lld), lowering " "kernel.perf_event_max_sample_rate to %d\n", __report_avg, __report_allowed, @@ -843,6 +879,32 @@ perf_cgroup_mark_enabled(struct perf_event *event, } } } + +/* + * Update cpuctx->cgrp so that it is set when first cgroup event is added and + * cleared when last cgroup event is removed. + */ +static inline void +list_update_cgroup_event(struct perf_event *event, + struct perf_event_context *ctx, bool add) +{ + struct perf_cpu_context *cpuctx; + + if (!is_cgroup_event(event)) + return; + + if (add && ctx->nr_cgroups++) + return; + else if (!add && --ctx->nr_cgroups) + return; + /* + * Because cgroup events are always per-cpu events, + * this will always be called from the right CPU. + */ + cpuctx = __get_cpu_context(ctx); + cpuctx->cgrp = add ? event->cgrp : NULL; +} + #else /* !CONFIG_CGROUP_PERF */ static inline bool @@ -920,6 +982,13 @@ perf_cgroup_mark_enabled(struct perf_event *event, struct perf_event_context *ctx) { } + +static inline void +list_update_cgroup_event(struct perf_event *event, + struct perf_event_context *ctx, bool add) +{ +} + #endif /* @@ -1392,6 +1461,7 @@ ctx_group_list(struct perf_event *event, struct perf_event_context *ctx) static void list_add_event(struct perf_event *event, struct perf_event_context *ctx) { + lockdep_assert_held(&ctx->lock); WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT); @@ -1412,8 +1482,7 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) list_add_tail(&event->group_entry, list); } - if (is_cgroup_event(event)) - ctx->nr_cgroups++; + list_update_cgroup_event(event, ctx, true); list_add_rcu(&event->event_entry, &ctx->event_list); ctx->nr_events++; @@ -1581,8 +1650,6 @@ static void perf_group_attach(struct perf_event *event) static void list_del_event(struct perf_event *event, struct perf_event_context *ctx) { - struct perf_cpu_context *cpuctx; - WARN_ON_ONCE(event->ctx != ctx); lockdep_assert_held(&ctx->lock); @@ -1594,20 +1661,7 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) event->attach_state &= ~PERF_ATTACH_CONTEXT; - if (is_cgroup_event(event)) { - ctx->nr_cgroups--; - /* - * Because cgroup events are always per-cpu events, this will - * always be called from the right CPU. - */ - cpuctx = __get_cpu_context(ctx); - /* - * If there are no more cgroup events then clear cgrp to avoid - * stale pointer in update_cgrp_time_from_cpuctx(). - */ - if (!ctx->nr_cgroups) - cpuctx->cgrp = NULL; - } + list_update_cgroup_event(event, ctx, false); ctx->nr_events--; if (event->attr.inherit_stat) @@ -1716,8 +1770,8 @@ static inline int pmu_filter_match(struct perf_event *event) static inline int event_filter_match(struct perf_event *event) { - return (event->cpu == -1 || event->cpu == smp_processor_id()) - && perf_cgroup_match(event) && pmu_filter_match(event); + return (event->cpu == -1 || event->cpu == smp_processor_id()) && + perf_cgroup_match(event) && pmu_filter_match(event); } static void @@ -1737,8 +1791,8 @@ event_sched_out(struct perf_event *event, * maintained, otherwise bogus information is return * via read() for time_enabled, time_running: */ - if (event->state == PERF_EVENT_STATE_INACTIVE - && !event_filter_match(event)) { + if (event->state == PERF_EVENT_STATE_INACTIVE && + !event_filter_match(event)) { delta = tstamp - event->tstamp_stopped; event->tstamp_running += delta; event->tstamp_stopped = tstamp; @@ -2236,10 +2290,15 @@ perf_install_in_context(struct perf_event_context *ctx, lockdep_assert_held(&ctx->mutex); - event->ctx = ctx; if (event->cpu != -1) event->cpu = cpu; + /* + * Ensures that if we can observe event->ctx, both the event and ctx + * will be 'complete'. See perf_iterate_sb_cpu(). + */ + smp_store_release(&event->ctx, ctx); + if (!task) { cpu_function_call(cpu, __perf_install_in_context, event); return; @@ -2437,11 +2496,11 @@ static int __perf_event_stop(void *info) return 0; } -static int perf_event_restart(struct perf_event *event) +static int perf_event_stop(struct perf_event *event, int restart) { struct stop_event_data sd = { .event = event, - .restart = 1, + .restart = restart, }; int ret = 0; @@ -3490,8 +3549,17 @@ static int perf_event_read(struct perf_event *event, bool group) .group = group, .ret = 0, }; - smp_call_function_single(event->oncpu, - __perf_event_read, &data, 1); + /* + * Purposely ignore the smp_call_function_single() return + * value. + * + * If event->oncpu isn't a valid CPU it means the event got + * scheduled out and that will have updated the event count. + * + * Therefore, either way, we'll have an up-to-date event count + * after this. + */ + (void)smp_call_function_single(event->oncpu, __perf_event_read, &data, 1); ret = data.ret; } else if (event->state == PERF_EVENT_STATE_INACTIVE) { struct perf_event_context *ctx = event->ctx; @@ -3861,7 +3929,7 @@ static void exclusive_event_destroy(struct perf_event *event) static bool exclusive_event_match(struct perf_event *e1, struct perf_event *e2) { - if ((e1->pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE) && + if ((e1->pmu == e2->pmu) && (e1->cpu == e2->cpu || e1->cpu == -1 || e2->cpu == -1)) @@ -4777,6 +4845,19 @@ static void ring_buffer_attach(struct perf_event *event, spin_unlock_irqrestore(&rb->event_lock, flags); } + /* + * Avoid racing with perf_mmap_close(AUX): stop the event + * before swizzling the event::rb pointer; if it's getting + * unmapped, its aux_mmap_count will be 0 and it won't + * restart. See the comment in __perf_pmu_output_stop(). + * + * Data will inevitably be lost when set_output is done in + * mid-air, but then again, whoever does it like this is + * not in for the data anyway. + */ + if (has_aux(event)) + perf_event_stop(event, 0); + rcu_assign_pointer(event->rb, rb); if (old_rb) { @@ -5969,6 +6050,14 @@ static void perf_iterate_sb_cpu(perf_iterate_f output, void *data) struct perf_event *event; list_for_each_entry_rcu(event, &pel->list, sb_list) { + /* + * Skip events that are not fully formed yet; ensure that + * if we observe event->ctx, both event and ctx will be + * complete enough. See perf_install_in_context(). + */ + if (!smp_load_acquire(&event->ctx)) + continue; + if (event->state < PERF_EVENT_STATE_INACTIVE) continue; if (!event_filter_match(event)) @@ -6044,7 +6133,7 @@ static void perf_event_addr_filters_exec(struct perf_event *event, void *data) raw_spin_unlock_irqrestore(&ifh->lock, flags); if (restart) - perf_event_restart(event); + perf_event_stop(event, 1); } void perf_event_exec(void) @@ -6088,7 +6177,13 @@ static void __perf_event_output_stop(struct perf_event *event, void *data) /* * In case of inheritance, it will be the parent that links to the - * ring-buffer, but it will be the child that's actually using it: + * ring-buffer, but it will be the child that's actually using it. + * + * We are using event::rb to determine if the event should be stopped, + * however this may race with ring_buffer_attach() (through set_output), + * which will make us skip the event that actually needs to be stopped. + * So ring_buffer_attach() has to stop an aux event before re-assigning + * its rb pointer. */ if (rcu_dereference(parent->rb) == rb) ro->err = __perf_event_stop(&sd); @@ -6098,7 +6193,7 @@ static int __perf_pmu_output_stop(void *info) { struct perf_event *event = info; struct pmu *pmu = event->pmu; - struct perf_cpu_context *cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); + struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); struct remote_output ro = { .rb = event->rb, }; @@ -6553,15 +6648,6 @@ got_name: } /* - * Whether this @filter depends on a dynamic object which is not loaded - * yet or its load addresses are not known. - */ -static bool perf_addr_filter_needs_mmap(struct perf_addr_filter *filter) -{ - return filter->filter && filter->inode; -} - -/* * Check whether inode and address range match filter criteria. */ static bool perf_addr_filter_match(struct perf_addr_filter *filter, @@ -6611,7 +6697,7 @@ static void __perf_addr_filters_adjust(struct perf_event *event, void *data) raw_spin_unlock_irqrestore(&ifh->lock, flags); if (restart) - perf_event_restart(event); + perf_event_stop(event, 1); } /* @@ -6622,6 +6708,13 @@ static void perf_addr_filters_adjust(struct vm_area_struct *vma) struct perf_event_context *ctx; int ctxn; + /* + * Data tracing isn't supported yet and as such there is no need + * to keep track of anything that isn't related to executable code: + */ + if (!(vma->vm_flags & VM_EXEC)) + return; + rcu_read_lock(); for_each_task_context_nr(ctxn) { ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); @@ -7774,7 +7867,11 @@ static void perf_event_addr_filters_apply(struct perf_event *event) list_for_each_entry(filter, &ifh->list, entry) { event->addr_filters_offs[count] = 0; - if (perf_addr_filter_needs_mmap(filter)) + /* + * Adjust base offset if the filter is associated to a binary + * that needs to be mapped: + */ + if (filter->inode) event->addr_filters_offs[count] = perf_addr_filter_apply(filter, mm); @@ -7789,7 +7886,7 @@ static void perf_event_addr_filters_apply(struct perf_event *event) mmput(mm); restart: - perf_event_restart(event); + perf_event_stop(event, 1); } /* @@ -7905,8 +8002,10 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr, goto fail; } - if (token == IF_SRC_FILE) { - filename = match_strdup(&args[2]); + if (token == IF_SRC_FILE || token == IF_SRC_FILEADDR) { + int fpos = filter->range ? 2 : 1; + + filename = match_strdup(&args[fpos]); if (!filename) { ret = -ENOMEM; goto fail; @@ -10357,7 +10456,7 @@ static void __init perf_event_init_all_cpus(void) } } -static void perf_event_init_cpu(int cpu) +int perf_event_init_cpu(unsigned int cpu) { struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); @@ -10370,6 +10469,7 @@ static void perf_event_init_cpu(int cpu) rcu_assign_pointer(swhash->swevent_hlist, hlist); } mutex_unlock(&swhash->hlist_mutex); + return 0; } #if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC_CORE @@ -10401,14 +10501,17 @@ static void perf_event_exit_cpu_context(int cpu) } srcu_read_unlock(&pmus_srcu, idx); } +#else + +static void perf_event_exit_cpu_context(int cpu) { } + +#endif -static void perf_event_exit_cpu(int cpu) +int perf_event_exit_cpu(unsigned int cpu) { perf_event_exit_cpu_context(cpu); + return 0; } -#else -static inline void perf_event_exit_cpu(int cpu) { } -#endif static int perf_reboot(struct notifier_block *notifier, unsigned long val, void *v) @@ -10430,46 +10533,6 @@ static struct notifier_block perf_reboot_notifier = { .priority = INT_MIN, }; -static int -perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) -{ - unsigned int cpu = (long)hcpu; - - switch (action & ~CPU_TASKS_FROZEN) { - - case CPU_UP_PREPARE: - /* - * This must be done before the CPU comes alive, because the - * moment we can run tasks we can encounter (software) events. - * - * Specifically, someone can have inherited events on kthreadd - * or a pre-existing worker thread that gets re-bound. - */ - perf_event_init_cpu(cpu); - break; - - case CPU_DOWN_PREPARE: - /* - * This must be done before the CPU dies because after that an - * active event might want to IPI the CPU and that'll not work - * so great for dead CPUs. - * - * XXX smp_call_function_single() return -ENXIO without a warn - * so we could possibly deal with this. - * - * This is safe against new events arriving because - * sys_perf_event_open() serializes against hotplug using - * get_online_cpus(). - */ - perf_event_exit_cpu(cpu); - break; - default: - break; - } - - return NOTIFY_OK; -} - void __init perf_event_init(void) { int ret; @@ -10482,7 +10545,7 @@ void __init perf_event_init(void) perf_pmu_register(&perf_cpu_clock, NULL, -1); perf_pmu_register(&perf_task_clock, NULL, -1); perf_tp_register(); - perf_cpu_notifier(perf_cpu_notify); + perf_event_init_cpu(smp_processor_id()); register_reboot_notifier(&perf_reboot_notifier); ret = init_hw_breakpoint(); diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index ae9b90dc9a5a..257fa460b846 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -330,15 +330,22 @@ void *perf_aux_output_begin(struct perf_output_handle *handle, if (!rb) return NULL; - if (!rb_has_aux(rb) || !atomic_inc_not_zero(&rb->aux_refcount)) + if (!rb_has_aux(rb)) goto err; /* - * If rb::aux_mmap_count is zero (and rb_has_aux() above went through), - * the aux buffer is in perf_mmap_close(), about to get freed. + * If aux_mmap_count is zero, the aux buffer is in perf_mmap_close(), + * about to get freed, so we leave immediately. + * + * Checking rb::aux_mmap_count and rb::refcount has to be done in + * the same order, see perf_mmap_close. Otherwise we end up freeing + * aux pages in this path, which is a bug, because in_atomic(). */ if (!atomic_read(&rb->aux_mmap_count)) - goto err_put; + goto err; + + if (!atomic_inc_not_zero(&rb->aux_refcount)) + goto err; /* * Nesting is not supported for AUX area, make sure nested diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index b7a525ab2083..8c50276b60d1 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -172,8 +172,10 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); err = -EAGAIN; ptep = page_check_address(page, mm, addr, &ptl, 0); - if (!ptep) + if (!ptep) { + mem_cgroup_cancel_charge(kpage, memcg, false); goto unlock; + } get_page(kpage); page_add_new_anon_rmap(kpage, vma, addr, false); @@ -200,7 +202,6 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, err = 0; unlock: - mem_cgroup_cancel_charge(kpage, memcg, false); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); unlock_page(page); return err; diff --git a/kernel/exit.c b/kernel/exit.c index 84ae830234f8..091a78be3b09 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -715,7 +715,7 @@ static void check_stack_usage(void) spin_lock(&low_water_lock); if (free < lowest_to_date) { - pr_warn("%s (%d) used greatest stack depth: %lu bytes left\n", + pr_info("%s (%d) used greatest stack depth: %lu bytes left\n", current->comm, task_pid_nr(current), free); lowest_to_date = free; } @@ -848,12 +848,7 @@ void do_exit(long code) TASKS_RCU(preempt_enable()); exit_notify(tsk, group_dead); proc_exit_connector(tsk); -#ifdef CONFIG_NUMA - task_lock(tsk); - mpol_put(tsk->mempolicy); - tsk->mempolicy = NULL; - task_unlock(tsk); -#endif + mpol_put_task_policy(tsk); #ifdef CONFIG_FUTEX if (unlikely(current->pi_state_cache)) kfree(current->pi_state_cache); diff --git a/kernel/fork.c b/kernel/fork.c index de21f25e0d2c..beb31725f7e2 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -165,20 +165,12 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, struct page *page = alloc_pages_node(node, THREADINFO_GFP, THREAD_SIZE_ORDER); - if (page) - memcg_kmem_update_page_stat(page, MEMCG_KERNEL_STACK, - 1 << THREAD_SIZE_ORDER); - return page ? page_address(page) : NULL; } static inline void free_thread_stack(unsigned long *stack) { - struct page *page = virt_to_page(stack); - - memcg_kmem_update_page_stat(page, MEMCG_KERNEL_STACK, - -(1 << THREAD_SIZE_ORDER)); - __free_pages(page, THREAD_SIZE_ORDER); + __free_pages(virt_to_page(stack), THREAD_SIZE_ORDER); } # else static struct kmem_cache *thread_stack_cache; @@ -223,9 +215,15 @@ static struct kmem_cache *mm_cachep; static void account_kernel_stack(unsigned long *stack, int account) { - struct zone *zone = page_zone(virt_to_page(stack)); + /* All stack pages are in the same zone and belong to the same memcg. */ + struct page *first_page = virt_to_page(stack); + + mod_zone_page_state(page_zone(first_page), NR_KERNEL_STACK_KB, + THREAD_SIZE / 1024 * account); - mod_zone_page_state(zone, NR_KERNEL_STACK, account); + memcg_kmem_update_page_stat( + first_page, MEMCG_KERNEL_STACK_KB, + account * (THREAD_SIZE / 1024)); } void free_task(struct task_struct *tsk) @@ -801,6 +799,29 @@ struct file *get_mm_exe_file(struct mm_struct *mm) EXPORT_SYMBOL(get_mm_exe_file); /** + * get_task_exe_file - acquire a reference to the task's executable file + * + * Returns %NULL if task's mm (if any) has no associated executable file or + * this is a kernel thread with borrowed mm (see the comment above get_task_mm). + * User must release file via fput(). + */ +struct file *get_task_exe_file(struct task_struct *task) +{ + struct file *exe_file = NULL; + struct mm_struct *mm; + + task_lock(task); + mm = task->mm; + if (mm) { + if (!(task->flags & PF_KTHREAD)) + exe_file = get_mm_exe_file(mm); + } + task_unlock(task); + return exe_file; +} +EXPORT_SYMBOL(get_task_exe_file); + +/** * get_task_mm - acquire a reference to the task's mm * * Returns %NULL if the task has no mm. Checks PF_KTHREAD (meaning @@ -915,14 +936,12 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm) deactivate_mm(tsk, mm); /* - * If we're exiting normally, clear a user-space tid field if - * requested. We leave this alone when dying by signal, to leave - * the value intact in a core dump, and to save the unnecessary - * trouble, say, a killed vfork parent shouldn't touch this mm. - * Userland only wants this done for a sys_exit. + * Signal userspace if we're not exiting with a core dump + * because we want to leave the value intact for debugging + * purposes. */ if (tsk->clear_child_tid) { - if (!(tsk->flags & PF_SIGNALED) && + if (!(tsk->signal->flags & SIGNAL_GROUP_COREDUMP) && atomic_read(&mm->mm_users) > 1) { /* * We don't check the error code - if userspace has @@ -1406,7 +1425,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->real_start_time = ktime_get_boot_ns(); p->io_context = NULL; p->audit_context = NULL; - threadgroup_change_begin(current); cgroup_fork(p); #ifdef CONFIG_NUMA p->mempolicy = mpol_dup(p->mempolicy); @@ -1558,6 +1576,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, INIT_LIST_HEAD(&p->thread_group); p->task_works = NULL; + threadgroup_change_begin(current); /* * Ensure that the cgroup subsystem policies allow the new process to be * forked. It should be noted the the new process's css_set can be changed @@ -1658,6 +1677,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, bad_fork_cancel_cgroup: cgroup_cancel_fork(p); bad_fork_free_pid: + threadgroup_change_end(current); if (pid != &init_struct_pid) free_pid(pid); bad_fork_cleanup_thread: @@ -1690,7 +1710,6 @@ bad_fork_cleanup_policy: mpol_put(p->mempolicy); bad_fork_cleanup_threadgroup_lock: #endif - threadgroup_change_end(current); delayacct_tsk_free(p); bad_fork_cleanup_count: atomic_dec(&p->cred->user->processes); diff --git a/kernel/freezer.c b/kernel/freezer.c index a8900a3bc27a..6f56a9e219fa 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -42,7 +42,7 @@ bool freezing_slow_path(struct task_struct *p) if (p->flags & (PF_NOFREEZE | PF_SUSPEND_TASK)) return false; - if (test_thread_flag(TIF_MEMDIE)) + if (test_tsk_thread_flag(p, TIF_MEMDIE)) return false; if (pm_nosig_freezing || cgroup_freezing(p)) diff --git a/kernel/futex.c b/kernel/futex.c index 33664f70e2d2..46cb3a301bc1 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -179,7 +179,15 @@ int __read_mostly futex_cmpxchg_enabled; * Futex flags used to encode options to functions and preserve them across * restarts. */ -#define FLAGS_SHARED 0x01 +#ifdef CONFIG_MMU +# define FLAGS_SHARED 0x01 +#else +/* + * NOMMU does not have per process address space. Let the compiler optimize + * code away. + */ +# define FLAGS_SHARED 0x00 +#endif #define FLAGS_CLOCKRT 0x02 #define FLAGS_HAS_TIMEOUT 0x04 @@ -405,6 +413,16 @@ static void get_futex_key_refs(union futex_key *key) if (!key->both.ptr) return; + /* + * On MMU less systems futexes are always "private" as there is no per + * process address space. We need the smp wmb nevertheless - yes, + * arch/blackfin has MMU less SMP ... + */ + if (!IS_ENABLED(CONFIG_MMU)) { + smp_mb(); /* explicit smp_mb(); (B) */ + return; + } + switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { case FUT_OFF_INODE: ihold(key->shared.inode); /* implies smp_mb(); (B) */ @@ -436,6 +454,9 @@ static void drop_futex_key_refs(union futex_key *key) return; } + if (!IS_ENABLED(CONFIG_MMU)) + return; + switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { case FUT_OFF_INODE: iput(key->shared.inode); diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c index f68959341c0f..32f6cfcff212 100644 --- a/kernel/irq/affinity.c +++ b/kernel/irq/affinity.c @@ -39,6 +39,7 @@ struct cpumask *irq_create_affinity_mask(unsigned int *nr_vecs) return NULL; } + get_online_cpus(); if (max_vecs >= num_online_cpus()) { cpumask_copy(affinity_mask, cpu_online_mask); *nr_vecs = num_online_cpus(); @@ -56,6 +57,7 @@ struct cpumask *irq_create_affinity_mask(unsigned int *nr_vecs) } *nr_vecs = vecs; } + put_online_cpus(); return affinity_mask; } diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index b4c1bc7c9ca2..26ba5654d9d5 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -820,6 +820,21 @@ __irq_do_set_handler(struct irq_desc *desc, irq_flow_handler_t handle, desc->name = name; if (handle != handle_bad_irq && is_chained) { + unsigned int type = irqd_get_trigger_type(&desc->irq_data); + + /* + * We're about to start this interrupt immediately, + * hence the need to set the trigger configuration. + * But the .set_type callback may have overridden the + * flow handler, ignoring that we're dealing with a + * chained interrupt. Reset it immediately because we + * do know better. + */ + if (type != IRQ_TYPE_NONE) { + __irq_set_trigger(desc, type); + desc->handle_irq = handle; + } + irq_settings_set_noprobe(desc); irq_settings_set_norequest(desc); irq_settings_set_nothread(desc); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 73a2b786b5e9..9530fcd27704 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1681,8 +1681,10 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler, action->dev_id = dev_id; retval = irq_chip_pm_get(&desc->irq_data); - if (retval < 0) + if (retval < 0) { + kfree(action); return retval; + } chip_bus_lock(desc); retval = __setup_irq(irq, desc, action); @@ -1985,8 +1987,10 @@ int request_percpu_irq(unsigned int irq, irq_handler_t handler, action->percpu_dev_id = dev_id; retval = irq_chip_pm_get(&desc->irq_data); - if (retval < 0) + if (retval < 0) { + kfree(action); return retval; + } chip_bus_lock(desc); retval = __setup_irq(irq, desc, action); diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 54999350162c..19e9dfbe97fa 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -359,6 +359,17 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, else dev_dbg(dev, "irq [%d-%d] for MSI\n", virq, virq + desc->nvec_used - 1); + /* + * This flag is set by the PCI layer as we need to activate + * the MSI entries before the PCI layer enables MSI in the + * card. Otherwise the card latches a random msi message. + */ + if (info->flags & MSI_FLAG_ACTIVATE_EARLY) { + struct irq_data *irq_data; + + irq_data = irq_domain_get_irq_data(domain, desc->irq); + irq_domain_activate_irq(irq_data); + } } return 0; diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 0dbea887d625..93ad6c1fb9b6 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -14,6 +14,7 @@ #include <linux/err.h> #include <linux/static_key.h> #include <linux/jump_label_ratelimit.h> +#include <linux/bug.h> #ifdef HAVE_JUMP_LABEL @@ -56,6 +57,49 @@ jump_label_sort_entries(struct jump_entry *start, struct jump_entry *stop) static void jump_label_update(struct static_key *key); +/* + * There are similar definitions for the !HAVE_JUMP_LABEL case in jump_label.h. + * The use of 'atomic_read()' requires atomic.h and its problematic for some + * kernel headers such as kernel.h and others. Since static_key_count() is not + * used in the branch statements as it is for the !HAVE_JUMP_LABEL case its ok + * to have it be a function here. Similarly, for 'static_key_enable()' and + * 'static_key_disable()', which require bug.h. This should allow jump_label.h + * to be included from most/all places for HAVE_JUMP_LABEL. + */ +int static_key_count(struct static_key *key) +{ + /* + * -1 means the first static_key_slow_inc() is in progress. + * static_key_enabled() must return true, so return 1 here. + */ + int n = atomic_read(&key->enabled); + + return n >= 0 ? n : 1; +} +EXPORT_SYMBOL_GPL(static_key_count); + +void static_key_enable(struct static_key *key) +{ + int count = static_key_count(key); + + WARN_ON_ONCE(count < 0 || count > 1); + + if (!count) + static_key_slow_inc(key); +} +EXPORT_SYMBOL_GPL(static_key_enable); + +void static_key_disable(struct static_key *key) +{ + int count = static_key_count(key); + + WARN_ON_ONCE(count < 0 || count > 1); + + if (count) + static_key_slow_dec(key); +} +EXPORT_SYMBOL_GPL(static_key_disable); + void static_key_slow_inc(struct static_key *key) { int v, v1; @@ -235,6 +279,18 @@ void __init jump_label_init(void) struct static_key *key = NULL; struct jump_entry *iter; + /* + * Since we are initializing the static_key.enabled field with + * with the 'raw' int values (to avoid pulling in atomic.h) in + * jump_label.h, let's make sure that is safe. There are only two + * cases to check since we initialize to 0 or 1. + */ + BUILD_BUG_ON((int)ATOMIC_INIT(0) != 0); + BUILD_BUG_ON((int)ATOMIC_INIT(1) != 1); + + if (static_key_initialized) + return; + jump_label_lock(); jump_label_sort_entries(iter_start, iter_stop); @@ -284,11 +340,14 @@ static int __jump_label_mod_text_reserved(void *start, void *end) { struct module *mod; + preempt_disable(); mod = __module_text_address((unsigned long)start); + WARN_ON_ONCE(__module_text_address((unsigned long)end) != mod); + preempt_enable(); + if (!mod) return 0; - WARN_ON_ONCE(__module_text_address((unsigned long)end) != mod); return __jump_label_text_reserved(mod->jump_entries, mod->jump_entries + mod->num_jump_entries, diff --git a/kernel/kexec.c b/kernel/kexec.c index 4384672d3245..980936a90ee6 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -48,7 +48,8 @@ static int kimage_alloc_init(struct kimage **rimage, unsigned long entry, if (kexec_on_panic) { /* Verify we have a valid entry point */ - if ((entry < crashk_res.start) || (entry > crashk_res.end)) + if ((entry < phys_to_boot_phys(crashk_res.start)) || + (entry > phys_to_boot_phys(crashk_res.end))) return -EADDRNOTAVAIL; } diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 56b3ed0927b0..561675589511 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -95,6 +95,12 @@ int kexec_should_crash(struct task_struct *p) return 0; } +int kexec_crash_loaded(void) +{ + return !!kexec_crash_image; +} +EXPORT_SYMBOL_GPL(kexec_crash_loaded); + /* * When kexec transitions to the new kernel there is a one-to-one * mapping between physical and virtual addresses. On processors @@ -140,6 +146,7 @@ int kexec_should_crash(struct task_struct *p) * allocating pages whose destination address we do not care about. */ #define KIMAGE_NO_DEST (-1UL) +#define PAGE_COUNT(x) (((x) + PAGE_SIZE - 1) >> PAGE_SHIFT) static struct page *kimage_alloc_page(struct kimage *image, gfp_t gfp_mask, @@ -147,8 +154,9 @@ static struct page *kimage_alloc_page(struct kimage *image, int sanity_check_segment_list(struct kimage *image) { - int result, i; + int i; unsigned long nr_segments = image->nr_segments; + unsigned long total_pages = 0; /* * Verify we have good destination addresses. The caller is @@ -163,16 +171,17 @@ int sanity_check_segment_list(struct kimage *image) * simply because addresses are changed to page size * granularity. */ - result = -EADDRNOTAVAIL; for (i = 0; i < nr_segments; i++) { unsigned long mstart, mend; mstart = image->segment[i].mem; mend = mstart + image->segment[i].memsz; + if (mstart > mend) + return -EADDRNOTAVAIL; if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK)) - return result; + return -EADDRNOTAVAIL; if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT) - return result; + return -EADDRNOTAVAIL; } /* Verify our destination addresses do not overlap. @@ -180,7 +189,6 @@ int sanity_check_segment_list(struct kimage *image) * through very weird things can happen with no * easy explanation as one segment stops on another. */ - result = -EINVAL; for (i = 0; i < nr_segments; i++) { unsigned long mstart, mend; unsigned long j; @@ -194,7 +202,7 @@ int sanity_check_segment_list(struct kimage *image) pend = pstart + image->segment[j].memsz; /* Do the segments overlap ? */ if ((mend > pstart) && (mstart < pend)) - return result; + return -EINVAL; } } @@ -203,12 +211,26 @@ int sanity_check_segment_list(struct kimage *image) * and it is easier to check up front than to be surprised * later on. */ - result = -EINVAL; for (i = 0; i < nr_segments; i++) { if (image->segment[i].bufsz > image->segment[i].memsz) - return result; + return -EINVAL; + } + + /* + * Verify that no more than half of memory will be consumed. If the + * request from userspace is too large, a large amount of time will be + * wasted allocating pages, which can cause a soft lockup. + */ + for (i = 0; i < nr_segments; i++) { + if (PAGE_COUNT(image->segment[i].memsz) > totalram_pages / 2) + return -EINVAL; + + total_pages += PAGE_COUNT(image->segment[i].memsz); } + if (total_pages > totalram_pages / 2) + return -EINVAL; + /* * Verify we have good destination addresses. Normally * the caller is responsible for making certain we don't @@ -220,16 +242,15 @@ int sanity_check_segment_list(struct kimage *image) */ if (image->type == KEXEC_TYPE_CRASH) { - result = -EADDRNOTAVAIL; for (i = 0; i < nr_segments; i++) { unsigned long mstart, mend; mstart = image->segment[i].mem; mend = mstart + image->segment[i].memsz - 1; /* Ensure we are within the crash kernel limits */ - if ((mstart < crashk_res.start) || - (mend > crashk_res.end)) - return result; + if ((mstart < phys_to_boot_phys(crashk_res.start)) || + (mend > phys_to_boot_phys(crashk_res.end))) + return -EADDRNOTAVAIL; } } @@ -352,7 +373,7 @@ static struct page *kimage_alloc_normal_control_pages(struct kimage *image, pages = kimage_alloc_pages(KEXEC_CONTROL_MEMORY_GFP, order); if (!pages) break; - pfn = page_to_pfn(pages); + pfn = page_to_boot_pfn(pages); epfn = pfn + count; addr = pfn << PAGE_SHIFT; eaddr = epfn << PAGE_SHIFT; @@ -478,7 +499,7 @@ static int kimage_add_entry(struct kimage *image, kimage_entry_t entry) return -ENOMEM; ind_page = page_address(page); - *image->entry = virt_to_phys(ind_page) | IND_INDIRECTION; + *image->entry = virt_to_boot_phys(ind_page) | IND_INDIRECTION; image->entry = ind_page; image->last_entry = ind_page + ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1); @@ -533,13 +554,13 @@ void kimage_terminate(struct kimage *image) #define for_each_kimage_entry(image, ptr, entry) \ for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \ ptr = (entry & IND_INDIRECTION) ? \ - phys_to_virt((entry & PAGE_MASK)) : ptr + 1) + boot_phys_to_virt((entry & PAGE_MASK)) : ptr + 1) static void kimage_free_entry(kimage_entry_t entry) { struct page *page; - page = pfn_to_page(entry >> PAGE_SHIFT); + page = boot_pfn_to_page(entry >> PAGE_SHIFT); kimage_free_pages(page); } @@ -633,7 +654,7 @@ static struct page *kimage_alloc_page(struct kimage *image, * have a match. */ list_for_each_entry(page, &image->dest_pages, lru) { - addr = page_to_pfn(page) << PAGE_SHIFT; + addr = page_to_boot_pfn(page) << PAGE_SHIFT; if (addr == destination) { list_del(&page->lru); return page; @@ -648,12 +669,12 @@ static struct page *kimage_alloc_page(struct kimage *image, if (!page) return NULL; /* If the page cannot be used file it away */ - if (page_to_pfn(page) > + if (page_to_boot_pfn(page) > (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) { list_add(&page->lru, &image->unusable_pages); continue; } - addr = page_to_pfn(page) << PAGE_SHIFT; + addr = page_to_boot_pfn(page) << PAGE_SHIFT; /* If it is the destination page we want use it */ if (addr == destination) @@ -676,7 +697,7 @@ static struct page *kimage_alloc_page(struct kimage *image, struct page *old_page; old_addr = *old & PAGE_MASK; - old_page = pfn_to_page(old_addr >> PAGE_SHIFT); + old_page = boot_pfn_to_page(old_addr >> PAGE_SHIFT); copy_highpage(page, old_page); *old = addr | (*old & ~PAGE_MASK); @@ -732,7 +753,7 @@ static int kimage_load_normal_segment(struct kimage *image, result = -ENOMEM; goto out; } - result = kimage_add_page(image, page_to_pfn(page) + result = kimage_add_page(image, page_to_boot_pfn(page) << PAGE_SHIFT); if (result < 0) goto out; @@ -793,7 +814,7 @@ static int kimage_load_crash_segment(struct kimage *image, char *ptr; size_t uchunk, mchunk; - page = pfn_to_page(maddr >> PAGE_SHIFT); + page = boot_pfn_to_page(maddr >> PAGE_SHIFT); if (!page) { result = -ENOMEM; goto out; @@ -921,7 +942,7 @@ void __weak crash_free_reserved_phys_range(unsigned long begin, unsigned long addr; for (addr = begin; addr < end; addr += PAGE_SIZE) - free_reserved_page(pfn_to_page(addr >> PAGE_SHIFT)); + free_reserved_page(boot_pfn_to_page(addr >> PAGE_SHIFT)); } int crash_shrink_memory(unsigned long new_size) @@ -1374,7 +1395,7 @@ void vmcoreinfo_append_str(const char *fmt, ...) void __weak arch_crash_save_vmcoreinfo(void) {} -unsigned long __weak paddr_vmcoreinfo_note(void) +phys_addr_t __weak paddr_vmcoreinfo_note(void) { return __pa((unsigned long)(char *)&vmcoreinfo_note); } diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 503bc2d348e5..037c321c5618 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -887,7 +887,10 @@ int kexec_load_purgatory(struct kimage *image, unsigned long min, return 0; out: vfree(pi->sechdrs); + pi->sechdrs = NULL; + vfree(pi->purgatory_buf); + pi->purgatory_buf = NULL; return ret; } diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index 152da4a48867..ee1bc1bb8feb 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -101,7 +101,7 @@ KERNEL_ATTR_RO(kexec_loaded); static ssize_t kexec_crash_loaded_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sprintf(buf, "%d\n", !!kexec_crash_image); + return sprintf(buf, "%d\n", kexec_crash_loaded()); } KERNEL_ATTR_RO(kexec_crash_loaded); @@ -128,8 +128,8 @@ KERNEL_ATTR_RW(kexec_crash_size); static ssize_t vmcoreinfo_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sprintf(buf, "%lx %x\n", - paddr_vmcoreinfo_note(), + phys_addr_t vmcore_base = paddr_vmcoreinfo_note(); + return sprintf(buf, "%pa %x\n", &vmcore_base, (unsigned int)sizeof(vmcoreinfo_note)); } KERNEL_ATTR_RO(vmcoreinfo); diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index 5c2bc1052691..8bbe50704621 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -309,7 +309,7 @@ static int klp_write_object_relocations(struct module *pmod, break; } - module_enable_ro(pmod); + module_enable_ro(pmod, true); return ret; } diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h index 37649e69056c..8a99abf58080 100644 --- a/kernel/locking/qspinlock_paravirt.h +++ b/kernel/locking/qspinlock_paravirt.h @@ -450,7 +450,7 @@ pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node) goto gotlock; } } - WRITE_ONCE(pn->state, vcpu_halted); + WRITE_ONCE(pn->state, vcpu_hashed); qstat_inc(qstat_pv_wait_head, true); qstat_inc(qstat_pv_wait_again, waitcnt); pv_wait(&l->locked, _Q_SLOW_VAL); diff --git a/kernel/locking/qspinlock_stat.h b/kernel/locking/qspinlock_stat.h index 22e025309845..b9d031516254 100644 --- a/kernel/locking/qspinlock_stat.h +++ b/kernel/locking/qspinlock_stat.h @@ -153,7 +153,6 @@ static ssize_t qstat_read(struct file *file, char __user *user_buf, */ if ((counter == qstat_pv_latency_kick) || (counter == qstat_pv_latency_wake)) { - stat = 0; if (kicks) stat = DIV_ROUND_CLOSEST_ULL(stat, kicks); } diff --git a/kernel/memremap.c b/kernel/memremap.c index 017532193fb1..b501e390bb34 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -169,12 +169,6 @@ void devm_memunmap(struct device *dev, void *addr) } EXPORT_SYMBOL(devm_memunmap); -pfn_t phys_to_pfn_t(phys_addr_t addr, u64 flags) -{ - return __pfn_to_pfn_t(addr >> PAGE_SHIFT, flags); -} -EXPORT_SYMBOL(phys_to_pfn_t); - #ifdef CONFIG_ZONE_DEVICE static DEFINE_MUTEX(pgmap_lock); static RADIX_TREE(pgmap_radix, GFP_KERNEL); @@ -253,6 +247,7 @@ static void devm_memremap_pages_release(struct device *dev, void *data) align_start = res->start & ~(SECTION_SIZE - 1); align_size = ALIGN(resource_size(res), SECTION_SIZE); arch_remove_memory(align_start, align_size); + untrack_pfn(NULL, PHYS_PFN(align_start), align_size); pgmap_radix_release(res); dev_WARN_ONCE(dev, pgmap->altmap && pgmap->altmap->alloc, "%s: failed to free all reserved pages\n", __func__); @@ -288,6 +283,7 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, struct percpu_ref *ref, struct vmem_altmap *altmap) { resource_size_t key, align_start, align_size, align_end; + pgprot_t pgprot = PAGE_KERNEL; struct dev_pagemap *pgmap; struct page_map *page_map; int error, nid, is_ram; @@ -308,12 +304,6 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, if (is_ram == REGION_INTERSECTS) return __va(res->start); - if (altmap && !IS_ENABLED(CONFIG_SPARSEMEM_VMEMMAP)) { - dev_err(dev, "%s: altmap requires CONFIG_SPARSEMEM_VMEMMAP=y\n", - __func__); - return ERR_PTR(-ENXIO); - } - if (!ref) return ERR_PTR(-EINVAL); @@ -363,6 +353,11 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, if (nid < 0) nid = numa_mem_id(); + error = track_pfn_remap(NULL, &pgprot, PHYS_PFN(align_start), 0, + align_size); + if (error) + goto err_pfn_remap; + error = arch_add_memory(nid, align_start, align_size, true); if (error) goto err_add_memory; @@ -383,6 +378,8 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, return __va(res->start); err_add_memory: + untrack_pfn(NULL, PHYS_PFN(align_start), align_size); + err_pfn_remap: err_radix: pgmap_radix_release(res); devres_free(page_map); @@ -401,7 +398,6 @@ void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns) altmap->alloc -= nr_pfns; } -#ifdef CONFIG_SPARSEMEM_VMEMMAP struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start) { /* @@ -427,5 +423,4 @@ struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start) return pgmap ? pgmap->altmap : NULL; } -#endif /* CONFIG_SPARSEMEM_VMEMMAP */ #endif /* CONFIG_ZONE_DEVICE */ diff --git a/kernel/module.c b/kernel/module.c index 5f71aa63ed2a..529efae9f481 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -60,6 +60,7 @@ #include <linux/jump_label.h> #include <linux/pfn.h> #include <linux/bsearch.h> +#include <linux/dynamic_debug.h> #include <uapi/linux/module.h> #include "module-internal.h" @@ -264,7 +265,7 @@ static void module_assert_mutex_or_preempt(void) if (unlikely(!debug_locks)) return; - WARN_ON(!rcu_read_lock_sched_held() && + WARN_ON_ONCE(!rcu_read_lock_sched_held() && !lockdep_is_held(&module_mutex)); #endif } @@ -336,7 +337,7 @@ static inline void add_taint_module(struct module *mod, unsigned flag, * A thread that wants to hold a reference to a module only while it * is running can call this to safely exit. nfsd and lockd use this. */ -void __module_put_and_exit(struct module *mod, long code) +void __noreturn __module_put_and_exit(struct module *mod, long code) { module_put(mod); do_exit(code); @@ -1693,8 +1694,7 @@ static int module_add_modinfo_attrs(struct module *mod) temp_attr = mod->modinfo_attrs; for (i = 0; (attr = modinfo_attrs[i]) && !error; i++) { - if (!attr->test || - (attr->test && attr->test(mod))) { + if (!attr->test || attr->test(mod)) { memcpy(temp_attr, attr, sizeof(*temp_attr)); sysfs_attr_init(&temp_attr->attr); error = sysfs_create_file(&mod->mkobj.kobj, @@ -1858,10 +1858,11 @@ static void mod_sysfs_teardown(struct module *mod) * from modification and any data from execution. * * General layout of module is: - * [text] [read-only-data] [writable data] - * text_size -----^ ^ ^ - * ro_size ------------------------| | - * size -------------------------------------------| + * [text] [read-only-data] [ro-after-init] [writable data] + * text_size -----^ ^ ^ ^ + * ro_size ------------------------| | | + * ro_after_init_size -----------------------------| | + * size -----------------------------------------------------------| * * These values are always page-aligned (as is base) */ @@ -1884,14 +1885,24 @@ static void frob_rodata(const struct module_layout *layout, (layout->ro_size - layout->text_size) >> PAGE_SHIFT); } +static void frob_ro_after_init(const struct module_layout *layout, + int (*set_memory)(unsigned long start, int num_pages)) +{ + BUG_ON((unsigned long)layout->base & (PAGE_SIZE-1)); + BUG_ON((unsigned long)layout->ro_size & (PAGE_SIZE-1)); + BUG_ON((unsigned long)layout->ro_after_init_size & (PAGE_SIZE-1)); + set_memory((unsigned long)layout->base + layout->ro_size, + (layout->ro_after_init_size - layout->ro_size) >> PAGE_SHIFT); +} + static void frob_writable_data(const struct module_layout *layout, int (*set_memory)(unsigned long start, int num_pages)) { BUG_ON((unsigned long)layout->base & (PAGE_SIZE-1)); - BUG_ON((unsigned long)layout->ro_size & (PAGE_SIZE-1)); + BUG_ON((unsigned long)layout->ro_after_init_size & (PAGE_SIZE-1)); BUG_ON((unsigned long)layout->size & (PAGE_SIZE-1)); - set_memory((unsigned long)layout->base + layout->ro_size, - (layout->size - layout->ro_size) >> PAGE_SHIFT); + set_memory((unsigned long)layout->base + layout->ro_after_init_size, + (layout->size - layout->ro_after_init_size) >> PAGE_SHIFT); } /* livepatching wants to disable read-only so it can frob module. */ @@ -1899,21 +1910,26 @@ void module_disable_ro(const struct module *mod) { frob_text(&mod->core_layout, set_memory_rw); frob_rodata(&mod->core_layout, set_memory_rw); + frob_ro_after_init(&mod->core_layout, set_memory_rw); frob_text(&mod->init_layout, set_memory_rw); frob_rodata(&mod->init_layout, set_memory_rw); } -void module_enable_ro(const struct module *mod) +void module_enable_ro(const struct module *mod, bool after_init) { frob_text(&mod->core_layout, set_memory_ro); frob_rodata(&mod->core_layout, set_memory_ro); frob_text(&mod->init_layout, set_memory_ro); frob_rodata(&mod->init_layout, set_memory_ro); + + if (after_init) + frob_ro_after_init(&mod->core_layout, set_memory_ro); } static void module_enable_nx(const struct module *mod) { frob_rodata(&mod->core_layout, set_memory_nx); + frob_ro_after_init(&mod->core_layout, set_memory_nx); frob_writable_data(&mod->core_layout, set_memory_nx); frob_rodata(&mod->init_layout, set_memory_nx); frob_writable_data(&mod->init_layout, set_memory_nx); @@ -1922,6 +1938,7 @@ static void module_enable_nx(const struct module *mod) static void module_disable_nx(const struct module *mod) { frob_rodata(&mod->core_layout, set_memory_x); + frob_ro_after_init(&mod->core_layout, set_memory_x); frob_writable_data(&mod->core_layout, set_memory_x); frob_rodata(&mod->init_layout, set_memory_x); frob_writable_data(&mod->init_layout, set_memory_x); @@ -1964,6 +1981,8 @@ static void disable_ro_nx(const struct module_layout *layout) frob_text(layout, set_memory_rw); frob_rodata(layout, set_memory_rw); frob_rodata(layout, set_memory_x); + frob_ro_after_init(layout, set_memory_rw); + frob_ro_after_init(layout, set_memory_x); frob_writable_data(layout, set_memory_x); } @@ -2306,6 +2325,7 @@ static void layout_sections(struct module *mod, struct load_info *info) * finder in the two loops below */ { SHF_EXECINSTR | SHF_ALLOC, ARCH_SHF_SMALL }, { SHF_ALLOC, SHF_WRITE | ARCH_SHF_SMALL }, + { SHF_RO_AFTER_INIT | SHF_ALLOC, ARCH_SHF_SMALL }, { SHF_WRITE | SHF_ALLOC, ARCH_SHF_SMALL }, { ARCH_SHF_SMALL | SHF_ALLOC, 0 } }; @@ -2337,7 +2357,11 @@ static void layout_sections(struct module *mod, struct load_info *info) mod->core_layout.size = debug_align(mod->core_layout.size); mod->core_layout.ro_size = mod->core_layout.size; break; - case 3: /* whole core */ + case 2: /* RO after init */ + mod->core_layout.size = debug_align(mod->core_layout.size); + mod->core_layout.ro_after_init_size = mod->core_layout.size; + break; + case 4: /* whole core */ mod->core_layout.size = debug_align(mod->core_layout.size); break; } @@ -2367,7 +2391,14 @@ static void layout_sections(struct module *mod, struct load_info *info) mod->init_layout.size = debug_align(mod->init_layout.size); mod->init_layout.ro_size = mod->init_layout.size; break; - case 3: /* whole init */ + case 2: + /* + * RO after init doesn't apply to init_layout (only + * core_layout), so it just takes the value of ro_size. + */ + mod->init_layout.ro_after_init_size = mod->init_layout.ro_size; + break; + case 4: /* whole init */ mod->init_layout.size = debug_align(mod->init_layout.size); break; } @@ -2687,13 +2718,18 @@ static inline void kmemleak_load_module(const struct module *mod, #endif #ifdef CONFIG_MODULE_SIG -static int module_sig_check(struct load_info *info) +static int module_sig_check(struct load_info *info, int flags) { int err = -ENOKEY; const unsigned long markerlen = sizeof(MODULE_SIG_STRING) - 1; const void *mod = info->hdr; - if (info->len > markerlen && + /* + * Require flags == 0, as a module with version information + * removed is no longer the module that was signed + */ + if (flags == 0 && + info->len > markerlen && memcmp(mod + info->len - markerlen, MODULE_SIG_STRING, markerlen) == 0) { /* We truncate the module to discard the signature */ info->len -= markerlen; @@ -2712,7 +2748,7 @@ static int module_sig_check(struct load_info *info) return err; } #else /* !CONFIG_MODULE_SIG */ -static int module_sig_check(struct load_info *info) +static int module_sig_check(struct load_info *info, int flags) { return 0; } @@ -2920,8 +2956,12 @@ static int check_modinfo(struct module *mod, struct load_info *info, int flags) return -ENOEXEC; } - if (!get_modinfo(info, "intree")) + if (!get_modinfo(info, "intree")) { + if (!test_taint(TAINT_OOT_MODULE)) + pr_warn("%s: loading out-of-tree module taints kernel.\n", + mod->name); add_taint_module(mod, TAINT_OOT_MODULE, LOCKDEP_STILL_OK); + } if (get_modinfo(info, "staging")) { add_taint_module(mod, TAINT_CRAP, LOCKDEP_STILL_OK); @@ -3090,6 +3130,8 @@ static int move_module(struct module *mod, struct load_info *info) static int check_module_license_and_versions(struct module *mod) { + int prev_taint = test_taint(TAINT_PROPRIETARY_MODULE); + /* * ndiswrapper is under GPL by itself, but loads proprietary modules. * Don't use add_taint_module(), as it would prevent ndiswrapper from @@ -3108,6 +3150,9 @@ static int check_module_license_and_versions(struct module *mod) add_taint_module(mod, TAINT_PROPRIETARY_MODULE, LOCKDEP_NOW_UNRELIABLE); + if (!prev_taint && test_taint(TAINT_PROPRIETARY_MODULE)) + pr_warn("%s: module license taints kernel.\n", mod->name); + #ifdef CONFIG_MODVERSIONS if ((mod->num_syms && !mod->crcs) || (mod->num_gpl_syms && !mod->gpl_crcs) @@ -3155,16 +3200,41 @@ int __weak module_frob_arch_sections(Elf_Ehdr *hdr, return 0; } +/* module_blacklist is a comma-separated list of module names */ +static char *module_blacklist; +static bool blacklisted(char *module_name) +{ + const char *p; + size_t len; + + if (!module_blacklist) + return false; + + for (p = module_blacklist; *p; p += len) { + len = strcspn(p, ","); + if (strlen(module_name) == len && !memcmp(module_name, p, len)) + return true; + if (p[len] == ',') + len++; + } + return false; +} +core_param(module_blacklist, module_blacklist, charp, 0400); + static struct module *layout_and_allocate(struct load_info *info, int flags) { /* Module within temporary copy. */ struct module *mod; + unsigned int ndx; int err; mod = setup_load_info(info, flags); if (IS_ERR(mod)) return mod; + if (blacklisted(mod->name)) + return ERR_PTR(-EPERM); + err = check_modinfo(mod, info, flags); if (err) return ERR_PTR(err); @@ -3178,6 +3248,15 @@ static struct module *layout_and_allocate(struct load_info *info, int flags) /* We will do a special allocation for per-cpu sections later. */ info->sechdrs[info->index.pcpu].sh_flags &= ~(unsigned long)SHF_ALLOC; + /* + * Mark ro_after_init section with SHF_RO_AFTER_INIT so that + * layout_sections() can put it in the right place. + * Note: ro_after_init sections also have SHF_{WRITE,ALLOC} set. + */ + ndx = find_sec(info, ".data..ro_after_init"); + if (ndx) + info->sechdrs[ndx].sh_flags |= SHF_RO_AFTER_INIT; + /* Determine total sizes, and put offsets in sh_entsize. For now this is done generically; there doesn't appear to be any special cases for the architectures. */ @@ -3344,12 +3423,14 @@ static noinline int do_init_module(struct module *mod) /* Switch to core kallsyms now init is done: kallsyms may be walking! */ rcu_assign_pointer(mod->kallsyms, &mod->core_kallsyms); #endif + module_enable_ro(mod, true); mod_tree_remove_init(mod); disable_ro_nx(&mod->init_layout); module_arch_freeing_init(mod); mod->init_layout.base = NULL; mod->init_layout.size = 0; mod->init_layout.ro_size = 0; + mod->init_layout.ro_after_init_size = 0; mod->init_layout.text_size = 0; /* * We want to free module_init, but be aware that kallsyms may be @@ -3441,8 +3522,7 @@ static int complete_formation(struct module *mod, struct load_info *info) /* This relies on module_mutex for list integrity. */ module_bug_finalize(info->hdr, info->sechdrs, mod); - /* Set RO and NX regions */ - module_enable_ro(mod); + module_enable_ro(mod, false); module_enable_nx(mod); /* Mark state as coming so strong_try_module_get() ignores us, @@ -3498,7 +3578,7 @@ static int load_module(struct load_info *info, const char __user *uargs, long err; char *after_dashes; - err = module_sig_check(info); + err = module_sig_check(info, flags); if (err) goto free_copy; diff --git a/kernel/panic.c b/kernel/panic.c index 8aa74497cc5a..ca8cea1ef673 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -108,6 +108,7 @@ void panic(const char *fmt, ...) long i, i_next = 0; int state = 0; int old_cpu, this_cpu; + bool _crash_kexec_post_notifiers = crash_kexec_post_notifiers; /* * Disable local interrupts. This will prevent panic_smp_self_stop @@ -160,7 +161,7 @@ void panic(const char *fmt, ...) * * Bypass the panic_cpu check and call __crash_kexec directly. */ - if (!crash_kexec_post_notifiers) { + if (!_crash_kexec_post_notifiers) { printk_nmi_flush_on_panic(); __crash_kexec(NULL); } @@ -191,7 +192,7 @@ void panic(const char *fmt, ...) * * Bypass the panic_cpu check and call __crash_kexec directly. */ - if (crash_kexec_post_notifiers) + if (_crash_kexec_post_notifiers) __crash_kexec(NULL); bust_spinlocks(0); @@ -571,13 +572,7 @@ EXPORT_SYMBOL(__stack_chk_fail); core_param(panic, panic_timeout, int, 0644); core_param(pause_on_oops, pause_on_oops, int, 0644); core_param(panic_on_warn, panic_on_warn, int, 0644); - -static int __init setup_crash_kexec_post_notifiers(char *s) -{ - crash_kexec_post_notifiers = true; - return 0; -} -early_param("crash_kexec_post_notifiers", setup_crash_kexec_post_notifiers); +core_param(crash_kexec_post_notifiers, crash_kexec_post_notifiers, bool, 0644); static int __init oops_setup(char *s) { diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index a881c6a7ba74..33c79b6105c5 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -300,12 +300,12 @@ static int create_image(int platform_mode) save_processor_state(); trace_suspend_resume(TPS("machine_suspend"), PM_EVENT_HIBERNATE, true); error = swsusp_arch_suspend(); + /* Restore control flow magically appears here */ + restore_processor_state(); trace_suspend_resume(TPS("machine_suspend"), PM_EVENT_HIBERNATE, false); if (error) printk(KERN_ERR "PM: Error %d creating hibernation image\n", error); - /* Restore control flow magically appears here */ - restore_processor_state(); if (!in_suspend) events_check_enabled = false; diff --git a/kernel/power/qos.c b/kernel/power/qos.c index 97b0df71303e..168ff442ebde 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c @@ -482,7 +482,16 @@ void pm_qos_update_request(struct pm_qos_request *req, return; } - cancel_delayed_work_sync(&req->work); + /* + * This function may be called very early during boot, for example, + * from of_clk_init(), where irq needs to stay disabled. + * cancel_delayed_work_sync() assumes that irq is enabled on + * invocation and re-enables it on return. Avoid calling it until + * workqueue is initialized. + */ + if (keventd_up()) + cancel_delayed_work_sync(&req->work); + __pm_qos_update_request(req, new_value); } EXPORT_SYMBOL_GPL(pm_qos_update_request); diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index d90df926b59f..b02228411d57 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -835,9 +835,9 @@ static bool memory_bm_pfn_present(struct memory_bitmap *bm, unsigned long pfn) */ static bool rtree_next_node(struct memory_bitmap *bm) { - bm->cur.node = list_entry(bm->cur.node->list.next, - struct rtree_node, list); - if (&bm->cur.node->list != &bm->cur.zone->leaves) { + if (!list_is_last(&bm->cur.node->list, &bm->cur.zone->leaves)) { + bm->cur.node = list_entry(bm->cur.node->list.next, + struct rtree_node, list); bm->cur.node_pfn += BM_BITS_PER_BLOCK; bm->cur.node_bit = 0; touch_softlockup_watchdog(); @@ -845,9 +845,9 @@ static bool rtree_next_node(struct memory_bitmap *bm) } /* No more nodes, goto next zone */ - bm->cur.zone = list_entry(bm->cur.zone->list.next, + if (!list_is_last(&bm->cur.zone->list, &bm->zones)) { + bm->cur.zone = list_entry(bm->cur.zone->list.next, struct mem_zone_bm_rtree, list); - if (&bm->cur.zone->list != &bm->zones) { bm->cur.node = list_entry(bm->cur.zone->leaves.next, struct rtree_node, list); bm->cur.node_pfn = 0; @@ -1627,11 +1627,11 @@ static unsigned long minimum_image_size(unsigned long saveable) unsigned long size; size = global_page_state(NR_SLAB_RECLAIMABLE) - + global_page_state(NR_ACTIVE_ANON) - + global_page_state(NR_INACTIVE_ANON) - + global_page_state(NR_ACTIVE_FILE) - + global_page_state(NR_INACTIVE_FILE) - - global_page_state(NR_FILE_MAPPED); + + global_node_page_state(NR_ACTIVE_ANON) + + global_node_page_state(NR_INACTIVE_ANON) + + global_node_page_state(NR_ACTIVE_FILE) + + global_node_page_state(NR_INACTIVE_FILE) + - global_node_page_state(NR_FILE_MAPPED); return saveable <= size ? 0 : saveable - size; } diff --git a/kernel/printk/braille.c b/kernel/printk/braille.c index 276762f3a460..d5760c42f042 100644 --- a/kernel/printk/braille.c +++ b/kernel/printk/braille.c @@ -9,10 +9,10 @@ char *_braille_console_setup(char **str, char **brl_options) { - if (!memcmp(*str, "brl,", 4)) { + if (!strncmp(*str, "brl,", 4)) { *brl_options = ""; *str += 4; - } else if (!memcmp(str, "brl=", 4)) { + } else if (!strncmp(*str, "brl=", 4)) { *brl_options = *str + 4; *str = strchr(*brl_options, ','); if (!*str) diff --git a/kernel/printk/nmi.c b/kernel/printk/nmi.c index b69eb8a2876f..16bab471c7e2 100644 --- a/kernel/printk/nmi.c +++ b/kernel/printk/nmi.c @@ -99,27 +99,33 @@ again: return add; } -/* - * printk one line from the temporary buffer from @start index until - * and including the @end index. - */ -static void print_nmi_seq_line(struct nmi_seq_buf *s, int start, int end) +static void printk_nmi_flush_line(const char *text, int len) { - const char *buf = s->buffer + start; - /* * The buffers are flushed in NMI only on panic. The messages must * go only into the ring buffer at this stage. Consoles will get * explicitly called later when a crashdump is not generated. */ if (in_nmi()) - printk_deferred("%.*s", (end - start) + 1, buf); + printk_deferred("%.*s", len, text); else - printk("%.*s", (end - start) + 1, buf); + printk("%.*s", len, text); } /* + * printk one line from the temporary buffer from @start index until + * and including the @end index. + */ +static void printk_nmi_flush_seq_line(struct nmi_seq_buf *s, + int start, int end) +{ + const char *buf = s->buffer + start; + + printk_nmi_flush_line(buf, (end - start) + 1); +} + +/* * Flush data from the associated per_CPU buffer. The function * can be called either via IRQ work or independently. */ @@ -150,9 +156,11 @@ more: * the buffer an unexpected way. If we printed something then * @len must only increase. */ - if (i && i >= len) - pr_err("printk_nmi_flush: internal error: i=%d >= len=%zu\n", - i, len); + if (i && i >= len) { + const char *msg = "printk_nmi_flush: internal error\n"; + + printk_nmi_flush_line(msg, strlen(msg)); + } if (!len) goto out; /* Someone else has already flushed the buffer. */ @@ -166,14 +174,14 @@ more: /* Print line by line. */ for (; i < size; i++) { if (s->buffer[i] == '\n') { - print_nmi_seq_line(s, last_i, i); + printk_nmi_flush_seq_line(s, last_i, i); last_i = i + 1; } } /* Check if there was a partial line. */ if (last_i < size) { - print_nmi_seq_line(s, last_i, size - 1); - pr_cont("\n"); + printk_nmi_flush_seq_line(s, last_i, size - 1); + printk_nmi_flush_line("\n", strlen("\n")); } /* diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 60cdf6386763..eea6dbc2d8cf 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -26,7 +26,6 @@ #include <linux/nmi.h> #include <linux/module.h> #include <linux/moduleparam.h> -#include <linux/interrupt.h> /* For in_interrupt() */ #include <linux/delay.h> #include <linux/smp.h> #include <linux/security.h> @@ -48,7 +47,7 @@ #include <linux/uio.h> #include <asm/uaccess.h> -#include <asm-generic/sections.h> +#include <asm/sections.h> #define CREATE_TRACE_POINTS #include <trace/events/printk.h> @@ -86,6 +85,111 @@ static struct lockdep_map console_lock_dep_map = { }; #endif +enum devkmsg_log_bits { + __DEVKMSG_LOG_BIT_ON = 0, + __DEVKMSG_LOG_BIT_OFF, + __DEVKMSG_LOG_BIT_LOCK, +}; + +enum devkmsg_log_masks { + DEVKMSG_LOG_MASK_ON = BIT(__DEVKMSG_LOG_BIT_ON), + DEVKMSG_LOG_MASK_OFF = BIT(__DEVKMSG_LOG_BIT_OFF), + DEVKMSG_LOG_MASK_LOCK = BIT(__DEVKMSG_LOG_BIT_LOCK), +}; + +/* Keep both the 'on' and 'off' bits clear, i.e. ratelimit by default: */ +#define DEVKMSG_LOG_MASK_DEFAULT 0 + +static unsigned int __read_mostly devkmsg_log = DEVKMSG_LOG_MASK_DEFAULT; + +static int __control_devkmsg(char *str) +{ + if (!str) + return -EINVAL; + + if (!strncmp(str, "on", 2)) { + devkmsg_log = DEVKMSG_LOG_MASK_ON; + return 2; + } else if (!strncmp(str, "off", 3)) { + devkmsg_log = DEVKMSG_LOG_MASK_OFF; + return 3; + } else if (!strncmp(str, "ratelimit", 9)) { + devkmsg_log = DEVKMSG_LOG_MASK_DEFAULT; + return 9; + } + return -EINVAL; +} + +static int __init control_devkmsg(char *str) +{ + if (__control_devkmsg(str) < 0) + return 1; + + /* + * Set sysctl string accordingly: + */ + if (devkmsg_log == DEVKMSG_LOG_MASK_ON) { + memset(devkmsg_log_str, 0, DEVKMSG_STR_MAX_SIZE); + strncpy(devkmsg_log_str, "on", 2); + } else if (devkmsg_log == DEVKMSG_LOG_MASK_OFF) { + memset(devkmsg_log_str, 0, DEVKMSG_STR_MAX_SIZE); + strncpy(devkmsg_log_str, "off", 3); + } + /* else "ratelimit" which is set by default. */ + + /* + * Sysctl cannot change it anymore. The kernel command line setting of + * this parameter is to force the setting to be permanent throughout the + * runtime of the system. This is a precation measure against userspace + * trying to be a smarta** and attempting to change it up on us. + */ + devkmsg_log |= DEVKMSG_LOG_MASK_LOCK; + + return 0; +} +__setup("printk.devkmsg=", control_devkmsg); + +char devkmsg_log_str[DEVKMSG_STR_MAX_SIZE] = "ratelimit"; + +int devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + char old_str[DEVKMSG_STR_MAX_SIZE]; + unsigned int old; + int err; + + if (write) { + if (devkmsg_log & DEVKMSG_LOG_MASK_LOCK) + return -EINVAL; + + old = devkmsg_log; + strncpy(old_str, devkmsg_log_str, DEVKMSG_STR_MAX_SIZE); + } + + err = proc_dostring(table, write, buffer, lenp, ppos); + if (err) + return err; + + if (write) { + err = __control_devkmsg(devkmsg_log_str); + + /* + * Do not accept an unknown string OR a known string with + * trailing crap... + */ + if (err < 0 || (err + 1 != *lenp)) { + + /* ... and restore old setting. */ + devkmsg_log = old; + strncpy(devkmsg_log_str, old_str, DEVKMSG_STR_MAX_SIZE); + + return -EINVAL; + } + } + + return 0; +} + /* * Number of registered extended console drivers. * @@ -614,6 +718,7 @@ struct devkmsg_user { u64 seq; u32 idx; enum log_flags prev; + struct ratelimit_state rs; struct mutex lock; char buf[CONSOLE_EXT_LOG_MAX]; }; @@ -623,11 +728,24 @@ static ssize_t devkmsg_write(struct kiocb *iocb, struct iov_iter *from) char *buf, *line; int level = default_message_loglevel; int facility = 1; /* LOG_USER */ + struct file *file = iocb->ki_filp; + struct devkmsg_user *user = file->private_data; size_t len = iov_iter_count(from); ssize_t ret = len; - if (len > LOG_LINE_MAX) + if (!user || len > LOG_LINE_MAX) return -EINVAL; + + /* Ignore when user logging is disabled. */ + if (devkmsg_log & DEVKMSG_LOG_MASK_OFF) + return len; + + /* Ratelimit when not explicitly enabled. */ + if (!(devkmsg_log & DEVKMSG_LOG_MASK_ON)) { + if (!___ratelimit(&user->rs, current->comm)) + return ret; + } + buf = kmalloc(len+1, GFP_KERNEL); if (buf == NULL) return -ENOMEM; @@ -800,19 +918,24 @@ static int devkmsg_open(struct inode *inode, struct file *file) struct devkmsg_user *user; int err; - /* write-only does not need any file context */ - if ((file->f_flags & O_ACCMODE) == O_WRONLY) - return 0; + if (devkmsg_log & DEVKMSG_LOG_MASK_OFF) + return -EPERM; - err = check_syslog_permissions(SYSLOG_ACTION_READ_ALL, - SYSLOG_FROM_READER); - if (err) - return err; + /* write-only does not need any file context */ + if ((file->f_flags & O_ACCMODE) != O_WRONLY) { + err = check_syslog_permissions(SYSLOG_ACTION_READ_ALL, + SYSLOG_FROM_READER); + if (err) + return err; + } user = kmalloc(sizeof(struct devkmsg_user), GFP_KERNEL); if (!user) return -ENOMEM; + ratelimit_default_init(&user->rs); + ratelimit_set_flags(&user->rs, RATELIMIT_MSG_ON_RELEASE); + mutex_init(&user->lock); raw_spin_lock_irq(&logbuf_lock); @@ -831,6 +954,8 @@ static int devkmsg_release(struct inode *inode, struct file *file) if (!user) return 0; + ratelimit_state_exit(&user->rs); + mutex_destroy(&user->lock); kfree(user); return 0; @@ -986,6 +1111,11 @@ module_param(ignore_loglevel, bool, S_IRUGO | S_IWUSR); MODULE_PARM_DESC(ignore_loglevel, "ignore loglevel setting (prints all kernel messages to the console)"); +static bool suppress_message_printing(int level) +{ + return (level >= console_loglevel && !ignore_loglevel); +} + #ifdef CONFIG_BOOT_PRINTK_DELAY static int boot_delay; /* msecs delay after each printk during bootup */ @@ -1015,7 +1145,7 @@ static void boot_delay_msec(int level) unsigned long timeout; if ((boot_delay == 0 || system_state != SYSTEM_BOOTING) - || (level >= console_loglevel && !ignore_loglevel)) { + || suppress_message_printing(level)) { return; } @@ -1439,8 +1569,6 @@ static void call_console_drivers(int level, trace_console(text, len); - if (level >= console_loglevel && !ignore_loglevel) - return; if (!console_drivers) return; @@ -1888,6 +2016,7 @@ static void call_console_drivers(int level, static size_t msg_print_text(const struct printk_log *msg, enum log_flags prev, bool syslog, char *buf, size_t size) { return 0; } static size_t cont_print_text(char *text, size_t size) { return 0; } +static bool suppress_message_printing(int level) { return false; } /* Still needs to be defined for users */ DEFINE_PER_CPU(printk_func_t, printk_func); @@ -2167,6 +2296,13 @@ static void console_cont_flush(char *text, size_t size) if (!cont.len) goto out; + if (suppress_message_printing(cont.level)) { + cont.cons = cont.len; + if (cont.flushed) + cont.len = 0; + goto out; + } + /* * We still queue earlier records, likely because the console was * busy. The earlier ones need to be printed before this one, we @@ -2270,10 +2406,13 @@ skip: break; msg = log_from_idx(console_idx); - if (msg->flags & LOG_NOCONS) { + level = msg->level; + if ((msg->flags & LOG_NOCONS) || + suppress_message_printing(level)) { /* * Skip record we have buffered and already printed - * directly to the console when we received it. + * directly to the console when we received it, and + * record that has level above the console loglevel. */ console_idx = log_next(console_idx); console_seq++; @@ -2287,7 +2426,6 @@ skip: goto skip; } - level = msg->level; len += msg_print_text(msg, console_prev, false, text + len, sizeof(text) - len); if (nr_ext_console_drivers) { @@ -3177,9 +3315,8 @@ void show_regs_print_info(const char *log_lvl) { dump_stack_print_info(log_lvl); - printk("%stask: %p ti: %p task.ti: %p\n", - log_lvl, current, current_thread_info(), - task_thread_info(current)); + printk("%stask: %p task.stack: %p\n", + log_lvl, current, task_stack_page(current)); } #endif diff --git a/kernel/profile.c b/kernel/profile.c index c2199e9901c9..2dbccf2d806c 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -328,68 +328,57 @@ out: put_cpu(); } -static int profile_cpu_callback(struct notifier_block *info, - unsigned long action, void *__cpu) +static int profile_dead_cpu(unsigned int cpu) { - int node, cpu = (unsigned long)__cpu; struct page *page; + int i; - switch (action) { - case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: - node = cpu_to_mem(cpu); - per_cpu(cpu_profile_flip, cpu) = 0; - if (!per_cpu(cpu_profile_hits, cpu)[1]) { - page = __alloc_pages_node(node, - GFP_KERNEL | __GFP_ZERO, - 0); - if (!page) - return notifier_from_errno(-ENOMEM); - per_cpu(cpu_profile_hits, cpu)[1] = page_address(page); - } - if (!per_cpu(cpu_profile_hits, cpu)[0]) { - page = __alloc_pages_node(node, - GFP_KERNEL | __GFP_ZERO, - 0); - if (!page) - goto out_free; - per_cpu(cpu_profile_hits, cpu)[0] = page_address(page); - } - break; -out_free: - page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]); - per_cpu(cpu_profile_hits, cpu)[1] = NULL; - __free_page(page); - return notifier_from_errno(-ENOMEM); - case CPU_ONLINE: - case CPU_ONLINE_FROZEN: - if (prof_cpu_mask != NULL) - cpumask_set_cpu(cpu, prof_cpu_mask); - break; - case CPU_UP_CANCELED: - case CPU_UP_CANCELED_FROZEN: - case CPU_DEAD: - case CPU_DEAD_FROZEN: - if (prof_cpu_mask != NULL) - cpumask_clear_cpu(cpu, prof_cpu_mask); - if (per_cpu(cpu_profile_hits, cpu)[0]) { - page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[0]); - per_cpu(cpu_profile_hits, cpu)[0] = NULL; + if (prof_cpu_mask != NULL) + cpumask_clear_cpu(cpu, prof_cpu_mask); + + for (i = 0; i < 2; i++) { + if (per_cpu(cpu_profile_hits, cpu)[i]) { + page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[i]); + per_cpu(cpu_profile_hits, cpu)[i] = NULL; __free_page(page); } - if (per_cpu(cpu_profile_hits, cpu)[1]) { - page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]); - per_cpu(cpu_profile_hits, cpu)[1] = NULL; - __free_page(page); + } + return 0; +} + +static int profile_prepare_cpu(unsigned int cpu) +{ + int i, node = cpu_to_mem(cpu); + struct page *page; + + per_cpu(cpu_profile_flip, cpu) = 0; + + for (i = 0; i < 2; i++) { + if (per_cpu(cpu_profile_hits, cpu)[i]) + continue; + + page = __alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0); + if (!page) { + profile_dead_cpu(cpu); + return -ENOMEM; } - break; + per_cpu(cpu_profile_hits, cpu)[i] = page_address(page); + } - return NOTIFY_OK; + return 0; +} + +static int profile_online_cpu(unsigned int cpu) +{ + if (prof_cpu_mask != NULL) + cpumask_set_cpu(cpu, prof_cpu_mask); + + return 0; } + #else /* !CONFIG_SMP */ #define profile_flip_buffers() do { } while (0) #define profile_discard_flip_buffers() do { } while (0) -#define profile_cpu_callback NULL static void do_profile_hits(int type, void *__pc, unsigned int nr_hits) { @@ -531,83 +520,43 @@ static const struct file_operations proc_profile_operations = { .llseek = default_llseek, }; -#ifdef CONFIG_SMP -static void profile_nop(void *unused) -{ -} - -static int create_hash_tables(void) +int __ref create_proc_profile(void) { - int cpu; - - for_each_online_cpu(cpu) { - int node = cpu_to_mem(cpu); - struct page *page; - - page = __alloc_pages_node(node, - GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE, - 0); - if (!page) - goto out_cleanup; - per_cpu(cpu_profile_hits, cpu)[1] - = (struct profile_hit *)page_address(page); - page = __alloc_pages_node(node, - GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE, - 0); - if (!page) - goto out_cleanup; - per_cpu(cpu_profile_hits, cpu)[0] - = (struct profile_hit *)page_address(page); - } - return 0; -out_cleanup: - prof_on = 0; - smp_mb(); - on_each_cpu(profile_nop, NULL, 1); - for_each_online_cpu(cpu) { - struct page *page; - - if (per_cpu(cpu_profile_hits, cpu)[0]) { - page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[0]); - per_cpu(cpu_profile_hits, cpu)[0] = NULL; - __free_page(page); - } - if (per_cpu(cpu_profile_hits, cpu)[1]) { - page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]); - per_cpu(cpu_profile_hits, cpu)[1] = NULL; - __free_page(page); - } - } - return -1; -} -#else -#define create_hash_tables() ({ 0; }) + struct proc_dir_entry *entry; +#ifdef CONFIG_SMP + enum cpuhp_state online_state; #endif -int __ref create_proc_profile(void) /* false positive from hotcpu_notifier */ -{ - struct proc_dir_entry *entry; int err = 0; if (!prof_on) return 0; - - cpu_notifier_register_begin(); - - if (create_hash_tables()) { - err = -ENOMEM; - goto out; - } - +#ifdef CONFIG_SMP + err = cpuhp_setup_state(CPUHP_PROFILE_PREPARE, "PROFILE_PREPARE", + profile_prepare_cpu, profile_dead_cpu); + if (err) + return err; + + err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "AP_PROFILE_ONLINE", + profile_online_cpu, NULL); + if (err < 0) + goto err_state_prep; + online_state = err; + err = 0; +#endif entry = proc_create("profile", S_IWUSR | S_IRUGO, NULL, &proc_profile_operations); if (!entry) - goto out; + goto err_state_onl; proc_set_size(entry, (1 + prof_len) * sizeof(atomic_t)); - __hotcpu_notifier(profile_cpu_callback, 0); -out: - cpu_notifier_register_done(); + return err; +err_state_onl: +#ifdef CONFIG_SMP + cpuhp_remove_state(online_state); +err_state_prep: + cpuhp_remove_state(CPUHP_PROFILE_PREPARE); +#endif return err; } subsys_initcall(create_proc_profile); diff --git a/kernel/ptrace.c b/kernel/ptrace.c index d49bfa1e53e6..1d3b7665d0be 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -585,8 +585,8 @@ static int ptrace_setoptions(struct task_struct *child, unsigned long data) return -EINVAL; if (unlikely(data & PTRACE_O_SUSPEND_SECCOMP)) { - if (!config_enabled(CONFIG_CHECKPOINT_RESTORE) || - !config_enabled(CONFIG_SECCOMP)) + if (!IS_ENABLED(CONFIG_CHECKPOINT_RESTORE) || + !IS_ENABLED(CONFIG_SECCOMP)) return -EINVAL; if (!capable(CAP_SYS_ADMIN)) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index f433959e9322..5d80925e7fc8 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1073,11 +1073,11 @@ EXPORT_SYMBOL_GPL(rcu_is_watching); * offline to continue to use RCU for one jiffy after marking itself * offline in the cpu_online_mask. This leniency is necessary given the * non-atomic nature of the online and offline processing, for example, - * the fact that a CPU enters the scheduler after completing the CPU_DYING - * notifiers. + * the fact that a CPU enters the scheduler after completing the teardown + * of the CPU. * - * This is also why RCU internally marks CPUs online during the - * CPU_UP_PREPARE phase and offline during the CPU_DEAD phase. + * This is also why RCU internally marks CPUs online during in the + * preparation phase and offline after the CPU has been taken down. * * Disable checking if in an NMI handler because we cannot safely report * errors from NMI handlers anyway. @@ -3806,12 +3806,58 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } -static void rcu_prepare_cpu(int cpu) +int rcutree_prepare_cpu(unsigned int cpu) { struct rcu_state *rsp; for_each_rcu_flavor(rsp) rcu_init_percpu_data(cpu, rsp); + + rcu_prepare_kthreads(cpu); + rcu_spawn_all_nocb_kthreads(cpu); + + return 0; +} + +static void rcutree_affinity_setting(unsigned int cpu, int outgoing) +{ + struct rcu_data *rdp = per_cpu_ptr(rcu_state_p->rda, cpu); + + rcu_boost_kthread_setaffinity(rdp->mynode, outgoing); +} + +int rcutree_online_cpu(unsigned int cpu) +{ + sync_sched_exp_online_cleanup(cpu); + rcutree_affinity_setting(cpu, -1); + return 0; +} + +int rcutree_offline_cpu(unsigned int cpu) +{ + rcutree_affinity_setting(cpu, cpu); + return 0; +} + + +int rcutree_dying_cpu(unsigned int cpu) +{ + struct rcu_state *rsp; + + for_each_rcu_flavor(rsp) + rcu_cleanup_dying_cpu(rsp); + return 0; +} + +int rcutree_dead_cpu(unsigned int cpu) +{ + struct rcu_state *rsp; + + for_each_rcu_flavor(rsp) { + rcu_cleanup_dead_cpu(cpu, rsp); + do_nocb_deferred_wakeup(per_cpu_ptr(rsp->rda, cpu)); + } + return 0; } #ifdef CONFIG_HOTPLUG_CPU @@ -3851,52 +3897,6 @@ void rcu_report_dead(unsigned int cpu) } #endif -/* - * Handle CPU online/offline notification events. - */ -int rcu_cpu_notify(struct notifier_block *self, - unsigned long action, void *hcpu) -{ - long cpu = (long)hcpu; - struct rcu_data *rdp = per_cpu_ptr(rcu_state_p->rda, cpu); - struct rcu_node *rnp = rdp->mynode; - struct rcu_state *rsp; - - switch (action) { - case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: - rcu_prepare_cpu(cpu); - rcu_prepare_kthreads(cpu); - rcu_spawn_all_nocb_kthreads(cpu); - break; - case CPU_ONLINE: - case CPU_DOWN_FAILED: - sync_sched_exp_online_cleanup(cpu); - rcu_boost_kthread_setaffinity(rnp, -1); - break; - case CPU_DOWN_PREPARE: - rcu_boost_kthread_setaffinity(rnp, cpu); - break; - case CPU_DYING: - case CPU_DYING_FROZEN: - for_each_rcu_flavor(rsp) - rcu_cleanup_dying_cpu(rsp); - break; - case CPU_DEAD: - case CPU_DEAD_FROZEN: - case CPU_UP_CANCELED: - case CPU_UP_CANCELED_FROZEN: - for_each_rcu_flavor(rsp) { - rcu_cleanup_dead_cpu(cpu, rsp); - do_nocb_deferred_wakeup(per_cpu_ptr(rsp->rda, cpu)); - } - break; - default: - break; - } - return NOTIFY_OK; -} - static int rcu_pm_notify(struct notifier_block *self, unsigned long action, void *hcpu) { @@ -4208,10 +4208,9 @@ void __init rcu_init(void) * this is called early in boot, before either interrupts * or the scheduler are operational. */ - cpu_notifier(rcu_cpu_notify, 0); pm_notifier(rcu_pm_notify, 0); for_each_online_cpu(cpu) - rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu); + rcutree_prepare_cpu(cpu); } #include "tree_exp.h" diff --git a/kernel/relay.c b/kernel/relay.c index 04d7cf3ef8cf..d797502140b9 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -451,6 +451,13 @@ static struct rchan_buf *relay_open_buf(struct rchan *chan, unsigned int cpu) if (!dentry) goto free_buf; relay_set_buf_dentry(buf, dentry); + } else { + /* Only retrieve global info, nothing more, nothing less */ + dentry = chan->cb->create_buf_file(NULL, NULL, + S_IRUSR, buf, + &chan->is_global); + if (WARN_ON(dentry)) + goto free_buf; } buf->cpu = cpu; @@ -562,6 +569,10 @@ static int relay_hotcpu_callback(struct notifier_block *nb, * attributes specified. The created channel buffer files * will be named base_filename0...base_filenameN-1. File * permissions will be %S_IRUSR. + * + * If opening a buffer (@parent = NULL) that you later wish to register + * in a filesystem, call relay_late_setup_files() once the @parent dentry + * is available. */ struct rchan *relay_open(const char *base_filename, struct dentry *parent, @@ -640,8 +651,12 @@ static void __relay_set_buf_dentry(void *info) * * Returns 0 if successful, non-zero otherwise. * - * Use to setup files for a previously buffer-only channel. - * Useful to do early tracing in kernel, before VFS is up, for example. + * Use to setup files for a previously buffer-only channel created + * by relay_open() with a NULL parent dentry. + * + * For example, this is useful for perfomring early tracing in kernel, + * before VFS is up and then exposing the early results once the dentry + * is available. */ int relay_late_setup_files(struct rchan *chan, const char *base_filename, @@ -666,6 +681,20 @@ int relay_late_setup_files(struct rchan *chan, } chan->has_base_filename = 1; chan->parent = parent; + + if (chan->is_global) { + err = -EINVAL; + if (!WARN_ON_ONCE(!chan->buf[0])) { + dentry = relay_create_buf_file(chan, chan->buf[0], 0); + if (dentry && !WARN_ON_ONCE(!chan->is_global)) { + relay_set_buf_dentry(chan->buf[0], dentry); + err = 0; + } + } + mutex_unlock(&relay_channels_mutex); + return err; + } + curr_cpu = get_cpu(); /* * The CPU hotplug notifier ran before us and created buffers with @@ -706,6 +735,7 @@ int relay_late_setup_files(struct rchan *chan, return err; } +EXPORT_SYMBOL_GPL(relay_late_setup_files); /** * relay_switch_subbuf - switch to a new sub-buffer diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 5c883fe8e440..44817c640e99 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -74,6 +74,7 @@ #include <linux/context_tracking.h> #include <linux/compiler.h> #include <linux/frame.h> +#include <linux/prefetch.h> #include <asm/switch_to.h> #include <asm/tlb.h> @@ -2015,6 +2016,28 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) success = 1; /* we're going to change ->state */ cpu = task_cpu(p); + /* + * Ensure we load p->on_rq _after_ p->state, otherwise it would + * be possible to, falsely, observe p->on_rq == 0 and get stuck + * in smp_cond_load_acquire() below. + * + * sched_ttwu_pending() try_to_wake_up() + * [S] p->on_rq = 1; [L] P->state + * UNLOCK rq->lock -----. + * \ + * +--- RMB + * schedule() / + * LOCK rq->lock -----' + * UNLOCK rq->lock + * + * [task p] + * [S] p->state = UNINTERRUPTIBLE [L] p->on_rq + * + * Pairs with the UNLOCK+LOCK on rq->lock from the + * last wakeup of our task and the schedule that got our task + * current. + */ + smp_rmb(); if (p->on_rq && ttwu_remote(p, wake_flags)) goto stat; @@ -2972,6 +2995,23 @@ EXPORT_PER_CPU_SYMBOL(kstat); EXPORT_PER_CPU_SYMBOL(kernel_cpustat); /* + * The function fair_sched_class.update_curr accesses the struct curr + * and its field curr->exec_start; when called from task_sched_runtime(), + * we observe a high rate of cache misses in practice. + * Prefetching this data results in improved performance. + */ +static inline void prefetch_curr_exec_start(struct task_struct *p) +{ +#ifdef CONFIG_FAIR_GROUP_SCHED + struct sched_entity *curr = (&p->se)->cfs_rq->curr; +#else + struct sched_entity *curr = (&task_rq(p)->cfs)->curr; +#endif + prefetch(curr); + prefetch(&curr->exec_start); +} + +/* * Return accounted runtime for the task. * In case the task is currently running, return the runtime plus current's * pending runtime that have not been accounted yet. @@ -3005,6 +3045,7 @@ unsigned long long task_sched_runtime(struct task_struct *p) * thread, breaking clock_gettime(). */ if (task_current(rq, p) && task_on_rq_queued(p)) { + prefetch_curr_exec_start(p); update_rq_clock(rq); p->sched_class->update_curr(rq); } diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c index 5be58820465c..d4184498c9f5 100644 --- a/kernel/sched/cpudeadline.c +++ b/kernel/sched/cpudeadline.c @@ -168,7 +168,7 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid) if (old_idx == IDX_INVALID) { cp->size++; - cp->elements[cp->size - 1].dl = 0; + cp->elements[cp->size - 1].dl = dl; cp->elements[cp->size - 1].cpu = cpu; cp->elements[cpu].idx = cp->size - 1; cpudl_change_key(cp, cp->size - 1, dl); diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 1934f658c036..a846cf89eb96 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -263,6 +263,11 @@ void account_idle_time(cputime_t cputime) cpustat[CPUTIME_IDLE] += (__force u64) cputime; } +/* + * When a guest is interrupted for a longer amount of time, missed clock + * ticks are not redelivered later. Due to that, this function may on + * occasion account more time than the calling functions think elapsed. + */ static __always_inline cputime_t steal_account_process_time(cputime_t maxtime) { #ifdef CONFIG_PARAVIRT @@ -371,7 +376,7 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick, * idle, or potentially user or system time. Due to rounding, * other time can exceed ticks occasionally. */ - other = account_other_time(cputime); + other = account_other_time(ULONG_MAX); if (other >= cputime) return; cputime -= other; @@ -486,7 +491,7 @@ void account_process_tick(struct task_struct *p, int user_tick) } cputime = cputime_one_jiffy; - steal = steal_account_process_time(cputime); + steal = steal_account_process_time(ULONG_MAX); if (steal >= cputime) return; @@ -508,13 +513,21 @@ void account_process_tick(struct task_struct *p, int user_tick) */ void account_idle_ticks(unsigned long ticks) { + cputime_t cputime, steal; if (sched_clock_irqtime) { irqtime_account_idle_ticks(ticks); return; } - account_idle_time(jiffies_to_cputime(ticks)); + cputime = jiffies_to_cputime(ticks); + steal = steal_account_process_time(ULONG_MAX); + + if (steal >= cputime) + return; + + cputime -= steal; + account_idle_time(cputime); } /* @@ -606,19 +619,25 @@ static void cputime_adjust(struct task_cputime *curr, stime = curr->stime; utime = curr->utime; - if (utime == 0) { - stime = rtime; + /* + * If either stime or both stime and utime are 0, assume all runtime is + * userspace. Once a task gets some ticks, the monotonicy code at + * 'update' will ensure things converge to the observed ratio. + */ + if (stime == 0) { + utime = rtime; goto update; } - if (stime == 0) { - utime = rtime; + if (utime == 0) { + stime = rtime; goto update; } stime = scale_stime((__force u64)stime, (__force u64)rtime, (__force u64)(stime + utime)); +update: /* * Make sure stime doesn't go backwards; this preserves monotonicity * for utime because rtime is monotonic. @@ -641,7 +660,6 @@ static void cputime_adjust(struct task_cputime *curr, stime = rtime - utime; } -update: prev->stime = stime; prev->utime = utime; out: @@ -686,6 +704,13 @@ static cputime_t get_vtime_delta(struct task_struct *tsk) unsigned long now = READ_ONCE(jiffies); cputime_t delta, other; + /* + * Unlike tick based timing, vtime based timing never has lost + * ticks, and no need for steal time accounting to make up for + * lost ticks. Vtime accounts a rounded version of actual + * elapsed time. Limit account_other_time to prevent rounding + * errors from causing elapsed vtime to go negative. + */ delta = jiffies_to_cputime(now - tsk->vtime_snap); other = account_other_time(delta); WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE); diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index fcb7f0217ff4..1ce8867283dc 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -658,8 +658,11 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) * * XXX figure out if select_task_rq_dl() deals with offline cpus. */ - if (unlikely(!rq->online)) + if (unlikely(!rq->online)) { + lockdep_unpin_lock(&rq->lock, rf.cookie); rq = dl_task_offline_migration(rq, p); + rf.cookie = lockdep_pin_lock(&rq->lock); + } /* * Queueing this task back might have overloaded rq, check if we need diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 4088eedea763..039de34f1521 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4269,7 +4269,7 @@ static void sync_throttle(struct task_group *tg, int cpu) pcfs_rq = tg->parent->cfs_rq[cpu]; cfs_rq->throttle_count = pcfs_rq->throttle_count; - pcfs_rq->throttled_clock_task = rq_clock_task(cpu_rq(cpu)); + cfs_rq->throttled_clock_task = rq_clock_task(cpu_rq(cpu)); } /* conditionally throttle active cfs_rq's from put_prev_entity() */ diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 7002796f14a4..0db7c8a2afe2 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -173,7 +173,7 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen) * * Returns valid seccomp BPF response codes. */ -static u32 seccomp_run_filters(struct seccomp_data *sd) +static u32 seccomp_run_filters(const struct seccomp_data *sd) { struct seccomp_data sd_local; u32 ret = SECCOMP_RET_ALLOW; @@ -347,7 +347,7 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog) { struct seccomp_filter *sfilter; int ret; - const bool save_orig = config_enabled(CONFIG_CHECKPOINT_RESTORE); + const bool save_orig = IS_ENABLED(CONFIG_CHECKPOINT_RESTORE); if (fprog->len == 0 || fprog->len > BPF_MAXINSNS) return ERR_PTR(-EINVAL); @@ -542,7 +542,7 @@ void secure_computing_strict(int this_syscall) { int mode = current->seccomp.mode; - if (config_enabled(CONFIG_CHECKPOINT_RESTORE) && + if (IS_ENABLED(CONFIG_CHECKPOINT_RESTORE) && unlikely(current->ptrace & PT_SUSPEND_SECCOMP)) return; @@ -554,20 +554,10 @@ void secure_computing_strict(int this_syscall) BUG(); } #else -int __secure_computing(void) -{ - u32 phase1_result = seccomp_phase1(NULL); - - if (likely(phase1_result == SECCOMP_PHASE1_OK)) - return 0; - else if (likely(phase1_result == SECCOMP_PHASE1_SKIP)) - return -1; - else - return seccomp_phase2(phase1_result); -} #ifdef CONFIG_SECCOMP_FILTER -static u32 __seccomp_phase1_filter(int this_syscall, struct seccomp_data *sd) +static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, + const bool recheck_after_trace) { u32 filter_ret, action; int data; @@ -599,10 +589,50 @@ static u32 __seccomp_phase1_filter(int this_syscall, struct seccomp_data *sd) goto skip; case SECCOMP_RET_TRACE: - return filter_ret; /* Save the rest for phase 2. */ + /* We've been put in this state by the ptracer already. */ + if (recheck_after_trace) + return 0; + + /* ENOSYS these calls if there is no tracer attached. */ + if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) { + syscall_set_return_value(current, + task_pt_regs(current), + -ENOSYS, 0); + goto skip; + } + + /* Allow the BPF to provide the event message */ + ptrace_event(PTRACE_EVENT_SECCOMP, data); + /* + * The delivery of a fatal signal during event + * notification may silently skip tracer notification, + * which could leave us with a potentially unmodified + * syscall that the tracer would have liked to have + * changed. Since the process is about to die, we just + * force the syscall to be skipped and let the signal + * kill the process and correctly handle any tracer exit + * notifications. + */ + if (fatal_signal_pending(current)) + goto skip; + /* Check if the tracer forced the syscall to be skipped. */ + this_syscall = syscall_get_nr(current, task_pt_regs(current)); + if (this_syscall < 0) + goto skip; + + /* + * Recheck the syscall, since it may have changed. This + * intentionally uses a NULL struct seccomp_data to force + * a reload of all registers. This does not goto skip since + * a skip would have already been reported. + */ + if (__seccomp_filter(this_syscall, NULL, true)) + return -1; + + return 0; case SECCOMP_RET_ALLOW: - return SECCOMP_PHASE1_OK; + return 0; case SECCOMP_RET_KILL: default: @@ -614,96 +644,38 @@ static u32 __seccomp_phase1_filter(int this_syscall, struct seccomp_data *sd) skip: audit_seccomp(this_syscall, 0, action); - return SECCOMP_PHASE1_SKIP; + return -1; +} +#else +static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, + const bool recheck_after_trace) +{ + BUG(); } #endif -/** - * seccomp_phase1() - run fast path seccomp checks on the current syscall - * @arg sd: The seccomp_data or NULL - * - * This only reads pt_regs via the syscall_xyz helpers. The only change - * it will make to pt_regs is via syscall_set_return_value, and it will - * only do that if it returns SECCOMP_PHASE1_SKIP. - * - * If sd is provided, it will not read pt_regs at all. - * - * It may also call do_exit or force a signal; these actions must be - * safe. - * - * If it returns SECCOMP_PHASE1_OK, the syscall passes checks and should - * be processed normally. - * - * If it returns SECCOMP_PHASE1_SKIP, then the syscall should not be - * invoked. In this case, seccomp_phase1 will have set the return value - * using syscall_set_return_value. - * - * If it returns anything else, then the return value should be passed - * to seccomp_phase2 from a context in which ptrace hooks are safe. - */ -u32 seccomp_phase1(struct seccomp_data *sd) +int __secure_computing(const struct seccomp_data *sd) { int mode = current->seccomp.mode; - int this_syscall = sd ? sd->nr : - syscall_get_nr(current, task_pt_regs(current)); + int this_syscall; - if (config_enabled(CONFIG_CHECKPOINT_RESTORE) && + if (IS_ENABLED(CONFIG_CHECKPOINT_RESTORE) && unlikely(current->ptrace & PT_SUSPEND_SECCOMP)) - return SECCOMP_PHASE1_OK; + return 0; + + this_syscall = sd ? sd->nr : + syscall_get_nr(current, task_pt_regs(current)); switch (mode) { case SECCOMP_MODE_STRICT: __secure_computing_strict(this_syscall); /* may call do_exit */ - return SECCOMP_PHASE1_OK; -#ifdef CONFIG_SECCOMP_FILTER + return 0; case SECCOMP_MODE_FILTER: - return __seccomp_phase1_filter(this_syscall, sd); -#endif + return __seccomp_filter(this_syscall, sd, false); default: BUG(); } } - -/** - * seccomp_phase2() - finish slow path seccomp work for the current syscall - * @phase1_result: The return value from seccomp_phase1() - * - * This must be called from a context in which ptrace hooks can be used. - * - * Returns 0 if the syscall should be processed or -1 to skip the syscall. - */ -int seccomp_phase2(u32 phase1_result) -{ - struct pt_regs *regs = task_pt_regs(current); - u32 action = phase1_result & SECCOMP_RET_ACTION; - int data = phase1_result & SECCOMP_RET_DATA; - - BUG_ON(action != SECCOMP_RET_TRACE); - - audit_seccomp(syscall_get_nr(current, regs), 0, action); - - /* Skip these calls if there is no tracer. */ - if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) { - syscall_set_return_value(current, regs, - -ENOSYS, 0); - return -1; - } - - /* Allow the BPF to provide the event message */ - ptrace_event(PTRACE_EVENT_SECCOMP, data); - /* - * The delivery of a fatal signal during event - * notification may silently skip tracer notification. - * Terminating the task now avoids executing a system - * call that may not be intended. - */ - if (fatal_signal_pending(current)) - do_exit(SIGSYS); - if (syscall_get_nr(current, regs) < 0) - return -1; /* Explicit request to skip. */ - - return 0; -} #endif /* CONFIG_HAVE_ARCH_SECCOMP_FILTER */ long prctl_get_seccomp(void) diff --git a/kernel/smp.c b/kernel/smp.c index 36552beed397..3aa642d39c03 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -33,69 +33,54 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct llist_head, call_single_queue); static void flush_smp_call_function_queue(bool warn_cpu_offline); -static int -hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu) +int smpcfd_prepare_cpu(unsigned int cpu) { - long cpu = (long)hcpu; struct call_function_data *cfd = &per_cpu(cfd_data, cpu); - switch (action) { - case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: - if (!zalloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL, - cpu_to_node(cpu))) - return notifier_from_errno(-ENOMEM); - cfd->csd = alloc_percpu(struct call_single_data); - if (!cfd->csd) { - free_cpumask_var(cfd->cpumask); - return notifier_from_errno(-ENOMEM); - } - break; - -#ifdef CONFIG_HOTPLUG_CPU - case CPU_UP_CANCELED: - case CPU_UP_CANCELED_FROZEN: - /* Fall-through to the CPU_DEAD[_FROZEN] case. */ - - case CPU_DEAD: - case CPU_DEAD_FROZEN: + if (!zalloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL, + cpu_to_node(cpu))) + return -ENOMEM; + cfd->csd = alloc_percpu(struct call_single_data); + if (!cfd->csd) { free_cpumask_var(cfd->cpumask); - free_percpu(cfd->csd); - break; + return -ENOMEM; + } - case CPU_DYING: - case CPU_DYING_FROZEN: - /* - * The IPIs for the smp-call-function callbacks queued by other - * CPUs might arrive late, either due to hardware latencies or - * because this CPU disabled interrupts (inside stop-machine) - * before the IPIs were sent. So flush out any pending callbacks - * explicitly (without waiting for the IPIs to arrive), to - * ensure that the outgoing CPU doesn't go offline with work - * still pending. - */ - flush_smp_call_function_queue(false); - break; -#endif - }; + return 0; +} + +int smpcfd_dead_cpu(unsigned int cpu) +{ + struct call_function_data *cfd = &per_cpu(cfd_data, cpu); - return NOTIFY_OK; + free_cpumask_var(cfd->cpumask); + free_percpu(cfd->csd); + return 0; } -static struct notifier_block hotplug_cfd_notifier = { - .notifier_call = hotplug_cfd, -}; +int smpcfd_dying_cpu(unsigned int cpu) +{ + /* + * The IPIs for the smp-call-function callbacks queued by other + * CPUs might arrive late, either due to hardware latencies or + * because this CPU disabled interrupts (inside stop-machine) + * before the IPIs were sent. So flush out any pending callbacks + * explicitly (without waiting for the IPIs to arrive), to + * ensure that the outgoing CPU doesn't go offline with work + * still pending. + */ + flush_smp_call_function_queue(false); + return 0; +} void __init call_function_init(void) { - void *cpu = (void *)(long)smp_processor_id(); int i; for_each_possible_cpu(i) init_llist_head(&per_cpu(call_single_queue, i)); - hotplug_cfd(&hotplug_cfd_notifier, CPU_UP_PREPARE, cpu); - register_cpu_notifier(&hotplug_cfd_notifier); + smpcfd_prepare_cpu(smp_processor_id()); } /* diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index a467e6c28a3b..4a1ca5f6da7e 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -21,6 +21,7 @@ #include <linux/smpboot.h> #include <linux/atomic.h> #include <linux/lglock.h> +#include <linux/nmi.h> /* * Structure to determine completion condition and record errors. May @@ -209,6 +210,13 @@ static int multi_cpu_stop(void *data) break; } ack_state(msdata); + } else if (curstate > MULTI_STOP_PREPARE) { + /* + * At this stage all other CPUs we depend on must spin + * in the same loop. Any reason for hard-lockup should + * be detected and reported on their side. + */ + touch_nmi_watchdog(); } } while (curstate != MULTI_STOP_EXIT); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 35f0dcb1cb4f..a13bbdaab47d 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -814,6 +814,13 @@ static struct ctl_table kern_table[] = { .extra2 = &ten_thousand, }, { + .procname = "printk_devkmsg", + .data = devkmsg_log_str, + .maxlen = DEVKMSG_STR_MAX_SIZE, + .mode = 0644, + .proc_handler = devkmsg_sysctl_set_loglvl, + }, + { .procname = "dmesg_restrict", .data = &dmesg_restrict, .maxlen = sizeof(int), @@ -1508,8 +1515,8 @@ static struct ctl_table vm_table[] = { #ifdef CONFIG_NUMA { .procname = "zone_reclaim_mode", - .data = &zone_reclaim_mode, - .maxlen = sizeof(zone_reclaim_mode), + .data = &node_reclaim_mode, + .maxlen = sizeof(node_reclaim_mode), .mode = 0644, .proc_handler = proc_dointvec, .extra1 = &zero, @@ -2133,6 +2140,21 @@ static int do_proc_dointvec_conv(bool *negp, unsigned long *lvalp, return 0; } +static int do_proc_douintvec_conv(bool *negp, unsigned long *lvalp, + int *valp, + int write, void *data) +{ + if (write) { + if (*negp) + return -EINVAL; + *valp = *lvalp; + } else { + unsigned int val = *valp; + *lvalp = (unsigned long)val; + } + return 0; +} + static const char proc_wspace_sep[] = { ' ', '\t', '\n' }; static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table, @@ -2252,8 +2274,27 @@ static int do_proc_dointvec(struct ctl_table *table, int write, int proc_dointvec(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - return do_proc_dointvec(table,write,buffer,lenp,ppos, - NULL,NULL); + return do_proc_dointvec(table, write, buffer, lenp, ppos, NULL, NULL); +} + +/** + * proc_douintvec - read a vector of unsigned integers + * @table: the sysctl table + * @write: %TRUE if this is a write to the sysctl file + * @buffer: the user buffer + * @lenp: the size of the user buffer + * @ppos: file position + * + * Reads/writes up to table->maxlen/sizeof(unsigned int) unsigned integer + * values from/to the user buffer, treated as an ASCII string. + * + * Returns 0 on success. + */ +int proc_douintvec(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return do_proc_dointvec(table, write, buffer, lenp, ppos, + do_proc_douintvec_conv, NULL); } /* @@ -2851,6 +2892,12 @@ int proc_dointvec(struct ctl_table *table, int write, return -ENOSYS; } +int proc_douintvec(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} + int proc_dointvec_minmax(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { @@ -2896,6 +2943,7 @@ int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write, * exception granted :-) */ EXPORT_SYMBOL(proc_dointvec); +EXPORT_SYMBOL(proc_douintvec); EXPORT_SYMBOL(proc_dointvec_jiffies); EXPORT_SYMBOL(proc_dointvec_minmax); EXPORT_SYMBOL(proc_dointvec_userhz_jiffies); diff --git a/kernel/task_work.c b/kernel/task_work.c index 6ab4842b00e8..d513051fcca2 100644 --- a/kernel/task_work.c +++ b/kernel/task_work.c @@ -29,7 +29,7 @@ task_work_add(struct task_struct *task, struct callback_head *work, bool notify) struct callback_head *head; do { - head = ACCESS_ONCE(task->task_works); + head = READ_ONCE(task->task_works); if (unlikely(head == &work_exited)) return -ESRCH; work->next = head; @@ -57,6 +57,9 @@ task_work_cancel(struct task_struct *task, task_work_func_t func) struct callback_head **pprev = &task->task_works; struct callback_head *work; unsigned long flags; + + if (likely(!task->task_works)) + return NULL; /* * If cmpxchg() fails we continue without updating pprev. * Either we raced with task_work_add() which added the @@ -64,8 +67,7 @@ task_work_cancel(struct task_struct *task, task_work_func_t func) * we raced with task_work_run(), *pprev == NULL/exited. */ raw_spin_lock_irqsave(&task->pi_lock, flags); - while ((work = ACCESS_ONCE(*pprev))) { - smp_read_barrier_depends(); + while ((work = lockless_dereference(*pprev))) { if (work->func != func) pprev = &work->next; else if (cmpxchg(pprev, work, work->next) == work) @@ -95,7 +97,7 @@ void task_work_run(void) * work_exited unless the list is empty. */ do { - work = ACCESS_ONCE(task->task_works); + work = READ_ONCE(task->task_works); head = !work && (task->flags & PF_EXITING) ? &work_exited : NULL; } while (cmpxchg(&task->task_works, work, head) != work); diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index d13c9aebf7a3..9ba7c820fc23 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1590,7 +1590,7 @@ SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp, /* * Functions related to boot-time initialization: */ -static void init_hrtimers_cpu(int cpu) +int hrtimers_prepare_cpu(unsigned int cpu) { struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu); int i; @@ -1602,6 +1602,7 @@ static void init_hrtimers_cpu(int cpu) cpu_base->cpu = cpu; hrtimer_init_hres(cpu_base); + return 0; } #ifdef CONFIG_HOTPLUG_CPU @@ -1636,7 +1637,7 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, } } -static void migrate_hrtimers(int scpu) +int hrtimers_dead_cpu(unsigned int scpu) { struct hrtimer_cpu_base *old_base, *new_base; int i; @@ -1665,45 +1666,14 @@ static void migrate_hrtimers(int scpu) /* Check, if we got expired work to do */ __hrtimer_peek_ahead_timers(); local_irq_enable(); + return 0; } #endif /* CONFIG_HOTPLUG_CPU */ -static int hrtimer_cpu_notify(struct notifier_block *self, - unsigned long action, void *hcpu) -{ - int scpu = (long)hcpu; - - switch (action) { - - case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: - init_hrtimers_cpu(scpu); - break; - -#ifdef CONFIG_HOTPLUG_CPU - case CPU_DEAD: - case CPU_DEAD_FROZEN: - migrate_hrtimers(scpu); - break; -#endif - - default: - break; - } - - return NOTIFY_OK; -} - -static struct notifier_block hrtimers_nb = { - .notifier_call = hrtimer_cpu_notify, -}; - void __init hrtimers_init(void) { - hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE, - (void *)(long)smp_processor_id()); - register_cpu_notifier(&hrtimers_nb); + hrtimers_prepare_cpu(smp_processor_id()); } /** diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 204fdc86863d..2ec7c00228f3 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -908,10 +908,11 @@ static void __tick_nohz_idle_enter(struct tick_sched *ts) ktime_t now, expires; int cpu = smp_processor_id(); + now = tick_nohz_start_idle(ts); + if (can_stop_idle_tick(cpu, ts)) { int was_stopped = ts->tick_stopped; - now = tick_nohz_start_idle(ts); ts->idle_calls++; expires = tick_nohz_stop_sched_tick(ts, now, cpu); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 3b65746c7f15..e07fb093f819 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -401,7 +401,10 @@ static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf) do { seq = raw_read_seqcount_latch(&tkf->seq); tkr = tkf->base + (seq & 0x01); - now = ktime_to_ns(tkr->base) + timekeeping_get_ns(tkr); + now = ktime_to_ns(tkr->base); + + now += clocksource_delta(tkr->read(tkr->clock), + tkr->cycle_last, tkr->mask); } while (read_seqcount_retry(&tkf->seq, seq)); return now; diff --git a/kernel/time/timekeeping_debug.c b/kernel/time/timekeeping_debug.c index f6bd65236712..107310a6f36f 100644 --- a/kernel/time/timekeeping_debug.c +++ b/kernel/time/timekeeping_debug.c @@ -23,7 +23,9 @@ #include "timekeeping_internal.h" -static unsigned int sleep_time_bin[32] = {0}; +#define NUM_BINS 32 + +static unsigned int sleep_time_bin[NUM_BINS] = {0}; static int tk_debug_show_sleep_time(struct seq_file *s, void *data) { @@ -69,6 +71,9 @@ late_initcall(tk_debug_sleep_time_init); void tk_debug_account_sleep_time(struct timespec64 *t) { - sleep_time_bin[fls(t->tv_sec)]++; + /* Cap bin index so we don't overflow the array */ + int bin = min(fls(t->tv_sec), NUM_BINS-1); + + sleep_time_bin[bin]++; } diff --git a/kernel/time/timer.c b/kernel/time/timer.c index cb9ab401e2d9..32bf6f75a8fe 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1496,6 +1496,7 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem) struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]); u64 expires = KTIME_MAX; unsigned long nextevt; + bool is_max_delta; /* * Pretend that there is no timer pending if the cpu is offline. @@ -1506,6 +1507,7 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem) spin_lock(&base->lock); nextevt = __next_timer_interrupt(base); + is_max_delta = (nextevt == base->clk + NEXT_TIMER_MAX_DELTA); base->next_expiry = nextevt; /* * We have a fresh next event. Check whether we can forward the base: @@ -1519,7 +1521,8 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem) expires = basem; base->is_idle = false; } else { - expires = basem + (nextevt - basej) * TICK_NSEC; + if (!is_max_delta) + expires = basem + (nextevt - basej) * TICK_NSEC; /* * If we expect to sleep more than a tick, mark the base idle: */ @@ -1804,7 +1807,7 @@ static void migrate_timer_list(struct timer_base *new_base, struct hlist_head *h } } -static void migrate_timers(int cpu) +int timers_dead_cpu(unsigned int cpu) { struct timer_base *old_base; struct timer_base *new_base; @@ -1831,29 +1834,9 @@ static void migrate_timers(int cpu) spin_unlock_irq(&new_base->lock); put_cpu_ptr(&timer_bases); } + return 0; } -static int timer_cpu_notify(struct notifier_block *self, - unsigned long action, void *hcpu) -{ - switch (action) { - case CPU_DEAD: - case CPU_DEAD_FROZEN: - migrate_timers((long)hcpu); - break; - default: - break; - } - - return NOTIFY_OK; -} - -static inline void timer_register_cpu_notifier(void) -{ - cpu_notifier(timer_cpu_notify, 0); -} -#else -static inline void timer_register_cpu_notifier(void) { } #endif /* CONFIG_HOTPLUG_CPU */ static void __init init_timer_cpu(int cpu) @@ -1881,7 +1864,6 @@ void __init init_timers(void) { init_timer_cpus(); init_timer_stats(); - timer_register_cpu_notifier(); open_softirq(TIMER_SOFTIRQ, run_timer_softirq); } diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index fafeaf803bd0..f4b86e8ca1e7 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -542,6 +542,7 @@ config HIST_TRIGGERS bool "Histogram triggers" depends on ARCH_HAVE_NMI_SAFE_CMPXCHG select TRACING_MAP + select TRACING default n help Hist triggers allow one or more arbitrary trace event fields diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index fb345cd11883..dbafc5df03f3 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -223,7 +223,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, what |= MASK_TC_BIT(op_flags, META); what |= MASK_TC_BIT(op_flags, PREFLUSH); what |= MASK_TC_BIT(op_flags, FUA); - if (op == REQ_OP_DISCARD) + if (op == REQ_OP_DISCARD || op == REQ_OP_SECURE_ERASE) what |= BLK_TC_ACT(BLK_TC_DISCARD); if (op == REQ_OP_FLUSH) what |= BLK_TC_ACT(BLK_TC_FLUSH); @@ -776,7 +776,7 @@ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio, return; __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, - bio_op(bio), bio->bi_rw, what, error, 0, NULL); + bio_op(bio), bio->bi_opf, what, error, 0, NULL); } static void blk_add_trace_bio_bounce(void *ignore, @@ -881,7 +881,7 @@ static void blk_add_trace_split(void *ignore, __be64 rpdu = cpu_to_be64(pdu); __blk_add_trace(bt, bio->bi_iter.bi_sector, - bio->bi_iter.bi_size, bio_op(bio), bio->bi_rw, + bio->bi_iter.bi_size, bio_op(bio), bio->bi_opf, BLK_TA_SPLIT, bio->bi_error, sizeof(rpdu), &rpdu); } @@ -915,7 +915,7 @@ static void blk_add_trace_bio_remap(void *ignore, r.sector_from = cpu_to_be64(from); __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, - bio_op(bio), bio->bi_rw, BLK_TA_REMAP, bio->bi_error, + bio_op(bio), bio->bi_opf, BLK_TA_REMAP, bio->bi_error, sizeof(r), &r); } diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 900dbb1efff2..84752c8e28b5 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -89,16 +89,16 @@ struct ftrace_ops *function_trace_op __read_mostly = &ftrace_list_end; /* What to set function_trace_op to */ static struct ftrace_ops *set_function_trace_op; -/* List for set_ftrace_pid's pids. */ -LIST_HEAD(ftrace_pids); -struct ftrace_pid { - struct list_head list; - struct pid *pid; -}; - -static bool ftrace_pids_enabled(void) +static bool ftrace_pids_enabled(struct ftrace_ops *ops) { - return !list_empty(&ftrace_pids); + struct trace_array *tr; + + if (!(ops->flags & FTRACE_OPS_FL_PID) || !ops->private) + return false; + + tr = ops->private; + + return tr->function_pids != NULL; } static void ftrace_update_trampoline(struct ftrace_ops *ops); @@ -179,7 +179,9 @@ int ftrace_nr_registered_ops(void) static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct pt_regs *regs) { - if (!test_tsk_trace_trace(current)) + struct trace_array *tr = op->private; + + if (tr && this_cpu_read(tr->trace_buffer.data->ftrace_ignore_pid)) return; op->saved_func(ip, parent_ip, op, regs); @@ -417,7 +419,7 @@ static int __register_ftrace_function(struct ftrace_ops *ops) /* Always save the function, and reset at unregistering */ ops->saved_func = ops->func; - if (ops->flags & FTRACE_OPS_FL_PID && ftrace_pids_enabled()) + if (ftrace_pids_enabled(ops)) ops->func = ftrace_pid_func; ftrace_update_trampoline(ops); @@ -450,7 +452,6 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops) static void ftrace_update_pid_func(void) { - bool enabled = ftrace_pids_enabled(); struct ftrace_ops *op; /* Only do something if we are tracing something */ @@ -459,8 +460,8 @@ static void ftrace_update_pid_func(void) do_for_each_ftrace_op(op, ftrace_ops_list) { if (op->flags & FTRACE_OPS_FL_PID) { - op->func = enabled ? ftrace_pid_func : - op->saved_func; + op->func = ftrace_pids_enabled(op) ? + ftrace_pid_func : op->saved_func; ftrace_update_trampoline(op); } } while_for_each_ftrace_op(op); @@ -5324,179 +5325,99 @@ ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops) return ops->func; } -static void clear_ftrace_swapper(void) +static void +ftrace_filter_pid_sched_switch_probe(void *data, bool preempt, + struct task_struct *prev, struct task_struct *next) { - struct task_struct *p; - int cpu; + struct trace_array *tr = data; + struct trace_pid_list *pid_list; - get_online_cpus(); - for_each_online_cpu(cpu) { - p = idle_task(cpu); - clear_tsk_trace_trace(p); - } - put_online_cpus(); -} - -static void set_ftrace_swapper(void) -{ - struct task_struct *p; - int cpu; + pid_list = rcu_dereference_sched(tr->function_pids); - get_online_cpus(); - for_each_online_cpu(cpu) { - p = idle_task(cpu); - set_tsk_trace_trace(p); - } - put_online_cpus(); + this_cpu_write(tr->trace_buffer.data->ftrace_ignore_pid, + trace_ignore_this_task(pid_list, next)); } -static void clear_ftrace_pid(struct pid *pid) +static void clear_ftrace_pids(struct trace_array *tr) { - struct task_struct *p; + struct trace_pid_list *pid_list; + int cpu; - rcu_read_lock(); - do_each_pid_task(pid, PIDTYPE_PID, p) { - clear_tsk_trace_trace(p); - } while_each_pid_task(pid, PIDTYPE_PID, p); - rcu_read_unlock(); + pid_list = rcu_dereference_protected(tr->function_pids, + lockdep_is_held(&ftrace_lock)); + if (!pid_list) + return; - put_pid(pid); -} + unregister_trace_sched_switch(ftrace_filter_pid_sched_switch_probe, tr); -static void set_ftrace_pid(struct pid *pid) -{ - struct task_struct *p; + for_each_possible_cpu(cpu) + per_cpu_ptr(tr->trace_buffer.data, cpu)->ftrace_ignore_pid = false; - rcu_read_lock(); - do_each_pid_task(pid, PIDTYPE_PID, p) { - set_tsk_trace_trace(p); - } while_each_pid_task(pid, PIDTYPE_PID, p); - rcu_read_unlock(); -} + rcu_assign_pointer(tr->function_pids, NULL); -static void clear_ftrace_pid_task(struct pid *pid) -{ - if (pid == ftrace_swapper_pid) - clear_ftrace_swapper(); - else - clear_ftrace_pid(pid); -} + /* Wait till all users are no longer using pid filtering */ + synchronize_sched(); -static void set_ftrace_pid_task(struct pid *pid) -{ - if (pid == ftrace_swapper_pid) - set_ftrace_swapper(); - else - set_ftrace_pid(pid); + trace_free_pid_list(pid_list); } -static int ftrace_pid_add(int p) +static void ftrace_pid_reset(struct trace_array *tr) { - struct pid *pid; - struct ftrace_pid *fpid; - int ret = -EINVAL; - mutex_lock(&ftrace_lock); - - if (!p) - pid = ftrace_swapper_pid; - else - pid = find_get_pid(p); - - if (!pid) - goto out; - - ret = 0; - - list_for_each_entry(fpid, &ftrace_pids, list) - if (fpid->pid == pid) - goto out_put; - - ret = -ENOMEM; - - fpid = kmalloc(sizeof(*fpid), GFP_KERNEL); - if (!fpid) - goto out_put; - - list_add(&fpid->list, &ftrace_pids); - fpid->pid = pid; - - set_ftrace_pid_task(pid); + clear_ftrace_pids(tr); ftrace_update_pid_func(); - ftrace_startup_all(0); mutex_unlock(&ftrace_lock); - return 0; - -out_put: - if (pid != ftrace_swapper_pid) - put_pid(pid); - -out: - mutex_unlock(&ftrace_lock); - return ret; } -static void ftrace_pid_reset(void) -{ - struct ftrace_pid *fpid, *safe; - - mutex_lock(&ftrace_lock); - list_for_each_entry_safe(fpid, safe, &ftrace_pids, list) { - struct pid *pid = fpid->pid; - - clear_ftrace_pid_task(pid); - - list_del(&fpid->list); - kfree(fpid); - } - - ftrace_update_pid_func(); - ftrace_startup_all(0); - - mutex_unlock(&ftrace_lock); -} +/* Greater than any max PID */ +#define FTRACE_NO_PIDS (void *)(PID_MAX_LIMIT + 1) static void *fpid_start(struct seq_file *m, loff_t *pos) + __acquires(RCU) { + struct trace_pid_list *pid_list; + struct trace_array *tr = m->private; + mutex_lock(&ftrace_lock); + rcu_read_lock_sched(); - if (!ftrace_pids_enabled() && (!*pos)) - return (void *) 1; + pid_list = rcu_dereference_sched(tr->function_pids); - return seq_list_start(&ftrace_pids, *pos); + if (!pid_list) + return !(*pos) ? FTRACE_NO_PIDS : NULL; + + return trace_pid_start(pid_list, pos); } static void *fpid_next(struct seq_file *m, void *v, loff_t *pos) { - if (v == (void *)1) + struct trace_array *tr = m->private; + struct trace_pid_list *pid_list = rcu_dereference_sched(tr->function_pids); + + if (v == FTRACE_NO_PIDS) return NULL; - return seq_list_next(v, &ftrace_pids, pos); + return trace_pid_next(pid_list, v, pos); } static void fpid_stop(struct seq_file *m, void *p) + __releases(RCU) { + rcu_read_unlock_sched(); mutex_unlock(&ftrace_lock); } static int fpid_show(struct seq_file *m, void *v) { - const struct ftrace_pid *fpid = list_entry(v, struct ftrace_pid, list); - - if (v == (void *)1) { + if (v == FTRACE_NO_PIDS) { seq_puts(m, "no pid\n"); return 0; } - if (fpid->pid == ftrace_swapper_pid) - seq_puts(m, "swapper tasks\n"); - else - seq_printf(m, "%u\n", pid_vnr(fpid->pid)); - - return 0; + return trace_pid_show(m, v); } static const struct seq_operations ftrace_pid_sops = { @@ -5509,58 +5430,103 @@ static const struct seq_operations ftrace_pid_sops = { static int ftrace_pid_open(struct inode *inode, struct file *file) { + struct trace_array *tr = inode->i_private; + struct seq_file *m; int ret = 0; + if (trace_array_get(tr) < 0) + return -ENODEV; + if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) - ftrace_pid_reset(); + ftrace_pid_reset(tr); - if (file->f_mode & FMODE_READ) - ret = seq_open(file, &ftrace_pid_sops); + ret = seq_open(file, &ftrace_pid_sops); + if (ret < 0) { + trace_array_put(tr); + } else { + m = file->private_data; + /* copy tr over to seq ops */ + m->private = tr; + } return ret; } +static void ignore_task_cpu(void *data) +{ + struct trace_array *tr = data; + struct trace_pid_list *pid_list; + + /* + * This function is called by on_each_cpu() while the + * event_mutex is held. + */ + pid_list = rcu_dereference_protected(tr->function_pids, + mutex_is_locked(&ftrace_lock)); + + this_cpu_write(tr->trace_buffer.data->ftrace_ignore_pid, + trace_ignore_this_task(pid_list, current)); +} + static ssize_t ftrace_pid_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { - char buf[64], *tmp; - long val; - int ret; + struct seq_file *m = filp->private_data; + struct trace_array *tr = m->private; + struct trace_pid_list *filtered_pids = NULL; + struct trace_pid_list *pid_list; + ssize_t ret; - if (cnt >= sizeof(buf)) - return -EINVAL; + if (!cnt) + return 0; + + mutex_lock(&ftrace_lock); + + filtered_pids = rcu_dereference_protected(tr->function_pids, + lockdep_is_held(&ftrace_lock)); + + ret = trace_pid_write(filtered_pids, &pid_list, ubuf, cnt); + if (ret < 0) + goto out; - if (copy_from_user(&buf, ubuf, cnt)) - return -EFAULT; + rcu_assign_pointer(tr->function_pids, pid_list); - buf[cnt] = 0; + if (filtered_pids) { + synchronize_sched(); + trace_free_pid_list(filtered_pids); + } else if (pid_list) { + /* Register a probe to set whether to ignore the tracing of a task */ + register_trace_sched_switch(ftrace_filter_pid_sched_switch_probe, tr); + } /* - * Allow "echo > set_ftrace_pid" or "echo -n '' > set_ftrace_pid" - * to clean the filter quietly. + * Ignoring of pids is done at task switch. But we have to + * check for those tasks that are currently running. + * Always do this in case a pid was appended or removed. */ - tmp = strstrip(buf); - if (strlen(tmp) == 0) - return 1; + on_each_cpu(ignore_task_cpu, tr, 1); - ret = kstrtol(tmp, 10, &val); - if (ret < 0) - return ret; + ftrace_update_pid_func(); + ftrace_startup_all(0); + out: + mutex_unlock(&ftrace_lock); - ret = ftrace_pid_add(val); + if (ret > 0) + *ppos += ret; - return ret ? ret : cnt; + return ret; } static int ftrace_pid_release(struct inode *inode, struct file *file) { - if (file->f_mode & FMODE_READ) - seq_release(inode, file); + struct trace_array *tr = inode->i_private; - return 0; + trace_array_put(tr); + + return seq_release(inode, file); } static const struct file_operations ftrace_pid_fops = { @@ -5571,24 +5537,21 @@ static const struct file_operations ftrace_pid_fops = { .release = ftrace_pid_release, }; -static __init int ftrace_init_tracefs(void) +void ftrace_init_tracefs(struct trace_array *tr, struct dentry *d_tracer) { - struct dentry *d_tracer; + trace_create_file("set_ftrace_pid", 0644, d_tracer, + tr, &ftrace_pid_fops); +} - d_tracer = tracing_init_dentry(); - if (IS_ERR(d_tracer)) - return 0; +void __init ftrace_init_tracefs_toplevel(struct trace_array *tr, + struct dentry *d_tracer) +{ + /* Only the top level directory has the dyn_tracefs and profile */ + WARN_ON(!(tr->flags & TRACE_ARRAY_FL_GLOBAL)); ftrace_init_dyn_tracefs(d_tracer); - - trace_create_file("set_ftrace_pid", 0644, d_tracer, - NULL, &ftrace_pid_fops); - ftrace_profile_tracefs(d_tracer); - - return 0; } -fs_initcall(ftrace_init_tracefs); /** * ftrace_kill - kill ftrace diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 77eeab2776ef..7bc56762ca35 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -25,7 +25,7 @@ #include <linux/hardirq.h> #include <linux/linkage.h> #include <linux/uaccess.h> -#include <linux/kprobes.h> +#include <linux/vmalloc.h> #include <linux/ftrace.h> #include <linux/module.h> #include <linux/percpu.h> @@ -319,6 +319,258 @@ int call_filter_check_discard(struct trace_event_call *call, void *rec, return 0; } +void trace_free_pid_list(struct trace_pid_list *pid_list) +{ + vfree(pid_list->pids); + kfree(pid_list); +} + +/** + * trace_find_filtered_pid - check if a pid exists in a filtered_pid list + * @filtered_pids: The list of pids to check + * @search_pid: The PID to find in @filtered_pids + * + * Returns true if @search_pid is fonud in @filtered_pids, and false otherwis. + */ +bool +trace_find_filtered_pid(struct trace_pid_list *filtered_pids, pid_t search_pid) +{ + /* + * If pid_max changed after filtered_pids was created, we + * by default ignore all pids greater than the previous pid_max. + */ + if (search_pid >= filtered_pids->pid_max) + return false; + + return test_bit(search_pid, filtered_pids->pids); +} + +/** + * trace_ignore_this_task - should a task be ignored for tracing + * @filtered_pids: The list of pids to check + * @task: The task that should be ignored if not filtered + * + * Checks if @task should be traced or not from @filtered_pids. + * Returns true if @task should *NOT* be traced. + * Returns false if @task should be traced. + */ +bool +trace_ignore_this_task(struct trace_pid_list *filtered_pids, struct task_struct *task) +{ + /* + * Return false, because if filtered_pids does not exist, + * all pids are good to trace. + */ + if (!filtered_pids) + return false; + + return !trace_find_filtered_pid(filtered_pids, task->pid); +} + +/** + * trace_pid_filter_add_remove - Add or remove a task from a pid_list + * @pid_list: The list to modify + * @self: The current task for fork or NULL for exit + * @task: The task to add or remove + * + * If adding a task, if @self is defined, the task is only added if @self + * is also included in @pid_list. This happens on fork and tasks should + * only be added when the parent is listed. If @self is NULL, then the + * @task pid will be removed from the list, which would happen on exit + * of a task. + */ +void trace_filter_add_remove_task(struct trace_pid_list *pid_list, + struct task_struct *self, + struct task_struct *task) +{ + if (!pid_list) + return; + + /* For forks, we only add if the forking task is listed */ + if (self) { + if (!trace_find_filtered_pid(pid_list, self->pid)) + return; + } + + /* Sorry, but we don't support pid_max changing after setting */ + if (task->pid >= pid_list->pid_max) + return; + + /* "self" is set for forks, and NULL for exits */ + if (self) + set_bit(task->pid, pid_list->pids); + else + clear_bit(task->pid, pid_list->pids); +} + +/** + * trace_pid_next - Used for seq_file to get to the next pid of a pid_list + * @pid_list: The pid list to show + * @v: The last pid that was shown (+1 the actual pid to let zero be displayed) + * @pos: The position of the file + * + * This is used by the seq_file "next" operation to iterate the pids + * listed in a trace_pid_list structure. + * + * Returns the pid+1 as we want to display pid of zero, but NULL would + * stop the iteration. + */ +void *trace_pid_next(struct trace_pid_list *pid_list, void *v, loff_t *pos) +{ + unsigned long pid = (unsigned long)v; + + (*pos)++; + + /* pid already is +1 of the actual prevous bit */ + pid = find_next_bit(pid_list->pids, pid_list->pid_max, pid); + + /* Return pid + 1 to allow zero to be represented */ + if (pid < pid_list->pid_max) + return (void *)(pid + 1); + + return NULL; +} + +/** + * trace_pid_start - Used for seq_file to start reading pid lists + * @pid_list: The pid list to show + * @pos: The position of the file + * + * This is used by seq_file "start" operation to start the iteration + * of listing pids. + * + * Returns the pid+1 as we want to display pid of zero, but NULL would + * stop the iteration. + */ +void *trace_pid_start(struct trace_pid_list *pid_list, loff_t *pos) +{ + unsigned long pid; + loff_t l = 0; + + pid = find_first_bit(pid_list->pids, pid_list->pid_max); + if (pid >= pid_list->pid_max) + return NULL; + + /* Return pid + 1 so that zero can be the exit value */ + for (pid++; pid && l < *pos; + pid = (unsigned long)trace_pid_next(pid_list, (void *)pid, &l)) + ; + return (void *)pid; +} + +/** + * trace_pid_show - show the current pid in seq_file processing + * @m: The seq_file structure to write into + * @v: A void pointer of the pid (+1) value to display + * + * Can be directly used by seq_file operations to display the current + * pid value. + */ +int trace_pid_show(struct seq_file *m, void *v) +{ + unsigned long pid = (unsigned long)v - 1; + + seq_printf(m, "%lu\n", pid); + return 0; +} + +/* 128 should be much more than enough */ +#define PID_BUF_SIZE 127 + +int trace_pid_write(struct trace_pid_list *filtered_pids, + struct trace_pid_list **new_pid_list, + const char __user *ubuf, size_t cnt) +{ + struct trace_pid_list *pid_list; + struct trace_parser parser; + unsigned long val; + int nr_pids = 0; + ssize_t read = 0; + ssize_t ret = 0; + loff_t pos; + pid_t pid; + + if (trace_parser_get_init(&parser, PID_BUF_SIZE + 1)) + return -ENOMEM; + + /* + * Always recreate a new array. The write is an all or nothing + * operation. Always create a new array when adding new pids by + * the user. If the operation fails, then the current list is + * not modified. + */ + pid_list = kmalloc(sizeof(*pid_list), GFP_KERNEL); + if (!pid_list) + return -ENOMEM; + + pid_list->pid_max = READ_ONCE(pid_max); + + /* Only truncating will shrink pid_max */ + if (filtered_pids && filtered_pids->pid_max > pid_list->pid_max) + pid_list->pid_max = filtered_pids->pid_max; + + pid_list->pids = vzalloc((pid_list->pid_max + 7) >> 3); + if (!pid_list->pids) { + kfree(pid_list); + return -ENOMEM; + } + + if (filtered_pids) { + /* copy the current bits to the new max */ + for_each_set_bit(pid, filtered_pids->pids, + filtered_pids->pid_max) { + set_bit(pid, pid_list->pids); + nr_pids++; + } + } + + while (cnt > 0) { + + pos = 0; + + ret = trace_get_user(&parser, ubuf, cnt, &pos); + if (ret < 0 || !trace_parser_loaded(&parser)) + break; + + read += ret; + ubuf += ret; + cnt -= ret; + + parser.buffer[parser.idx] = 0; + + ret = -EINVAL; + if (kstrtoul(parser.buffer, 0, &val)) + break; + if (val >= pid_list->pid_max) + break; + + pid = (pid_t)val; + + set_bit(pid, pid_list->pids); + nr_pids++; + + trace_parser_clear(&parser); + ret = 0; + } + trace_parser_put(&parser); + + if (ret < 0) { + trace_free_pid_list(pid_list); + return ret; + } + + if (!nr_pids) { + /* Cleared the list of pids */ + trace_free_pid_list(pid_list); + read = ret; + pid_list = NULL; + } + + *new_pid_list = pid_list; + + return read; +} + static cycle_t buffer_ftrace_now(struct trace_buffer *buf, int cpu) { u64 ts; @@ -1862,7 +2114,17 @@ void trace_buffer_unlock_commit_regs(struct trace_array *tr, { __buffer_unlock_commit(buffer, event); - ftrace_trace_stack(tr, buffer, flags, 0, pc, regs); + /* + * If regs is not set, then skip the following callers: + * trace_buffer_unlock_commit_regs + * event_trigger_unlock_commit + * trace_event_buffer_commit + * trace_event_raw_event_sched_switch + * Note, we can still get here via blktrace, wakeup tracer + * and mmiotrace, but that's ok if they lose a function or + * two. They are that meaningful. + */ + ftrace_trace_stack(tr, buffer, flags, regs ? 0 : 4, pc, regs); ftrace_trace_userstack(buffer, flags, pc); } @@ -1913,6 +2175,13 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer, trace.skip = skip; /* + * Add two, for this function and the call to save_stack_trace() + * If regs is set, then these functions will not be in the way. + */ + if (!regs) + trace.skip += 2; + + /* * Since events can happen in NMIs there's no safe way to * use the per cpu ftrace_stacks. We reserve it and if an interrupt * or NMI comes in, it will just have to use the default @@ -2083,83 +2352,41 @@ static void __trace_userstack(struct trace_array *tr, unsigned long flags) /* created for use with alloc_percpu */ struct trace_buffer_struct { - char buffer[TRACE_BUF_SIZE]; + int nesting; + char buffer[4][TRACE_BUF_SIZE]; }; static struct trace_buffer_struct *trace_percpu_buffer; -static struct trace_buffer_struct *trace_percpu_sirq_buffer; -static struct trace_buffer_struct *trace_percpu_irq_buffer; -static struct trace_buffer_struct *trace_percpu_nmi_buffer; /* - * The buffer used is dependent on the context. There is a per cpu - * buffer for normal context, softirq contex, hard irq context and - * for NMI context. Thise allows for lockless recording. - * - * Note, if the buffers failed to be allocated, then this returns NULL + * Thise allows for lockless recording. If we're nested too deeply, then + * this returns NULL. */ static char *get_trace_buf(void) { - struct trace_buffer_struct *percpu_buffer; - - /* - * If we have allocated per cpu buffers, then we do not - * need to do any locking. - */ - if (in_nmi()) - percpu_buffer = trace_percpu_nmi_buffer; - else if (in_irq()) - percpu_buffer = trace_percpu_irq_buffer; - else if (in_softirq()) - percpu_buffer = trace_percpu_sirq_buffer; - else - percpu_buffer = trace_percpu_buffer; + struct trace_buffer_struct *buffer = this_cpu_ptr(trace_percpu_buffer); - if (!percpu_buffer) + if (!buffer || buffer->nesting >= 4) return NULL; - return this_cpu_ptr(&percpu_buffer->buffer[0]); + return &buffer->buffer[buffer->nesting++][0]; +} + +static void put_trace_buf(void) +{ + this_cpu_dec(trace_percpu_buffer->nesting); } static int alloc_percpu_trace_buffer(void) { struct trace_buffer_struct *buffers; - struct trace_buffer_struct *sirq_buffers; - struct trace_buffer_struct *irq_buffers; - struct trace_buffer_struct *nmi_buffers; buffers = alloc_percpu(struct trace_buffer_struct); - if (!buffers) - goto err_warn; - - sirq_buffers = alloc_percpu(struct trace_buffer_struct); - if (!sirq_buffers) - goto err_sirq; - - irq_buffers = alloc_percpu(struct trace_buffer_struct); - if (!irq_buffers) - goto err_irq; - - nmi_buffers = alloc_percpu(struct trace_buffer_struct); - if (!nmi_buffers) - goto err_nmi; + if (WARN(!buffers, "Could not allocate percpu trace_printk buffer")) + return -ENOMEM; trace_percpu_buffer = buffers; - trace_percpu_sirq_buffer = sirq_buffers; - trace_percpu_irq_buffer = irq_buffers; - trace_percpu_nmi_buffer = nmi_buffers; - return 0; - - err_nmi: - free_percpu(irq_buffers); - err_irq: - free_percpu(sirq_buffers); - err_sirq: - free_percpu(buffers); - err_warn: - WARN(1, "Could not allocate percpu trace_printk buffer"); - return -ENOMEM; } static int buffers_allocated; @@ -2250,7 +2477,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) tbuffer = get_trace_buf(); if (!tbuffer) { len = 0; - goto out; + goto out_nobuffer; } len = vbin_printf((u32 *)tbuffer, TRACE_BUF_SIZE/sizeof(int), fmt, args); @@ -2276,6 +2503,9 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) } out: + put_trace_buf(); + +out_nobuffer: preempt_enable_notrace(); unpause_graph_tracing(); @@ -2307,7 +2537,7 @@ __trace_array_vprintk(struct ring_buffer *buffer, tbuffer = get_trace_buf(); if (!tbuffer) { len = 0; - goto out; + goto out_nobuffer; } len = vscnprintf(tbuffer, TRACE_BUF_SIZE, fmt, args); @@ -2326,7 +2556,11 @@ __trace_array_vprintk(struct ring_buffer *buffer, __buffer_unlock_commit(buffer, event); ftrace_trace_stack(&global_trace, buffer, flags, 6, pc, NULL); } - out: + +out: + put_trace_buf(); + +out_nobuffer: preempt_enable_notrace(); unpause_graph_tracing(); @@ -6980,6 +7214,7 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer) for_each_tracing_cpu(cpu) tracing_init_tracefs_percpu(tr, cpu); + ftrace_init_tracefs(tr, d_tracer); } static struct vfsmount *trace_automount(void *ingore) @@ -7133,6 +7368,7 @@ static __init int tracer_init_tracefs(void) return 0; init_tracer_tracefs(&global_trace, d_tracer); + ftrace_init_tracefs_toplevel(&global_trace, d_tracer); trace_create_file("tracing_thresh", 0644, d_tracer, &global_trace, &tracing_thresh_fops); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 5167c366d6b7..f783df416726 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -80,6 +80,12 @@ enum trace_type { FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print), \ filter) +#undef FTRACE_ENTRY_PACKED +#define FTRACE_ENTRY_PACKED(name, struct_name, id, tstruct, print, \ + filter) \ + FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print), \ + filter) __packed + #include "trace_entries.h" /* @@ -156,6 +162,9 @@ struct trace_array_cpu { char comm[TASK_COMM_LEN]; bool ignore_pid; +#ifdef CONFIG_FUNCTION_TRACER + bool ftrace_ignore_pid; +#endif }; struct tracer; @@ -247,6 +256,7 @@ struct trace_array { int ref; #ifdef CONFIG_FUNCTION_TRACER struct ftrace_ops *ops; + struct trace_pid_list __rcu *function_pids; /* function tracing enabled */ int function_enabled; #endif @@ -628,6 +638,25 @@ extern unsigned long nsecs_to_usecs(unsigned long nsecs); extern unsigned long tracing_thresh; +/* PID filtering */ + +extern int pid_max; + +bool trace_find_filtered_pid(struct trace_pid_list *filtered_pids, + pid_t search_pid); +bool trace_ignore_this_task(struct trace_pid_list *filtered_pids, + struct task_struct *task); +void trace_filter_add_remove_task(struct trace_pid_list *pid_list, + struct task_struct *self, + struct task_struct *task); +void *trace_pid_next(struct trace_pid_list *pid_list, void *v, loff_t *pos); +void *trace_pid_start(struct trace_pid_list *pid_list, loff_t *pos); +int trace_pid_show(struct seq_file *m, void *v); +void trace_free_pid_list(struct trace_pid_list *pid_list); +int trace_pid_write(struct trace_pid_list *filtered_pids, + struct trace_pid_list **new_pid_list, + const char __user *ubuf, size_t cnt); + #ifdef CONFIG_TRACER_MAX_TRACE void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu); void update_max_tr_single(struct trace_array *tr, @@ -821,12 +850,9 @@ extern struct list_head ftrace_pids; #ifdef CONFIG_FUNCTION_TRACER extern bool ftrace_filter_param __initdata; -static inline int ftrace_trace_task(struct task_struct *task) +static inline int ftrace_trace_task(struct trace_array *tr) { - if (list_empty(&ftrace_pids)) - return 1; - - return test_tsk_trace_trace(task); + return !this_cpu_read(tr->trace_buffer.data->ftrace_ignore_pid); } extern int ftrace_is_dead(void); int ftrace_create_function_files(struct trace_array *tr, @@ -836,8 +862,11 @@ void ftrace_init_global_array_ops(struct trace_array *tr); void ftrace_init_array_ops(struct trace_array *tr, ftrace_func_t func); void ftrace_reset_array_ops(struct trace_array *tr); int using_ftrace_ops_list_func(void); +void ftrace_init_tracefs(struct trace_array *tr, struct dentry *d_tracer); +void ftrace_init_tracefs_toplevel(struct trace_array *tr, + struct dentry *d_tracer); #else -static inline int ftrace_trace_task(struct task_struct *task) +static inline int ftrace_trace_task(struct trace_array *tr) { return 1; } @@ -852,6 +881,8 @@ static inline void ftrace_destroy_function_files(struct trace_array *tr) { } static inline __init void ftrace_init_global_array_ops(struct trace_array *tr) { } static inline void ftrace_reset_array_ops(struct trace_array *tr) { } +static inline void ftrace_init_tracefs(struct trace_array *tr, struct dentry *d) { } +static inline void ftrace_init_tracefs_toplevel(struct trace_array *tr, struct dentry *d) { } /* ftace_func_t type is not defined, use macro instead of static inline */ #define ftrace_init_array_ops(tr, func) do { } while (0) #endif /* CONFIG_FUNCTION_TRACER */ @@ -1600,6 +1631,11 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled); #define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print, filter) \ FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print), \ filter) +#undef FTRACE_ENTRY_PACKED +#define FTRACE_ENTRY_PACKED(call, struct_name, id, tstruct, print, filter) \ + FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print), \ + filter) + #include "trace_entries.h" #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_FUNCTION_TRACER) diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index ee7b94a4810a..5c30efcda5e6 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -72,7 +72,7 @@ FTRACE_ENTRY_REG(function, ftrace_entry, ); /* Function call entry */ -FTRACE_ENTRY(funcgraph_entry, ftrace_graph_ent_entry, +FTRACE_ENTRY_PACKED(funcgraph_entry, ftrace_graph_ent_entry, TRACE_GRAPH_ENT, @@ -88,7 +88,7 @@ FTRACE_ENTRY(funcgraph_entry, ftrace_graph_ent_entry, ); /* Function return entry */ -FTRACE_ENTRY(funcgraph_exit, ftrace_graph_ret_entry, +FTRACE_ENTRY_PACKED(funcgraph_exit, ftrace_graph_ret_entry, TRACE_GRAPH_RET, diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 3d4155892a1e..03c0a48c3ac4 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -15,7 +15,6 @@ #include <linux/kthread.h> #include <linux/tracefs.h> #include <linux/uaccess.h> -#include <linux/vmalloc.h> #include <linux/module.h> #include <linux/ctype.h> #include <linux/sort.h> @@ -262,6 +261,14 @@ void *trace_event_buffer_reserve(struct trace_event_buffer *fbuffer, local_save_flags(fbuffer->flags); fbuffer->pc = preempt_count(); + /* + * If CONFIG_PREEMPT is enabled, then the tracepoint itself disables + * preemption (adding one to the preempt_count). Since we are + * interested in the preempt_count at the time the tracepoint was + * hit, we need to subtract one to offset the increment. + */ + if (IS_ENABLED(CONFIG_PREEMPT)) + fbuffer->pc--; fbuffer->trace_file = trace_file; fbuffer->event = @@ -499,60 +506,6 @@ static void ftrace_clear_events(struct trace_array *tr) mutex_unlock(&event_mutex); } -/* Shouldn't this be in a header? */ -extern int pid_max; - -/* Returns true if found in filter */ -static bool -find_filtered_pid(struct trace_pid_list *filtered_pids, pid_t search_pid) -{ - /* - * If pid_max changed after filtered_pids was created, we - * by default ignore all pids greater than the previous pid_max. - */ - if (search_pid >= filtered_pids->pid_max) - return false; - - return test_bit(search_pid, filtered_pids->pids); -} - -static bool -ignore_this_task(struct trace_pid_list *filtered_pids, struct task_struct *task) -{ - /* - * Return false, because if filtered_pids does not exist, - * all pids are good to trace. - */ - if (!filtered_pids) - return false; - - return !find_filtered_pid(filtered_pids, task->pid); -} - -static void filter_add_remove_task(struct trace_pid_list *pid_list, - struct task_struct *self, - struct task_struct *task) -{ - if (!pid_list) - return; - - /* For forks, we only add if the forking task is listed */ - if (self) { - if (!find_filtered_pid(pid_list, self->pid)) - return; - } - - /* Sorry, but we don't support pid_max changing after setting */ - if (task->pid >= pid_list->pid_max) - return; - - /* "self" is set for forks, and NULL for exits */ - if (self) - set_bit(task->pid, pid_list->pids); - else - clear_bit(task->pid, pid_list->pids); -} - static void event_filter_pid_sched_process_exit(void *data, struct task_struct *task) { @@ -560,7 +513,7 @@ event_filter_pid_sched_process_exit(void *data, struct task_struct *task) struct trace_array *tr = data; pid_list = rcu_dereference_sched(tr->filtered_pids); - filter_add_remove_task(pid_list, NULL, task); + trace_filter_add_remove_task(pid_list, NULL, task); } static void @@ -572,7 +525,7 @@ event_filter_pid_sched_process_fork(void *data, struct trace_array *tr = data; pid_list = rcu_dereference_sched(tr->filtered_pids); - filter_add_remove_task(pid_list, self, task); + trace_filter_add_remove_task(pid_list, self, task); } void trace_event_follow_fork(struct trace_array *tr, bool enable) @@ -600,8 +553,8 @@ event_filter_pid_sched_switch_probe_pre(void *data, bool preempt, pid_list = rcu_dereference_sched(tr->filtered_pids); this_cpu_write(tr->trace_buffer.data->ignore_pid, - ignore_this_task(pid_list, prev) && - ignore_this_task(pid_list, next)); + trace_ignore_this_task(pid_list, prev) && + trace_ignore_this_task(pid_list, next)); } static void @@ -614,7 +567,7 @@ event_filter_pid_sched_switch_probe_post(void *data, bool preempt, pid_list = rcu_dereference_sched(tr->filtered_pids); this_cpu_write(tr->trace_buffer.data->ignore_pid, - ignore_this_task(pid_list, next)); + trace_ignore_this_task(pid_list, next)); } static void @@ -630,7 +583,7 @@ event_filter_pid_sched_wakeup_probe_pre(void *data, struct task_struct *task) pid_list = rcu_dereference_sched(tr->filtered_pids); this_cpu_write(tr->trace_buffer.data->ignore_pid, - ignore_this_task(pid_list, task)); + trace_ignore_this_task(pid_list, task)); } static void @@ -647,7 +600,7 @@ event_filter_pid_sched_wakeup_probe_post(void *data, struct task_struct *task) /* Set tracing if current is enabled */ this_cpu_write(tr->trace_buffer.data->ignore_pid, - ignore_this_task(pid_list, current)); + trace_ignore_this_task(pid_list, current)); } static void __ftrace_clear_event_pids(struct trace_array *tr) @@ -685,8 +638,7 @@ static void __ftrace_clear_event_pids(struct trace_array *tr) /* Wait till all users are no longer using pid filtering */ synchronize_sched(); - vfree(pid_list->pids); - kfree(pid_list); + trace_free_pid_list(pid_list); } static void ftrace_clear_event_pids(struct trace_array *tr) @@ -1034,18 +986,8 @@ p_next(struct seq_file *m, void *v, loff_t *pos) { struct trace_array *tr = m->private; struct trace_pid_list *pid_list = rcu_dereference_sched(tr->filtered_pids); - unsigned long pid = (unsigned long)v; - - (*pos)++; - - /* pid already is +1 of the actual prevous bit */ - pid = find_next_bit(pid_list->pids, pid_list->pid_max, pid); - /* Return pid + 1 to allow zero to be represented */ - if (pid < pid_list->pid_max) - return (void *)(pid + 1); - - return NULL; + return trace_pid_next(pid_list, v, pos); } static void *p_start(struct seq_file *m, loff_t *pos) @@ -1053,8 +995,6 @@ static void *p_start(struct seq_file *m, loff_t *pos) { struct trace_pid_list *pid_list; struct trace_array *tr = m->private; - unsigned long pid; - loff_t l = 0; /* * Grab the mutex, to keep calls to p_next() having the same @@ -1070,15 +1010,7 @@ static void *p_start(struct seq_file *m, loff_t *pos) if (!pid_list) return NULL; - pid = find_first_bit(pid_list->pids, pid_list->pid_max); - if (pid >= pid_list->pid_max) - return NULL; - - /* Return pid + 1 so that zero can be the exit value */ - for (pid++; pid && l < *pos; - pid = (unsigned long)p_next(m, (void *)pid, &l)) - ; - return (void *)pid; + return trace_pid_start(pid_list, pos); } static void p_stop(struct seq_file *m, void *p) @@ -1088,14 +1020,6 @@ static void p_stop(struct seq_file *m, void *p) mutex_unlock(&event_mutex); } -static int p_show(struct seq_file *m, void *v) -{ - unsigned long pid = (unsigned long)v - 1; - - seq_printf(m, "%lu\n", pid); - return 0; -} - static ssize_t event_enable_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) @@ -1654,7 +1578,7 @@ static void ignore_task_cpu(void *data) mutex_is_locked(&event_mutex)); this_cpu_write(tr->trace_buffer.data->ignore_pid, - ignore_this_task(pid_list, current)); + trace_ignore_this_task(pid_list, current)); } static ssize_t @@ -1666,13 +1590,7 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf, struct trace_pid_list *filtered_pids = NULL; struct trace_pid_list *pid_list; struct trace_event_file *file; - struct trace_parser parser; - unsigned long val; - loff_t this_pos; - ssize_t read = 0; - ssize_t ret = 0; - pid_t pid; - int nr_pids = 0; + ssize_t ret; if (!cnt) return 0; @@ -1681,93 +1599,15 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf, if (ret < 0) return ret; - if (trace_parser_get_init(&parser, EVENT_BUF_SIZE + 1)) - return -ENOMEM; - mutex_lock(&event_mutex); + filtered_pids = rcu_dereference_protected(tr->filtered_pids, lockdep_is_held(&event_mutex)); - /* - * Always recreate a new array. The write is an all or nothing - * operation. Always create a new array when adding new pids by - * the user. If the operation fails, then the current list is - * not modified. - */ - pid_list = kmalloc(sizeof(*pid_list), GFP_KERNEL); - if (!pid_list) { - read = -ENOMEM; - goto out; - } - pid_list->pid_max = READ_ONCE(pid_max); - /* Only truncating will shrink pid_max */ - if (filtered_pids && filtered_pids->pid_max > pid_list->pid_max) - pid_list->pid_max = filtered_pids->pid_max; - pid_list->pids = vzalloc((pid_list->pid_max + 7) >> 3); - if (!pid_list->pids) { - kfree(pid_list); - read = -ENOMEM; - goto out; - } - if (filtered_pids) { - /* copy the current bits to the new max */ - pid = find_first_bit(filtered_pids->pids, - filtered_pids->pid_max); - while (pid < filtered_pids->pid_max) { - set_bit(pid, pid_list->pids); - pid = find_next_bit(filtered_pids->pids, - filtered_pids->pid_max, - pid + 1); - nr_pids++; - } - } - - while (cnt > 0) { - - this_pos = 0; - - ret = trace_get_user(&parser, ubuf, cnt, &this_pos); - if (ret < 0 || !trace_parser_loaded(&parser)) - break; - - read += ret; - ubuf += ret; - cnt -= ret; - - parser.buffer[parser.idx] = 0; - - ret = -EINVAL; - if (kstrtoul(parser.buffer, 0, &val)) - break; - if (val >= pid_list->pid_max) - break; - - pid = (pid_t)val; - - set_bit(pid, pid_list->pids); - nr_pids++; - - trace_parser_clear(&parser); - ret = 0; - } - trace_parser_put(&parser); - - if (ret < 0) { - vfree(pid_list->pids); - kfree(pid_list); - read = ret; + ret = trace_pid_write(filtered_pids, &pid_list, ubuf, cnt); + if (ret < 0) goto out; - } - if (!nr_pids) { - /* Cleared the list of pids */ - vfree(pid_list->pids); - kfree(pid_list); - read = ret; - if (!filtered_pids) - goto out; - pid_list = NULL; - } rcu_assign_pointer(tr->filtered_pids, pid_list); list_for_each_entry(file, &tr->events, list) { @@ -1776,10 +1616,8 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf, if (filtered_pids) { synchronize_sched(); - - vfree(filtered_pids->pids); - kfree(filtered_pids); - } else { + trace_free_pid_list(filtered_pids); + } else if (pid_list) { /* * Register a probe that is called before all other probes * to set ignore_pid if next or prev do not match. @@ -1817,9 +1655,8 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf, out: mutex_unlock(&event_mutex); - ret = read; - if (read > 0) - *ppos += read; + if (ret > 0) + *ppos += ret; return ret; } @@ -1846,7 +1683,7 @@ static const struct seq_operations show_set_event_seq_ops = { static const struct seq_operations show_set_pid_seq_ops = { .start = p_start, .next = p_next, - .show = p_show, + .show = trace_pid_show, .stop = p_stop, }; diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 5a095c2e4b69..0efa00d80623 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -43,7 +43,7 @@ static int allocate_ftrace_ops(struct trace_array *tr) /* Currently only the non stack verision is supported */ ops->func = function_trace_call; - ops->flags = FTRACE_OPS_FL_RECURSION_SAFE; + ops->flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_PID; tr->ops = ops; ops->private = tr; diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 3a0244ff7ea8..7363ccf79512 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -319,7 +319,7 @@ int trace_graph_entry(struct ftrace_graph_ent *trace) int cpu; int pc; - if (!ftrace_trace_task(current)) + if (!ftrace_trace_task(tr)) return 0; /* trace it when it is-nested-in or is a function enabled. */ @@ -338,6 +338,13 @@ int trace_graph_entry(struct ftrace_graph_ent *trace) if (ftrace_graph_notrace_addr(trace->func)) return 1; + /* + * Stop here if tracing_threshold is set. We only write function return + * events to the ring buffer. + */ + if (tracing_thresh) + return 1; + local_irq_save(flags); cpu = raw_smp_processor_id(); data = per_cpu_ptr(tr->trace_buffer.data, cpu); @@ -355,14 +362,6 @@ int trace_graph_entry(struct ftrace_graph_ent *trace) return ret; } -static int trace_graph_thresh_entry(struct ftrace_graph_ent *trace) -{ - if (tracing_thresh) - return 1; - else - return trace_graph_entry(trace); -} - static void __trace_graph_function(struct trace_array *tr, unsigned long ip, unsigned long flags, int pc) @@ -457,7 +456,7 @@ static int graph_trace_init(struct trace_array *tr) set_graph_array(tr); if (tracing_thresh) ret = register_ftrace_graph(&trace_graph_thresh_return, - &trace_graph_thresh_entry); + &trace_graph_entry); else ret = register_ftrace_graph(&trace_graph_return, &trace_graph_entry); diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 5546eec0505f..9aedb0b06683 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -587,6 +587,7 @@ static int create_trace_kprobe(int argc, char **argv) * $retval : fetch return value * $stack : fetch stack address * $stackN : fetch Nth of stack (N:0-) + * $comm : fetch current task comm * @ADDR : fetch memory at ADDR (ADDR should be in kernel) * @SYM[+|-offs] : fetch memory at SYM +|- offs (SYM is a data symbol) * %REG : fetch register REG diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index 68f376ca6d3f..cd7480d0a201 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -68,19 +68,15 @@ static void mmio_print_pcidev(struct trace_seq *s, const struct pci_dev *dev) trace_seq_printf(s, "PCIDEV %02x%02x %04x%04x %x", dev->bus->number, dev->devfn, dev->vendor, dev->device, dev->irq); - /* - * XXX: is pci_resource_to_user() appropriate, since we are - * supposed to interpret the __ioremap() phys_addr argument based on - * these printed values? - */ for (i = 0; i < 7; i++) { - pci_resource_to_user(dev, i, &dev->resource[i], &start, &end); + start = dev->resource[i].start; trace_seq_printf(s, " %llx", (unsigned long long)(start | (dev->resource[i].flags & PCI_REGION_FLAG_MASK))); } for (i = 0; i < 7; i++) { - pci_resource_to_user(dev, i, &dev->resource[i], &start, &end); + start = dev->resource[i].start; + end = dev->resource[i].end; trace_seq_printf(s, " %llx", dev->resource[i].start < dev->resource[i].end ? (unsigned long long)(end - start) + 1 : 0); diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 1d372fa6fefb..74e80a582c28 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -218,6 +218,28 @@ free_bitfield_fetch_param(struct bitfield_fetch_param *data) kfree(data); } +void FETCH_FUNC_NAME(comm, string)(struct pt_regs *regs, + void *data, void *dest) +{ + int maxlen = get_rloc_len(*(u32 *)dest); + u8 *dst = get_rloc_data(dest); + long ret; + + if (!maxlen) + return; + + ret = strlcpy(dst, current->comm, maxlen); + *(u32 *)dest = make_data_rloc(ret, get_rloc_offs(*(u32 *)dest)); +} +NOKPROBE_SYMBOL(FETCH_FUNC_NAME(comm, string)); + +void FETCH_FUNC_NAME(comm, string_size)(struct pt_regs *regs, + void *data, void *dest) +{ + *(u32 *)dest = strlen(current->comm) + 1; +} +NOKPROBE_SYMBOL(FETCH_FUNC_NAME(comm, string_size)); + static const struct fetch_type *find_fetch_type(const char *type, const struct fetch_type *ftbl) { @@ -348,6 +370,11 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t, } } else ret = -EINVAL; + } else if (strcmp(arg, "comm") == 0) { + if (strcmp(t->name, "string") != 0 && + strcmp(t->name, "string_size") != 0) + return -EINVAL; + f->fn = t->fetch[FETCH_MTD_comm]; } else ret = -EINVAL; @@ -522,6 +549,12 @@ int traceprobe_parse_probe_arg(char *arg, ssize_t *size, arg[t - parg->comm] = '\0'; t++; } + /* + * The default type of $comm should be "string", and it can't be + * dereferenced. + */ + if (!t && strcmp(arg, "$comm") == 0) + t = "string"; parg->type = find_fetch_type(t, ftbl); if (!parg->type) { pr_info("Unsupported type: %s\n", t); diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index f6398db09114..45400ca5ded1 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -102,6 +102,7 @@ enum { FETCH_MTD_reg = 0, FETCH_MTD_stack, FETCH_MTD_retval, + FETCH_MTD_comm, FETCH_MTD_memory, FETCH_MTD_symbol, FETCH_MTD_deref, @@ -183,6 +184,14 @@ DECLARE_BASIC_FETCH_FUNCS(bitfield); #define fetch_bitfield_string NULL #define fetch_bitfield_string_size NULL +/* comm only makes sense as a string */ +#define fetch_comm_u8 NULL +#define fetch_comm_u16 NULL +#define fetch_comm_u32 NULL +#define fetch_comm_u64 NULL +DECLARE_FETCH_FUNC(comm, string); +DECLARE_FETCH_FUNC(comm, string_size); + /* * Define macro for basic types - we don't need to define s* types, because * we have to care only about bitwidth at recording time. @@ -213,6 +222,7 @@ DEFINE_FETCH_##method(u64) ASSIGN_FETCH_FUNC(reg, ftype), \ ASSIGN_FETCH_FUNC(stack, ftype), \ ASSIGN_FETCH_FUNC(retval, ftype), \ +ASSIGN_FETCH_FUNC(comm, ftype), \ ASSIGN_FETCH_FUNC(memory, ftype), \ ASSIGN_FETCH_FUNC(symbol, ftype), \ ASSIGN_FETCH_FUNC(deref, ftype), \ diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 9bafc211930c..68f594212759 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -938,6 +938,20 @@ bool userns_may_setgroups(const struct user_namespace *ns) return allowed; } +/* + * Returns true if @ns is the same namespace as or a descendant of + * @target_ns. + */ +bool current_in_userns(const struct user_namespace *target_ns) +{ + struct user_namespace *ns; + for (ns = current_user_ns(); ns; ns = ns->parent) { + if (ns == target_ns) + return true; + } + return false; +} + static inline struct user_namespace *to_user_ns(struct ns_common *ns) { return container_of(ns, struct user_namespace, ns); diff --git a/kernel/workqueue.c b/kernel/workqueue.c index d12bd958077e..ef071ca73fc3 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -4607,84 +4607,65 @@ static void restore_unbound_workers_cpumask(struct worker_pool *pool, int cpu) WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, &cpumask) < 0); } -/* - * Workqueues should be brought up before normal priority CPU notifiers. - * This will be registered high priority CPU notifier. - */ -static int workqueue_cpu_up_callback(struct notifier_block *nfb, - unsigned long action, - void *hcpu) +int workqueue_prepare_cpu(unsigned int cpu) +{ + struct worker_pool *pool; + + for_each_cpu_worker_pool(pool, cpu) { + if (pool->nr_workers) + continue; + if (!create_worker(pool)) + return -ENOMEM; + } + return 0; +} + +int workqueue_online_cpu(unsigned int cpu) { - int cpu = (unsigned long)hcpu; struct worker_pool *pool; struct workqueue_struct *wq; int pi; - switch (action & ~CPU_TASKS_FROZEN) { - case CPU_UP_PREPARE: - for_each_cpu_worker_pool(pool, cpu) { - if (pool->nr_workers) - continue; - if (!create_worker(pool)) - return NOTIFY_BAD; - } - break; - - case CPU_DOWN_FAILED: - case CPU_ONLINE: - mutex_lock(&wq_pool_mutex); + mutex_lock(&wq_pool_mutex); - for_each_pool(pool, pi) { - mutex_lock(&pool->attach_mutex); + for_each_pool(pool, pi) { + mutex_lock(&pool->attach_mutex); - if (pool->cpu == cpu) - rebind_workers(pool); - else if (pool->cpu < 0) - restore_unbound_workers_cpumask(pool, cpu); + if (pool->cpu == cpu) + rebind_workers(pool); + else if (pool->cpu < 0) + restore_unbound_workers_cpumask(pool, cpu); - mutex_unlock(&pool->attach_mutex); - } + mutex_unlock(&pool->attach_mutex); + } - /* update NUMA affinity of unbound workqueues */ - list_for_each_entry(wq, &workqueues, list) - wq_update_unbound_numa(wq, cpu, true); + /* update NUMA affinity of unbound workqueues */ + list_for_each_entry(wq, &workqueues, list) + wq_update_unbound_numa(wq, cpu, true); - mutex_unlock(&wq_pool_mutex); - break; - } - return NOTIFY_OK; + mutex_unlock(&wq_pool_mutex); + return 0; } -/* - * Workqueues should be brought down after normal priority CPU notifiers. - * This will be registered as low priority CPU notifier. - */ -static int workqueue_cpu_down_callback(struct notifier_block *nfb, - unsigned long action, - void *hcpu) +int workqueue_offline_cpu(unsigned int cpu) { - int cpu = (unsigned long)hcpu; struct work_struct unbind_work; struct workqueue_struct *wq; - switch (action & ~CPU_TASKS_FROZEN) { - case CPU_DOWN_PREPARE: - /* unbinding per-cpu workers should happen on the local CPU */ - INIT_WORK_ONSTACK(&unbind_work, wq_unbind_fn); - queue_work_on(cpu, system_highpri_wq, &unbind_work); - - /* update NUMA affinity of unbound workqueues */ - mutex_lock(&wq_pool_mutex); - list_for_each_entry(wq, &workqueues, list) - wq_update_unbound_numa(wq, cpu, false); - mutex_unlock(&wq_pool_mutex); - - /* wait for per-cpu unbinding to finish */ - flush_work(&unbind_work); - destroy_work_on_stack(&unbind_work); - break; - } - return NOTIFY_OK; + /* unbinding per-cpu workers should happen on the local CPU */ + INIT_WORK_ONSTACK(&unbind_work, wq_unbind_fn); + queue_work_on(cpu, system_highpri_wq, &unbind_work); + + /* update NUMA affinity of unbound workqueues */ + mutex_lock(&wq_pool_mutex); + list_for_each_entry(wq, &workqueues, list) + wq_update_unbound_numa(wq, cpu, false); + mutex_unlock(&wq_pool_mutex); + + /* wait for per-cpu unbinding to finish */ + flush_work(&unbind_work); + destroy_work_on_stack(&unbind_work); + return 0; } #ifdef CONFIG_SMP @@ -5486,9 +5467,6 @@ static int __init init_workqueues(void) pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC); - cpu_notifier(workqueue_cpu_up_callback, CPU_PRI_WORKQUEUE_UP); - hotcpu_notifier(workqueue_cpu_down_callback, CPU_PRI_WORKQUEUE_DOWN); - wq_numa_init(); /* initialize CPU pools */ |