diff options
Diffstat (limited to 'kernel')
95 files changed, 5640 insertions, 2745 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index f70396e5a24b..2d9de86b7e76 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -23,6 +23,7 @@ CFLAGS_REMOVE_irq_work.o = -pg endif obj-y += sched/ +obj-y += power/ obj-$(CONFIG_FREEZER) += freezer.o obj-$(CONFIG_PROFILING) += profile.o @@ -52,8 +53,6 @@ obj-$(CONFIG_PROVE_LOCKING) += spinlock.o obj-$(CONFIG_UID16) += uid16.o obj-$(CONFIG_MODULES) += module.o obj-$(CONFIG_KALLSYMS) += kallsyms.o -obj-$(CONFIG_PM) += power/ -obj-$(CONFIG_FREEZER) += power/ obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o obj-$(CONFIG_KEXEC) += kexec.o obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o diff --git a/kernel/async.c b/kernel/async.c index 80b74b88fefe..bd0c168a3bbe 100644 --- a/kernel/async.c +++ b/kernel/async.c @@ -78,8 +78,6 @@ static DECLARE_WAIT_QUEUE_HEAD(async_done); static atomic_t entry_count; -extern int initcall_debug; - /* * MUST be called with the lock held! diff --git a/kernel/audit.c b/kernel/audit.c index 2c1d6ab7106e..1c7f2c61416b 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -601,13 +601,13 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type) case AUDIT_TTY_SET: case AUDIT_TRIM: case AUDIT_MAKE_EQUIV: - if (security_netlink_recv(skb, CAP_AUDIT_CONTROL)) + if (!capable(CAP_AUDIT_CONTROL)) err = -EPERM; break; case AUDIT_USER: case AUDIT_FIRST_USER_MSG ... AUDIT_LAST_USER_MSG: case AUDIT_FIRST_USER_MSG2 ... AUDIT_LAST_USER_MSG2: - if (security_netlink_recv(skb, CAP_AUDIT_WRITE)) + if (!capable(CAP_AUDIT_WRITE)) err = -EPERM; break; default: /* bad msg */ @@ -631,7 +631,7 @@ static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type, } *ab = audit_log_start(NULL, GFP_KERNEL, msg_type); - audit_log_format(*ab, "user pid=%d uid=%u auid=%u ses=%u", + audit_log_format(*ab, "pid=%d uid=%u auid=%u ses=%u", pid, uid, auid, ses); if (sid) { rc = security_secid_to_secctx(sid, &ctx, &len); @@ -1418,12 +1418,12 @@ void audit_log_untrustedstring(struct audit_buffer *ab, const char *string) /* This is a helper-function to print the escaped d_path */ void audit_log_d_path(struct audit_buffer *ab, const char *prefix, - struct path *path) + const struct path *path) { char *p, *pathname; if (prefix) - audit_log_format(ab, " %s", prefix); + audit_log_format(ab, "%s", prefix); /* We will allow 11 spaces for ' (deleted)' to be appended */ pathname = kmalloc(PATH_MAX+11, ab->gfp_mask); diff --git a/kernel/audit.h b/kernel/audit.h index 91e7071c4d2c..816766803371 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -36,12 +36,8 @@ enum audit_state { AUDIT_DISABLED, /* Do not create per-task audit_context. * No syscall-specific audit records can * be generated. */ - AUDIT_SETUP_CONTEXT, /* Create the per-task audit_context, - * but don't necessarily fill it in at - * syscall entry time (i.e., filter - * instead). */ AUDIT_BUILD_CONTEXT, /* Create the per-task audit_context, - * and always fill it in at syscall + * and fill it in at syscall * entry time. This makes a full * syscall record available if some * other part of the kernel decides it diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index f8277c80d678..a6c3f1abd206 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -235,13 +235,15 @@ static inline struct audit_entry *audit_to_entry_common(struct audit_rule *rule) switch(listnr) { default: goto exit_err; - case AUDIT_FILTER_USER: - case AUDIT_FILTER_TYPE: #ifdef CONFIG_AUDITSYSCALL case AUDIT_FILTER_ENTRY: + if (rule->action == AUDIT_ALWAYS) + goto exit_err; case AUDIT_FILTER_EXIT: case AUDIT_FILTER_TASK: #endif + case AUDIT_FILTER_USER: + case AUDIT_FILTER_TYPE: ; } if (unlikely(rule->action == AUDIT_POSSIBLE)) { @@ -385,7 +387,7 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule) goto exit_free; break; case AUDIT_FILETYPE: - if ((f->val & ~S_IFMT) > S_IFMT) + if (f->val & ~S_IFMT) goto exit_free; break; case AUDIT_INODE: @@ -459,6 +461,8 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, case AUDIT_ARG1: case AUDIT_ARG2: case AUDIT_ARG3: + case AUDIT_OBJ_UID: + case AUDIT_OBJ_GID: break; case AUDIT_ARCH: entry->rule.arch_f = f; @@ -522,7 +526,6 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, goto exit_free; break; case AUDIT_FILTERKEY: - err = -EINVAL; if (entry->rule.filterkey || f->val > AUDIT_MAX_KEY_LEN) goto exit_free; str = audit_unpack_string(&bufp, &remain, f->val); @@ -536,7 +539,11 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, goto exit_free; break; case AUDIT_FILETYPE: - if ((f->val & ~S_IFMT) > S_IFMT) + if (f->val & ~S_IFMT) + goto exit_free; + break; + case AUDIT_FIELD_COMPARE: + if (f->val > AUDIT_MAX_FIELD_COMPARE) goto exit_free; break; default: diff --git a/kernel/auditsc.c b/kernel/auditsc.c index e7fe2b0d29b3..af1de0f34eae 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -70,9 +70,15 @@ #include "audit.h" +/* flags stating the success for a syscall */ +#define AUDITSC_INVALID 0 +#define AUDITSC_SUCCESS 1 +#define AUDITSC_FAILURE 2 + /* AUDIT_NAMES is the number of slots we reserve in the audit_context - * for saving names from getname(). */ -#define AUDIT_NAMES 20 + * for saving names from getname(). If we get more names we will allocate + * a name dynamically and also add those to the list anchored by names_list. */ +#define AUDIT_NAMES 5 /* Indicates that audit should log the full pathname. */ #define AUDIT_NAME_FULL -1 @@ -101,9 +107,8 @@ struct audit_cap_data { * * Further, in fs/namei.c:path_lookup() we store the inode and device. */ struct audit_names { + struct list_head list; /* audit_context->names_list */ const char *name; - int name_len; /* number of name's characters to log */ - unsigned name_put; /* call __putname() for this name */ unsigned long ino; dev_t dev; umode_t mode; @@ -113,6 +118,14 @@ struct audit_names { u32 osid; struct audit_cap_data fcap; unsigned int fcap_ver; + int name_len; /* number of name's characters to log */ + bool name_put; /* call __putname() for this name */ + /* + * This was an allocated audit_names and not from the array of + * names allocated in the task audit context. Thus this name + * should be freed on syscall exit + */ + bool should_free; }; struct audit_aux_data { @@ -174,8 +187,17 @@ struct audit_context { long return_code;/* syscall return code */ u64 prio; int return_valid; /* return code is valid */ - int name_count; - struct audit_names names[AUDIT_NAMES]; + /* + * The names_list is the list of all audit_names collected during this + * syscall. The first AUDIT_NAMES entries in the names_list will + * actually be from the preallocated_names array for performance + * reasons. Except during allocation they should never be referenced + * through the preallocated_names array and should only be found/used + * by running the names_list. + */ + struct audit_names preallocated_names[AUDIT_NAMES]; + int name_count; /* total records in names_list */ + struct list_head names_list; /* anchor for struct audit_names->list */ char * filterkey; /* key for rule that triggered record */ struct path pwd; struct audit_context *previous; /* For nested syscalls */ @@ -305,21 +327,21 @@ static int audit_match_perm(struct audit_context *ctx, int mask) } } -static int audit_match_filetype(struct audit_context *ctx, int which) +static int audit_match_filetype(struct audit_context *ctx, int val) { - unsigned index = which & ~S_IFMT; - umode_t mode = which & S_IFMT; + struct audit_names *n; + umode_t mode = (umode_t)val; if (unlikely(!ctx)) return 0; - if (index >= ctx->name_count) - return 0; - if (ctx->names[index].ino == -1) - return 0; - if ((ctx->names[index].mode ^ mode) & S_IFMT) - return 0; - return 1; + list_for_each_entry(n, &ctx->names_list, list) { + if ((n->ino != -1) && + ((n->mode & S_IFMT) == mode)) + return 1; + } + + return 0; } /* @@ -441,6 +463,134 @@ static int match_tree_refs(struct audit_context *ctx, struct audit_tree *tree) return 0; } +static int audit_compare_id(uid_t uid1, + struct audit_names *name, + unsigned long name_offset, + struct audit_field *f, + struct audit_context *ctx) +{ + struct audit_names *n; + unsigned long addr; + uid_t uid2; + int rc; + + BUILD_BUG_ON(sizeof(uid_t) != sizeof(gid_t)); + + if (name) { + addr = (unsigned long)name; + addr += name_offset; + + uid2 = *(uid_t *)addr; + rc = audit_comparator(uid1, f->op, uid2); + if (rc) + return rc; + } + + if (ctx) { + list_for_each_entry(n, &ctx->names_list, list) { + addr = (unsigned long)n; + addr += name_offset; + + uid2 = *(uid_t *)addr; + + rc = audit_comparator(uid1, f->op, uid2); + if (rc) + return rc; + } + } + return 0; +} + +static int audit_field_compare(struct task_struct *tsk, + const struct cred *cred, + struct audit_field *f, + struct audit_context *ctx, + struct audit_names *name) +{ + switch (f->val) { + /* process to file object comparisons */ + case AUDIT_COMPARE_UID_TO_OBJ_UID: + return audit_compare_id(cred->uid, + name, offsetof(struct audit_names, uid), + f, ctx); + case AUDIT_COMPARE_GID_TO_OBJ_GID: + return audit_compare_id(cred->gid, + name, offsetof(struct audit_names, gid), + f, ctx); + case AUDIT_COMPARE_EUID_TO_OBJ_UID: + return audit_compare_id(cred->euid, + name, offsetof(struct audit_names, uid), + f, ctx); + case AUDIT_COMPARE_EGID_TO_OBJ_GID: + return audit_compare_id(cred->egid, + name, offsetof(struct audit_names, gid), + f, ctx); + case AUDIT_COMPARE_AUID_TO_OBJ_UID: + return audit_compare_id(tsk->loginuid, + name, offsetof(struct audit_names, uid), + f, ctx); + case AUDIT_COMPARE_SUID_TO_OBJ_UID: + return audit_compare_id(cred->suid, + name, offsetof(struct audit_names, uid), + f, ctx); + case AUDIT_COMPARE_SGID_TO_OBJ_GID: + return audit_compare_id(cred->sgid, + name, offsetof(struct audit_names, gid), + f, ctx); + case AUDIT_COMPARE_FSUID_TO_OBJ_UID: + return audit_compare_id(cred->fsuid, + name, offsetof(struct audit_names, uid), + f, ctx); + case AUDIT_COMPARE_FSGID_TO_OBJ_GID: + return audit_compare_id(cred->fsgid, + name, offsetof(struct audit_names, gid), + f, ctx); + /* uid comparisons */ + case AUDIT_COMPARE_UID_TO_AUID: + return audit_comparator(cred->uid, f->op, tsk->loginuid); + case AUDIT_COMPARE_UID_TO_EUID: + return audit_comparator(cred->uid, f->op, cred->euid); + case AUDIT_COMPARE_UID_TO_SUID: + return audit_comparator(cred->uid, f->op, cred->suid); + case AUDIT_COMPARE_UID_TO_FSUID: + return audit_comparator(cred->uid, f->op, cred->fsuid); + /* auid comparisons */ + case AUDIT_COMPARE_AUID_TO_EUID: + return audit_comparator(tsk->loginuid, f->op, cred->euid); + case AUDIT_COMPARE_AUID_TO_SUID: + return audit_comparator(tsk->loginuid, f->op, cred->suid); + case AUDIT_COMPARE_AUID_TO_FSUID: + return audit_comparator(tsk->loginuid, f->op, cred->fsuid); + /* euid comparisons */ + case AUDIT_COMPARE_EUID_TO_SUID: + return audit_comparator(cred->euid, f->op, cred->suid); + case AUDIT_COMPARE_EUID_TO_FSUID: + return audit_comparator(cred->euid, f->op, cred->fsuid); + /* suid comparisons */ + case AUDIT_COMPARE_SUID_TO_FSUID: + return audit_comparator(cred->suid, f->op, cred->fsuid); + /* gid comparisons */ + case AUDIT_COMPARE_GID_TO_EGID: + return audit_comparator(cred->gid, f->op, cred->egid); + case AUDIT_COMPARE_GID_TO_SGID: + return audit_comparator(cred->gid, f->op, cred->sgid); + case AUDIT_COMPARE_GID_TO_FSGID: + return audit_comparator(cred->gid, f->op, cred->fsgid); + /* egid comparisons */ + case AUDIT_COMPARE_EGID_TO_SGID: + return audit_comparator(cred->egid, f->op, cred->sgid); + case AUDIT_COMPARE_EGID_TO_FSGID: + return audit_comparator(cred->egid, f->op, cred->fsgid); + /* sgid comparison */ + case AUDIT_COMPARE_SGID_TO_FSGID: + return audit_comparator(cred->sgid, f->op, cred->fsgid); + default: + WARN(1, "Missing AUDIT_COMPARE define. Report as a bug\n"); + return 0; + } + return 0; +} + /* Determine if any context name data matches a rule's watch data */ /* Compare a task_struct with an audit_rule. Return 1 on match, 0 * otherwise. @@ -457,13 +607,14 @@ static int audit_filter_rules(struct task_struct *tsk, bool task_creation) { const struct cred *cred; - int i, j, need_sid = 1; + int i, need_sid = 1; u32 sid; cred = rcu_dereference_check(tsk->cred, tsk == current || task_creation); for (i = 0; i < rule->field_count; i++) { struct audit_field *f = &rule->fields[i]; + struct audit_names *n; int result = 0; switch (f->type) { @@ -522,12 +673,14 @@ static int audit_filter_rules(struct task_struct *tsk, } break; case AUDIT_DEVMAJOR: - if (name) - result = audit_comparator(MAJOR(name->dev), - f->op, f->val); - else if (ctx) { - for (j = 0; j < ctx->name_count; j++) { - if (audit_comparator(MAJOR(ctx->names[j].dev), f->op, f->val)) { + if (name) { + if (audit_comparator(MAJOR(name->dev), f->op, f->val) || + audit_comparator(MAJOR(name->rdev), f->op, f->val)) + ++result; + } else if (ctx) { + list_for_each_entry(n, &ctx->names_list, list) { + if (audit_comparator(MAJOR(n->dev), f->op, f->val) || + audit_comparator(MAJOR(n->rdev), f->op, f->val)) { ++result; break; } @@ -535,12 +688,14 @@ static int audit_filter_rules(struct task_struct *tsk, } break; case AUDIT_DEVMINOR: - if (name) - result = audit_comparator(MINOR(name->dev), - f->op, f->val); - else if (ctx) { - for (j = 0; j < ctx->name_count; j++) { - if (audit_comparator(MINOR(ctx->names[j].dev), f->op, f->val)) { + if (name) { + if (audit_comparator(MINOR(name->dev), f->op, f->val) || + audit_comparator(MINOR(name->rdev), f->op, f->val)) + ++result; + } else if (ctx) { + list_for_each_entry(n, &ctx->names_list, list) { + if (audit_comparator(MINOR(n->dev), f->op, f->val) || + audit_comparator(MINOR(n->rdev), f->op, f->val)) { ++result; break; } @@ -551,8 +706,32 @@ static int audit_filter_rules(struct task_struct *tsk, if (name) result = (name->ino == f->val); else if (ctx) { - for (j = 0; j < ctx->name_count; j++) { - if (audit_comparator(ctx->names[j].ino, f->op, f->val)) { + list_for_each_entry(n, &ctx->names_list, list) { + if (audit_comparator(n->ino, f->op, f->val)) { + ++result; + break; + } + } + } + break; + case AUDIT_OBJ_UID: + if (name) { + result = audit_comparator(name->uid, f->op, f->val); + } else if (ctx) { + list_for_each_entry(n, &ctx->names_list, list) { + if (audit_comparator(n->uid, f->op, f->val)) { + ++result; + break; + } + } + } + break; + case AUDIT_OBJ_GID: + if (name) { + result = audit_comparator(name->gid, f->op, f->val); + } else if (ctx) { + list_for_each_entry(n, &ctx->names_list, list) { + if (audit_comparator(n->gid, f->op, f->val)) { ++result; break; } @@ -607,11 +786,10 @@ static int audit_filter_rules(struct task_struct *tsk, name->osid, f->type, f->op, f->lsm_rule, ctx); } else if (ctx) { - for (j = 0; j < ctx->name_count; j++) { - if (security_audit_rule_match( - ctx->names[j].osid, - f->type, f->op, - f->lsm_rule, ctx)) { + list_for_each_entry(n, &ctx->names_list, list) { + if (security_audit_rule_match(n->osid, f->type, + f->op, f->lsm_rule, + ctx)) { ++result; break; } @@ -643,8 +821,10 @@ static int audit_filter_rules(struct task_struct *tsk, case AUDIT_FILETYPE: result = audit_match_filetype(ctx, f->val); break; + case AUDIT_FIELD_COMPARE: + result = audit_field_compare(tsk, cred, f, ctx, name); + break; } - if (!result) return 0; } @@ -722,40 +902,53 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk, return AUDIT_BUILD_CONTEXT; } -/* At syscall exit time, this filter is called if any audit_names[] have been +/* + * Given an audit_name check the inode hash table to see if they match. + * Called holding the rcu read lock to protect the use of audit_inode_hash + */ +static int audit_filter_inode_name(struct task_struct *tsk, + struct audit_names *n, + struct audit_context *ctx) { + int word, bit; + int h = audit_hash_ino((u32)n->ino); + struct list_head *list = &audit_inode_hash[h]; + struct audit_entry *e; + enum audit_state state; + + word = AUDIT_WORD(ctx->major); + bit = AUDIT_BIT(ctx->major); + + if (list_empty(list)) + return 0; + + list_for_each_entry_rcu(e, list, list) { + if ((e->rule.mask[word] & bit) == bit && + audit_filter_rules(tsk, &e->rule, ctx, n, &state, false)) { + ctx->current_state = state; + return 1; + } + } + + return 0; +} + +/* At syscall exit time, this filter is called if any audit_names have been * collected during syscall processing. We only check rules in sublists at hash - * buckets applicable to the inode numbers in audit_names[]. + * buckets applicable to the inode numbers in audit_names. * Regarding audit_state, same rules apply as for audit_filter_syscall(). */ void audit_filter_inodes(struct task_struct *tsk, struct audit_context *ctx) { - int i; - struct audit_entry *e; - enum audit_state state; + struct audit_names *n; if (audit_pid && tsk->tgid == audit_pid) return; rcu_read_lock(); - for (i = 0; i < ctx->name_count; i++) { - int word = AUDIT_WORD(ctx->major); - int bit = AUDIT_BIT(ctx->major); - struct audit_names *n = &ctx->names[i]; - int h = audit_hash_ino((u32)n->ino); - struct list_head *list = &audit_inode_hash[h]; - - if (list_empty(list)) - continue; - list_for_each_entry_rcu(e, list, list) { - if ((e->rule.mask[word] & bit) == bit && - audit_filter_rules(tsk, &e->rule, ctx, n, - &state, false)) { - rcu_read_unlock(); - ctx->current_state = state; - return; - } - } + list_for_each_entry(n, &ctx->names_list, list) { + if (audit_filter_inode_name(tsk, n, ctx)) + break; } rcu_read_unlock(); } @@ -766,7 +959,7 @@ static inline struct audit_context *audit_get_context(struct task_struct *tsk, { struct audit_context *context = tsk->audit_context; - if (likely(!context)) + if (!context) return NULL; context->return_valid = return_valid; @@ -799,7 +992,7 @@ static inline struct audit_context *audit_get_context(struct task_struct *tsk, static inline void audit_free_names(struct audit_context *context) { - int i; + struct audit_names *n, *next; #if AUDIT_DEBUG == 2 if (context->put_count + context->ino_count != context->name_count) { @@ -810,10 +1003,9 @@ static inline void audit_free_names(struct audit_context *context) context->serial, context->major, context->in_syscall, context->name_count, context->put_count, context->ino_count); - for (i = 0; i < context->name_count; i++) { + list_for_each_entry(n, &context->names_list, list) { printk(KERN_ERR "names[%d] = %p = %s\n", i, - context->names[i].name, - context->names[i].name ?: "(null)"); + n->name, n->name ?: "(null)"); } dump_stack(); return; @@ -824,9 +1016,12 @@ static inline void audit_free_names(struct audit_context *context) context->ino_count = 0; #endif - for (i = 0; i < context->name_count; i++) { - if (context->names[i].name && context->names[i].name_put) - __putname(context->names[i].name); + list_for_each_entry_safe(n, next, &context->names_list, list) { + list_del(&n->list); + if (n->name && n->name_put) + __putname(n->name); + if (n->should_free) + kfree(n); } context->name_count = 0; path_put(&context->pwd); @@ -864,6 +1059,7 @@ static inline struct audit_context *audit_alloc_context(enum audit_state state) return NULL; audit_zero_context(context, state); INIT_LIST_HEAD(&context->killed_trees); + INIT_LIST_HEAD(&context->names_list); return context; } @@ -886,7 +1082,7 @@ int audit_alloc(struct task_struct *tsk) return 0; /* Return if not auditing. */ state = audit_filter_task(tsk, &key); - if (likely(state == AUDIT_DISABLED)) + if (state == AUDIT_DISABLED) return 0; if (!(context = audit_alloc_context(state))) { @@ -975,7 +1171,7 @@ static void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk while (vma) { if ((vma->vm_flags & VM_EXECUTABLE) && vma->vm_file) { - audit_log_d_path(ab, "exe=", + audit_log_d_path(ab, " exe=", &vma->vm_file->f_path); break; } @@ -1166,8 +1362,8 @@ static void audit_log_execve_info(struct audit_context *context, struct audit_buffer **ab, struct audit_aux_data_execve *axi) { - int i; - size_t len, len_sent = 0; + int i, len; + size_t len_sent = 0; const char __user *p; char *buf; @@ -1324,6 +1520,68 @@ static void show_special(struct audit_context *context, int *call_panic) audit_log_end(ab); } +static void audit_log_name(struct audit_context *context, struct audit_names *n, + int record_num, int *call_panic) +{ + struct audit_buffer *ab; + ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH); + if (!ab) + return; /* audit_panic has been called */ + + audit_log_format(ab, "item=%d", record_num); + + if (n->name) { + switch (n->name_len) { + case AUDIT_NAME_FULL: + /* log the full path */ + audit_log_format(ab, " name="); + audit_log_untrustedstring(ab, n->name); + break; + case 0: + /* name was specified as a relative path and the + * directory component is the cwd */ + audit_log_d_path(ab, " name=", &context->pwd); + break; + default: + /* log the name's directory component */ + audit_log_format(ab, " name="); + audit_log_n_untrustedstring(ab, n->name, + n->name_len); + } + } else + audit_log_format(ab, " name=(null)"); + + if (n->ino != (unsigned long)-1) { + audit_log_format(ab, " inode=%lu" + " dev=%02x:%02x mode=%#ho" + " ouid=%u ogid=%u rdev=%02x:%02x", + n->ino, + MAJOR(n->dev), + MINOR(n->dev), + n->mode, + n->uid, + n->gid, + MAJOR(n->rdev), + MINOR(n->rdev)); + } + if (n->osid != 0) { + char *ctx = NULL; + u32 len; + if (security_secid_to_secctx( + n->osid, &ctx, &len)) { + audit_log_format(ab, " osid=%u", n->osid); + *call_panic = 2; + } else { + audit_log_format(ab, " obj=%s", ctx); + security_release_secctx(ctx, len); + } + } + + audit_log_fcaps(ab, n); + + audit_log_end(ab); +} + static void audit_log_exit(struct audit_context *context, struct task_struct *tsk) { const struct cred *cred; @@ -1331,6 +1589,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts struct audit_buffer *ab; struct audit_aux_data *aux; const char *tty; + struct audit_names *n; /* tsk == current */ context->pid = tsk->pid; @@ -1466,70 +1725,14 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts if (context->pwd.dentry && context->pwd.mnt) { ab = audit_log_start(context, GFP_KERNEL, AUDIT_CWD); if (ab) { - audit_log_d_path(ab, "cwd=", &context->pwd); + audit_log_d_path(ab, " cwd=", &context->pwd); audit_log_end(ab); } } - for (i = 0; i < context->name_count; i++) { - struct audit_names *n = &context->names[i]; - ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH); - if (!ab) - continue; /* audit_panic has been called */ - - audit_log_format(ab, "item=%d", i); - - if (n->name) { - switch(n->name_len) { - case AUDIT_NAME_FULL: - /* log the full path */ - audit_log_format(ab, " name="); - audit_log_untrustedstring(ab, n->name); - break; - case 0: - /* name was specified as a relative path and the - * directory component is the cwd */ - audit_log_d_path(ab, "name=", &context->pwd); - break; - default: - /* log the name's directory component */ - audit_log_format(ab, " name="); - audit_log_n_untrustedstring(ab, n->name, - n->name_len); - } - } else - audit_log_format(ab, " name=(null)"); - - if (n->ino != (unsigned long)-1) { - audit_log_format(ab, " inode=%lu" - " dev=%02x:%02x mode=%#ho" - " ouid=%u ogid=%u rdev=%02x:%02x", - n->ino, - MAJOR(n->dev), - MINOR(n->dev), - n->mode, - n->uid, - n->gid, - MAJOR(n->rdev), - MINOR(n->rdev)); - } - if (n->osid != 0) { - char *ctx = NULL; - u32 len; - if (security_secid_to_secctx( - n->osid, &ctx, &len)) { - audit_log_format(ab, " osid=%u", n->osid); - call_panic = 2; - } else { - audit_log_format(ab, " obj=%s", ctx); - security_release_secctx(ctx, len); - } - } - - audit_log_fcaps(ab, n); - - audit_log_end(ab); - } + i = 0; + list_for_each_entry(n, &context->names_list, list) + audit_log_name(context, n, i++, &call_panic); /* Send end of event record to help user space know we are finished */ ab = audit_log_start(context, GFP_KERNEL, AUDIT_EOE); @@ -1545,12 +1748,12 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts * * Called from copy_process and do_exit */ -void audit_free(struct task_struct *tsk) +void __audit_free(struct task_struct *tsk) { struct audit_context *context; context = audit_get_context(tsk, 0, 0); - if (likely(!context)) + if (!context) return; /* Check for system calls that do not go through the exit @@ -1583,7 +1786,7 @@ void audit_free(struct task_struct *tsk) * will only be written if another part of the kernel requests that it * be written). */ -void audit_syscall_entry(int arch, int major, +void __audit_syscall_entry(int arch, int major, unsigned long a1, unsigned long a2, unsigned long a3, unsigned long a4) { @@ -1591,7 +1794,7 @@ void audit_syscall_entry(int arch, int major, struct audit_context *context = tsk->audit_context; enum audit_state state; - if (unlikely(!context)) + if (!context) return; /* @@ -1648,7 +1851,7 @@ void audit_syscall_entry(int arch, int major, context->prio = 0; state = audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_ENTRY]); } - if (likely(state == AUDIT_DISABLED)) + if (state == AUDIT_DISABLED) return; context->serial = 0; @@ -1658,45 +1861,29 @@ void audit_syscall_entry(int arch, int major, context->ppid = 0; } -void audit_finish_fork(struct task_struct *child) -{ - struct audit_context *ctx = current->audit_context; - struct audit_context *p = child->audit_context; - if (!p || !ctx) - return; - if (!ctx->in_syscall || ctx->current_state != AUDIT_RECORD_CONTEXT) - return; - p->arch = ctx->arch; - p->major = ctx->major; - memcpy(p->argv, ctx->argv, sizeof(ctx->argv)); - p->ctime = ctx->ctime; - p->dummy = ctx->dummy; - p->in_syscall = ctx->in_syscall; - p->filterkey = kstrdup(ctx->filterkey, GFP_KERNEL); - p->ppid = current->pid; - p->prio = ctx->prio; - p->current_state = ctx->current_state; -} - /** * audit_syscall_exit - deallocate audit context after a system call - * @valid: success/failure flag - * @return_code: syscall return value + * @success: success value of the syscall + * @return_code: return value of the syscall * * Tear down after system call. If the audit context has been marked as * auditable (either because of the AUDIT_RECORD_CONTEXT state from - * filtering, or because some other part of the kernel write an audit + * filtering, or because some other part of the kernel wrote an audit * message), then write out the syscall information. In call cases, * free the names stored from getname(). */ -void audit_syscall_exit(int valid, long return_code) +void __audit_syscall_exit(int success, long return_code) { struct task_struct *tsk = current; struct audit_context *context; - context = audit_get_context(tsk, valid, return_code); + if (success) + success = AUDITSC_SUCCESS; + else + success = AUDITSC_FAILURE; - if (likely(!context)) + context = audit_get_context(tsk, success, return_code); + if (!context) return; if (context->in_syscall && context->current_state == AUDIT_RECORD_CONTEXT) @@ -1821,6 +2008,30 @@ retry: #endif } +static struct audit_names *audit_alloc_name(struct audit_context *context) +{ + struct audit_names *aname; + + if (context->name_count < AUDIT_NAMES) { + aname = &context->preallocated_names[context->name_count]; + memset(aname, 0, sizeof(*aname)); + } else { + aname = kzalloc(sizeof(*aname), GFP_NOFS); + if (!aname) + return NULL; + aname->should_free = true; + } + + aname->ino = (unsigned long)-1; + list_add_tail(&aname->list, &context->names_list); + + context->name_count++; +#if AUDIT_DEBUG + context->ino_count++; +#endif + return aname; +} + /** * audit_getname - add a name to the list * @name: name to add @@ -1831,9 +2042,7 @@ retry: void __audit_getname(const char *name) { struct audit_context *context = current->audit_context; - - if (IS_ERR(name) || !name) - return; + struct audit_names *n; if (!context->in_syscall) { #if AUDIT_DEBUG == 2 @@ -1843,13 +2052,15 @@ void __audit_getname(const char *name) #endif return; } - BUG_ON(context->name_count >= AUDIT_NAMES); - context->names[context->name_count].name = name; - context->names[context->name_count].name_len = AUDIT_NAME_FULL; - context->names[context->name_count].name_put = 1; - context->names[context->name_count].ino = (unsigned long)-1; - context->names[context->name_count].osid = 0; - ++context->name_count; + + n = audit_alloc_name(context); + if (!n) + return; + + n->name = name; + n->name_len = AUDIT_NAME_FULL; + n->name_put = true; + if (!context->pwd.dentry) get_fs_pwd(current->fs, &context->pwd); } @@ -1871,12 +2082,13 @@ void audit_putname(const char *name) printk(KERN_ERR "%s:%d(:%d): __putname(%p)\n", __FILE__, __LINE__, context->serial, name); if (context->name_count) { + struct audit_names *n; int i; - for (i = 0; i < context->name_count; i++) + + list_for_each_entry(n, &context->names_list, list) printk(KERN_ERR "name[%d] = %p = %s\n", i, - context->names[i].name, - context->names[i].name ?: "(null)"); - } + n->name, n->name ?: "(null)"); + } #endif __putname(name); } @@ -1897,39 +2109,11 @@ void audit_putname(const char *name) #endif } -static int audit_inc_name_count(struct audit_context *context, - const struct inode *inode) -{ - if (context->name_count >= AUDIT_NAMES) { - if (inode) - printk(KERN_DEBUG "audit: name_count maxed, losing inode data: " - "dev=%02x:%02x, inode=%lu\n", - MAJOR(inode->i_sb->s_dev), - MINOR(inode->i_sb->s_dev), - inode->i_ino); - - else - printk(KERN_DEBUG "name_count maxed, losing inode data\n"); - return 1; - } - context->name_count++; -#if AUDIT_DEBUG - context->ino_count++; -#endif - return 0; -} - - static inline int audit_copy_fcaps(struct audit_names *name, const struct dentry *dentry) { struct cpu_vfs_cap_data caps; int rc; - memset(&name->fcap.permitted, 0, sizeof(kernel_cap_t)); - memset(&name->fcap.inheritable, 0, sizeof(kernel_cap_t)); - name->fcap.fE = 0; - name->fcap_ver = 0; - if (!dentry) return 0; @@ -1969,30 +2153,25 @@ static void audit_copy_inode(struct audit_names *name, const struct dentry *dent */ void __audit_inode(const char *name, const struct dentry *dentry) { - int idx; struct audit_context *context = current->audit_context; const struct inode *inode = dentry->d_inode; + struct audit_names *n; if (!context->in_syscall) return; - if (context->name_count - && context->names[context->name_count-1].name - && context->names[context->name_count-1].name == name) - idx = context->name_count - 1; - else if (context->name_count > 1 - && context->names[context->name_count-2].name - && context->names[context->name_count-2].name == name) - idx = context->name_count - 2; - else { - /* FIXME: how much do we care about inodes that have no - * associated name? */ - if (audit_inc_name_count(context, inode)) - return; - idx = context->name_count - 1; - context->names[idx].name = NULL; + + list_for_each_entry_reverse(n, &context->names_list, list) { + if (n->name && (n->name == name)) + goto out; } + + /* unable to find the name from a previous getname() */ + n = audit_alloc_name(context); + if (!n) + return; +out: handle_path(dentry); - audit_copy_inode(&context->names[idx], dentry, inode); + audit_copy_inode(n, dentry, inode); } /** @@ -2011,11 +2190,11 @@ void __audit_inode(const char *name, const struct dentry *dentry) void __audit_inode_child(const struct dentry *dentry, const struct inode *parent) { - int idx; struct audit_context *context = current->audit_context; const char *found_parent = NULL, *found_child = NULL; const struct inode *inode = dentry->d_inode; const char *dname = dentry->d_name.name; + struct audit_names *n; int dirlen = 0; if (!context->in_syscall) @@ -2025,9 +2204,7 @@ void __audit_inode_child(const struct dentry *dentry, handle_one(inode); /* parent is more likely, look for it first */ - for (idx = 0; idx < context->name_count; idx++) { - struct audit_names *n = &context->names[idx]; - + list_for_each_entry(n, &context->names_list, list) { if (!n->name) continue; @@ -2040,9 +2217,7 @@ void __audit_inode_child(const struct dentry *dentry, } /* no matching parent, look for matching child */ - for (idx = 0; idx < context->name_count; idx++) { - struct audit_names *n = &context->names[idx]; - + list_for_each_entry(n, &context->names_list, list) { if (!n->name) continue; @@ -2060,34 +2235,29 @@ void __audit_inode_child(const struct dentry *dentry, add_names: if (!found_parent) { - if (audit_inc_name_count(context, parent)) + n = audit_alloc_name(context); + if (!n) return; - idx = context->name_count - 1; - context->names[idx].name = NULL; - audit_copy_inode(&context->names[idx], NULL, parent); + audit_copy_inode(n, NULL, parent); } if (!found_child) { - if (audit_inc_name_count(context, inode)) + n = audit_alloc_name(context); + if (!n) return; - idx = context->name_count - 1; /* Re-use the name belonging to the slot for a matching parent * directory. All names for this context are relinquished in * audit_free_names() */ if (found_parent) { - context->names[idx].name = found_parent; - context->names[idx].name_len = AUDIT_NAME_FULL; + n->name = found_parent; + n->name_len = AUDIT_NAME_FULL; /* don't call __putname() */ - context->names[idx].name_put = 0; - } else { - context->names[idx].name = NULL; + n->name_put = false; } if (inode) - audit_copy_inode(&context->names[idx], NULL, inode); - else - context->names[idx].ino = (unsigned long)-1; + audit_copy_inode(n, NULL, inode); } } EXPORT_SYMBOL_GPL(__audit_inode_child); @@ -2121,19 +2291,28 @@ int auditsc_get_stamp(struct audit_context *ctx, static atomic_t session_id = ATOMIC_INIT(0); /** - * audit_set_loginuid - set a task's audit_context loginuid - * @task: task whose audit context is being modified + * audit_set_loginuid - set current task's audit_context loginuid * @loginuid: loginuid value * * Returns 0. * * Called (set) from fs/proc/base.c::proc_loginuid_write(). */ -int audit_set_loginuid(struct task_struct *task, uid_t loginuid) +int audit_set_loginuid(uid_t loginuid) { - unsigned int sessionid = atomic_inc_return(&session_id); + struct task_struct *task = current; struct audit_context *context = task->audit_context; + unsigned int sessionid; + +#ifdef CONFIG_AUDIT_LOGINUID_IMMUTABLE + if (task->loginuid != -1) + return -EPERM; +#else /* CONFIG_AUDIT_LOGINUID_IMMUTABLE */ + if (!capable(CAP_AUDIT_CONTROL)) + return -EPERM; +#endif /* CONFIG_AUDIT_LOGINUID_IMMUTABLE */ + sessionid = atomic_inc_return(&session_id); if (context && context->in_syscall) { struct audit_buffer *ab; @@ -2271,14 +2450,11 @@ void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mo context->ipc.has_perm = 1; } -int audit_bprm(struct linux_binprm *bprm) +int __audit_bprm(struct linux_binprm *bprm) { struct audit_aux_data_execve *ax; struct audit_context *context = current->audit_context; - if (likely(!audit_enabled || !context || context->dummy)) - return 0; - ax = kmalloc(sizeof(*ax), GFP_KERNEL); if (!ax) return -ENOMEM; @@ -2299,13 +2475,10 @@ int audit_bprm(struct linux_binprm *bprm) * @args: args array * */ -void audit_socketcall(int nargs, unsigned long *args) +void __audit_socketcall(int nargs, unsigned long *args) { struct audit_context *context = current->audit_context; - if (likely(!context || context->dummy)) - return; - context->type = AUDIT_SOCKETCALL; context->socketcall.nargs = nargs; memcpy(context->socketcall.args, args, nargs * sizeof(unsigned long)); @@ -2331,13 +2504,10 @@ void __audit_fd_pair(int fd1, int fd2) * * Returns 0 for success or NULL context or < 0 on error. */ -int audit_sockaddr(int len, void *a) +int __audit_sockaddr(int len, void *a) { struct audit_context *context = current->audit_context; - if (likely(!context || context->dummy)) - return 0; - if (!context->sockaddr) { void *p = kmalloc(sizeof(struct sockaddr_storage), GFP_KERNEL); if (!p) @@ -2499,6 +2669,25 @@ void __audit_mmap_fd(int fd, int flags) context->type = AUDIT_MMAP; } +static void audit_log_abend(struct audit_buffer *ab, char *reason, long signr) +{ + uid_t auid, uid; + gid_t gid; + unsigned int sessionid; + + auid = audit_get_loginuid(current); + sessionid = audit_get_sessionid(current); + current_uid_gid(&uid, &gid); + + audit_log_format(ab, "auid=%u uid=%u gid=%u ses=%u", + auid, uid, gid, sessionid); + audit_log_task_context(ab); + audit_log_format(ab, " pid=%d comm=", current->pid); + audit_log_untrustedstring(ab, current->comm); + audit_log_format(ab, " reason="); + audit_log_string(ab, reason); + audit_log_format(ab, " sig=%ld", signr); +} /** * audit_core_dumps - record information about processes that end abnormally * @signr: signal value @@ -2509,10 +2698,6 @@ void __audit_mmap_fd(int fd, int flags) void audit_core_dumps(long signr) { struct audit_buffer *ab; - u32 sid; - uid_t auid = audit_get_loginuid(current), uid; - gid_t gid; - unsigned int sessionid = audit_get_sessionid(current); if (!audit_enabled) return; @@ -2521,24 +2706,17 @@ void audit_core_dumps(long signr) return; ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND); - current_uid_gid(&uid, &gid); - audit_log_format(ab, "auid=%u uid=%u gid=%u ses=%u", - auid, uid, gid, sessionid); - security_task_getsecid(current, &sid); - if (sid) { - char *ctx = NULL; - u32 len; + audit_log_abend(ab, "memory violation", signr); + audit_log_end(ab); +} - if (security_secid_to_secctx(sid, &ctx, &len)) - audit_log_format(ab, " ssid=%u", sid); - else { - audit_log_format(ab, " subj=%s", ctx); - security_release_secctx(ctx, len); - } - } - audit_log_format(ab, " pid=%d comm=", current->pid); - audit_log_untrustedstring(ab, current->comm); - audit_log_format(ab, " sig=%ld", signr); +void __audit_seccomp(unsigned long syscall) +{ + struct audit_buffer *ab; + + ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND); + audit_log_abend(ab, "seccomp", SIGKILL); + audit_log_format(ab, " syscall=%ld", syscall); audit_log_end(ab); } diff --git a/kernel/capability.c b/kernel/capability.c index b463871a4e69..3f1adb6c6470 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -287,74 +287,84 @@ error: } /** - * has_capability - Does a task have a capability in init_user_ns + * has_ns_capability - Does a task have a capability in a specific user ns * @t: The task in question + * @ns: target user namespace * @cap: The capability to be tested for * * Return true if the specified task has the given superior capability - * currently in effect to the initial user namespace, false if not. + * currently in effect to the specified user namespace, false if not. * * Note that this does not set PF_SUPERPRIV on the task. */ -bool has_capability(struct task_struct *t, int cap) +bool has_ns_capability(struct task_struct *t, + struct user_namespace *ns, int cap) { - int ret = security_real_capable(t, &init_user_ns, cap); + int ret; + + rcu_read_lock(); + ret = security_capable(__task_cred(t), ns, cap); + rcu_read_unlock(); return (ret == 0); } /** - * has_capability - Does a task have a capability in a specific user ns + * has_capability - Does a task have a capability in init_user_ns * @t: The task in question - * @ns: target user namespace * @cap: The capability to be tested for * * Return true if the specified task has the given superior capability - * currently in effect to the specified user namespace, false if not. + * currently in effect to the initial user namespace, false if not. * * Note that this does not set PF_SUPERPRIV on the task. */ -bool has_ns_capability(struct task_struct *t, - struct user_namespace *ns, int cap) +bool has_capability(struct task_struct *t, int cap) { - int ret = security_real_capable(t, ns, cap); - - return (ret == 0); + return has_ns_capability(t, &init_user_ns, cap); } /** - * has_capability_noaudit - Does a task have a capability (unaudited) + * has_ns_capability_noaudit - Does a task have a capability (unaudited) + * in a specific user ns. * @t: The task in question + * @ns: target user namespace * @cap: The capability to be tested for * * Return true if the specified task has the given superior capability - * currently in effect to init_user_ns, false if not. Don't write an - * audit message for the check. + * currently in effect to the specified user namespace, false if not. + * Do not write an audit message for the check. * * Note that this does not set PF_SUPERPRIV on the task. */ -bool has_capability_noaudit(struct task_struct *t, int cap) +bool has_ns_capability_noaudit(struct task_struct *t, + struct user_namespace *ns, int cap) { - int ret = security_real_capable_noaudit(t, &init_user_ns, cap); + int ret; + + rcu_read_lock(); + ret = security_capable_noaudit(__task_cred(t), ns, cap); + rcu_read_unlock(); return (ret == 0); } /** - * capable - Determine if the current task has a superior capability in effect + * has_capability_noaudit - Does a task have a capability (unaudited) in the + * initial user ns + * @t: The task in question * @cap: The capability to be tested for * - * Return true if the current task has the given superior capability currently - * available for use, false if not. + * Return true if the specified task has the given superior capability + * currently in effect to init_user_ns, false if not. Don't write an + * audit message for the check. * - * This sets PF_SUPERPRIV on the task if the capability is available on the - * assumption that it's about to be used. + * Note that this does not set PF_SUPERPRIV on the task. */ -bool capable(int cap) +bool has_capability_noaudit(struct task_struct *t, int cap) { - return ns_capable(&init_user_ns, cap); + return has_ns_capability_noaudit(t, &init_user_ns, cap); } -EXPORT_SYMBOL(capable); /** * ns_capable - Determine if the current task has a superior capability in effect @@ -374,7 +384,7 @@ bool ns_capable(struct user_namespace *ns, int cap) BUG(); } - if (security_capable(ns, current_cred(), cap) == 0) { + if (security_capable(current_cred(), ns, cap) == 0) { current->flags |= PF_SUPERPRIV; return true; } @@ -383,18 +393,20 @@ bool ns_capable(struct user_namespace *ns, int cap) EXPORT_SYMBOL(ns_capable); /** - * task_ns_capable - Determine whether current task has a superior - * capability targeted at a specific task's user namespace. - * @t: The task whose user namespace is targeted. - * @cap: The capability in question. + * capable - Determine if the current task has a superior capability in effect + * @cap: The capability to be tested for + * + * Return true if the current task has the given superior capability currently + * available for use, false if not. * - * Return true if it does, false otherwise. + * This sets PF_SUPERPRIV on the task if the capability is available on the + * assumption that it's about to be used. */ -bool task_ns_capable(struct task_struct *t, int cap) +bool capable(int cap) { - return ns_capable(task_cred_xxx(t, user)->user_ns, cap); + return ns_capable(&init_user_ns, cap); } -EXPORT_SYMBOL(task_ns_capable); +EXPORT_SYMBOL(capable); /** * nsown_capable - Check superior capability to one's own user_ns diff --git a/kernel/cgroup.c b/kernel/cgroup.c index a5d3b5325f77..f4ea4b6f3cf1 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -818,7 +818,7 @@ static int cgroup_call_pre_destroy(struct cgroup *cgrp) for_each_subsys(cgrp->root, ss) if (ss->pre_destroy) { - ret = ss->pre_destroy(ss, cgrp); + ret = ss->pre_destroy(cgrp); if (ret) break; } @@ -846,7 +846,7 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode) * Release the subsystem state objects. */ for_each_subsys(cgrp->root, ss) - ss->destroy(ss, cgrp); + ss->destroy(cgrp); cgrp->root->number_of_cgroups--; mutex_unlock(&cgroup_mutex); @@ -1015,7 +1015,7 @@ static int rebind_subsystems(struct cgroupfs_root *root, list_move(&ss->sibling, &root->subsys_list); ss->root = root; if (ss->bind) - ss->bind(ss, cgrp); + ss->bind(cgrp); mutex_unlock(&ss->hierarchy_mutex); /* refcount was already taken, and we're keeping it */ } else if (bit & removed_bits) { @@ -1025,7 +1025,7 @@ static int rebind_subsystems(struct cgroupfs_root *root, BUG_ON(cgrp->subsys[i]->cgroup != cgrp); mutex_lock(&ss->hierarchy_mutex); if (ss->bind) - ss->bind(ss, dummytop); + ss->bind(dummytop); dummytop->subsys[i]->cgroup = dummytop; cgrp->subsys[i] = NULL; subsys[i]->root = &rootnode; @@ -1472,7 +1472,6 @@ static int cgroup_get_rootdir(struct super_block *sb) struct inode *inode = cgroup_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR, sb); - struct dentry *dentry; if (!inode) return -ENOMEM; @@ -1481,12 +1480,9 @@ static int cgroup_get_rootdir(struct super_block *sb) inode->i_op = &cgroup_dir_inode_operations; /* directories start off with i_nlink == 2 (for "." entry) */ inc_nlink(inode); - dentry = d_alloc_root(inode); - if (!dentry) { - iput(inode); + sb->s_root = d_make_root(inode); + if (!sb->s_root) return -ENOMEM; - } - sb->s_root = dentry; /* for everything else we want ->d_op set */ sb->s_d_op = &cgroup_dops; return 0; @@ -1763,6 +1759,7 @@ EXPORT_SYMBOL_GPL(cgroup_path); struct task_and_cgroup { struct task_struct *task; struct cgroup *cgrp; + struct css_set *cg; }; struct cgroup_taskset { @@ -1843,11 +1840,10 @@ EXPORT_SYMBOL_GPL(cgroup_taskset_size); * will already exist. If not set, this function might sleep, and can fail with * -ENOMEM. Must be called with cgroup_mutex and threadgroup locked. */ -static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp, - struct task_struct *tsk, bool guarantee) +static void cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp, + struct task_struct *tsk, struct css_set *newcg) { struct css_set *oldcg; - struct css_set *newcg; /* * We are synchronized through threadgroup_lock() against PF_EXITING @@ -1857,23 +1853,6 @@ static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp, WARN_ON_ONCE(tsk->flags & PF_EXITING); oldcg = tsk->cgroups; - /* locate or allocate a new css_set for this task. */ - if (guarantee) { - /* we know the css_set we want already exists. */ - struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT]; - read_lock(&css_set_lock); - newcg = find_existing_css_set(oldcg, cgrp, template); - BUG_ON(!newcg); - get_css_set(newcg); - read_unlock(&css_set_lock); - } else { - might_sleep(); - /* find_css_set will give us newcg already referenced. */ - newcg = find_css_set(oldcg, cgrp); - if (!newcg) - return -ENOMEM; - } - task_lock(tsk); rcu_assign_pointer(tsk->cgroups, newcg); task_unlock(tsk); @@ -1892,7 +1871,6 @@ static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp, put_css_set(oldcg); set_bit(CGRP_RELEASABLE, &oldcgrp->flags); - return 0; } /** @@ -1910,6 +1888,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) struct cgroup *oldcgrp; struct cgroupfs_root *root = cgrp->root; struct cgroup_taskset tset = { }; + struct css_set *newcg; /* @tsk either already exited or can't exit until the end */ if (tsk->flags & PF_EXITING) @@ -1925,7 +1904,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) for_each_subsys(root, ss) { if (ss->can_attach) { - retval = ss->can_attach(ss, cgrp, &tset); + retval = ss->can_attach(cgrp, &tset); if (retval) { /* * Remember on which subsystem the can_attach() @@ -1939,13 +1918,17 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) } } - retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, false); - if (retval) + newcg = find_css_set(tsk->cgroups, cgrp); + if (!newcg) { + retval = -ENOMEM; goto out; + } + + cgroup_task_migrate(cgrp, oldcgrp, tsk, newcg); for_each_subsys(root, ss) { if (ss->attach) - ss->attach(ss, cgrp, &tset); + ss->attach(cgrp, &tset); } synchronize_rcu(); @@ -1967,7 +1950,7 @@ out: */ break; if (ss->cancel_attach) - ss->cancel_attach(ss, cgrp, &tset); + ss->cancel_attach(cgrp, &tset); } } return retval; @@ -1997,66 +1980,6 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) } EXPORT_SYMBOL_GPL(cgroup_attach_task_all); -/* - * cgroup_attach_proc works in two stages, the first of which prefetches all - * new css_sets needed (to make sure we have enough memory before committing - * to the move) and stores them in a list of entries of the following type. - * TODO: possible optimization: use css_set->rcu_head for chaining instead - */ -struct cg_list_entry { - struct css_set *cg; - struct list_head links; -}; - -static bool css_set_check_fetched(struct cgroup *cgrp, - struct task_struct *tsk, struct css_set *cg, - struct list_head *newcg_list) -{ - struct css_set *newcg; - struct cg_list_entry *cg_entry; - struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT]; - - read_lock(&css_set_lock); - newcg = find_existing_css_set(cg, cgrp, template); - read_unlock(&css_set_lock); - - /* doesn't exist at all? */ - if (!newcg) - return false; - /* see if it's already in the list */ - list_for_each_entry(cg_entry, newcg_list, links) - if (cg_entry->cg == newcg) - return true; - - /* not found */ - return false; -} - -/* - * Find the new css_set and store it in the list in preparation for moving the - * given task to the given cgroup. Returns 0 or -ENOMEM. - */ -static int css_set_prefetch(struct cgroup *cgrp, struct css_set *cg, - struct list_head *newcg_list) -{ - struct css_set *newcg; - struct cg_list_entry *cg_entry; - - /* ensure a new css_set will exist for this thread */ - newcg = find_css_set(cg, cgrp); - if (!newcg) - return -ENOMEM; - /* add it to the list */ - cg_entry = kmalloc(sizeof(struct cg_list_entry), GFP_KERNEL); - if (!cg_entry) { - put_css_set(newcg); - return -ENOMEM; - } - cg_entry->cg = newcg; - list_add(&cg_entry->links, newcg_list); - return 0; -} - /** * cgroup_attach_proc - attach all threads in a threadgroup to a cgroup * @cgrp: the cgroup to attach to @@ -2070,20 +1993,12 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) int retval, i, group_size; struct cgroup_subsys *ss, *failed_ss = NULL; /* guaranteed to be initialized later, but the compiler needs this */ - struct css_set *oldcg; struct cgroupfs_root *root = cgrp->root; /* threadgroup list cursor and array */ struct task_struct *tsk; struct task_and_cgroup *tc; struct flex_array *group; struct cgroup_taskset tset = { }; - /* - * we need to make sure we have css_sets for all the tasks we're - * going to move -before- we actually start moving them, so that in - * case we get an ENOMEM we can bail out before making any changes. - */ - struct list_head newcg_list; - struct cg_list_entry *cg_entry, *temp_nobe; /* * step 0: in order to do expensive, possibly blocking operations for @@ -2102,23 +2017,14 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) if (retval) goto out_free_group_list; - /* prevent changes to the threadgroup list while we take a snapshot. */ - read_lock(&tasklist_lock); - if (!thread_group_leader(leader)) { - /* - * a race with de_thread from another thread's exec() may strip - * us of our leadership, making while_each_thread unsafe to use - * on this task. if this happens, there is no choice but to - * throw this task away and try again (from cgroup_procs_write); - * this is "double-double-toil-and-trouble-check locking". - */ - read_unlock(&tasklist_lock); - retval = -EAGAIN; - goto out_free_group_list; - } - tsk = leader; i = 0; + /* + * Prevent freeing of tasks while we take a snapshot. Tasks that are + * already PF_EXITING could be freed from underneath us unless we + * take an rcu_read_lock. + */ + rcu_read_lock(); do { struct task_and_cgroup ent; @@ -2128,24 +2034,24 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) /* as per above, nr_threads may decrease, but not increase. */ BUG_ON(i >= group_size); - /* - * saying GFP_ATOMIC has no effect here because we did prealloc - * earlier, but it's good form to communicate our expectations. - */ ent.task = tsk; ent.cgrp = task_cgroup_from_root(tsk, root); /* nothing to do if this task is already in the cgroup */ if (ent.cgrp == cgrp) continue; + /* + * saying GFP_ATOMIC has no effect here because we did prealloc + * earlier, but it's good form to communicate our expectations. + */ retval = flex_array_put(group, i, &ent, GFP_ATOMIC); BUG_ON(retval != 0); i++; } while_each_thread(leader, tsk); + rcu_read_unlock(); /* remember the number of threads in the array for later. */ group_size = i; tset.tc_array = group; tset.tc_array_len = group_size; - read_unlock(&tasklist_lock); /* methods shouldn't be called if no task is actually migrating */ retval = 0; @@ -2157,7 +2063,7 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) */ for_each_subsys(root, ss) { if (ss->can_attach) { - retval = ss->can_attach(ss, cgrp, &tset); + retval = ss->can_attach(cgrp, &tset); if (retval) { failed_ss = ss; goto out_cancel_attach; @@ -2169,17 +2075,12 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) * step 2: make sure css_sets exist for all threads to be migrated. * we use find_css_set, which allocates a new one if necessary. */ - INIT_LIST_HEAD(&newcg_list); for (i = 0; i < group_size; i++) { tc = flex_array_get(group, i); - oldcg = tc->task->cgroups; - - /* if we don't already have it in the list get a new one */ - if (!css_set_check_fetched(cgrp, tc->task, oldcg, - &newcg_list)) { - retval = css_set_prefetch(cgrp, oldcg, &newcg_list); - if (retval) - goto out_list_teardown; + tc->cg = find_css_set(tc->task->cgroups, cgrp); + if (!tc->cg) { + retval = -ENOMEM; + goto out_put_css_set_refs; } } @@ -2190,8 +2091,7 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) */ for (i = 0; i < group_size; i++) { tc = flex_array_get(group, i); - retval = cgroup_task_migrate(cgrp, tc->cgrp, tc->task, true); - BUG_ON(retval); + cgroup_task_migrate(cgrp, tc->cgrp, tc->task, tc->cg); } /* nothing is sensitive to fork() after this point. */ @@ -2200,7 +2100,7 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) */ for_each_subsys(root, ss) { if (ss->attach) - ss->attach(ss, cgrp, &tset); + ss->attach(cgrp, &tset); } /* @@ -2209,21 +2109,22 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) synchronize_rcu(); cgroup_wakeup_rmdir_waiter(cgrp); retval = 0; -out_list_teardown: - /* clean up the list of prefetched css_sets. */ - list_for_each_entry_safe(cg_entry, temp_nobe, &newcg_list, links) { - list_del(&cg_entry->links); - put_css_set(cg_entry->cg); - kfree(cg_entry); +out_put_css_set_refs: + if (retval) { + for (i = 0; i < group_size; i++) { + tc = flex_array_get(group, i); + if (!tc->cg) + break; + put_css_set(tc->cg); + } } out_cancel_attach: - /* same deal as in cgroup_attach_task */ if (retval) { for_each_subsys(root, ss) { if (ss == failed_ss) break; if (ss->cancel_attach) - ss->cancel_attach(ss, cgrp, &tset); + ss->cancel_attach(cgrp, &tset); } } out_free_group_list: @@ -2245,22 +2146,14 @@ static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup) if (!cgroup_lock_live_group(cgrp)) return -ENODEV; +retry_find_task: + rcu_read_lock(); if (pid) { - rcu_read_lock(); tsk = find_task_by_vpid(pid); if (!tsk) { rcu_read_unlock(); - cgroup_unlock(); - return -ESRCH; - } - if (threadgroup) { - /* - * RCU protects this access, since tsk was found in the - * tid map. a race with de_thread may cause group_leader - * to stop being the leader, but cgroup_attach_proc will - * detect it later. - */ - tsk = tsk->group_leader; + ret= -ESRCH; + goto out_unlock_cgroup; } /* * even if we're attaching all tasks in the thread group, we @@ -2271,29 +2164,38 @@ static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup) cred->euid != tcred->uid && cred->euid != tcred->suid) { rcu_read_unlock(); - cgroup_unlock(); - return -EACCES; + ret = -EACCES; + goto out_unlock_cgroup; } - get_task_struct(tsk); - rcu_read_unlock(); - } else { - if (threadgroup) - tsk = current->group_leader; - else - tsk = current; - get_task_struct(tsk); - } - - threadgroup_lock(tsk); + } else + tsk = current; if (threadgroup) + tsk = tsk->group_leader; + get_task_struct(tsk); + rcu_read_unlock(); + + threadgroup_lock(tsk); + if (threadgroup) { + if (!thread_group_leader(tsk)) { + /* + * a race with de_thread from another thread's exec() + * may strip us of our leadership, if this happens, + * there is no choice but to throw this task away and + * try again; this is + * "double-double-toil-and-trouble-check locking". + */ + threadgroup_unlock(tsk); + put_task_struct(tsk); + goto retry_find_task; + } ret = cgroup_attach_proc(cgrp, tsk); - else + } else ret = cgroup_attach_task(cgrp, tsk); - threadgroup_unlock(tsk); put_task_struct(tsk); +out_unlock_cgroup: cgroup_unlock(); return ret; } @@ -2305,16 +2207,7 @@ static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid) static int cgroup_procs_write(struct cgroup *cgrp, struct cftype *cft, u64 tgid) { - int ret; - do { - /* - * attach_proc fails with -EAGAIN if threadgroup leadership - * changes in the middle of the operation, in which case we need - * to find the task_struct for the new leader and start over. - */ - ret = attach_task_by_pid(cgrp, tgid, true); - } while (ret == -EAGAIN); - return ret; + return attach_task_by_pid(cgrp, tgid, true); } /** @@ -2804,15 +2697,20 @@ static void cgroup_advance_iter(struct cgroup *cgrp, * using their cgroups capability, we don't maintain the lists running * through each css_set to its tasks until we see the list actually * used - in other words after the first call to cgroup_iter_start(). - * - * The tasklist_lock is not held here, as do_each_thread() and - * while_each_thread() are protected by RCU. */ static void cgroup_enable_task_cg_lists(void) { struct task_struct *p, *g; write_lock(&css_set_lock); use_task_css_set_links = 1; + /* + * We need tasklist_lock because RCU is not safe against + * while_each_thread(). Besides, a forking task that has passed + * cgroup_post_fork() without seeing use_task_css_set_links = 1 + * is not guaranteed to have its child immediately visible in the + * tasklist if we walk through it with RCU. + */ + read_lock(&tasklist_lock); do_each_thread(g, p) { task_lock(p); /* @@ -2824,6 +2722,7 @@ static void cgroup_enable_task_cg_lists(void) list_add(&p->cg_list, &p->cgroups->tasks); task_unlock(p); } while_each_thread(g, p); + read_unlock(&tasklist_lock); write_unlock(&css_set_lock); } @@ -3043,6 +2942,38 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan) * */ +/* which pidlist file are we talking about? */ +enum cgroup_filetype { + CGROUP_FILE_PROCS, + CGROUP_FILE_TASKS, +}; + +/* + * A pidlist is a list of pids that virtually represents the contents of one + * of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists, + * a pair (one each for procs, tasks) for each pid namespace that's relevant + * to the cgroup. + */ +struct cgroup_pidlist { + /* + * used to find which pidlist is wanted. doesn't change as long as + * this particular list stays in the list. + */ + struct { enum cgroup_filetype type; struct pid_namespace *ns; } key; + /* array of xids */ + pid_t *list; + /* how many elements the above list has */ + int length; + /* how many files are using the current array */ + int use_count; + /* each of these stored in a list by its cgroup */ + struct list_head links; + /* pointer to the cgroup we belong to, for list removal purposes */ + struct cgroup *owner; + /* protects the other fields */ + struct rw_semaphore mutex; +}; + /* * The following two functions "fix" the issue where there are more pids * than kmalloc will give memory for; in such cases, we use vmalloc/vfree. @@ -3827,7 +3758,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); for_each_subsys(root, ss) { - struct cgroup_subsys_state *css = ss->create(ss, cgrp); + struct cgroup_subsys_state *css = ss->create(cgrp); if (IS_ERR(css)) { err = PTR_ERR(css); @@ -3841,7 +3772,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, } /* At error, ->destroy() callback has to free assigned ID. */ if (clone_children(parent) && ss->post_clone) - ss->post_clone(ss, cgrp); + ss->post_clone(cgrp); } cgroup_lock_hierarchy(root); @@ -3875,7 +3806,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, for_each_subsys(root, ss) { if (cgrp->subsys[ss->subsys_id]) - ss->destroy(ss, cgrp); + ss->destroy(cgrp); } mutex_unlock(&cgroup_mutex); @@ -4099,7 +4030,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) /* Create the top cgroup state for this subsystem */ list_add(&ss->sibling, &rootnode.subsys_list); ss->root = &rootnode; - css = ss->create(ss, dummytop); + css = ss->create(dummytop); /* We don't handle early failures gracefully */ BUG_ON(IS_ERR(css)); init_cgroup_css(css, ss, dummytop); @@ -4188,7 +4119,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) * no ss->create seems to need anything important in the ss struct, so * this can happen first (i.e. before the rootnode attachment). */ - css = ss->create(ss, dummytop); + css = ss->create(dummytop); if (IS_ERR(css)) { /* failure case - need to deassign the subsys[] slot. */ subsys[i] = NULL; @@ -4206,7 +4137,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) int ret = cgroup_init_idr(ss, css); if (ret) { dummytop->subsys[ss->subsys_id] = NULL; - ss->destroy(ss, dummytop); + ss->destroy(dummytop); subsys[i] = NULL; mutex_unlock(&cgroup_mutex); return ret; @@ -4304,7 +4235,7 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss) * pointer to find their state. note that this also takes care of * freeing the css_id. */ - ss->destroy(ss, dummytop); + ss->destroy(dummytop); dummytop->subsys[ss->subsys_id] = NULL; mutex_unlock(&cgroup_mutex); @@ -4580,7 +4511,7 @@ void cgroup_fork_callbacks(struct task_struct *child) for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { struct cgroup_subsys *ss = subsys[i]; if (ss->fork) - ss->fork(ss, child); + ss->fork(child); } } } @@ -4596,6 +4527,17 @@ void cgroup_fork_callbacks(struct task_struct *child) */ void cgroup_post_fork(struct task_struct *child) { + /* + * use_task_css_set_links is set to 1 before we walk the tasklist + * under the tasklist_lock and we read it here after we added the child + * to the tasklist under the tasklist_lock as well. If the child wasn't + * yet in the tasklist when we walked through it from + * cgroup_enable_task_cg_lists(), then use_task_css_set_links value + * should be visible now due to the paired locking and barriers implied + * by LOCK/UNLOCK: it is written before the tasklist_lock unlock + * in cgroup_enable_task_cg_lists() and read here after the tasklist_lock + * lock on fork. + */ if (use_task_css_set_links) { write_lock(&css_set_lock); if (list_empty(&child->cg_list)) { @@ -4682,7 +4624,7 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks) struct cgroup *old_cgrp = rcu_dereference_raw(cg->subsys[i])->cgroup; struct cgroup *cgrp = task_cgroup(tsk, i); - ss->exit(ss, cgrp, old_cgrp, tsk); + ss->exit(cgrp, old_cgrp, tsk); } } } @@ -4939,9 +4881,9 @@ void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css) rcu_assign_pointer(id->css, NULL); rcu_assign_pointer(css->id, NULL); - write_lock(&ss->id_lock); + spin_lock(&ss->id_lock); idr_remove(&ss->idr, id->id); - write_unlock(&ss->id_lock); + spin_unlock(&ss->id_lock); kfree_rcu(id, rcu_head); } EXPORT_SYMBOL_GPL(free_css_id); @@ -4967,10 +4909,10 @@ static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth) error = -ENOMEM; goto err_out; } - write_lock(&ss->id_lock); + spin_lock(&ss->id_lock); /* Don't use 0. allocates an ID of 1-65535 */ error = idr_get_new_above(&ss->idr, newid, 1, &myid); - write_unlock(&ss->id_lock); + spin_unlock(&ss->id_lock); /* Returns error when there are no free spaces for new ID.*/ if (error) { @@ -4985,9 +4927,9 @@ static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth) return newid; remove_idr: error = -ENOSPC; - write_lock(&ss->id_lock); + spin_lock(&ss->id_lock); idr_remove(&ss->idr, myid); - write_unlock(&ss->id_lock); + spin_unlock(&ss->id_lock); err_out: kfree(newid); return ERR_PTR(error); @@ -4999,7 +4941,7 @@ static int __init_or_module cgroup_init_idr(struct cgroup_subsys *ss, { struct css_id *newid; - rwlock_init(&ss->id_lock); + spin_lock_init(&ss->id_lock); idr_init(&ss->idr); newid = get_new_cssid(ss, 0); @@ -5087,6 +5029,8 @@ css_get_next(struct cgroup_subsys *ss, int id, return NULL; BUG_ON(!ss->use_id); + WARN_ON_ONCE(!rcu_read_lock_held()); + /* fill start point for scan */ tmpid = id; while (1) { @@ -5094,10 +5038,7 @@ css_get_next(struct cgroup_subsys *ss, int id, * scan next entry from bitmap(tree), tmpid is updated after * idr_get_next(). */ - read_lock(&ss->id_lock); tmp = idr_get_next(&ss->idr, &tmpid); - read_unlock(&ss->id_lock); - if (!tmp) break; if (tmp->depth >= depth && tmp->stack[depth] == rootid) { @@ -5137,8 +5078,7 @@ struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id) } #ifdef CONFIG_CGROUP_DEBUG -static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss, - struct cgroup *cont) +static struct cgroup_subsys_state *debug_create(struct cgroup *cont) { struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL); @@ -5148,7 +5088,7 @@ static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss, return css; } -static void debug_destroy(struct cgroup_subsys *ss, struct cgroup *cont) +static void debug_destroy(struct cgroup *cont) { kfree(cont->subsys[debug_subsys_id]); } diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index fc0646b78a64..f86e93920b62 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -128,8 +128,7 @@ struct cgroup_subsys freezer_subsys; * task->alloc_lock (inside __thaw_task(), prevents race with refrigerator()) * sighand->siglock */ -static struct cgroup_subsys_state *freezer_create(struct cgroup_subsys *ss, - struct cgroup *cgroup) +static struct cgroup_subsys_state *freezer_create(struct cgroup *cgroup) { struct freezer *freezer; @@ -142,8 +141,7 @@ static struct cgroup_subsys_state *freezer_create(struct cgroup_subsys *ss, return &freezer->css; } -static void freezer_destroy(struct cgroup_subsys *ss, - struct cgroup *cgroup) +static void freezer_destroy(struct cgroup *cgroup) { struct freezer *freezer = cgroup_freezer(cgroup); @@ -164,8 +162,7 @@ static bool is_task_frozen_enough(struct task_struct *task) * a write to that file racing against an attach, and hence the * can_attach() result will remain valid until the attach completes. */ -static int freezer_can_attach(struct cgroup_subsys *ss, - struct cgroup *new_cgroup, +static int freezer_can_attach(struct cgroup *new_cgroup, struct cgroup_taskset *tset) { struct freezer *freezer; @@ -185,7 +182,7 @@ static int freezer_can_attach(struct cgroup_subsys *ss, return 0; } -static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task) +static void freezer_fork(struct task_struct *task) { struct freezer *freezer; diff --git a/kernel/cpuset.c b/kernel/cpuset.c index a09ac2b9a661..1010cc61931f 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -964,7 +964,6 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk, { bool need_loop; -repeat: /* * Allow tasks that have access to memory reserves because they have * been OOM killed to get memory anywhere. @@ -983,45 +982,19 @@ repeat: */ need_loop = task_has_mempolicy(tsk) || !nodes_intersects(*newmems, tsk->mems_allowed); - nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems); - mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1); - /* - * ensure checking ->mems_allowed_change_disable after setting all new - * allowed nodes. - * - * the read-side task can see an nodemask with new allowed nodes and - * old allowed nodes. and if it allocates page when cpuset clears newly - * disallowed ones continuous, it can see the new allowed bits. - * - * And if setting all new allowed nodes is after the checking, setting - * all new allowed nodes and clearing newly disallowed ones will be done - * continuous, and the read-side task may find no node to alloc page. - */ - smp_mb(); + if (need_loop) + write_seqcount_begin(&tsk->mems_allowed_seq); - /* - * Allocation of memory is very fast, we needn't sleep when waiting - * for the read-side. - */ - while (need_loop && ACCESS_ONCE(tsk->mems_allowed_change_disable)) { - task_unlock(tsk); - if (!task_curr(tsk)) - yield(); - goto repeat; - } - - /* - * ensure checking ->mems_allowed_change_disable before clearing all new - * disallowed nodes. - * - * if clearing newly disallowed bits before the checking, the read-side - * task may find no node to alloc page. - */ - smp_mb(); + nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems); + mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1); mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP2); tsk->mems_allowed = *newmems; + + if (need_loop) + write_seqcount_end(&tsk->mems_allowed_seq); + task_unlock(tsk); } @@ -1399,8 +1372,7 @@ static nodemask_t cpuset_attach_nodemask_from; static nodemask_t cpuset_attach_nodemask_to; /* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */ -static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, - struct cgroup_taskset *tset) +static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) { struct cpuset *cs = cgroup_cs(cgrp); struct task_struct *task; @@ -1436,8 +1408,7 @@ static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, return 0; } -static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, - struct cgroup_taskset *tset) +static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) { struct mm_struct *mm; struct task_struct *task; @@ -1833,8 +1804,7 @@ static int cpuset_populate(struct cgroup_subsys *ss, struct cgroup *cont) * (and likewise for mems) to the new cgroup. Called with cgroup_mutex * held. */ -static void cpuset_post_clone(struct cgroup_subsys *ss, - struct cgroup *cgroup) +static void cpuset_post_clone(struct cgroup *cgroup) { struct cgroup *parent, *child; struct cpuset *cs, *parent_cs; @@ -1857,13 +1827,10 @@ static void cpuset_post_clone(struct cgroup_subsys *ss, /* * cpuset_create - create a cpuset - * ss: cpuset cgroup subsystem * cont: control group that the new cpuset will be part of */ -static struct cgroup_subsys_state *cpuset_create( - struct cgroup_subsys *ss, - struct cgroup *cont) +static struct cgroup_subsys_state *cpuset_create(struct cgroup *cont) { struct cpuset *cs; struct cpuset *parent; @@ -1902,7 +1869,7 @@ static struct cgroup_subsys_state *cpuset_create( * will call async_rebuild_sched_domains(). */ -static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont) +static void cpuset_destroy(struct cgroup *cont) { struct cpuset *cs = cgroup_cs(cont); diff --git a/kernel/cred.c b/kernel/cred.c index 5791612a4045..97b36eeca4c9 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -16,6 +16,7 @@ #include <linux/keyctl.h> #include <linux/init_task.h> #include <linux/security.h> +#include <linux/binfmts.h> #include <linux/cn_proc.h> #if 0 diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 63786e71a3cd..e2ae7349437f 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -1982,7 +1982,7 @@ static int kdb_lsmod(int argc, const char **argv) kdb_printf("%-20s%8u 0x%p ", mod->name, mod->core_size, (void *)mod); #ifdef CONFIG_MODULE_UNLOAD - kdb_printf("%4d ", module_refcount(mod)); + kdb_printf("%4ld ", module_refcount(mod)); #endif if (mod->state == MODULE_STATE_GOING) kdb_printf(" (Unloading)"); diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c index 7d6fb40d2188..d35cc2d3a4cc 100644 --- a/kernel/debug/kdb/kdb_support.c +++ b/kernel/debug/kdb/kdb_support.c @@ -384,9 +384,9 @@ static int kdb_getphys(void *res, unsigned long addr, size_t size) if (!pfn_valid(pfn)) return 1; page = pfn_to_page(pfn); - vaddr = kmap_atomic(page, KM_KDB); + vaddr = kmap_atomic(page); memcpy(res, vaddr + (addr & (PAGE_SIZE - 1)), size); - kunmap_atomic(vaddr, KM_KDB); + kunmap_atomic(vaddr); return 0; } diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index 057e24b665cf..6581a040f399 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -115,8 +115,6 @@ int get_callchain_buffers(void) } err = alloc_callchain_buffers(); - if (err) - release_callchain_buffers(); exit: mutex_unlock(&callchain_mutex); diff --git a/kernel/events/core.c b/kernel/events/core.c index a8f4ac001a00..4b50357914fb 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -118,6 +118,13 @@ static int cpu_function_call(int cpu, int (*func) (void *info), void *info) PERF_FLAG_FD_OUTPUT |\ PERF_FLAG_PID_CGROUP) +/* + * branch priv levels that need permission checks + */ +#define PERF_SAMPLE_BRANCH_PERM_PLM \ + (PERF_SAMPLE_BRANCH_KERNEL |\ + PERF_SAMPLE_BRANCH_HV) + enum event_type_t { EVENT_FLEXIBLE = 0x1, EVENT_PINNED = 0x2, @@ -128,8 +135,9 @@ enum event_type_t { * perf_sched_events : >0 events exist * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu */ -struct jump_label_key_deferred perf_sched_events __read_mostly; +struct static_key_deferred perf_sched_events __read_mostly; static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); +static DEFINE_PER_CPU(atomic_t, perf_branch_stack_events); static atomic_t nr_mmap_events __read_mostly; static atomic_t nr_comm_events __read_mostly; @@ -815,7 +823,7 @@ static void update_event_times(struct perf_event *event) * here. */ if (is_cgroup_event(event)) - run_end = perf_event_time(event); + run_end = perf_cgroup_event_time(event); else if (ctx->is_active) run_end = ctx->time; else @@ -881,6 +889,9 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) if (is_cgroup_event(event)) ctx->nr_cgroups++; + if (has_branch_stack(event)) + ctx->nr_branch_stack++; + list_add_rcu(&event->event_entry, &ctx->event_list); if (!ctx->nr_events) perf_pmu_rotate_start(ctx->pmu); @@ -1020,6 +1031,9 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) cpuctx->cgrp = NULL; } + if (has_branch_stack(event)) + ctx->nr_branch_stack--; + ctx->nr_events--; if (event->attr.inherit_stat) ctx->nr_stat--; @@ -2195,6 +2209,66 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx, } /* + * When sampling the branck stack in system-wide, it may be necessary + * to flush the stack on context switch. This happens when the branch + * stack does not tag its entries with the pid of the current task. + * Otherwise it becomes impossible to associate a branch entry with a + * task. This ambiguity is more likely to appear when the branch stack + * supports priv level filtering and the user sets it to monitor only + * at the user level (which could be a useful measurement in system-wide + * mode). In that case, the risk is high of having a branch stack with + * branch from multiple tasks. Flushing may mean dropping the existing + * entries or stashing them somewhere in the PMU specific code layer. + * + * This function provides the context switch callback to the lower code + * layer. It is invoked ONLY when there is at least one system-wide context + * with at least one active event using taken branch sampling. + */ +static void perf_branch_stack_sched_in(struct task_struct *prev, + struct task_struct *task) +{ + struct perf_cpu_context *cpuctx; + struct pmu *pmu; + unsigned long flags; + + /* no need to flush branch stack if not changing task */ + if (prev == task) + return; + + local_irq_save(flags); + + rcu_read_lock(); + + list_for_each_entry_rcu(pmu, &pmus, entry) { + cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); + + /* + * check if the context has at least one + * event using PERF_SAMPLE_BRANCH_STACK + */ + if (cpuctx->ctx.nr_branch_stack > 0 + && pmu->flush_branch_stack) { + + pmu = cpuctx->ctx.pmu; + + perf_ctx_lock(cpuctx, cpuctx->task_ctx); + + perf_pmu_disable(pmu); + + pmu->flush_branch_stack(); + + perf_pmu_enable(pmu); + + perf_ctx_unlock(cpuctx, cpuctx->task_ctx); + } + } + + rcu_read_unlock(); + + local_irq_restore(flags); +} + +/* * Called from scheduler to add the events of the current task * with interrupts disabled. * @@ -2225,6 +2299,10 @@ void __perf_event_task_sched_in(struct task_struct *prev, */ if (atomic_read(&__get_cpu_var(perf_cgroup_events))) perf_cgroup_sched_in(prev, task); + + /* check for system-wide branch_stack events */ + if (atomic_read(&__get_cpu_var(perf_branch_stack_events))) + perf_branch_stack_sched_in(prev, task); } static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count) @@ -2300,7 +2378,10 @@ do { \ return div64_u64(dividend, divisor); } -static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count) +static DEFINE_PER_CPU(int, perf_throttled_count); +static DEFINE_PER_CPU(u64, perf_throttled_seq); + +static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bool disable) { struct hw_perf_event *hwc = &event->hw; s64 period, sample_period; @@ -2319,22 +2400,40 @@ static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count) hwc->sample_period = sample_period; if (local64_read(&hwc->period_left) > 8*sample_period) { - event->pmu->stop(event, PERF_EF_UPDATE); + if (disable) + event->pmu->stop(event, PERF_EF_UPDATE); + local64_set(&hwc->period_left, 0); - event->pmu->start(event, PERF_EF_RELOAD); + + if (disable) + event->pmu->start(event, PERF_EF_RELOAD); } } -static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period) +/* + * combine freq adjustment with unthrottling to avoid two passes over the + * events. At the same time, make sure, having freq events does not change + * the rate of unthrottling as that would introduce bias. + */ +static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx, + int needs_unthr) { struct perf_event *event; struct hw_perf_event *hwc; - u64 interrupts, now; + u64 now, period = TICK_NSEC; s64 delta; - if (!ctx->nr_freq) + /* + * only need to iterate over all events iff: + * - context have events in frequency mode (needs freq adjust) + * - there are events to unthrottle on this cpu + */ + if (!(ctx->nr_freq || needs_unthr)) return; + raw_spin_lock(&ctx->lock); + perf_pmu_disable(ctx->pmu); + list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { if (event->state != PERF_EVENT_STATE_ACTIVE) continue; @@ -2344,13 +2443,8 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period) hwc = &event->hw; - interrupts = hwc->interrupts; - hwc->interrupts = 0; - - /* - * unthrottle events on the tick - */ - if (interrupts == MAX_INTERRUPTS) { + if (needs_unthr && hwc->interrupts == MAX_INTERRUPTS) { + hwc->interrupts = 0; perf_log_throttle(event, 1); event->pmu->start(event, 0); } @@ -2358,14 +2452,30 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period) if (!event->attr.freq || !event->attr.sample_freq) continue; - event->pmu->read(event); + /* + * stop the event and update event->count + */ + event->pmu->stop(event, PERF_EF_UPDATE); + now = local64_read(&event->count); delta = now - hwc->freq_count_stamp; hwc->freq_count_stamp = now; + /* + * restart the event + * reload only if value has changed + * we have stopped the event so tell that + * to perf_adjust_period() to avoid stopping it + * twice. + */ if (delta > 0) - perf_adjust_period(event, period, delta); + perf_adjust_period(event, period, delta, false); + + event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0); } + + perf_pmu_enable(ctx->pmu); + raw_spin_unlock(&ctx->lock); } /* @@ -2388,16 +2498,13 @@ static void rotate_ctx(struct perf_event_context *ctx) */ static void perf_rotate_context(struct perf_cpu_context *cpuctx) { - u64 interval = (u64)cpuctx->jiffies_interval * TICK_NSEC; struct perf_event_context *ctx = NULL; - int rotate = 0, remove = 1, freq = 0; + int rotate = 0, remove = 1; if (cpuctx->ctx.nr_events) { remove = 0; if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active) rotate = 1; - if (cpuctx->ctx.nr_freq) - freq = 1; } ctx = cpuctx->task_ctx; @@ -2405,37 +2512,26 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx) remove = 0; if (ctx->nr_events != ctx->nr_active) rotate = 1; - if (ctx->nr_freq) - freq = 1; } - if (!rotate && !freq) + if (!rotate) goto done; perf_ctx_lock(cpuctx, cpuctx->task_ctx); perf_pmu_disable(cpuctx->ctx.pmu); - if (freq) { - perf_ctx_adjust_freq(&cpuctx->ctx, interval); - if (ctx) - perf_ctx_adjust_freq(ctx, interval); - } - - if (rotate) { - cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); - if (ctx) - ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE); + cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); + if (ctx) + ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE); - rotate_ctx(&cpuctx->ctx); - if (ctx) - rotate_ctx(ctx); + rotate_ctx(&cpuctx->ctx); + if (ctx) + rotate_ctx(ctx); - perf_event_sched_in(cpuctx, ctx, current); - } + perf_event_sched_in(cpuctx, ctx, current); perf_pmu_enable(cpuctx->ctx.pmu); perf_ctx_unlock(cpuctx, cpuctx->task_ctx); - done: if (remove) list_del_init(&cpuctx->rotation_list); @@ -2445,10 +2541,22 @@ void perf_event_task_tick(void) { struct list_head *head = &__get_cpu_var(rotation_list); struct perf_cpu_context *cpuctx, *tmp; + struct perf_event_context *ctx; + int throttled; WARN_ON(!irqs_disabled()); + __this_cpu_inc(perf_throttled_seq); + throttled = __this_cpu_xchg(perf_throttled_count, 0); + list_for_each_entry_safe(cpuctx, tmp, head, rotation_list) { + ctx = &cpuctx->ctx; + perf_adjust_freq_unthr_context(ctx, throttled); + + ctx = cpuctx->task_ctx; + if (ctx) + perf_adjust_freq_unthr_context(ctx, throttled); + if (cpuctx->jiffies_interval == 1 || !(jiffies % cpuctx->jiffies_interval)) perf_rotate_context(cpuctx); @@ -2748,7 +2856,7 @@ static void free_event(struct perf_event *event) if (!event->parent) { if (event->attach_state & PERF_ATTACH_TASK) - jump_label_dec_deferred(&perf_sched_events); + static_key_slow_dec_deferred(&perf_sched_events); if (event->attr.mmap || event->attr.mmap_data) atomic_dec(&nr_mmap_events); if (event->attr.comm) @@ -2759,7 +2867,15 @@ static void free_event(struct perf_event *event) put_callchain_buffers(); if (is_cgroup_event(event)) { atomic_dec(&per_cpu(perf_cgroup_events, event->cpu)); - jump_label_dec_deferred(&perf_sched_events); + static_key_slow_dec_deferred(&perf_sched_events); + } + + if (has_branch_stack(event)) { + static_key_slow_dec_deferred(&perf_sched_events); + /* is system-wide event */ + if (!(event->attach_state & PERF_ATTACH_TASK)) + atomic_dec(&per_cpu(perf_branch_stack_events, + event->cpu)); } } @@ -3208,10 +3324,6 @@ int perf_event_task_disable(void) return 0; } -#ifndef PERF_EVENT_INDEX_OFFSET -# define PERF_EVENT_INDEX_OFFSET 0 -#endif - static int perf_event_index(struct perf_event *event) { if (event->hw.state & PERF_HES_STOPPED) @@ -3220,21 +3332,26 @@ static int perf_event_index(struct perf_event *event) if (event->state != PERF_EVENT_STATE_ACTIVE) return 0; - return event->hw.idx + 1 - PERF_EVENT_INDEX_OFFSET; + return event->pmu->event_idx(event); } static void calc_timer_values(struct perf_event *event, + u64 *now, u64 *enabled, u64 *running) { - u64 now, ctx_time; + u64 ctx_time; - now = perf_clock(); - ctx_time = event->shadow_ctx_time + now; + *now = perf_clock(); + ctx_time = event->shadow_ctx_time + *now; *enabled = ctx_time - event->tstamp_enabled; *running = ctx_time - event->tstamp_running; } +void __weak perf_update_user_clock(struct perf_event_mmap_page *userpg, u64 now) +{ +} + /* * Callers need to ensure there can be no nesting of this function, otherwise * the seqlock logic goes bad. We can not serialize this because the arch @@ -3244,7 +3361,7 @@ void perf_event_update_userpage(struct perf_event *event) { struct perf_event_mmap_page *userpg; struct ring_buffer *rb; - u64 enabled, running; + u64 enabled, running, now; rcu_read_lock(); /* @@ -3256,7 +3373,7 @@ void perf_event_update_userpage(struct perf_event *event) * because of locking issue as we can be called in * NMI context */ - calc_timer_values(event, &enabled, &running); + calc_timer_values(event, &now, &enabled, &running); rb = rcu_dereference(event->rb); if (!rb) goto unlock; @@ -3272,7 +3389,7 @@ void perf_event_update_userpage(struct perf_event *event) barrier(); userpg->index = perf_event_index(event); userpg->offset = perf_event_count(event); - if (event->state == PERF_EVENT_STATE_ACTIVE) + if (userpg->index) userpg->offset -= local64_read(&event->hw.prev_count); userpg->time_enabled = enabled + @@ -3281,6 +3398,8 @@ void perf_event_update_userpage(struct perf_event *event) userpg->time_running = running + atomic64_read(&event->child_total_time_running); + perf_update_user_clock(userpg, now); + barrier(); ++userpg->lock; preempt_enable(); @@ -3538,6 +3657,8 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) event->mmap_user = get_current_user(); vma->vm_mm->pinned_vm += event->mmap_locked; + perf_event_update_userpage(event); + unlock: if (!ret) atomic_inc(&event->mmap_count); @@ -3769,7 +3890,7 @@ static void perf_output_read_group(struct perf_output_handle *handle, static void perf_output_read(struct perf_output_handle *handle, struct perf_event *event) { - u64 enabled = 0, running = 0; + u64 enabled = 0, running = 0, now; u64 read_format = event->attr.read_format; /* @@ -3782,7 +3903,7 @@ static void perf_output_read(struct perf_output_handle *handle, * NMI context */ if (read_format & PERF_FORMAT_TOTAL_TIMES) - calc_timer_values(event, &enabled, &running); + calc_timer_values(event, &now, &enabled, &running); if (event->attr.read_format & PERF_FORMAT_GROUP) perf_output_read_group(handle, event, enabled, running); @@ -3872,6 +3993,24 @@ void perf_output_sample(struct perf_output_handle *handle, } } } + + if (sample_type & PERF_SAMPLE_BRANCH_STACK) { + if (data->br_stack) { + size_t size; + + size = data->br_stack->nr + * sizeof(struct perf_branch_entry); + + perf_output_put(handle, data->br_stack->nr); + perf_output_copy(handle, data->br_stack->entries, size); + } else { + /* + * we always store at least the value of nr + */ + u64 nr = 0; + perf_output_put(handle, nr); + } + } } void perf_prepare_sample(struct perf_event_header *header, @@ -3914,6 +4053,15 @@ void perf_prepare_sample(struct perf_event_header *header, WARN_ON_ONCE(size & (sizeof(u64)-1)); header->size += size; } + + if (sample_type & PERF_SAMPLE_BRANCH_STACK) { + int size = sizeof(u64); /* nr */ + if (data->br_stack) { + size += data->br_stack->nr + * sizeof(struct perf_branch_entry); + } + header->size += size; + } } static void perf_event_output(struct perf_event *event, @@ -4509,6 +4657,7 @@ static int __perf_event_overflow(struct perf_event *event, { int events = atomic_read(&event->event_limit); struct hw_perf_event *hwc = &event->hw; + u64 seq; int ret = 0; /* @@ -4518,14 +4667,20 @@ static int __perf_event_overflow(struct perf_event *event, if (unlikely(!is_sampling_event(event))) return 0; - if (unlikely(hwc->interrupts >= max_samples_per_tick)) { - if (throttle) { + seq = __this_cpu_read(perf_throttled_seq); + if (seq != hwc->interrupts_seq) { + hwc->interrupts_seq = seq; + hwc->interrupts = 1; + } else { + hwc->interrupts++; + if (unlikely(throttle + && hwc->interrupts >= max_samples_per_tick)) { + __this_cpu_inc(perf_throttled_count); hwc->interrupts = MAX_INTERRUPTS; perf_log_throttle(event, 0); ret = 1; } - } else - hwc->interrupts++; + } if (event->attr.freq) { u64 now = perf_clock(); @@ -4534,7 +4689,7 @@ static int __perf_event_overflow(struct perf_event *event, hwc->freq_time_stamp = now; if (delta > 0 && delta < 2*TICK_NSEC) - perf_adjust_period(event, delta, hwc->last_period); + perf_adjust_period(event, delta, hwc->last_period, true); } /* @@ -4949,7 +5104,7 @@ fail: return err; } -struct jump_label_key perf_swevent_enabled[PERF_COUNT_SW_MAX]; +struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX]; static void sw_perf_event_destroy(struct perf_event *event) { @@ -4957,7 +5112,7 @@ static void sw_perf_event_destroy(struct perf_event *event) WARN_ON(event->parent); - jump_label_dec(&perf_swevent_enabled[event_id]); + static_key_slow_dec(&perf_swevent_enabled[event_id]); swevent_hlist_put(event); } @@ -4968,6 +5123,12 @@ static int perf_swevent_init(struct perf_event *event) if (event->attr.type != PERF_TYPE_SOFTWARE) return -ENOENT; + /* + * no branch sampling for software events + */ + if (has_branch_stack(event)) + return -EOPNOTSUPP; + switch (event_id) { case PERF_COUNT_SW_CPU_CLOCK: case PERF_COUNT_SW_TASK_CLOCK: @@ -4987,13 +5148,18 @@ static int perf_swevent_init(struct perf_event *event) if (err) return err; - jump_label_inc(&perf_swevent_enabled[event_id]); + static_key_slow_inc(&perf_swevent_enabled[event_id]); event->destroy = sw_perf_event_destroy; } return 0; } +static int perf_swevent_event_idx(struct perf_event *event) +{ + return 0; +} + static struct pmu perf_swevent = { .task_ctx_nr = perf_sw_context, @@ -5003,6 +5169,8 @@ static struct pmu perf_swevent = { .start = perf_swevent_start, .stop = perf_swevent_stop, .read = perf_swevent_read, + + .event_idx = perf_swevent_event_idx, }; #ifdef CONFIG_EVENT_TRACING @@ -5071,6 +5239,12 @@ static int perf_tp_event_init(struct perf_event *event) if (event->attr.type != PERF_TYPE_TRACEPOINT) return -ENOENT; + /* + * no branch sampling for tracepoint events + */ + if (has_branch_stack(event)) + return -EOPNOTSUPP; + err = perf_trace_init(event); if (err) return err; @@ -5089,6 +5263,8 @@ static struct pmu perf_tracepoint = { .start = perf_swevent_start, .stop = perf_swevent_stop, .read = perf_swevent_read, + + .event_idx = perf_swevent_event_idx, }; static inline void perf_tp_register(void) @@ -5294,6 +5470,12 @@ static int cpu_clock_event_init(struct perf_event *event) if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK) return -ENOENT; + /* + * no branch sampling for software events + */ + if (has_branch_stack(event)) + return -EOPNOTSUPP; + perf_swevent_init_hrtimer(event); return 0; @@ -5308,6 +5490,8 @@ static struct pmu perf_cpu_clock = { .start = cpu_clock_event_start, .stop = cpu_clock_event_stop, .read = cpu_clock_event_read, + + .event_idx = perf_swevent_event_idx, }; /* @@ -5366,6 +5550,12 @@ static int task_clock_event_init(struct perf_event *event) if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK) return -ENOENT; + /* + * no branch sampling for software events + */ + if (has_branch_stack(event)) + return -EOPNOTSUPP; + perf_swevent_init_hrtimer(event); return 0; @@ -5380,6 +5570,8 @@ static struct pmu perf_task_clock = { .start = task_clock_event_start, .stop = task_clock_event_stop, .read = task_clock_event_read, + + .event_idx = perf_swevent_event_idx, }; static void perf_pmu_nop_void(struct pmu *pmu) @@ -5407,6 +5599,11 @@ static void perf_pmu_cancel_txn(struct pmu *pmu) perf_pmu_enable(pmu); } +static int perf_event_idx_default(struct perf_event *event) +{ + return event->hw.idx + 1; +} + /* * Ensures all contexts with the same task_ctx_nr have the same * pmu_cpu_context too. @@ -5493,6 +5690,7 @@ static int pmu_dev_alloc(struct pmu *pmu) if (!pmu->dev) goto out; + pmu->dev->groups = pmu->attr_groups; device_initialize(pmu->dev); ret = dev_set_name(pmu->dev, "%s", pmu->name); if (ret) @@ -5596,6 +5794,9 @@ got_cpu_context: pmu->pmu_disable = perf_pmu_nop_void; } + if (!pmu->event_idx) + pmu->event_idx = perf_event_idx_default; + list_add_rcu(&pmu->entry, &pmus); ret = 0; unlock: @@ -5788,7 +5989,7 @@ done: if (!event->parent) { if (event->attach_state & PERF_ATTACH_TASK) - jump_label_inc(&perf_sched_events.key); + static_key_slow_inc(&perf_sched_events.key); if (event->attr.mmap || event->attr.mmap_data) atomic_inc(&nr_mmap_events); if (event->attr.comm) @@ -5802,6 +6003,12 @@ done: return ERR_PTR(err); } } + if (has_branch_stack(event)) { + static_key_slow_inc(&perf_sched_events.key); + if (!(event->attach_state & PERF_ATTACH_TASK)) + atomic_inc(&per_cpu(perf_branch_stack_events, + event->cpu)); + } } return event; @@ -5871,6 +6078,40 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr, if (attr->read_format & ~(PERF_FORMAT_MAX-1)) return -EINVAL; + if (attr->sample_type & PERF_SAMPLE_BRANCH_STACK) { + u64 mask = attr->branch_sample_type; + + /* only using defined bits */ + if (mask & ~(PERF_SAMPLE_BRANCH_MAX-1)) + return -EINVAL; + + /* at least one branch bit must be set */ + if (!(mask & ~PERF_SAMPLE_BRANCH_PLM_ALL)) + return -EINVAL; + + /* kernel level capture: check permissions */ + if ((mask & PERF_SAMPLE_BRANCH_PERM_PLM) + && perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN)) + return -EACCES; + + /* propagate priv level, when not set for branch */ + if (!(mask & PERF_SAMPLE_BRANCH_PLM_ALL)) { + + /* exclude_kernel checked on syscall entry */ + if (!attr->exclude_kernel) + mask |= PERF_SAMPLE_BRANCH_KERNEL; + + if (!attr->exclude_user) + mask |= PERF_SAMPLE_BRANCH_USER; + + if (!attr->exclude_hv) + mask |= PERF_SAMPLE_BRANCH_HV; + /* + * adjust user setting (for HW filter setup) + */ + attr->branch_sample_type = mask; + } + } out: return ret; @@ -6026,7 +6267,7 @@ SYSCALL_DEFINE5(perf_event_open, * - that may need work on context switch */ atomic_inc(&per_cpu(perf_cgroup_events, event->cpu)); - jump_label_inc(&perf_sched_events.key); + static_key_slow_inc(&perf_sched_events.key); } /* @@ -6906,8 +7147,7 @@ unlock: device_initcall(perf_event_sysfs_init); #ifdef CONFIG_CGROUP_PERF -static struct cgroup_subsys_state *perf_cgroup_create( - struct cgroup_subsys *ss, struct cgroup *cont) +static struct cgroup_subsys_state *perf_cgroup_create(struct cgroup *cont) { struct perf_cgroup *jc; @@ -6924,8 +7164,7 @@ static struct cgroup_subsys_state *perf_cgroup_create( return &jc->css; } -static void perf_cgroup_destroy(struct cgroup_subsys *ss, - struct cgroup *cont) +static void perf_cgroup_destroy(struct cgroup *cont) { struct perf_cgroup *jc; jc = container_of(cgroup_subsys_state(cont, perf_subsys_id), @@ -6941,8 +7180,7 @@ static int __perf_cgroup_move(void *info) return 0; } -static void perf_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, - struct cgroup_taskset *tset) +static void perf_cgroup_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) { struct task_struct *task; @@ -6950,8 +7188,8 @@ static void perf_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, task_function_call(task, __perf_cgroup_move, task); } -static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp, - struct cgroup *old_cgrp, struct task_struct *task) +static void perf_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp, + struct task_struct *task) { /* * cgroup_exit() is called in the copy_process() failure path. diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index b7971d6f38bf..bb38c4d3ee12 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -581,6 +581,12 @@ static int hw_breakpoint_event_init(struct perf_event *bp) if (bp->attr.type != PERF_TYPE_BREAKPOINT) return -ENOENT; + /* + * no branch sampling for breakpoint events + */ + if (has_branch_stack(bp)) + return -EOPNOTSUPP; + err = register_perf_hw_breakpoint(bp); if (err) return err; @@ -613,6 +619,11 @@ static void hw_breakpoint_stop(struct perf_event *bp, int flags) bp->hw.state = PERF_HES_STOPPED; } +static int hw_breakpoint_event_idx(struct perf_event *bp) +{ + return 0; +} + static struct pmu perf_breakpoint = { .task_ctx_nr = perf_sw_context, /* could eventually get its own */ @@ -622,6 +633,8 @@ static struct pmu perf_breakpoint = { .start = hw_breakpoint_start, .stop = hw_breakpoint_stop, .read = hw_breakpoint_pmu_read, + + .event_idx = hw_breakpoint_event_idx, }; int __init init_hw_breakpoint(void) @@ -651,10 +664,10 @@ int __init init_hw_breakpoint(void) err_alloc: for_each_possible_cpu(err_cpu) { - if (err_cpu == cpu) - break; for (i = 0; i < TYPE_MAX; i++) kfree(per_cpu(nr_task_bp_pinned[i], cpu)); + if (err_cpu == cpu) + break; } return -ENOMEM; diff --git a/kernel/exit.c b/kernel/exit.c index 94ed6e20bb53..16b07bfac224 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -52,6 +52,7 @@ #include <linux/hw_breakpoint.h> #include <linux/oom.h> #include <linux/writeback.h> +#include <linux/shm.h> #include <asm/uaccess.h> #include <asm/unistd.h> @@ -424,7 +425,7 @@ void daemonize(const char *name, ...) */ exit_mm(current); /* - * We don't want to have TIF_FREEZE set if the system-wide hibernation + * We don't want to get frozen, in case system-wide hibernation * or suspend transition begins right now. */ current->flags |= (PF_NOFREEZE | PF_KTHREAD); @@ -818,25 +819,6 @@ static void exit_notify(struct task_struct *tsk, int group_dead) if (group_dead) kill_orphaned_pgrp(tsk->group_leader, NULL); - /* Let father know we died - * - * Thread signals are configurable, but you aren't going to use - * that to send signals to arbitrary processes. - * That stops right now. - * - * If the parent exec id doesn't match the exec id we saved - * when we started then we know the parent has changed security - * domain. - * - * If our self_exec id doesn't match our parent_exec_id then - * we have changed execution domain as these two values started - * the same after a fork. - */ - if (thread_group_leader(tsk) && tsk->exit_signal != SIGCHLD && - (tsk->parent_exec_id != tsk->real_parent->self_exec_id || - tsk->self_exec_id != tsk->parent_exec_id)) - tsk->exit_signal = SIGCHLD; - if (unlikely(tsk->ptrace)) { int sig = thread_group_leader(tsk) && thread_group_empty(tsk) && @@ -887,7 +869,7 @@ static void check_stack_usage(void) static inline void check_stack_usage(void) {} #endif -NORET_TYPE void do_exit(long code) +void do_exit(long code) { struct task_struct *tsk = current; int group_dead; @@ -935,8 +917,6 @@ NORET_TYPE void do_exit(long code) schedule(); } - exit_irq_thread(); - exit_signals(tsk); /* sets PF_EXITING */ /* * tsk->flags are checked in the futex code to protect against @@ -945,6 +925,8 @@ NORET_TYPE void do_exit(long code) smp_mb(); raw_spin_unlock_wait(&tsk->pi_lock); + exit_irq_thread(); + if (unlikely(in_atomic())) printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n", current->comm, task_pid_nr(current), @@ -953,7 +935,7 @@ NORET_TYPE void do_exit(long code) acct_update_integrals(tsk); /* sync mm's RSS info before statistics gathering */ if (tsk->mm) - sync_mm_rss(tsk, tsk->mm); + sync_mm_rss(tsk->mm); group_dead = atomic_dec_and_test(&tsk->signal->live); if (group_dead) { hrtimer_cancel(&tsk->signal->real_timer); @@ -964,8 +946,7 @@ NORET_TYPE void do_exit(long code) acct_collect(code, group_dead); if (group_dead) tty_audit_exit(); - if (unlikely(tsk->audit_context)) - audit_free(tsk); + audit_free(tsk); tsk->exit_code = code; taskstats_exit(tsk, group_dead); @@ -1039,6 +1020,22 @@ NORET_TYPE void do_exit(long code) if (tsk->nr_dirtied) __this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied); exit_rcu(); + + /* + * The setting of TASK_RUNNING by try_to_wake_up() may be delayed + * when the following two conditions become true. + * - There is race condition of mmap_sem (It is acquired by + * exit_mm()), and + * - SMI occurs before setting TASK_RUNINNG. + * (or hypervisor of virtual machine switches to other guest) + * As a result, we may become TASK_RUNNING after becoming TASK_DEAD + * + * To avoid it, we have to wait for releasing tsk->pi_lock which + * is held by try_to_wake_up() + */ + smp_mb(); + raw_spin_unlock_wait(&tsk->pi_lock); + /* causes final put_task_struct in finish_task_switch(). */ tsk->state = TASK_DEAD; tsk->flags |= PF_NOFREEZE; /* tell freezer to ignore us */ @@ -1051,7 +1048,7 @@ NORET_TYPE void do_exit(long code) EXPORT_SYMBOL_GPL(do_exit); -NORET_TYPE void complete_and_exit(struct completion *comp, long code) +void complete_and_exit(struct completion *comp, long code) { if (comp) complete(comp); @@ -1070,7 +1067,7 @@ SYSCALL_DEFINE1(exit, int, error_code) * Take down every thread in the group. This is called by fatal signals * as well as by sys_exit_group (below). */ -NORET_TYPE void +void do_group_exit(int exit_code) { struct signal_struct *sig = current->signal; diff --git a/kernel/fork.c b/kernel/fork.c index 443f5125f11e..37674ec55cde 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -66,6 +66,7 @@ #include <linux/user-return-notifier.h> #include <linux/oom.h> #include <linux/khugepaged.h> +#include <linux/signalfd.h> #include <asm/pgtable.h> #include <asm/pgalloc.h> @@ -192,6 +193,7 @@ void __put_task_struct(struct task_struct *tsk) WARN_ON(atomic_read(&tsk->usage)); WARN_ON(tsk == current); + security_task_free(tsk); exit_creds(tsk); delayacct_tsk_free(tsk); put_signal_struct(tsk->signal); @@ -354,7 +356,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) charge = 0; if (mpnt->vm_flags & VM_ACCOUNT) { unsigned int len = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT; - if (security_vm_enough_memory(len)) + if (security_vm_enough_memory_mm(oldmm, len)) /* sic */ goto fail_nomem; charge = len; } @@ -510,6 +512,23 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p) return NULL; } +static void check_mm(struct mm_struct *mm) +{ + int i; + + for (i = 0; i < NR_MM_COUNTERS; i++) { + long x = atomic_long_read(&mm->rss_stat.count[i]); + + if (unlikely(x)) + printk(KERN_ALERT "BUG: Bad rss-counter state " + "mm:%p idx:%d val:%ld\n", mm, i, x); + } + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + VM_BUG_ON(mm->pmd_huge_pte); +#endif +} + /* * Allocate and initialize an mm_struct. */ @@ -537,9 +556,7 @@ void __mmdrop(struct mm_struct *mm) mm_free_pgd(mm); destroy_context(mm); mmu_notifier_mm_destroy(mm); -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - VM_BUG_ON(mm->pmd_huge_pte); -#endif + check_mm(mm); free_mm(mm); } EXPORT_SYMBOL_GPL(__mmdrop); @@ -647,6 +664,58 @@ struct mm_struct *get_task_mm(struct task_struct *task) } EXPORT_SYMBOL_GPL(get_task_mm); +struct mm_struct *mm_access(struct task_struct *task, unsigned int mode) +{ + struct mm_struct *mm; + int err; + + err = mutex_lock_killable(&task->signal->cred_guard_mutex); + if (err) + return ERR_PTR(err); + + mm = get_task_mm(task); + if (mm && mm != current->mm && + !ptrace_may_access(task, mode)) { + mmput(mm); + mm = ERR_PTR(-EACCES); + } + mutex_unlock(&task->signal->cred_guard_mutex); + + return mm; +} + +static void complete_vfork_done(struct task_struct *tsk) +{ + struct completion *vfork; + + task_lock(tsk); + vfork = tsk->vfork_done; + if (likely(vfork)) { + tsk->vfork_done = NULL; + complete(vfork); + } + task_unlock(tsk); +} + +static int wait_for_vfork_done(struct task_struct *child, + struct completion *vfork) +{ + int killed; + + freezer_do_not_count(); + killed = wait_for_completion_killable(vfork); + freezer_count(); + + if (killed) { + task_lock(child); + child->vfork_done = NULL; + task_unlock(child); + } + + put_task_struct(child); + return killed; +} + /* Please note the differences between mmput and mm_release. * mmput is called whenever we stop holding onto a mm_struct, * error success whatever. @@ -662,8 +731,6 @@ EXPORT_SYMBOL_GPL(get_task_mm); */ void mm_release(struct task_struct *tsk, struct mm_struct *mm) { - struct completion *vfork_done = tsk->vfork_done; - /* Get rid of any futexes when releasing the mm */ #ifdef CONFIG_FUTEX if (unlikely(tsk->robust_list)) { @@ -683,17 +750,15 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm) /* Get rid of any cached register state */ deactivate_mm(tsk, mm); - /* notify parent sleeping on vfork() */ - if (vfork_done) { - tsk->vfork_done = NULL; - complete(vfork_done); - } + if (tsk->vfork_done) + complete_vfork_done(tsk); /* * If we're exiting normally, clear a user-space tid field if * requested. We leave this alone when dying by signal, to leave * the value intact in a core dump, and to save the unnecessary - * trouble otherwise. Userland only wants this done for a sys_exit. + * trouble, say, a killed vfork parent shouldn't touch this mm. + * Userland only wants this done for a sys_exit. */ if (tsk->clear_child_tid) { if (!(tsk->flags & PF_SIGNALED) && @@ -873,6 +938,7 @@ static int copy_io(unsigned long clone_flags, struct task_struct *tsk) { #ifdef CONFIG_BLOCK struct io_context *ioc = current->io_context; + struct io_context *new_ioc; if (!ioc) return 0; @@ -884,11 +950,12 @@ static int copy_io(unsigned long clone_flags, struct task_struct *tsk) if (unlikely(!tsk->io_context)) return -ENOMEM; } else if (ioprio_valid(ioc->ioprio)) { - tsk->io_context = alloc_io_context(GFP_KERNEL, -1); - if (unlikely(!tsk->io_context)) + new_ioc = get_task_io_context(tsk, GFP_KERNEL, NUMA_NO_NODE); + if (unlikely(!new_ioc)) return -ENOMEM; - tsk->io_context->ioprio = ioc->ioprio; + new_ioc->ioprio = ioc->ioprio; + put_io_context(new_ioc); } #endif return 0; @@ -913,8 +980,10 @@ static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk) void __cleanup_sighand(struct sighand_struct *sighand) { - if (atomic_dec_and_test(&sighand->count)) + if (atomic_dec_and_test(&sighand->count)) { + signalfd_cleanup(sighand); kmem_cache_free(sighand_cachep, sighand); + } } @@ -993,7 +1062,6 @@ static void copy_flags(unsigned long clone_flags, struct task_struct *p) new_flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER); new_flags |= PF_FORKNOEXEC; - new_flags |= PF_STARTING; p->flags = new_flags; } @@ -1170,6 +1238,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, #ifdef CONFIG_CPUSETS p->cpuset_mem_spread_rotor = NUMA_NO_NODE; p->cpuset_slab_spread_rotor = NUMA_NO_NODE; + seqcount_init(&p->mems_allowed_seq); #endif #ifdef CONFIG_TRACE_IRQFLAGS p->irq_events = 0; @@ -1288,7 +1357,13 @@ static struct task_struct *copy_process(unsigned long clone_flags, clear_all_latency_tracing(p); /* ok, now we should be set up.. */ - p->exit_signal = (clone_flags & CLONE_THREAD) ? -1 : (clone_flags & CSIGNAL); + if (clone_flags & CLONE_THREAD) + p->exit_signal = -1; + else if (clone_flags & CLONE_PARENT) + p->exit_signal = current->group_leader->exit_signal; + else + p->exit_signal = (clone_flags & CSIGNAL); + p->pdeath_signal = 0; p->exit_state = 0; @@ -1523,18 +1598,9 @@ long do_fork(unsigned long clone_flags, if (clone_flags & CLONE_VFORK) { p->vfork_done = &vfork; init_completion(&vfork); + get_task_struct(p); } - audit_finish_fork(p); - - /* - * We set PF_STARTING at creation in case tracing wants to - * use this to distinguish a fully live task from one that - * hasn't finished SIGSTOP raising yet. Now we clear it - * and set the child going. - */ - p->flags &= ~PF_STARTING; - wake_up_new_task(p); /* forking complete and child started to run, tell ptracer */ @@ -1542,10 +1608,8 @@ long do_fork(unsigned long clone_flags, ptrace_event(trace, nr); if (clone_flags & CLONE_VFORK) { - freezer_do_not_count(); - wait_for_completion(&vfork); - freezer_count(); - ptrace_event(PTRACE_EVENT_VFORK_DONE, nr); + if (!wait_for_vfork_done(p, &vfork)) + ptrace_event(PTRACE_EVENT_VFORK_DONE, nr); } } else { nr = PTR_ERR(p); diff --git a/kernel/freezer.c b/kernel/freezer.c index 9815b8d1eed5..11f82a4d4eae 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -99,9 +99,9 @@ static void fake_signal_wake_up(struct task_struct *p) * freeze_task - send a freeze request to given task * @p: task to send the request to * - * If @p is freezing, the freeze request is sent by setting %TIF_FREEZE - * flag and either sending a fake signal to it or waking it up, depending - * on whether it has %PF_FREEZER_NOSIG set. + * If @p is freezing, the freeze request is sent either by sending a fake + * signal (if it's not a kernel thread) or waking it up (if it's a kernel + * thread). * * RETURNS: * %false, if @p is not freezing or already frozen; %true, otherwise diff --git a/kernel/futex.c b/kernel/futex.c index 1614be20173d..72efa1e4359a 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -2628,7 +2628,7 @@ void exit_robust_list(struct task_struct *curr) long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, u32 __user *uaddr2, u32 val2, u32 val3) { - int ret = -ENOSYS, cmd = op & FUTEX_CMD_MASK; + int cmd = op & FUTEX_CMD_MASK; unsigned int flags = 0; if (!(op & FUTEX_PRIVATE_FLAG)) @@ -2641,49 +2641,44 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, } switch (cmd) { + case FUTEX_LOCK_PI: + case FUTEX_UNLOCK_PI: + case FUTEX_TRYLOCK_PI: + case FUTEX_WAIT_REQUEUE_PI: + case FUTEX_CMP_REQUEUE_PI: + if (!futex_cmpxchg_enabled) + return -ENOSYS; + } + + switch (cmd) { case FUTEX_WAIT: val3 = FUTEX_BITSET_MATCH_ANY; case FUTEX_WAIT_BITSET: - ret = futex_wait(uaddr, flags, val, timeout, val3); - break; + return futex_wait(uaddr, flags, val, timeout, val3); case FUTEX_WAKE: val3 = FUTEX_BITSET_MATCH_ANY; case FUTEX_WAKE_BITSET: - ret = futex_wake(uaddr, flags, val, val3); - break; + return futex_wake(uaddr, flags, val, val3); case FUTEX_REQUEUE: - ret = futex_requeue(uaddr, flags, uaddr2, val, val2, NULL, 0); - break; + return futex_requeue(uaddr, flags, uaddr2, val, val2, NULL, 0); case FUTEX_CMP_REQUEUE: - ret = futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 0); - break; + return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 0); case FUTEX_WAKE_OP: - ret = futex_wake_op(uaddr, flags, uaddr2, val, val2, val3); - break; + return futex_wake_op(uaddr, flags, uaddr2, val, val2, val3); case FUTEX_LOCK_PI: - if (futex_cmpxchg_enabled) - ret = futex_lock_pi(uaddr, flags, val, timeout, 0); - break; + return futex_lock_pi(uaddr, flags, val, timeout, 0); case FUTEX_UNLOCK_PI: - if (futex_cmpxchg_enabled) - ret = futex_unlock_pi(uaddr, flags); - break; + return futex_unlock_pi(uaddr, flags); case FUTEX_TRYLOCK_PI: - if (futex_cmpxchg_enabled) - ret = futex_lock_pi(uaddr, flags, 0, timeout, 1); - break; + return futex_lock_pi(uaddr, flags, 0, timeout, 1); case FUTEX_WAIT_REQUEUE_PI: val3 = FUTEX_BITSET_MATCH_ANY; - ret = futex_wait_requeue_pi(uaddr, flags, val, timeout, val3, - uaddr2); - break; + return futex_wait_requeue_pi(uaddr, flags, val, timeout, val3, + uaddr2); case FUTEX_CMP_REQUEUE_PI: - ret = futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 1); - break; - default: - ret = -ENOSYS; + return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 1); } - return ret; + return -ENOSYS; } diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 2e48ec0c2e91..c21449f85a2a 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -119,15 +119,20 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) * For preemptible RCU it is sufficient to call rcu_read_unlock in order * to exit the grace period. For classic RCU, a reschedule is required. */ -static void rcu_lock_break(struct task_struct *g, struct task_struct *t) +static bool rcu_lock_break(struct task_struct *g, struct task_struct *t) { + bool can_cont; + get_task_struct(g); get_task_struct(t); rcu_read_unlock(); cond_resched(); rcu_read_lock(); + can_cont = pid_alive(g) && pid_alive(t); put_task_struct(t); put_task_struct(g); + + return can_cont; } /* @@ -154,9 +159,7 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout) goto unlock; if (!--batch_count) { batch_count = HUNG_TASK_BATCHING; - rcu_lock_break(g, t); - /* Exit if t or g was unhashed during refresh. */ - if (t->state == TASK_DEAD || g->state == TASK_DEAD) + if (!rcu_lock_break(g, t)) goto unlock; } /* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */ diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c index 342d8f44e401..0119b9d467ae 100644 --- a/kernel/irq/autoprobe.c +++ b/kernel/irq/autoprobe.c @@ -53,7 +53,7 @@ unsigned long probe_irq_on(void) if (desc->irq_data.chip->irq_set_type) desc->irq_data.chip->irq_set_type(&desc->irq_data, IRQ_TYPE_PROBE); - irq_startup(desc); + irq_startup(desc, false); } raw_spin_unlock_irq(&desc->lock); } @@ -70,7 +70,7 @@ unsigned long probe_irq_on(void) raw_spin_lock_irq(&desc->lock); if (!desc->action && irq_settings_can_probe(desc)) { desc->istate |= IRQS_AUTODETECT | IRQS_WAITING; - if (irq_startup(desc)) + if (irq_startup(desc, false)) desc->istate |= IRQS_PENDING; } raw_spin_unlock_irq(&desc->lock); diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index f7c543a801d9..6080f6bc8c33 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -16,6 +16,8 @@ #include <linux/interrupt.h> #include <linux/kernel_stat.h> +#include <trace/events/irq.h> + #include "internals.h" /** @@ -61,8 +63,7 @@ int irq_set_irq_type(unsigned int irq, unsigned int type) return -EINVAL; type &= IRQ_TYPE_SENSE_MASK; - if (type != IRQ_TYPE_NONE) - ret = __irq_set_trigger(desc, irq, type); + ret = __irq_set_trigger(desc, irq, type); irq_put_desc_busunlock(desc, flags); return ret; } @@ -157,19 +158,22 @@ static void irq_state_set_masked(struct irq_desc *desc) irqd_set(&desc->irq_data, IRQD_IRQ_MASKED); } -int irq_startup(struct irq_desc *desc) +int irq_startup(struct irq_desc *desc, bool resend) { + int ret = 0; + irq_state_clr_disabled(desc); desc->depth = 0; if (desc->irq_data.chip->irq_startup) { - int ret = desc->irq_data.chip->irq_startup(&desc->irq_data); + ret = desc->irq_data.chip->irq_startup(&desc->irq_data); irq_state_clr_masked(desc); - return ret; + } else { + irq_enable(desc); } - - irq_enable(desc); - return 0; + if (resend) + check_irq_resend(desc, desc->irq_data.irq); + return ret; } void irq_shutdown(struct irq_desc *desc) @@ -330,6 +334,24 @@ out_unlock: } EXPORT_SYMBOL_GPL(handle_simple_irq); +/* + * Called unconditionally from handle_level_irq() and only for oneshot + * interrupts from handle_fasteoi_irq() + */ +static void cond_unmask_irq(struct irq_desc *desc) +{ + /* + * We need to unmask in the following cases: + * - Standard level irq (IRQF_ONESHOT is not set) + * - Oneshot irq which did not wake the thread (caused by a + * spurious interrupt or a primary handler handling it + * completely). + */ + if (!irqd_irq_disabled(&desc->irq_data) && + irqd_irq_masked(&desc->irq_data) && !desc->threads_oneshot) + unmask_irq(desc); +} + /** * handle_level_irq - Level type irq handler * @irq: the interrupt number @@ -362,8 +384,8 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc) handle_irq_event(desc); - if (!irqd_irq_disabled(&desc->irq_data) && !(desc->istate & IRQS_ONESHOT)) - unmask_irq(desc); + cond_unmask_irq(desc); + out_unlock: raw_spin_unlock(&desc->lock); } @@ -417,6 +439,9 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) preflow_handler(desc); handle_irq_event(desc); + if (desc->istate & IRQS_ONESHOT) + cond_unmask_irq(desc); + out_eoi: desc->irq_data.chip->irq_eoi(&desc->irq_data); out_unlock: @@ -625,7 +650,7 @@ __irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, irq_settings_set_noprobe(desc); irq_settings_set_norequest(desc); irq_settings_set_nothread(desc); - irq_startup(desc); + irq_startup(desc, true); } out: irq_put_desc_busunlock(desc, flags); diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 470d08c82bbe..6ff84e6a954c 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -60,7 +60,7 @@ static void irq_wake_thread(struct irq_desc *desc, struct irqaction *action) * device interrupt, so no irq storm is lurking. If the * RUNTHREAD bit is already set, nothing to do. */ - if (test_bit(IRQTF_DIED, &action->thread_flags) || + if ((action->thread->flags & PF_EXITING) || test_and_set_bit(IRQTF_RUNTHREAD, &action->thread_flags)) return; @@ -110,6 +110,18 @@ static void irq_wake_thread(struct irq_desc *desc, struct irqaction *action) * threads_oneshot untouched and runs the thread another time. */ desc->threads_oneshot |= action->thread_mask; + + /* + * We increment the threads_active counter in case we wake up + * the irq thread. The irq thread decrements the counter when + * it returns from the handler or in the exit path and wakes + * up waiters which are stuck in synchronize_irq() when the + * active count becomes zero. synchronize_irq() is serialized + * against this code (hard irq handler) via IRQS_INPROGRESS + * like the finalize_oneshot() code. See comment above. + */ + atomic_inc(&desc->threads_active); + wake_up_process(action->thread); } diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index a73dd6c7372d..8e5c56b3b7d9 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -15,19 +15,17 @@ #define istate core_internal_state__do_not_mess_with_it -extern int noirqdebug; +extern bool noirqdebug; /* * Bits used by threaded handlers: * IRQTF_RUNTHREAD - signals that the interrupt handler thread should run - * IRQTF_DIED - handler thread died * IRQTF_WARNED - warning "IRQ_WAKE_THREAD w/o thread_fn" has been printed * IRQTF_AFFINITY - irq thread is requested to adjust affinity * IRQTF_FORCED_THREAD - irq action is force threaded */ enum { IRQTF_RUNTHREAD, - IRQTF_DIED, IRQTF_WARNED, IRQTF_AFFINITY, IRQTF_FORCED_THREAD, @@ -67,7 +65,7 @@ extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, extern void __disable_irq(struct irq_desc *desc, unsigned int irq, bool susp); extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume); -extern int irq_startup(struct irq_desc *desc); +extern int irq_startup(struct irq_desc *desc, bool resend); extern void irq_shutdown(struct irq_desc *desc); extern void irq_enable(struct irq_desc *desc); extern void irq_disable(struct irq_desc *desc); diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 1f9e26526b69..af48e59bc2ff 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -1,189 +1,793 @@ +#include <linux/debugfs.h> +#include <linux/hardirq.h> +#include <linux/interrupt.h> #include <linux/irq.h> +#include <linux/irqdesc.h> #include <linux/irqdomain.h> #include <linux/module.h> #include <linux/mutex.h> #include <linux/of.h> #include <linux/of_address.h> +#include <linux/seq_file.h> #include <linux/slab.h> +#include <linux/smp.h> +#include <linux/fs.h> + +#define IRQ_DOMAIN_MAP_LEGACY 0 /* driver allocated fixed range of irqs. + * ie. legacy 8259, gets irqs 1..15 */ +#define IRQ_DOMAIN_MAP_NOMAP 1 /* no fast reverse mapping */ +#define IRQ_DOMAIN_MAP_LINEAR 2 /* linear map of interrupts */ +#define IRQ_DOMAIN_MAP_TREE 3 /* radix tree */ static LIST_HEAD(irq_domain_list); static DEFINE_MUTEX(irq_domain_mutex); +static DEFINE_MUTEX(revmap_trees_mutex); +static unsigned int irq_virq_count = NR_IRQS; +static struct irq_domain *irq_default_domain; + /** - * irq_domain_add() - Register an irq_domain - * @domain: ptr to initialized irq_domain structure + * irq_domain_alloc() - Allocate a new irq_domain data structure + * @of_node: optional device-tree node of the interrupt controller + * @revmap_type: type of reverse mapping to use + * @ops: map/unmap domain callbacks + * @host_data: Controller private data pointer * - * Registers an irq_domain structure. The irq_domain must at a minimum be - * initialized with an ops structure pointer, and either a ->to_irq hook or - * a valid irq_base value. Everything else is optional. + * Allocates and initialize and irq_domain structure. Caller is expected to + * register allocated irq_domain with irq_domain_register(). Returns pointer + * to IRQ domain, or NULL on failure. */ -void irq_domain_add(struct irq_domain *domain) +static struct irq_domain *irq_domain_alloc(struct device_node *of_node, + unsigned int revmap_type, + const struct irq_domain_ops *ops, + void *host_data) { - struct irq_data *d; - int hwirq, irq; + struct irq_domain *domain; - /* - * This assumes that the irq_domain owner has already allocated - * the irq_descs. This block will be removed when support for dynamic - * allocation of irq_descs is added to irq_domain. - */ - irq_domain_for_each_irq(domain, hwirq, irq) { - d = irq_get_irq_data(irq); - if (!d) { - WARN(1, "error: assigning domain to non existant irq_desc"); - return; - } - if (d->domain) { - /* things are broken; just report, don't clean up */ - WARN(1, "error: irq_desc already assigned to a domain"); - return; + domain = kzalloc(sizeof(*domain), GFP_KERNEL); + if (WARN_ON(!domain)) + return NULL; + + /* Fill structure */ + domain->revmap_type = revmap_type; + domain->ops = ops; + domain->host_data = host_data; + domain->of_node = of_node_get(of_node); + + return domain; +} + +static void irq_domain_add(struct irq_domain *domain) +{ + mutex_lock(&irq_domain_mutex); + list_add(&domain->link, &irq_domain_list); + mutex_unlock(&irq_domain_mutex); + pr_debug("irq: Allocated domain of type %d @0x%p\n", + domain->revmap_type, domain); +} + +static unsigned int irq_domain_legacy_revmap(struct irq_domain *domain, + irq_hw_number_t hwirq) +{ + irq_hw_number_t first_hwirq = domain->revmap_data.legacy.first_hwirq; + int size = domain->revmap_data.legacy.size; + + if (WARN_ON(hwirq < first_hwirq || hwirq >= first_hwirq + size)) + return 0; + return hwirq - first_hwirq + domain->revmap_data.legacy.first_irq; +} + +/** + * irq_domain_add_legacy() - Allocate and register a legacy revmap irq_domain. + * @of_node: pointer to interrupt controller's device tree node. + * @size: total number of irqs in legacy mapping + * @first_irq: first number of irq block assigned to the domain + * @first_hwirq: first hwirq number to use for the translation. Should normally + * be '0', but a positive integer can be used if the effective + * hwirqs numbering does not begin at zero. + * @ops: map/unmap domain callbacks + * @host_data: Controller private data pointer + * + * Note: the map() callback will be called before this function returns + * for all legacy interrupts except 0 (which is always the invalid irq for + * a legacy controller). + */ +struct irq_domain *irq_domain_add_legacy(struct device_node *of_node, + unsigned int size, + unsigned int first_irq, + irq_hw_number_t first_hwirq, + const struct irq_domain_ops *ops, + void *host_data) +{ + struct irq_domain *domain; + unsigned int i; + + domain = irq_domain_alloc(of_node, IRQ_DOMAIN_MAP_LEGACY, ops, host_data); + if (!domain) + return NULL; + + domain->revmap_data.legacy.first_irq = first_irq; + domain->revmap_data.legacy.first_hwirq = first_hwirq; + domain->revmap_data.legacy.size = size; + + mutex_lock(&irq_domain_mutex); + /* Verify that all the irqs are available */ + for (i = 0; i < size; i++) { + int irq = first_irq + i; + struct irq_data *irq_data = irq_get_irq_data(irq); + + if (WARN_ON(!irq_data || irq_data->domain)) { + mutex_unlock(&irq_domain_mutex); + of_node_put(domain->of_node); + kfree(domain); + return NULL; } - d->domain = domain; - d->hwirq = hwirq; } - mutex_lock(&irq_domain_mutex); - list_add(&domain->list, &irq_domain_list); + /* Claim all of the irqs before registering a legacy domain */ + for (i = 0; i < size; i++) { + struct irq_data *irq_data = irq_get_irq_data(first_irq + i); + irq_data->hwirq = first_hwirq + i; + irq_data->domain = domain; + } mutex_unlock(&irq_domain_mutex); + + for (i = 0; i < size; i++) { + int irq = first_irq + i; + int hwirq = first_hwirq + i; + + /* IRQ0 gets ignored */ + if (!irq) + continue; + + /* Legacy flags are left to default at this point, + * one can then use irq_create_mapping() to + * explicitly change them + */ + ops->map(domain, irq, hwirq); + + /* Clear norequest flags */ + irq_clear_status_flags(irq, IRQ_NOREQUEST); + } + + irq_domain_add(domain); + return domain; +} + +/** + * irq_domain_add_linear() - Allocate and register a legacy revmap irq_domain. + * @of_node: pointer to interrupt controller's device tree node. + * @ops: map/unmap domain callbacks + * @host_data: Controller private data pointer + */ +struct irq_domain *irq_domain_add_linear(struct device_node *of_node, + unsigned int size, + const struct irq_domain_ops *ops, + void *host_data) +{ + struct irq_domain *domain; + unsigned int *revmap; + + revmap = kzalloc(sizeof(*revmap) * size, GFP_KERNEL); + if (WARN_ON(!revmap)) + return NULL; + + domain = irq_domain_alloc(of_node, IRQ_DOMAIN_MAP_LINEAR, ops, host_data); + if (!domain) { + kfree(revmap); + return NULL; + } + domain->revmap_data.linear.size = size; + domain->revmap_data.linear.revmap = revmap; + irq_domain_add(domain); + return domain; +} + +struct irq_domain *irq_domain_add_nomap(struct device_node *of_node, + const struct irq_domain_ops *ops, + void *host_data) +{ + struct irq_domain *domain = irq_domain_alloc(of_node, + IRQ_DOMAIN_MAP_NOMAP, ops, host_data); + if (domain) + irq_domain_add(domain); + return domain; +} + +/** + * irq_domain_add_tree() + * @of_node: pointer to interrupt controller's device tree node. + * @ops: map/unmap domain callbacks + * + * Note: The radix tree will be allocated later during boot automatically + * (the reverse mapping will use the slow path until that happens). + */ +struct irq_domain *irq_domain_add_tree(struct device_node *of_node, + const struct irq_domain_ops *ops, + void *host_data) +{ + struct irq_domain *domain = irq_domain_alloc(of_node, + IRQ_DOMAIN_MAP_TREE, ops, host_data); + if (domain) { + INIT_RADIX_TREE(&domain->revmap_data.tree, GFP_KERNEL); + irq_domain_add(domain); + } + return domain; } /** - * irq_domain_del() - Unregister an irq_domain - * @domain: ptr to registered irq_domain. + * irq_find_host() - Locates a domain for a given device node + * @node: device-tree node of the interrupt controller */ -void irq_domain_del(struct irq_domain *domain) +struct irq_domain *irq_find_host(struct device_node *node) { - struct irq_data *d; - int hwirq, irq; + struct irq_domain *h, *found = NULL; + int rc; + /* We might want to match the legacy controller last since + * it might potentially be set to match all interrupts in + * the absence of a device node. This isn't a problem so far + * yet though... + */ mutex_lock(&irq_domain_mutex); - list_del(&domain->list); + list_for_each_entry(h, &irq_domain_list, link) { + if (h->ops->match) + rc = h->ops->match(h, node); + else + rc = (h->of_node != NULL) && (h->of_node == node); + + if (rc) { + found = h; + break; + } + } mutex_unlock(&irq_domain_mutex); + return found; +} +EXPORT_SYMBOL_GPL(irq_find_host); + +/** + * irq_set_default_host() - Set a "default" irq domain + * @domain: default domain pointer + * + * For convenience, it's possible to set a "default" domain that will be used + * whenever NULL is passed to irq_create_mapping(). It makes life easier for + * platforms that want to manipulate a few hard coded interrupt numbers that + * aren't properly represented in the device-tree. + */ +void irq_set_default_host(struct irq_domain *domain) +{ + pr_debug("irq: Default domain set to @0x%p\n", domain); + + irq_default_domain = domain; +} + +/** + * irq_set_virq_count() - Set the maximum number of linux irqs + * @count: number of linux irqs, capped with NR_IRQS + * + * This is mainly for use by platforms like iSeries who want to program + * the virtual irq number in the controller to avoid the reverse mapping + */ +void irq_set_virq_count(unsigned int count) +{ + pr_debug("irq: Trying to set virq count to %d\n", count); - /* Clear the irq_domain assignments */ - irq_domain_for_each_irq(domain, hwirq, irq) { - d = irq_get_irq_data(irq); - d->domain = NULL; + BUG_ON(count < NUM_ISA_INTERRUPTS); + if (count < NR_IRQS) + irq_virq_count = count; +} + +static int irq_setup_virq(struct irq_domain *domain, unsigned int virq, + irq_hw_number_t hwirq) +{ + struct irq_data *irq_data = irq_get_irq_data(virq); + + irq_data->hwirq = hwirq; + irq_data->domain = domain; + if (domain->ops->map(domain, virq, hwirq)) { + pr_debug("irq: -> mapping failed, freeing\n"); + irq_data->domain = NULL; + irq_data->hwirq = 0; + return -1; } + + irq_clear_status_flags(virq, IRQ_NOREQUEST); + + return 0; } -#if defined(CONFIG_OF_IRQ) /** - * irq_create_of_mapping() - Map a linux irq number from a DT interrupt spec + * irq_create_direct_mapping() - Allocate an irq for direct mapping + * @domain: domain to allocate the irq for or NULL for default domain * - * Used by the device tree interrupt mapping code to translate a device tree - * interrupt specifier to a valid linux irq number. Returns either a valid - * linux IRQ number or 0. + * This routine is used for irq controllers which can choose the hardware + * interrupt numbers they generate. In such a case it's simplest to use + * the linux irq as the hardware interrupt number. + */ +unsigned int irq_create_direct_mapping(struct irq_domain *domain) +{ + unsigned int virq; + + if (domain == NULL) + domain = irq_default_domain; + + BUG_ON(domain == NULL); + WARN_ON(domain->revmap_type != IRQ_DOMAIN_MAP_NOMAP); + + virq = irq_alloc_desc_from(1, 0); + if (!virq) { + pr_debug("irq: create_direct virq allocation failed\n"); + return 0; + } + if (virq >= irq_virq_count) { + pr_err("ERROR: no free irqs available below %i maximum\n", + irq_virq_count); + irq_free_desc(virq); + return 0; + } + + pr_debug("irq: create_direct obtained virq %d\n", virq); + + if (irq_setup_virq(domain, virq, virq)) { + irq_free_desc(virq); + return 0; + } + + return virq; +} + +/** + * irq_create_mapping() - Map a hardware interrupt into linux irq space + * @domain: domain owning this hardware interrupt or NULL for default domain + * @hwirq: hardware irq number in that domain space * - * When the caller no longer need the irq number returned by this function it - * should arrange to call irq_dispose_mapping(). + * Only one mapping per hardware interrupt is permitted. Returns a linux + * irq number. + * If the sense/trigger is to be specified, set_irq_type() should be called + * on the number returned from that call. */ +unsigned int irq_create_mapping(struct irq_domain *domain, + irq_hw_number_t hwirq) +{ + unsigned int virq, hint; + + pr_debug("irq: irq_create_mapping(0x%p, 0x%lx)\n", domain, hwirq); + + /* Look for default domain if nececssary */ + if (domain == NULL) + domain = irq_default_domain; + if (domain == NULL) { + printk(KERN_WARNING "irq_create_mapping called for" + " NULL domain, hwirq=%lx\n", hwirq); + WARN_ON(1); + return 0; + } + pr_debug("irq: -> using domain @%p\n", domain); + + /* Check if mapping already exists */ + virq = irq_find_mapping(domain, hwirq); + if (virq) { + pr_debug("irq: -> existing mapping on virq %d\n", virq); + return virq; + } + + /* Get a virtual interrupt number */ + if (domain->revmap_type == IRQ_DOMAIN_MAP_LEGACY) + return irq_domain_legacy_revmap(domain, hwirq); + + /* Allocate a virtual interrupt number */ + hint = hwirq % irq_virq_count; + if (hint == 0) + hint++; + virq = irq_alloc_desc_from(hint, 0); + if (!virq) + virq = irq_alloc_desc_from(1, 0); + if (!virq) { + pr_debug("irq: -> virq allocation failed\n"); + return 0; + } + + if (irq_setup_virq(domain, virq, hwirq)) { + if (domain->revmap_type != IRQ_DOMAIN_MAP_LEGACY) + irq_free_desc(virq); + return 0; + } + + pr_debug("irq: irq %lu on domain %s mapped to virtual irq %u\n", + hwirq, domain->of_node ? domain->of_node->full_name : "null", virq); + + return virq; +} +EXPORT_SYMBOL_GPL(irq_create_mapping); + unsigned int irq_create_of_mapping(struct device_node *controller, const u32 *intspec, unsigned int intsize) { struct irq_domain *domain; - unsigned long hwirq; - unsigned int irq, type; - int rc = -EINVAL; + irq_hw_number_t hwirq; + unsigned int type = IRQ_TYPE_NONE; + unsigned int virq; - /* Find a domain which can translate the irq spec */ - mutex_lock(&irq_domain_mutex); - list_for_each_entry(domain, &irq_domain_list, list) { - if (!domain->ops->dt_translate) - continue; - rc = domain->ops->dt_translate(domain, controller, - intspec, intsize, &hwirq, &type); - if (rc == 0) - break; + domain = controller ? irq_find_host(controller) : irq_default_domain; + if (!domain) { +#ifdef CONFIG_MIPS + /* + * Workaround to avoid breaking interrupt controller drivers + * that don't yet register an irq_domain. This is temporary + * code. ~~~gcl, Feb 24, 2012 + * + * Scheduled for removal in Linux v3.6. That should be enough + * time. + */ + if (intsize > 0) + return intspec[0]; +#endif + printk(KERN_WARNING "irq: no irq domain found for %s !\n", + controller->full_name); + return 0; } - mutex_unlock(&irq_domain_mutex); - if (rc != 0) - return 0; + /* If domain has no translation, then we assume interrupt line */ + if (domain->ops->xlate == NULL) + hwirq = intspec[0]; + else { + if (domain->ops->xlate(domain, controller, intspec, intsize, + &hwirq, &type)) + return 0; + } + + /* Create mapping */ + virq = irq_create_mapping(domain, hwirq); + if (!virq) + return virq; - irq = irq_domain_to_irq(domain, hwirq); - if (type != IRQ_TYPE_NONE) - irq_set_irq_type(irq, type); - pr_debug("%s: mapped hwirq=%i to irq=%i, flags=%x\n", - controller->full_name, (int)hwirq, irq, type); - return irq; + /* Set type if specified and different than the current one */ + if (type != IRQ_TYPE_NONE && + type != (irqd_get_trigger_type(irq_get_irq_data(virq)))) + irq_set_irq_type(virq, type); + return virq; } EXPORT_SYMBOL_GPL(irq_create_of_mapping); /** - * irq_dispose_mapping() - Discard a mapping created by irq_create_of_mapping() - * @irq: linux irq number to be discarded + * irq_dispose_mapping() - Unmap an interrupt + * @virq: linux irq number of the interrupt to unmap + */ +void irq_dispose_mapping(unsigned int virq) +{ + struct irq_data *irq_data = irq_get_irq_data(virq); + struct irq_domain *domain; + irq_hw_number_t hwirq; + + if (!virq || !irq_data) + return; + + domain = irq_data->domain; + if (WARN_ON(domain == NULL)) + return; + + /* Never unmap legacy interrupts */ + if (domain->revmap_type == IRQ_DOMAIN_MAP_LEGACY) + return; + + irq_set_status_flags(virq, IRQ_NOREQUEST); + + /* remove chip and handler */ + irq_set_chip_and_handler(virq, NULL, NULL); + + /* Make sure it's completed */ + synchronize_irq(virq); + + /* Tell the PIC about it */ + if (domain->ops->unmap) + domain->ops->unmap(domain, virq); + smp_mb(); + + /* Clear reverse map */ + hwirq = irq_data->hwirq; + switch(domain->revmap_type) { + case IRQ_DOMAIN_MAP_LINEAR: + if (hwirq < domain->revmap_data.linear.size) + domain->revmap_data.linear.revmap[hwirq] = 0; + break; + case IRQ_DOMAIN_MAP_TREE: + mutex_lock(&revmap_trees_mutex); + radix_tree_delete(&domain->revmap_data.tree, hwirq); + mutex_unlock(&revmap_trees_mutex); + break; + } + + irq_free_desc(virq); +} +EXPORT_SYMBOL_GPL(irq_dispose_mapping); + +/** + * irq_find_mapping() - Find a linux irq from an hw irq number. + * @domain: domain owning this hardware interrupt + * @hwirq: hardware irq number in that domain space + * + * This is a slow path, for use by generic code. It's expected that an + * irq controller implementation directly calls the appropriate low level + * mapping function. + */ +unsigned int irq_find_mapping(struct irq_domain *domain, + irq_hw_number_t hwirq) +{ + unsigned int i; + unsigned int hint = hwirq % irq_virq_count; + + /* Look for default domain if nececssary */ + if (domain == NULL) + domain = irq_default_domain; + if (domain == NULL) + return 0; + + /* legacy -> bail early */ + if (domain->revmap_type == IRQ_DOMAIN_MAP_LEGACY) + return irq_domain_legacy_revmap(domain, hwirq); + + /* Slow path does a linear search of the map */ + if (hint == 0) + hint = 1; + i = hint; + do { + struct irq_data *data = irq_get_irq_data(i); + if (data && (data->domain == domain) && (data->hwirq == hwirq)) + return i; + i++; + if (i >= irq_virq_count) + i = 1; + } while(i != hint); + return 0; +} +EXPORT_SYMBOL_GPL(irq_find_mapping); + +/** + * irq_radix_revmap_lookup() - Find a linux irq from a hw irq number. + * @domain: domain owning this hardware interrupt + * @hwirq: hardware irq number in that domain space * - * Calling this function indicates the caller no longer needs a reference to - * the linux irq number returned by a prior call to irq_create_of_mapping(). + * This is a fast path, for use by irq controller code that uses radix tree + * revmaps */ -void irq_dispose_mapping(unsigned int irq) +unsigned int irq_radix_revmap_lookup(struct irq_domain *domain, + irq_hw_number_t hwirq) { + struct irq_data *irq_data; + + if (WARN_ON_ONCE(domain->revmap_type != IRQ_DOMAIN_MAP_TREE)) + return irq_find_mapping(domain, hwirq); + + /* + * Freeing an irq can delete nodes along the path to + * do the lookup via call_rcu. + */ + rcu_read_lock(); + irq_data = radix_tree_lookup(&domain->revmap_data.tree, hwirq); + rcu_read_unlock(); + /* - * nothing yet; will be filled when support for dynamic allocation of - * irq_descs is added to irq_domain + * If found in radix tree, then fine. + * Else fallback to linear lookup - this should not happen in practice + * as it means that we failed to insert the node in the radix tree. */ + return irq_data ? irq_data->irq : irq_find_mapping(domain, hwirq); } -EXPORT_SYMBOL_GPL(irq_dispose_mapping); -int irq_domain_simple_dt_translate(struct irq_domain *d, - struct device_node *controller, - const u32 *intspec, unsigned int intsize, - unsigned long *out_hwirq, unsigned int *out_type) +/** + * irq_radix_revmap_insert() - Insert a hw irq to linux irq number mapping. + * @domain: domain owning this hardware interrupt + * @virq: linux irq number + * @hwirq: hardware irq number in that domain space + * + * This is for use by irq controllers that use a radix tree reverse + * mapping for fast lookup. + */ +void irq_radix_revmap_insert(struct irq_domain *domain, unsigned int virq, + irq_hw_number_t hwirq) { - if (d->of_node != controller) - return -EINVAL; - if (intsize < 1) - return -EINVAL; - if (d->nr_irq && ((intspec[0] < d->hwirq_base) || - (intspec[0] >= d->hwirq_base + d->nr_irq))) - return -EINVAL; + struct irq_data *irq_data = irq_get_irq_data(virq); + + if (WARN_ON(domain->revmap_type != IRQ_DOMAIN_MAP_TREE)) + return; + + if (virq) { + mutex_lock(&revmap_trees_mutex); + radix_tree_insert(&domain->revmap_data.tree, hwirq, irq_data); + mutex_unlock(&revmap_trees_mutex); + } +} + +/** + * irq_linear_revmap() - Find a linux irq from a hw irq number. + * @domain: domain owning this hardware interrupt + * @hwirq: hardware irq number in that domain space + * + * This is a fast path, for use by irq controller code that uses linear + * revmaps. It does fallback to the slow path if the revmap doesn't exist + * yet and will create the revmap entry with appropriate locking + */ +unsigned int irq_linear_revmap(struct irq_domain *domain, + irq_hw_number_t hwirq) +{ + unsigned int *revmap; + + if (WARN_ON_ONCE(domain->revmap_type != IRQ_DOMAIN_MAP_LINEAR)) + return irq_find_mapping(domain, hwirq); + + /* Check revmap bounds */ + if (unlikely(hwirq >= domain->revmap_data.linear.size)) + return irq_find_mapping(domain, hwirq); + + /* Check if revmap was allocated */ + revmap = domain->revmap_data.linear.revmap; + if (unlikely(revmap == NULL)) + return irq_find_mapping(domain, hwirq); + + /* Fill up revmap with slow path if no mapping found */ + if (unlikely(!revmap[hwirq])) + revmap[hwirq] = irq_find_mapping(domain, hwirq); + + return revmap[hwirq]; +} + +#ifdef CONFIG_VIRQ_DEBUG +static int virq_debug_show(struct seq_file *m, void *private) +{ + unsigned long flags; + struct irq_desc *desc; + const char *p; + static const char none[] = "none"; + void *data; + int i; + + seq_printf(m, "%-5s %-7s %-15s %-18s %s\n", "virq", "hwirq", + "chip name", "chip data", "domain name"); + + for (i = 1; i < nr_irqs; i++) { + desc = irq_to_desc(i); + if (!desc) + continue; + + raw_spin_lock_irqsave(&desc->lock, flags); + + if (desc->action && desc->action->handler) { + struct irq_chip *chip; + + seq_printf(m, "%5d ", i); + seq_printf(m, "0x%05lx ", desc->irq_data.hwirq); + + chip = irq_desc_get_chip(desc); + if (chip && chip->name) + p = chip->name; + else + p = none; + seq_printf(m, "%-15s ", p); + + data = irq_desc_get_chip_data(desc); + seq_printf(m, "0x%16p ", data); + + if (desc->irq_data.domain->of_node) + p = desc->irq_data.domain->of_node->full_name; + else + p = none; + seq_printf(m, "%s\n", p); + } + + raw_spin_unlock_irqrestore(&desc->lock, flags); + } + + return 0; +} +static int virq_debug_open(struct inode *inode, struct file *file) +{ + return single_open(file, virq_debug_show, inode->i_private); +} + +static const struct file_operations virq_debug_fops = { + .open = virq_debug_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int __init irq_debugfs_init(void) +{ + if (debugfs_create_file("virq_mapping", S_IRUGO, powerpc_debugfs_root, + NULL, &virq_debug_fops) == NULL) + return -ENOMEM; + + return 0; +} +__initcall(irq_debugfs_init); +#endif /* CONFIG_VIRQ_DEBUG */ + +int irq_domain_simple_map(struct irq_domain *d, unsigned int irq, + irq_hw_number_t hwirq) +{ + return 0; +} + +/** + * irq_domain_xlate_onecell() - Generic xlate for direct one cell bindings + * + * Device Tree IRQ specifier translation function which works with one cell + * bindings where the cell value maps directly to the hwirq number. + */ +int irq_domain_xlate_onecell(struct irq_domain *d, struct device_node *ctrlr, + const u32 *intspec, unsigned int intsize, + unsigned long *out_hwirq, unsigned int *out_type) +{ + if (WARN_ON(intsize < 1)) + return -EINVAL; *out_hwirq = intspec[0]; *out_type = IRQ_TYPE_NONE; - if (intsize > 1) - *out_type = intspec[1] & IRQ_TYPE_SENSE_MASK; return 0; } +EXPORT_SYMBOL_GPL(irq_domain_xlate_onecell); /** - * irq_domain_create_simple() - Set up a 'simple' translation range + * irq_domain_xlate_twocell() - Generic xlate for direct two cell bindings + * + * Device Tree IRQ specifier translation function which works with two cell + * bindings where the cell values map directly to the hwirq number + * and linux irq flags. */ -void irq_domain_add_simple(struct device_node *controller, int irq_base) +int irq_domain_xlate_twocell(struct irq_domain *d, struct device_node *ctrlr, + const u32 *intspec, unsigned int intsize, + irq_hw_number_t *out_hwirq, unsigned int *out_type) { - struct irq_domain *domain; - - domain = kzalloc(sizeof(*domain), GFP_KERNEL); - if (!domain) { - WARN_ON(1); - return; - } + if (WARN_ON(intsize < 2)) + return -EINVAL; + *out_hwirq = intspec[0]; + *out_type = intspec[1] & IRQ_TYPE_SENSE_MASK; + return 0; +} +EXPORT_SYMBOL_GPL(irq_domain_xlate_twocell); - domain->irq_base = irq_base; - domain->of_node = of_node_get(controller); - domain->ops = &irq_domain_simple_ops; - irq_domain_add(domain); +/** + * irq_domain_xlate_onetwocell() - Generic xlate for one or two cell bindings + * + * Device Tree IRQ specifier translation function which works with either one + * or two cell bindings where the cell values map directly to the hwirq number + * and linux irq flags. + * + * Note: don't use this function unless your interrupt controller explicitly + * supports both one and two cell bindings. For the majority of controllers + * the _onecell() or _twocell() variants above should be used. + */ +int irq_domain_xlate_onetwocell(struct irq_domain *d, + struct device_node *ctrlr, + const u32 *intspec, unsigned int intsize, + unsigned long *out_hwirq, unsigned int *out_type) +{ + if (WARN_ON(intsize < 1)) + return -EINVAL; + *out_hwirq = intspec[0]; + *out_type = (intsize > 1) ? intspec[1] : IRQ_TYPE_NONE; + return 0; } -EXPORT_SYMBOL_GPL(irq_domain_add_simple); +EXPORT_SYMBOL_GPL(irq_domain_xlate_onetwocell); +const struct irq_domain_ops irq_domain_simple_ops = { + .map = irq_domain_simple_map, + .xlate = irq_domain_xlate_onetwocell, +}; +EXPORT_SYMBOL_GPL(irq_domain_simple_ops); + +#ifdef CONFIG_OF_IRQ void irq_domain_generate_simple(const struct of_device_id *match, u64 phys_base, unsigned int irq_start) { struct device_node *node; - pr_info("looking for phys_base=%llx, irq_start=%i\n", + pr_debug("looking for phys_base=%llx, irq_start=%i\n", (unsigned long long) phys_base, (int) irq_start); node = of_find_matching_node_by_address(NULL, match, phys_base); if (node) - irq_domain_add_simple(node, irq_start); - else - pr_info("no node found\n"); + irq_domain_add_legacy(node, 32, irq_start, 0, + &irq_domain_simple_ops, NULL); } EXPORT_SYMBOL_GPL(irq_domain_generate_simple); -#endif /* CONFIG_OF_IRQ */ - -struct irq_domain_ops irq_domain_simple_ops = { -#ifdef CONFIG_OF_IRQ - .dt_translate = irq_domain_simple_dt_translate, -#endif /* CONFIG_OF_IRQ */ -}; -EXPORT_SYMBOL_GPL(irq_domain_simple_ops); +#endif diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index a9a9dbe49fea..b0ccd1ac2d6a 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -759,6 +759,13 @@ static irqreturn_t irq_thread_fn(struct irq_desc *desc, return ret; } +static void wake_threads_waitq(struct irq_desc *desc) +{ + if (atomic_dec_and_test(&desc->threads_active) && + waitqueue_active(&desc->wait_for_threads)) + wake_up(&desc->wait_for_threads); +} + /* * Interrupt handler thread */ @@ -771,57 +778,41 @@ static int irq_thread(void *data) struct irq_desc *desc = irq_to_desc(action->irq); irqreturn_t (*handler_fn)(struct irq_desc *desc, struct irqaction *action); - int wake; - if (force_irqthreads & test_bit(IRQTF_FORCED_THREAD, + if (force_irqthreads && test_bit(IRQTF_FORCED_THREAD, &action->thread_flags)) handler_fn = irq_forced_thread_fn; else handler_fn = irq_thread_fn; sched_setscheduler(current, SCHED_FIFO, ¶m); - current->irqaction = action; + current->irq_thread = 1; while (!irq_wait_for_interrupt(action)) { + irqreturn_t action_ret; irq_thread_check_affinity(desc, action); - atomic_inc(&desc->threads_active); - - raw_spin_lock_irq(&desc->lock); - if (unlikely(irqd_irq_disabled(&desc->irq_data))) { - /* - * CHECKME: We might need a dedicated - * IRQ_THREAD_PENDING flag here, which - * retriggers the thread in check_irq_resend() - * but AFAICT IRQS_PENDING should be fine as it - * retriggers the interrupt itself --- tglx - */ - desc->istate |= IRQS_PENDING; - raw_spin_unlock_irq(&desc->lock); - } else { - irqreturn_t action_ret; - - raw_spin_unlock_irq(&desc->lock); - action_ret = handler_fn(desc, action); - if (!noirqdebug) - note_interrupt(action->irq, desc, action_ret); - } + action_ret = handler_fn(desc, action); + if (!noirqdebug) + note_interrupt(action->irq, desc, action_ret); - wake = atomic_dec_and_test(&desc->threads_active); - - if (wake && waitqueue_active(&desc->wait_for_threads)) - wake_up(&desc->wait_for_threads); + wake_threads_waitq(desc); } - /* Prevent a stale desc->threads_oneshot */ - irq_finalize_oneshot(desc, action, true); - /* - * Clear irqaction. Otherwise exit_irq_thread() would make + * This is the regular exit path. __free_irq() is stopping the + * thread via kthread_stop() after calling + * synchronize_irq(). So neither IRQTF_RUNTHREAD nor the + * oneshot mask bit can be set. We cannot verify that as we + * cannot touch the oneshot mask at this point anymore as + * __setup_irq() might have given out currents thread_mask + * again. + * + * Clear irq_thread. Otherwise exit_irq_thread() would make * fuzz about an active irq thread going into nirvana. */ - current->irqaction = NULL; + current->irq_thread = 0; return 0; } @@ -832,27 +823,28 @@ void exit_irq_thread(void) { struct task_struct *tsk = current; struct irq_desc *desc; + struct irqaction *action; - if (!tsk->irqaction) + if (!tsk->irq_thread) return; + action = kthread_data(tsk); + printk(KERN_ERR "exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n", - tsk->comm ? tsk->comm : "", tsk->pid, tsk->irqaction->irq); + tsk->comm ? tsk->comm : "", tsk->pid, action->irq); - desc = irq_to_desc(tsk->irqaction->irq); + desc = irq_to_desc(action->irq); /* - * Prevent a stale desc->threads_oneshot. Must be called - * before setting the IRQTF_DIED flag. + * If IRQTF_RUNTHREAD is set, we need to decrement + * desc->threads_active and wake possible waiters. */ - irq_finalize_oneshot(desc, tsk->irqaction, true); + if (test_and_clear_bit(IRQTF_RUNTHREAD, &action->thread_flags)) + wake_threads_waitq(desc); - /* - * Set the THREAD DIED flag to prevent further wakeups of the - * soon to be gone threaded handler. - */ - set_bit(IRQTF_DIED, &tsk->irqaction->flags); + /* Prevent a stale desc->threads_oneshot */ + irq_finalize_oneshot(desc, action, true); } static void irq_setup_forced_threading(struct irqaction *new) @@ -985,6 +977,11 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) /* add new interrupt at end of irq queue */ do { + /* + * Or all existing action->thread_mask bits, + * so we can find the next zero bit for this + * new action. + */ thread_mask |= old->thread_mask; old_ptr = &old->next; old = *old_ptr; @@ -993,14 +990,41 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) } /* - * Setup the thread mask for this irqaction. Unlikely to have - * 32 resp 64 irqs sharing one line, but who knows. + * Setup the thread mask for this irqaction for ONESHOT. For + * !ONESHOT irqs the thread mask is 0 so we can avoid a + * conditional in irq_wake_thread(). */ - if (new->flags & IRQF_ONESHOT && thread_mask == ~0UL) { - ret = -EBUSY; - goto out_mask; + if (new->flags & IRQF_ONESHOT) { + /* + * Unlikely to have 32 resp 64 irqs sharing one line, + * but who knows. + */ + if (thread_mask == ~0UL) { + ret = -EBUSY; + goto out_mask; + } + /* + * The thread_mask for the action is or'ed to + * desc->thread_active to indicate that the + * IRQF_ONESHOT thread handler has been woken, but not + * yet finished. The bit is cleared when a thread + * completes. When all threads of a shared interrupt + * line have completed desc->threads_active becomes + * zero and the interrupt line is unmasked. See + * handle.c:irq_wake_thread() for further information. + * + * If no thread is woken by primary (hard irq context) + * interrupt handlers, then desc->threads_active is + * also checked for zero to unmask the irq line in the + * affected hard irq flow handlers + * (handle_[fasteoi|level]_irq). + * + * The new action gets the first zero bit of + * thread_mask assigned. See the loop above which or's + * all existing action->thread_mask bits. + */ + new->thread_mask = 1 << ffz(thread_mask); } - new->thread_mask = 1 << ffz(thread_mask); if (!shared) { init_waitqueue_head(&desc->wait_for_threads); @@ -1027,7 +1051,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) desc->istate |= IRQS_ONESHOT; if (irq_settings_can_autoenable(desc)) - irq_startup(desc); + irq_startup(desc, true); else /* Undo nested disables: */ desc->depth = 1; @@ -1103,8 +1127,7 @@ out_thread: struct task_struct *t = new->thread; new->thread = NULL; - if (likely(!test_bit(IRQTF_DIED, &new->thread_flags))) - kthread_stop(t); + kthread_stop(t); put_task_struct(t); } out_mput: @@ -1214,8 +1237,7 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) #endif if (action->thread) { - if (!test_bit(IRQTF_DIED, &action->thread_flags)) - kthread_stop(action->thread); + kthread_stop(action->thread); put_task_struct(action->thread); } diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index dc813a948be2..611cd6003c45 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -325,7 +325,7 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc, desc->irqs_unhandled = 0; } -int noirqdebug __read_mostly; +bool noirqdebug __read_mostly; int noirqdebug_setup(char *str) { diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 01d3b70fc98a..43049192b5ec 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -12,7 +12,7 @@ #include <linux/slab.h> #include <linux/sort.h> #include <linux/err.h> -#include <linux/jump_label.h> +#include <linux/static_key.h> #ifdef HAVE_JUMP_LABEL @@ -29,11 +29,6 @@ void jump_label_unlock(void) mutex_unlock(&jump_label_mutex); } -bool jump_label_enabled(struct jump_label_key *key) -{ - return !!atomic_read(&key->enabled); -} - static int jump_label_cmp(const void *a, const void *b) { const struct jump_entry *jea = a; @@ -58,56 +53,66 @@ jump_label_sort_entries(struct jump_entry *start, struct jump_entry *stop) sort(start, size, sizeof(struct jump_entry), jump_label_cmp, NULL); } -static void jump_label_update(struct jump_label_key *key, int enable); +static void jump_label_update(struct static_key *key, int enable); -void jump_label_inc(struct jump_label_key *key) +void static_key_slow_inc(struct static_key *key) { if (atomic_inc_not_zero(&key->enabled)) return; jump_label_lock(); - if (atomic_read(&key->enabled) == 0) - jump_label_update(key, JUMP_LABEL_ENABLE); + if (atomic_read(&key->enabled) == 0) { + if (!jump_label_get_branch_default(key)) + jump_label_update(key, JUMP_LABEL_ENABLE); + else + jump_label_update(key, JUMP_LABEL_DISABLE); + } atomic_inc(&key->enabled); jump_label_unlock(); } -EXPORT_SYMBOL_GPL(jump_label_inc); +EXPORT_SYMBOL_GPL(static_key_slow_inc); -static void __jump_label_dec(struct jump_label_key *key, +static void __static_key_slow_dec(struct static_key *key, unsigned long rate_limit, struct delayed_work *work) { - if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex)) + if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex)) { + WARN(atomic_read(&key->enabled) < 0, + "jump label: negative count!\n"); return; + } if (rate_limit) { atomic_inc(&key->enabled); schedule_delayed_work(work, rate_limit); - } else - jump_label_update(key, JUMP_LABEL_DISABLE); - + } else { + if (!jump_label_get_branch_default(key)) + jump_label_update(key, JUMP_LABEL_DISABLE); + else + jump_label_update(key, JUMP_LABEL_ENABLE); + } jump_label_unlock(); } -EXPORT_SYMBOL_GPL(jump_label_dec); static void jump_label_update_timeout(struct work_struct *work) { - struct jump_label_key_deferred *key = - container_of(work, struct jump_label_key_deferred, work.work); - __jump_label_dec(&key->key, 0, NULL); + struct static_key_deferred *key = + container_of(work, struct static_key_deferred, work.work); + __static_key_slow_dec(&key->key, 0, NULL); } -void jump_label_dec(struct jump_label_key *key) +void static_key_slow_dec(struct static_key *key) { - __jump_label_dec(key, 0, NULL); + __static_key_slow_dec(key, 0, NULL); } +EXPORT_SYMBOL_GPL(static_key_slow_dec); -void jump_label_dec_deferred(struct jump_label_key_deferred *key) +void static_key_slow_dec_deferred(struct static_key_deferred *key) { - __jump_label_dec(&key->key, key->timeout, &key->work); + __static_key_slow_dec(&key->key, key->timeout, &key->work); } +EXPORT_SYMBOL_GPL(static_key_slow_dec_deferred); - -void jump_label_rate_limit(struct jump_label_key_deferred *key, +void jump_label_rate_limit(struct static_key_deferred *key, unsigned long rl) { key->timeout = rl; @@ -150,7 +155,7 @@ void __weak __init_or_module arch_jump_label_transform_static(struct jump_entry arch_jump_label_transform(entry, type); } -static void __jump_label_update(struct jump_label_key *key, +static void __jump_label_update(struct static_key *key, struct jump_entry *entry, struct jump_entry *stop, int enable) { @@ -167,27 +172,40 @@ static void __jump_label_update(struct jump_label_key *key, } } +static enum jump_label_type jump_label_type(struct static_key *key) +{ + bool true_branch = jump_label_get_branch_default(key); + bool state = static_key_enabled(key); + + if ((!true_branch && state) || (true_branch && !state)) + return JUMP_LABEL_ENABLE; + + return JUMP_LABEL_DISABLE; +} + void __init jump_label_init(void) { struct jump_entry *iter_start = __start___jump_table; struct jump_entry *iter_stop = __stop___jump_table; - struct jump_label_key *key = NULL; + struct static_key *key = NULL; struct jump_entry *iter; jump_label_lock(); jump_label_sort_entries(iter_start, iter_stop); for (iter = iter_start; iter < iter_stop; iter++) { - struct jump_label_key *iterk; + struct static_key *iterk; - iterk = (struct jump_label_key *)(unsigned long)iter->key; - arch_jump_label_transform_static(iter, jump_label_enabled(iterk) ? - JUMP_LABEL_ENABLE : JUMP_LABEL_DISABLE); + iterk = (struct static_key *)(unsigned long)iter->key; + arch_jump_label_transform_static(iter, jump_label_type(iterk)); if (iterk == key) continue; key = iterk; - key->entries = iter; + /* + * Set key->entries to iter, but preserve JUMP_LABEL_TRUE_BRANCH. + */ + *((unsigned long *)&key->entries) += (unsigned long)iter; #ifdef CONFIG_MODULES key->next = NULL; #endif @@ -197,8 +215,8 @@ void __init jump_label_init(void) #ifdef CONFIG_MODULES -struct jump_label_mod { - struct jump_label_mod *next; +struct static_key_mod { + struct static_key_mod *next; struct jump_entry *entries; struct module *mod; }; @@ -218,9 +236,9 @@ static int __jump_label_mod_text_reserved(void *start, void *end) start, end); } -static void __jump_label_mod_update(struct jump_label_key *key, int enable) +static void __jump_label_mod_update(struct static_key *key, int enable) { - struct jump_label_mod *mod = key->next; + struct static_key_mod *mod = key->next; while (mod) { struct module *m = mod->mod; @@ -251,11 +269,7 @@ void jump_label_apply_nops(struct module *mod) return; for (iter = iter_start; iter < iter_stop; iter++) { - struct jump_label_key *iterk; - - iterk = (struct jump_label_key *)(unsigned long)iter->key; - arch_jump_label_transform_static(iter, jump_label_enabled(iterk) ? - JUMP_LABEL_ENABLE : JUMP_LABEL_DISABLE); + arch_jump_label_transform_static(iter, JUMP_LABEL_DISABLE); } } @@ -264,8 +278,8 @@ static int jump_label_add_module(struct module *mod) struct jump_entry *iter_start = mod->jump_entries; struct jump_entry *iter_stop = iter_start + mod->num_jump_entries; struct jump_entry *iter; - struct jump_label_key *key = NULL; - struct jump_label_mod *jlm; + struct static_key *key = NULL; + struct static_key_mod *jlm; /* if the module doesn't have jump label entries, just return */ if (iter_start == iter_stop) @@ -274,28 +288,30 @@ static int jump_label_add_module(struct module *mod) jump_label_sort_entries(iter_start, iter_stop); for (iter = iter_start; iter < iter_stop; iter++) { - if (iter->key == (jump_label_t)(unsigned long)key) - continue; + struct static_key *iterk; - key = (struct jump_label_key *)(unsigned long)iter->key; + iterk = (struct static_key *)(unsigned long)iter->key; + if (iterk == key) + continue; + key = iterk; if (__module_address(iter->key) == mod) { - atomic_set(&key->enabled, 0); - key->entries = iter; + /* + * Set key->entries to iter, but preserve JUMP_LABEL_TRUE_BRANCH. + */ + *((unsigned long *)&key->entries) += (unsigned long)iter; key->next = NULL; continue; } - - jlm = kzalloc(sizeof(struct jump_label_mod), GFP_KERNEL); + jlm = kzalloc(sizeof(struct static_key_mod), GFP_KERNEL); if (!jlm) return -ENOMEM; - jlm->mod = mod; jlm->entries = iter; jlm->next = key->next; key->next = jlm; - if (jump_label_enabled(key)) + if (jump_label_type(key) == JUMP_LABEL_ENABLE) __jump_label_update(key, iter, iter_stop, JUMP_LABEL_ENABLE); } @@ -307,14 +323,14 @@ static void jump_label_del_module(struct module *mod) struct jump_entry *iter_start = mod->jump_entries; struct jump_entry *iter_stop = iter_start + mod->num_jump_entries; struct jump_entry *iter; - struct jump_label_key *key = NULL; - struct jump_label_mod *jlm, **prev; + struct static_key *key = NULL; + struct static_key_mod *jlm, **prev; for (iter = iter_start; iter < iter_stop; iter++) { if (iter->key == (jump_label_t)(unsigned long)key) continue; - key = (struct jump_label_key *)(unsigned long)iter->key; + key = (struct static_key *)(unsigned long)iter->key; if (__module_address(iter->key) == mod) continue; @@ -416,12 +432,13 @@ int jump_label_text_reserved(void *start, void *end) return ret; } -static void jump_label_update(struct jump_label_key *key, int enable) +static void jump_label_update(struct static_key *key, int enable) { - struct jump_entry *entry = key->entries, *stop = __stop___jump_table; + struct jump_entry *stop = __stop___jump_table; + struct jump_entry *entry = jump_label_get_entries(key); #ifdef CONFIG_MODULES - struct module *mod = __module_address((jump_label_t)key); + struct module *mod = __module_address((unsigned long)key); __jump_label_mod_update(key, enable); diff --git a/kernel/kexec.c b/kernel/kexec.c index 090ee10d9604..a6a675cb9818 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -32,7 +32,6 @@ #include <linux/console.h> #include <linux/vmalloc.h> #include <linux/swap.h> -#include <linux/kmsg_dump.h> #include <linux/syscore_ops.h> #include <asm/page.h> @@ -1094,8 +1093,6 @@ void crash_kexec(struct pt_regs *regs) if (kexec_crash_image) { struct pt_regs fixed_regs; - kmsg_dump(KMSG_DUMP_KEXEC); - crash_setup_regs(&fixed_regs, regs); crash_save_vmcoreinfo(); machine_crash_shutdown(&fixed_regs); @@ -1132,6 +1129,8 @@ int crash_shrink_memory(unsigned long new_size) { int ret = 0; unsigned long start, end; + unsigned long old_size; + struct resource *ram_res; mutex_lock(&kexec_mutex); @@ -1141,11 +1140,15 @@ int crash_shrink_memory(unsigned long new_size) } start = crashk_res.start; end = crashk_res.end; + old_size = (end == 0) ? 0 : end - start + 1; + if (new_size >= old_size) { + ret = (new_size == old_size) ? 0 : -EINVAL; + goto unlock; + } - if (new_size >= end - start + 1) { - ret = -EINVAL; - if (new_size == end - start + 1) - ret = 0; + ram_res = kzalloc(sizeof(*ram_res), GFP_KERNEL); + if (!ram_res) { + ret = -ENOMEM; goto unlock; } @@ -1157,7 +1160,15 @@ int crash_shrink_memory(unsigned long new_size) if ((start == end) && (crashk_res.parent != NULL)) release_resource(&crashk_res); + + ram_res->start = end; + ram_res->end = crashk_res.end; + ram_res->flags = IORESOURCE_BUSY | IORESOURCE_MEM; + ram_res->name = "System RAM"; + crashk_res.end = end - 1; + + insert_resource(&iomem_resource, ram_res); crash_unmap_reserved_pages(); unlock: @@ -1535,13 +1546,13 @@ int kernel_kexec(void) if (error) goto Resume_console; /* At this point, dpm_suspend_start() has been called, - * but *not* dpm_suspend_noirq(). We *must* call - * dpm_suspend_noirq() now. Otherwise, drivers for + * but *not* dpm_suspend_end(). We *must* call + * dpm_suspend_end() now. Otherwise, drivers for * some devices (e.g. interrupt controllers) become * desynchronized with the actual state of the * hardware at resume time, and evil weirdness ensues. */ - error = dpm_suspend_noirq(PMSG_FREEZE); + error = dpm_suspend_end(PMSG_FREEZE); if (error) goto Resume_devices; error = disable_nonboot_cpus(); @@ -1568,7 +1579,7 @@ int kernel_kexec(void) local_irq_enable(); Enable_cpus: enable_nonboot_cpus(); - dpm_resume_noirq(PMSG_RESTORE); + dpm_resume_start(PMSG_RESTORE); Resume_devices: dpm_resume_end(PMSG_RESTORE); Resume_console: diff --git a/kernel/kprobes.c b/kernel/kprobes.c index e5d84644823b..c62b8546cc90 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1077,6 +1077,7 @@ void __kprobes kprobe_flush_task(struct task_struct *tk) /* Early boot. kretprobe_table_locks not yet initialized. */ return; + INIT_HLIST_HEAD(&empty_rp); hash = hash_ptr(tk, KPROBE_HASH_BITS); head = &kretprobe_inst_table[hash]; kretprobe_table_lock(hash, &flags); @@ -1085,7 +1086,6 @@ void __kprobes kprobe_flush_task(struct task_struct *tk) recycle_rp_inst(ri, &empty_rp); } kretprobe_table_unlock(hash, &flags); - INIT_HLIST_HEAD(&empty_rp); hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) { hlist_del(&ri->hlist); kfree(ri); @@ -1334,8 +1334,10 @@ int __kprobes register_kprobe(struct kprobe *p) if (!kernel_text_address((unsigned long) p->addr) || in_kprobes_functions((unsigned long) p->addr) || ftrace_text_reserved(p->addr, p->addr) || - jump_label_text_reserved(p->addr, p->addr)) - goto fail_with_jump_label; + jump_label_text_reserved(p->addr, p->addr)) { + ret = -EINVAL; + goto cannot_probe; + } /* User can pass only KPROBE_FLAG_DISABLED to register_kprobe */ p->flags &= KPROBE_FLAG_DISABLED; @@ -1352,7 +1354,7 @@ int __kprobes register_kprobe(struct kprobe *p) * its code to prohibit unexpected unloading. */ if (unlikely(!try_module_get(probed_mod))) - goto fail_with_jump_label; + goto cannot_probe; /* * If the module freed .init.text, we couldn't insert @@ -1361,7 +1363,7 @@ int __kprobes register_kprobe(struct kprobe *p) if (within_module_init((unsigned long)p->addr, probed_mod) && probed_mod->state != MODULE_STATE_COMING) { module_put(probed_mod); - goto fail_with_jump_label; + goto cannot_probe; } /* ret will be updated by following code */ } @@ -1409,7 +1411,7 @@ out: return ret; -fail_with_jump_label: +cannot_probe: preempt_enable(); jump_label_unlock(); return ret; @@ -1673,8 +1675,12 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p, ri->rp = rp; ri->task = current; - if (rp->entry_handler && rp->entry_handler(ri, regs)) + if (rp->entry_handler && rp->entry_handler(ri, regs)) { + raw_spin_lock_irqsave(&rp->lock, flags); + hlist_add_head(&ri->hlist, &rp->free_instances); + raw_spin_unlock_irqrestore(&rp->lock, flags); return 0; + } arch_prepare_kretprobe(ri, regs); @@ -2198,7 +2204,7 @@ static ssize_t write_enabled_file_bool(struct file *file, const char __user *user_buf, size_t count, loff_t *ppos) { char buf[32]; - int buf_size; + size_t buf_size; buf_size = min(count, (sizeof(buf)-1)); if (copy_from_user(buf, user_buf, buf_size)) diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 8889f7dd7c46..ea9ee4518c35 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -4176,7 +4176,13 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s) printk("-------------------------------\n"); printk("%s:%d %s!\n", file, line, s); printk("\nother info that might help us debug this:\n\n"); - printk("\nrcu_scheduler_active = %d, debug_locks = %d\n", rcu_scheduler_active, debug_locks); + printk("\n%srcu_scheduler_active = %d, debug_locks = %d\n", + !rcu_lockdep_current_cpu_online() + ? "RCU used illegally from offline CPU!\n" + : rcu_is_cpu_idle() + ? "RCU used illegally from idle CPU!\n" + : "", + rcu_scheduler_active, debug_locks); /* * If a CPU is in the RCU-free window in idle (ie: in the section diff --git a/kernel/module.c b/kernel/module.c index 178333c48d1e..2c932760fd33 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -62,12 +62,6 @@ #define CREATE_TRACE_POINTS #include <trace/events/module.h> -#if 0 -#define DEBUGP printk -#else -#define DEBUGP(fmt , a...) -#endif - #ifndef ARCH_SHF_SMALL #define ARCH_SHF_SMALL 0 #endif @@ -138,7 +132,6 @@ struct load_info { unsigned long len; Elf_Shdr *sechdrs; char *secstrings, *strtab; - unsigned long *strmap; unsigned long symoffs, stroffs; struct _ddebug *debug; unsigned int num_debug; @@ -410,7 +403,7 @@ const struct kernel_symbol *find_symbol(const char *name, return fsa.sym; } - DEBUGP("Failed to find symbol %s\n", name); + pr_debug("Failed to find symbol %s\n", name); return NULL; } EXPORT_SYMBOL_GPL(find_symbol); @@ -600,11 +593,11 @@ static int already_uses(struct module *a, struct module *b) list_for_each_entry(use, &b->source_list, source_list) { if (use->source == a) { - DEBUGP("%s uses %s!\n", a->name, b->name); + pr_debug("%s uses %s!\n", a->name, b->name); return 1; } } - DEBUGP("%s does not use %s!\n", a->name, b->name); + pr_debug("%s does not use %s!\n", a->name, b->name); return 0; } @@ -619,7 +612,7 @@ static int add_module_usage(struct module *a, struct module *b) { struct module_use *use; - DEBUGP("Allocating new usage for %s.\n", a->name); + pr_debug("Allocating new usage for %s.\n", a->name); use = kmalloc(sizeof(*use), GFP_ATOMIC); if (!use) { printk(KERN_WARNING "%s: out of memory loading\n", a->name); @@ -663,7 +656,7 @@ static void module_unload_free(struct module *mod) mutex_lock(&module_mutex); list_for_each_entry_safe(use, tmp, &mod->target_list, target_list) { struct module *i = use->target; - DEBUGP("%s unusing %s\n", mod->name, i->name); + pr_debug("%s unusing %s\n", mod->name, i->name); module_put(i); list_del(&use->source_list); list_del(&use->target_list); @@ -726,9 +719,9 @@ static int try_stop_module(struct module *mod, int flags, int *forced) } } -unsigned int module_refcount(struct module *mod) +unsigned long module_refcount(struct module *mod) { - unsigned int incs = 0, decs = 0; + unsigned long incs = 0, decs = 0; int cpu; for_each_possible_cpu(cpu) @@ -761,7 +754,7 @@ static void wait_for_zero_refcount(struct module *mod) /* Since we might sleep for some time, release the mutex first */ mutex_unlock(&module_mutex); for (;;) { - DEBUGP("Looking at refcount...\n"); + pr_debug("Looking at refcount...\n"); set_current_state(TASK_UNINTERRUPTIBLE); if (module_refcount(mod) == 0) break; @@ -804,7 +797,7 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user, if (mod->state != MODULE_STATE_LIVE) { /* FIXME: if (force), slam module count and wake up waiter --RR */ - DEBUGP("%s already dying\n", mod->name); + pr_debug("%s already dying\n", mod->name); ret = -EBUSY; goto out; } @@ -854,7 +847,7 @@ static inline void print_unload_info(struct seq_file *m, struct module *mod) struct module_use *use; int printed_something = 0; - seq_printf(m, " %u ", module_refcount(mod)); + seq_printf(m, " %lu ", module_refcount(mod)); /* Always include a trailing , so userspace can differentiate between this and the old multi-field proc format. */ @@ -904,13 +897,11 @@ EXPORT_SYMBOL_GPL(symbol_put_addr); static ssize_t show_refcnt(struct module_attribute *mattr, struct module_kobject *mk, char *buffer) { - return sprintf(buffer, "%u\n", module_refcount(mk->mod)); + return sprintf(buffer, "%lu\n", module_refcount(mk->mod)); } -static struct module_attribute refcnt = { - .attr = { .name = "refcnt", .mode = 0444 }, - .show = show_refcnt, -}; +static struct module_attribute modinfo_refcnt = + __ATTR(refcnt, 0444, show_refcnt, NULL); void module_put(struct module *module) { @@ -951,6 +942,26 @@ static inline int module_unload_init(struct module *mod) } #endif /* CONFIG_MODULE_UNLOAD */ +static size_t module_flags_taint(struct module *mod, char *buf) +{ + size_t l = 0; + + if (mod->taints & (1 << TAINT_PROPRIETARY_MODULE)) + buf[l++] = 'P'; + if (mod->taints & (1 << TAINT_OOT_MODULE)) + buf[l++] = 'O'; + if (mod->taints & (1 << TAINT_FORCED_MODULE)) + buf[l++] = 'F'; + if (mod->taints & (1 << TAINT_CRAP)) + buf[l++] = 'C'; + /* + * TAINT_FORCED_RMMOD: could be added. + * TAINT_UNSAFE_SMP, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't + * apply to modules. + */ + return l; +} + static ssize_t show_initstate(struct module_attribute *mattr, struct module_kobject *mk, char *buffer) { @@ -970,10 +981,8 @@ static ssize_t show_initstate(struct module_attribute *mattr, return sprintf(buffer, "%s\n", state); } -static struct module_attribute initstate = { - .attr = { .name = "initstate", .mode = 0444 }, - .show = show_initstate, -}; +static struct module_attribute modinfo_initstate = + __ATTR(initstate, 0444, show_initstate, NULL); static ssize_t store_uevent(struct module_attribute *mattr, struct module_kobject *mk, @@ -986,18 +995,50 @@ static ssize_t store_uevent(struct module_attribute *mattr, return count; } -struct module_attribute module_uevent = { - .attr = { .name = "uevent", .mode = 0200 }, - .store = store_uevent, -}; +struct module_attribute module_uevent = + __ATTR(uevent, 0200, NULL, store_uevent); + +static ssize_t show_coresize(struct module_attribute *mattr, + struct module_kobject *mk, char *buffer) +{ + return sprintf(buffer, "%u\n", mk->mod->core_size); +} + +static struct module_attribute modinfo_coresize = + __ATTR(coresize, 0444, show_coresize, NULL); + +static ssize_t show_initsize(struct module_attribute *mattr, + struct module_kobject *mk, char *buffer) +{ + return sprintf(buffer, "%u\n", mk->mod->init_size); +} + +static struct module_attribute modinfo_initsize = + __ATTR(initsize, 0444, show_initsize, NULL); + +static ssize_t show_taint(struct module_attribute *mattr, + struct module_kobject *mk, char *buffer) +{ + size_t l; + + l = module_flags_taint(mk->mod, buffer); + buffer[l++] = '\n'; + return l; +} + +static struct module_attribute modinfo_taint = + __ATTR(taint, 0444, show_taint, NULL); static struct module_attribute *modinfo_attrs[] = { + &module_uevent, &modinfo_version, &modinfo_srcversion, - &initstate, - &module_uevent, + &modinfo_initstate, + &modinfo_coresize, + &modinfo_initsize, + &modinfo_taint, #ifdef CONFIG_MODULE_UNLOAD - &refcnt, + &modinfo_refcnt, #endif NULL, }; @@ -1057,7 +1098,7 @@ static int check_version(Elf_Shdr *sechdrs, if (versions[i].crc == maybe_relocated(*crc, crc_owner)) return 1; - DEBUGP("Found checksum %lX vs module %lX\n", + pr_debug("Found checksum %lX vs module %lX\n", maybe_relocated(*crc, crc_owner), versions[i].crc); goto bad_version; } @@ -1834,7 +1875,7 @@ static int simplify_symbols(struct module *mod, const struct load_info *info) case SHN_COMMON: /* We compiled with -fno-common. These are not supposed to happen. */ - DEBUGP("Common symbol: %s\n", name); + pr_debug("Common symbol: %s\n", name); printk("%s: please compile with -fno-common\n", mod->name); ret = -ENOEXEC; @@ -1842,7 +1883,7 @@ static int simplify_symbols(struct module *mod, const struct load_info *info) case SHN_ABS: /* Don't need to do anything */ - DEBUGP("Absolute symbol: 0x%08lx\n", + pr_debug("Absolute symbol: 0x%08lx\n", (long)sym[i].st_value); break; @@ -1966,7 +2007,7 @@ static void layout_sections(struct module *mod, struct load_info *info) for (i = 0; i < info->hdr->e_shnum; i++) info->sechdrs[i].sh_entsize = ~0UL; - DEBUGP("Core section allocation order:\n"); + pr_debug("Core section allocation order:\n"); for (m = 0; m < ARRAY_SIZE(masks); ++m) { for (i = 0; i < info->hdr->e_shnum; ++i) { Elf_Shdr *s = &info->sechdrs[i]; @@ -1978,7 +2019,7 @@ static void layout_sections(struct module *mod, struct load_info *info) || strstarts(sname, ".init")) continue; s->sh_entsize = get_offset(mod, &mod->core_size, s, i); - DEBUGP("\t%s\n", name); + pr_debug("\t%s\n", sname); } switch (m) { case 0: /* executable */ @@ -1995,7 +2036,7 @@ static void layout_sections(struct module *mod, struct load_info *info) } } - DEBUGP("Init section allocation order:\n"); + pr_debug("Init section allocation order:\n"); for (m = 0; m < ARRAY_SIZE(masks); ++m) { for (i = 0; i < info->hdr->e_shnum; ++i) { Elf_Shdr *s = &info->sechdrs[i]; @@ -2008,7 +2049,7 @@ static void layout_sections(struct module *mod, struct load_info *info) continue; s->sh_entsize = (get_offset(mod, &mod->init_size, s, i) | INIT_OFFSET_MASK); - DEBUGP("\t%s\n", sname); + pr_debug("\t%s\n", sname); } switch (m) { case 0: /* executable */ @@ -2178,45 +2219,46 @@ static bool is_core_symbol(const Elf_Sym *src, const Elf_Shdr *sechdrs, return true; } +/* + * We only allocate and copy the strings needed by the parts of symtab + * we keep. This is simple, but has the effect of making multiple + * copies of duplicates. We could be more sophisticated, see + * linux-kernel thread starting with + * <73defb5e4bca04a6431392cc341112b1@localhost>. + */ static void layout_symtab(struct module *mod, struct load_info *info) { Elf_Shdr *symsect = info->sechdrs + info->index.sym; Elf_Shdr *strsect = info->sechdrs + info->index.str; const Elf_Sym *src; - unsigned int i, nsrc, ndst; + unsigned int i, nsrc, ndst, strtab_size; /* Put symbol section at end of init part of module. */ symsect->sh_flags |= SHF_ALLOC; symsect->sh_entsize = get_offset(mod, &mod->init_size, symsect, info->index.sym) | INIT_OFFSET_MASK; - DEBUGP("\t%s\n", info->secstrings + symsect->sh_name); + pr_debug("\t%s\n", info->secstrings + symsect->sh_name); src = (void *)info->hdr + symsect->sh_offset; nsrc = symsect->sh_size / sizeof(*src); - for (ndst = i = 1; i < nsrc; ++i, ++src) - if (is_core_symbol(src, info->sechdrs, info->hdr->e_shnum)) { - unsigned int j = src->st_name; - while (!__test_and_set_bit(j, info->strmap) - && info->strtab[j]) - ++j; - ++ndst; + /* Compute total space required for the core symbols' strtab. */ + for (ndst = i = strtab_size = 1; i < nsrc; ++i, ++src) + if (is_core_symbol(src, info->sechdrs, info->hdr->e_shnum)) { + strtab_size += strlen(&info->strtab[src->st_name]) + 1; + ndst++; } /* Append room for core symbols at end of core part. */ info->symoffs = ALIGN(mod->core_size, symsect->sh_addralign ?: 1); - mod->core_size = info->symoffs + ndst * sizeof(Elf_Sym); + info->stroffs = mod->core_size = info->symoffs + ndst * sizeof(Elf_Sym); + mod->core_size += strtab_size; /* Put string table section at end of init part of module. */ strsect->sh_flags |= SHF_ALLOC; strsect->sh_entsize = get_offset(mod, &mod->init_size, strsect, info->index.str) | INIT_OFFSET_MASK; - DEBUGP("\t%s\n", info->secstrings + strsect->sh_name); - - /* Append room for core symbols' strings at end of core part. */ - info->stroffs = mod->core_size; - __set_bit(0, info->strmap); - mod->core_size += bitmap_weight(info->strmap, strsect->sh_size); + pr_debug("\t%s\n", info->secstrings + strsect->sh_name); } static void add_kallsyms(struct module *mod, const struct load_info *info) @@ -2237,22 +2279,19 @@ static void add_kallsyms(struct module *mod, const struct load_info *info) mod->symtab[i].st_info = elf_type(&mod->symtab[i], info); mod->core_symtab = dst = mod->module_core + info->symoffs; + mod->core_strtab = s = mod->module_core + info->stroffs; src = mod->symtab; *dst = *src; + *s++ = 0; for (ndst = i = 1; i < mod->num_symtab; ++i, ++src) { if (!is_core_symbol(src, info->sechdrs, info->hdr->e_shnum)) continue; + dst[ndst] = *src; - dst[ndst].st_name = bitmap_weight(info->strmap, - dst[ndst].st_name); - ++ndst; + dst[ndst++].st_name = s - mod->core_strtab; + s += strlcpy(s, &mod->strtab[src->st_name], KSYM_NAME_LEN) + 1; } mod->core_num_syms = ndst; - - mod->core_strtab = s = mod->module_core + info->stroffs; - for (*s = 0, i = 1; i < info->sechdrs[info->index.str].sh_size; ++i) - if (test_bit(i, info->strmap)) - *++s = mod->strtab[i]; } #else static inline void layout_symtab(struct module *mod, struct load_info *info) @@ -2621,7 +2660,7 @@ static int move_module(struct module *mod, struct load_info *info) mod->module_init = ptr; /* Transfer each section which specifies SHF_ALLOC */ - DEBUGP("final section addresses:\n"); + pr_debug("final section addresses:\n"); for (i = 0; i < info->hdr->e_shnum; i++) { void *dest; Elf_Shdr *shdr = &info->sechdrs[i]; @@ -2639,8 +2678,8 @@ static int move_module(struct module *mod, struct load_info *info) memcpy(dest, (void *)shdr->sh_addr, shdr->sh_size); /* Update sh_addr to point to copy in image. */ shdr->sh_addr = (unsigned long)dest; - DEBUGP("\t0x%lx %s\n", - shdr->sh_addr, info->secstrings + shdr->sh_name); + pr_debug("\t0x%lx %s\n", + (long)shdr->sh_addr, info->secstrings + shdr->sh_name); } return 0; @@ -2742,27 +2781,18 @@ static struct module *layout_and_allocate(struct load_info *info) this is done generically; there doesn't appear to be any special cases for the architectures. */ layout_sections(mod, info); - - info->strmap = kzalloc(BITS_TO_LONGS(info->sechdrs[info->index.str].sh_size) - * sizeof(long), GFP_KERNEL); - if (!info->strmap) { - err = -ENOMEM; - goto free_percpu; - } layout_symtab(mod, info); /* Allocate and move to the final place */ err = move_module(mod, info); if (err) - goto free_strmap; + goto free_percpu; /* Module has been copied to its final place now: return it. */ mod = (void *)info->sechdrs[info->index.mod].sh_addr; kmemleak_load_module(mod, info); return mod; -free_strmap: - kfree(info->strmap); free_percpu: percpu_modfree(mod); out: @@ -2772,7 +2802,6 @@ out: /* mod is no longer valid after this! */ static void module_deallocate(struct module *mod, struct load_info *info) { - kfree(info->strmap); percpu_modfree(mod); module_free(mod, mod->module_init); module_free(mod, mod->module_core); @@ -2811,7 +2840,7 @@ static struct module *load_module(void __user *umod, struct module *mod; long err; - DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n", + pr_debug("load_module: umod=%p, len=%lu, uargs=%p\n", umod, len, uargs); /* Copy in the blobs from userspace, check they are vaguely sane. */ @@ -2902,8 +2931,7 @@ static struct module *load_module(void __user *umod, if (err < 0) goto unlink; - /* Get rid of temporary copy and strmap. */ - kfree(info.strmap); + /* Get rid of temporary copy. */ free_copy(&info); /* Done! */ @@ -3256,20 +3284,7 @@ static char *module_flags(struct module *mod, char *buf) mod->state == MODULE_STATE_GOING || mod->state == MODULE_STATE_COMING) { buf[bx++] = '('; - if (mod->taints & (1 << TAINT_PROPRIETARY_MODULE)) - buf[bx++] = 'P'; - else if (mod->taints & (1 << TAINT_OOT_MODULE)) - buf[bx++] = 'O'; - if (mod->taints & (1 << TAINT_FORCED_MODULE)) - buf[bx++] = 'F'; - if (mod->taints & (1 << TAINT_CRAP)) - buf[bx++] = 'C'; - /* - * TAINT_FORCED_RMMOD: could be added. - * TAINT_UNSAFE_SMP, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't - * apply to modules. - */ - + bx += module_flags_taint(mod, buf + bx); /* Show a - for module-is-being-unloaded */ if (mod->state == MODULE_STATE_GOING) buf[bx++] = '-'; diff --git a/kernel/mutex.c b/kernel/mutex.c index 89096dd8786f..a307cc9c9526 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c @@ -240,9 +240,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, /* didn't get the lock, go to sleep: */ spin_unlock_mutex(&lock->wait_lock, flags); - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); spin_lock_mutex(&lock->wait_lock, flags); } diff --git a/kernel/panic.c b/kernel/panic.c index 3458469eb7c3..80aed44e345a 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -49,6 +49,15 @@ static long no_blink(int state) long (*panic_blink)(int state); EXPORT_SYMBOL(panic_blink); +/* + * Stop ourself in panic -- architecture code may override this + */ +void __weak panic_smp_self_stop(void) +{ + while (1) + cpu_relax(); +} + /** * panic - halt the system * @fmt: The text string to print @@ -57,8 +66,9 @@ EXPORT_SYMBOL(panic_blink); * * This function never returns. */ -NORET_TYPE void panic(const char * fmt, ...) +void panic(const char *fmt, ...) { + static DEFINE_SPINLOCK(panic_lock); static char buf[1024]; va_list args; long i, i_next = 0; @@ -68,8 +78,14 @@ NORET_TYPE void panic(const char * fmt, ...) * It's possible to come here directly from a panic-assertion and * not have preempt disabled. Some functions called from here want * preempt to be disabled. No point enabling it later though... + * + * Only one CPU is allowed to execute the panic code from here. For + * multiple parallel invocations of panic, all other CPUs either + * stop themself or will wait until they are stopped by the 1st CPU + * with smp_send_stop(). */ - preempt_disable(); + if (!spin_trylock(&panic_lock)) + panic_smp_self_stop(); console_verbose(); bust_spinlocks(1); @@ -78,7 +94,11 @@ NORET_TYPE void panic(const char * fmt, ...) va_end(args); printk(KERN_EMERG "Kernel panic - not syncing: %s\n",buf); #ifdef CONFIG_DEBUG_BUGVERBOSE - dump_stack(); + /* + * Avoid nested stack-dumping if a panic occurs during oops processing + */ + if (!oops_in_progress) + dump_stack(); #endif /* diff --git a/kernel/params.c b/kernel/params.c index 65aae11eb93f..4bc965d8a1fe 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -25,12 +25,6 @@ #include <linux/slab.h> #include <linux/ctype.h> -#if 0 -#define DEBUGP printk -#else -#define DEBUGP(fmt, a...) -#endif - /* Protects all parameters, and incidentally kmalloced_param list. */ static DEFINE_MUTEX(param_lock); @@ -103,9 +97,10 @@ static int parse_one(char *param, for (i = 0; i < num_params; i++) { if (parameq(param, params[i].name)) { /* No one handled NULL, so do it here. */ - if (!val && params[i].ops->set != param_set_bool) + if (!val && params[i].ops->set != param_set_bool + && params[i].ops->set != param_set_bint) return -EINVAL; - DEBUGP("They are equal! Calling %p\n", + pr_debug("They are equal! Calling %p\n", params[i].ops->set); mutex_lock(¶m_lock); err = params[i].ops->set(val, ¶ms[i]); @@ -115,11 +110,11 @@ static int parse_one(char *param, } if (handle_unknown) { - DEBUGP("Unknown argument: calling %p\n", handle_unknown); + pr_debug("Unknown argument: calling %p\n", handle_unknown); return handle_unknown(param, val); } - DEBUGP("Unknown argument `%s'\n", param); + pr_debug("Unknown argument `%s'\n", param); return -ENOENT; } @@ -184,7 +179,7 @@ int parse_args(const char *name, { char *param, *val; - DEBUGP("Parsing ARGS: %s\n", args); + pr_debug("Parsing ARGS: %s\n", args); /* Chew leading spaces */ args = skip_spaces(args); @@ -369,6 +364,30 @@ struct kernel_param_ops param_ops_invbool = { }; EXPORT_SYMBOL(param_ops_invbool); +int param_set_bint(const char *val, const struct kernel_param *kp) +{ + struct kernel_param boolkp; + bool v; + int ret; + + /* Match bool exactly, by re-using it. */ + boolkp = *kp; + boolkp.arg = &v; + boolkp.flags |= KPARAM_ISBOOL; + + ret = param_set_bool(val, &boolkp); + if (ret == 0) + *(int *)kp->arg = v; + return ret; +} +EXPORT_SYMBOL(param_set_bint); + +struct kernel_param_ops param_ops_bint = { + .set = param_set_bint, + .get = param_get_int, +}; +EXPORT_SYMBOL(param_ops_bint); + /* We break the rule and mangle the string. */ static int param_array(const char *name, const char *val, diff --git a/kernel/pid.c b/kernel/pid.c index fa5f72227e5f..9f08dfabaf13 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -137,7 +137,9 @@ static int pid_before(int base, int a, int b) } /* - * We might be racing with someone else trying to set pid_ns->last_pid. + * We might be racing with someone else trying to set pid_ns->last_pid + * at the pid allocation time (there's also a sysctl for this, but racing + * with this one is OK, see comment in kernel/pid_namespace.c about it). * We want the winner to have the "later" value, because if the * "earlier" value prevails, then a pid may get reused immediately. * @@ -541,12 +543,12 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns) */ void __init pidhash_init(void) { - int i, pidhash_size; + unsigned int i, pidhash_size; pid_hash = alloc_large_system_hash("PID", sizeof(*pid_hash), 0, 18, HASH_EARLY | HASH_SMALL, &pidhash_shift, NULL, 4096); - pidhash_size = 1 << pidhash_shift; + pidhash_size = 1U << pidhash_shift; for (i = 0; i < pidhash_size; i++) INIT_HLIST_HEAD(&pid_hash[i]); diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index e9c9adc84ca6..a8968396046d 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -191,9 +191,40 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) return; } +static int pid_ns_ctl_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + struct ctl_table tmp = *table; + + if (write && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + /* + * Writing directly to ns' last_pid field is OK, since this field + * is volatile in a living namespace anyway and a code writing to + * it should synchronize its usage with external means. + */ + + tmp.data = ¤t->nsproxy->pid_ns->last_pid; + return proc_dointvec(&tmp, write, buffer, lenp, ppos); +} + +static struct ctl_table pid_ns_ctl_table[] = { + { + .procname = "ns_last_pid", + .maxlen = sizeof(int), + .mode = 0666, /* permissions are checked in the handler */ + .proc_handler = pid_ns_ctl_handler, + }, + { } +}; + +static struct ctl_path kern_path[] = { { .procname = "kernel", }, { } }; + static __init int pid_namespaces_init(void) { pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC); + register_sysctl_paths(kern_path, pid_ns_ctl_table); return 0; } diff --git a/kernel/power/Makefile b/kernel/power/Makefile index 07e0e28ffba7..66d808ec5252 100644 --- a/kernel/power/Makefile +++ b/kernel/power/Makefile @@ -1,7 +1,8 @@ ccflags-$(CONFIG_PM_DEBUG) := -DDEBUG -obj-$(CONFIG_PM) += main.o qos.o +obj-y += qos.o +obj-$(CONFIG_PM) += main.o obj-$(CONFIG_VT_CONSOLE_SLEEP) += console.o obj-$(CONFIG_FREEZER) += process.o obj-$(CONFIG_SUSPEND) += suspend.o diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 6d6d28870335..0a186cfde788 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -245,8 +245,8 @@ void swsusp_show_speed(struct timeval *start, struct timeval *stop, * create_image - Create a hibernation image. * @platform_mode: Whether or not to use the platform driver. * - * Execute device drivers' .freeze_noirq() callbacks, create a hibernation image - * and execute the drivers' .thaw_noirq() callbacks. + * Execute device drivers' "late" and "noirq" freeze callbacks, create a + * hibernation image and run the drivers' "noirq" and "early" thaw callbacks. * * Control reappears in this routine after the subsequent restore. */ @@ -254,7 +254,7 @@ static int create_image(int platform_mode) { int error; - error = dpm_suspend_noirq(PMSG_FREEZE); + error = dpm_suspend_end(PMSG_FREEZE); if (error) { printk(KERN_ERR "PM: Some devices failed to power down, " "aborting hibernation\n"); @@ -306,7 +306,7 @@ static int create_image(int platform_mode) Platform_finish: platform_finish(platform_mode); - dpm_resume_noirq(in_suspend ? + dpm_resume_start(in_suspend ? (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); return error; @@ -343,13 +343,13 @@ int hibernation_snapshot(int platform_mode) * successful freezer test. */ freezer_test_done = true; - goto Cleanup; + goto Thaw; } error = dpm_prepare(PMSG_FREEZE); if (error) { dpm_complete(PMSG_RECOVER); - goto Cleanup; + goto Thaw; } suspend_console(); @@ -385,6 +385,8 @@ int hibernation_snapshot(int platform_mode) platform_end(platform_mode); return error; + Thaw: + thaw_kernel_threads(); Cleanup: swsusp_free(); goto Close; @@ -394,16 +396,16 @@ int hibernation_snapshot(int platform_mode) * resume_target_kernel - Restore system state from a hibernation image. * @platform_mode: Whether or not to use the platform driver. * - * Execute device drivers' .freeze_noirq() callbacks, restore the contents of - * highmem that have not been restored yet from the image and run the low-level - * code that will restore the remaining contents of memory and switch to the - * just restored target kernel. + * Execute device drivers' "noirq" and "late" freeze callbacks, restore the + * contents of highmem that have not been restored yet from the image and run + * the low-level code that will restore the remaining contents of memory and + * switch to the just restored target kernel. */ static int resume_target_kernel(bool platform_mode) { int error; - error = dpm_suspend_noirq(PMSG_QUIESCE); + error = dpm_suspend_end(PMSG_QUIESCE); if (error) { printk(KERN_ERR "PM: Some devices failed to power down, " "aborting resume\n"); @@ -460,7 +462,7 @@ static int resume_target_kernel(bool platform_mode) Cleanup: platform_restore_cleanup(platform_mode); - dpm_resume_noirq(PMSG_RECOVER); + dpm_resume_start(PMSG_RECOVER); return error; } @@ -518,7 +520,7 @@ int hibernation_platform_enter(void) goto Resume_devices; } - error = dpm_suspend_noirq(PMSG_HIBERNATE); + error = dpm_suspend_end(PMSG_HIBERNATE); if (error) goto Resume_devices; @@ -549,7 +551,7 @@ int hibernation_platform_enter(void) Platform_finish: hibernation_ops->finish(); - dpm_resume_noirq(PMSG_RESTORE); + dpm_resume_start(PMSG_RESTORE); Resume_devices: entering_platform_hibernation = false; @@ -616,7 +618,7 @@ int hibernate(void) /* Allocate memory management structures */ error = create_basic_memory_bitmaps(); if (error) - goto Exit; + goto Enable_umh; printk(KERN_INFO "PM: Syncing filesystems ... "); sys_sync(); @@ -624,15 +626,11 @@ int hibernate(void) error = freeze_processes(); if (error) - goto Finish; + goto Free_bitmaps; error = hibernation_snapshot(hibernation_mode == HIBERNATION_PLATFORM); - if (error) - goto Thaw; - if (freezer_test_done) { - freezer_test_done = false; + if (error || freezer_test_done) goto Thaw; - } if (in_suspend) { unsigned int flags = 0; @@ -657,8 +655,13 @@ int hibernate(void) Thaw: thaw_processes(); - Finish: + + /* Don't bother checking whether freezer_test_done is true */ + freezer_test_done = false; + + Free_bitmaps: free_basic_memory_bitmaps(); + Enable_umh: usermodehelper_enable(); Exit: pm_notifier_call_chain(PM_POST_HIBERNATION); diff --git a/kernel/power/main.c b/kernel/power/main.c index 9824b41e5a18..1c12581f1c62 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -165,16 +165,20 @@ static int suspend_stats_show(struct seq_file *s, void *unused) last_errno %= REC_FAILED_NUM; last_step = suspend_stats.last_failed_step + REC_FAILED_NUM - 1; last_step %= REC_FAILED_NUM; - seq_printf(s, "%s: %d\n%s: %d\n%s: %d\n%s: %d\n" - "%s: %d\n%s: %d\n%s: %d\n%s: %d\n", + seq_printf(s, "%s: %d\n%s: %d\n%s: %d\n%s: %d\n%s: %d\n" + "%s: %d\n%s: %d\n%s: %d\n%s: %d\n%s: %d\n", "success", suspend_stats.success, "fail", suspend_stats.fail, "failed_freeze", suspend_stats.failed_freeze, "failed_prepare", suspend_stats.failed_prepare, "failed_suspend", suspend_stats.failed_suspend, + "failed_suspend_late", + suspend_stats.failed_suspend_late, "failed_suspend_noirq", suspend_stats.failed_suspend_noirq, "failed_resume", suspend_stats.failed_resume, + "failed_resume_early", + suspend_stats.failed_resume_early, "failed_resume_noirq", suspend_stats.failed_resume_noirq); seq_printf(s, "failures:\n last_failed_dev:\t%-s\n", @@ -287,16 +291,10 @@ static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr, #ifdef CONFIG_SUSPEND for (s = &pm_states[state]; state < PM_SUSPEND_MAX; s++, state++) { - if (*s && len == strlen(*s) && !strncmp(buf, *s, len)) + if (*s && len == strlen(*s) && !strncmp(buf, *s, len)) { + error = pm_suspend(state); break; - } - if (state < PM_SUSPEND_MAX && *s) { - error = enter_state(state); - if (error) { - suspend_stats.fail++; - dpm_save_failed_errno(error); - } else - suspend_stats.success++; + } } #endif diff --git a/kernel/power/power.h b/kernel/power/power.h index 0c4defe6d3b8..98f3622d7407 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -177,13 +177,11 @@ extern const char *const pm_states[]; extern bool valid_state(suspend_state_t state); extern int suspend_devices_and_enter(suspend_state_t state); -extern int enter_state(suspend_state_t state); #else /* !CONFIG_SUSPEND */ static inline int suspend_devices_and_enter(suspend_state_t state) { return -ENOSYS; } -static inline int enter_state(suspend_state_t state) { return -ENOSYS; } static inline bool valid_state(suspend_state_t state) { return false; } #endif /* !CONFIG_SUSPEND */ @@ -231,8 +229,25 @@ extern int pm_test_level; #ifdef CONFIG_SUSPEND_FREEZER static inline int suspend_freeze_processes(void) { - int error = freeze_processes(); - return error ? : freeze_kernel_threads(); + int error; + + error = freeze_processes(); + /* + * freeze_processes() automatically thaws every task if freezing + * fails. So we need not do anything extra upon error. + */ + if (error) + return error; + + error = freeze_kernel_threads(); + /* + * freeze_kernel_threads() thaws only kernel threads upon freezing + * failure. So we have to thaw the userspace tasks ourselves. + */ + if (error) + thaw_processes(); + + return error; } static inline void suspend_thaw_processes(void) diff --git a/kernel/power/process.c b/kernel/power/process.c index 77274c9ba2f1..0d2aeb226108 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -53,11 +53,9 @@ static int try_to_freeze_tasks(bool user_only) * It is "frozen enough". If the task does wake * up, it will immediately call try_to_freeze. * - * Because freeze_task() goes through p's - * scheduler lock after setting TIF_FREEZE, it's - * guaranteed that either we see TASK_RUNNING or - * try_to_stop() after schedule() in ptrace/signal - * stop sees TIF_FREEZE. + * Because freeze_task() goes through p's scheduler lock, it's + * guaranteed that TASK_STOPPED/TRACED -> TASK_RUNNING + * transition can't race with task state testing here. */ if (!task_is_stopped_or_traced(p) && !freezer_should_skip(p)) @@ -98,13 +96,15 @@ static int try_to_freeze_tasks(bool user_only) elapsed_csecs / 100, elapsed_csecs % 100, todo - wq_busy, wq_busy); - read_lock(&tasklist_lock); - do_each_thread(g, p) { - if (!wakeup && !freezer_should_skip(p) && - p != current && freezing(p) && !frozen(p)) - sched_show_task(p); - } while_each_thread(g, p); - read_unlock(&tasklist_lock); + if (!wakeup) { + read_lock(&tasklist_lock); + do_each_thread(g, p) { + if (p != current && !freezer_should_skip(p) + && freezing(p) && !frozen(p)) + sched_show_task(p); + } while_each_thread(g, p); + read_unlock(&tasklist_lock); + } } else { printk("(elapsed %d.%02d seconds) ", elapsed_csecs / 100, elapsed_csecs % 100); @@ -143,7 +143,10 @@ int freeze_processes(void) /** * freeze_kernel_threads - Make freezable kernel threads go to the refrigerator. * - * On success, returns 0. On failure, -errno and system is fully thawed. + * On success, returns 0. On failure, -errno and only the kernel threads are + * thawed, so as to give a chance to the caller to do additional cleanups + * (if any) before thawing the userspace tasks. So, it is the responsibility + * of the caller to thaw the userspace tasks, when the time is right. */ int freeze_kernel_threads(void) { @@ -159,7 +162,7 @@ int freeze_kernel_threads(void) BUG_ON(in_atomic()); if (error) - thaw_processes(); + thaw_kernel_threads(); return error; } @@ -188,3 +191,22 @@ void thaw_processes(void) printk("done.\n"); } +void thaw_kernel_threads(void) +{ + struct task_struct *g, *p; + + pm_nosig_freezing = false; + printk("Restarting kernel threads ... "); + + thaw_workqueues(); + + read_lock(&tasklist_lock); + do_each_thread(g, p) { + if (p->flags & (PF_KTHREAD | PF_WQ_WORKER)) + __thaw_task(p); + } while_each_thread(g, p); + read_unlock(&tasklist_lock); + + schedule(); + printk("done.\n"); +} diff --git a/kernel/power/qos.c b/kernel/power/qos.c index 995e3bd3417b..d6d6dbd1ecc0 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c @@ -469,21 +469,18 @@ static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf, static int __init pm_qos_power_init(void) { int ret = 0; + int i; - ret = register_pm_qos_misc(&cpu_dma_pm_qos); - if (ret < 0) { - printk(KERN_ERR "pm_qos_param: cpu_dma_latency setup failed\n"); - return ret; - } - ret = register_pm_qos_misc(&network_lat_pm_qos); - if (ret < 0) { - printk(KERN_ERR "pm_qos_param: network_latency setup failed\n"); - return ret; + BUILD_BUG_ON(ARRAY_SIZE(pm_qos_array) != PM_QOS_NUM_CLASSES); + + for (i = 1; i < PM_QOS_NUM_CLASSES; i++) { + ret = register_pm_qos_misc(pm_qos_array[i]); + if (ret < 0) { + printk(KERN_ERR "pm_qos_param: %s setup failed\n", + pm_qos_array[i]->name); + return ret; + } } - ret = register_pm_qos_misc(&network_throughput_pm_qos); - if (ret < 0) - printk(KERN_ERR - "pm_qos_param: network_throughput setup failed\n"); return ret; } diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 1cf88900ec4f..0de28576807d 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -711,9 +711,10 @@ static void mark_nosave_pages(struct memory_bitmap *bm) list_for_each_entry(region, &nosave_regions, list) { unsigned long pfn; - pr_debug("PM: Marking nosave pages: %016lx - %016lx\n", - region->start_pfn << PAGE_SHIFT, - region->end_pfn << PAGE_SHIFT); + pr_debug("PM: Marking nosave pages: [mem %#010llx-%#010llx]\n", + (unsigned long long) region->start_pfn << PAGE_SHIFT, + ((unsigned long long) region->end_pfn << PAGE_SHIFT) + - 1); for (pfn = region->start_pfn; pfn < region->end_pfn; pfn++) if (pfn_valid(pfn)) { @@ -812,7 +813,8 @@ unsigned int snapshot_additional_pages(struct zone *zone) unsigned int res; res = DIV_ROUND_UP(zone->spanned_pages, BM_BITS_PER_BLOCK); - res += DIV_ROUND_UP(res * sizeof(struct bm_block), PAGE_SIZE); + res += DIV_ROUND_UP(res * sizeof(struct bm_block), + LINKED_PAGE_DATA_SIZE); return 2 * res; } @@ -999,20 +1001,20 @@ static void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn) s_page = pfn_to_page(src_pfn); d_page = pfn_to_page(dst_pfn); if (PageHighMem(s_page)) { - src = kmap_atomic(s_page, KM_USER0); - dst = kmap_atomic(d_page, KM_USER1); + src = kmap_atomic(s_page); + dst = kmap_atomic(d_page); do_copy_page(dst, src); - kunmap_atomic(dst, KM_USER1); - kunmap_atomic(src, KM_USER0); + kunmap_atomic(dst); + kunmap_atomic(src); } else { if (PageHighMem(d_page)) { /* Page pointed to by src may contain some kernel * data modified by kmap_atomic() */ safe_copy_page(buffer, s_page); - dst = kmap_atomic(d_page, KM_USER0); + dst = kmap_atomic(d_page); copy_page(dst, buffer); - kunmap_atomic(dst, KM_USER0); + kunmap_atomic(dst); } else { safe_copy_page(page_address(d_page), s_page); } @@ -1727,9 +1729,9 @@ int snapshot_read_next(struct snapshot_handle *handle) */ void *kaddr; - kaddr = kmap_atomic(page, KM_USER0); + kaddr = kmap_atomic(page); copy_page(buffer, kaddr); - kunmap_atomic(kaddr, KM_USER0); + kunmap_atomic(kaddr); handle->buffer = buffer; } else { handle->buffer = page_address(page); @@ -2013,9 +2015,9 @@ static void copy_last_highmem_page(void) if (last_highmem_page) { void *dst; - dst = kmap_atomic(last_highmem_page, KM_USER0); + dst = kmap_atomic(last_highmem_page); copy_page(dst, buffer); - kunmap_atomic(dst, KM_USER0); + kunmap_atomic(dst); last_highmem_page = NULL; } } @@ -2308,13 +2310,13 @@ swap_two_pages_data(struct page *p1, struct page *p2, void *buf) { void *kaddr1, *kaddr2; - kaddr1 = kmap_atomic(p1, KM_USER0); - kaddr2 = kmap_atomic(p2, KM_USER1); + kaddr1 = kmap_atomic(p1); + kaddr2 = kmap_atomic(p2); copy_page(buf, kaddr1); copy_page(kaddr1, kaddr2); copy_page(kaddr2, buf); - kunmap_atomic(kaddr2, KM_USER1); - kunmap_atomic(kaddr1, KM_USER0); + kunmap_atomic(kaddr2); + kunmap_atomic(kaddr1); } /** diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 4fd51beed879..88e5c967370d 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -37,8 +37,8 @@ const char *const pm_states[PM_SUSPEND_MAX] = { static const struct platform_suspend_ops *suspend_ops; /** - * suspend_set_ops - Set the global suspend method table. - * @ops: Pointer to ops structure. + * suspend_set_ops - Set the global suspend method table. + * @ops: Suspend operations to use. */ void suspend_set_ops(const struct platform_suspend_ops *ops) { @@ -58,11 +58,11 @@ bool valid_state(suspend_state_t state) } /** - * suspend_valid_only_mem - generic memory-only valid callback + * suspend_valid_only_mem - Generic memory-only valid callback. * - * Platform drivers that implement mem suspend only and only need - * to check for that in their .valid callback can use this instead - * of rolling their own .valid callback. + * Platform drivers that implement mem suspend only and only need to check for + * that in their .valid() callback can use this instead of rolling their own + * .valid() callback. */ int suspend_valid_only_mem(suspend_state_t state) { @@ -83,10 +83,11 @@ static int suspend_test(int level) } /** - * suspend_prepare - Do prep work before entering low-power state. + * suspend_prepare - Prepare for entering system sleep state. * - * This is common code that is called for each state that we're entering. - * Run suspend notifiers, allocate a console and stop all processes. + * Common code run for every system sleep state that can be entered (except for + * hibernation). Run suspend notifiers, allocate the "suspend" console and + * freeze processes. */ static int suspend_prepare(void) { @@ -131,9 +132,9 @@ void __attribute__ ((weak)) arch_suspend_enable_irqs(void) } /** - * suspend_enter - enter the desired system sleep state. - * @state: State to enter - * @wakeup: Returns information that suspend should not be entered again. + * suspend_enter - Make the system enter the given sleep state. + * @state: System sleep state to enter. + * @wakeup: Returns information that the sleep state should not be re-entered. * * This function should be called after devices have been suspended. */ @@ -147,7 +148,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) goto Platform_finish; } - error = dpm_suspend_noirq(PMSG_SUSPEND); + error = dpm_suspend_end(PMSG_SUSPEND); if (error) { printk(KERN_ERR "PM: Some devices failed to power down\n"); goto Platform_finish; @@ -189,7 +190,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) if (suspend_ops->wake) suspend_ops->wake(); - dpm_resume_noirq(PMSG_RESUME); + dpm_resume_start(PMSG_RESUME); Platform_finish: if (suspend_ops->finish) @@ -199,9 +200,8 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) } /** - * suspend_devices_and_enter - suspend devices and enter the desired system - * sleep state. - * @state: state to enter + * suspend_devices_and_enter - Suspend devices and enter system sleep state. + * @state: System sleep state to enter. */ int suspend_devices_and_enter(suspend_state_t state) { @@ -251,10 +251,10 @@ int suspend_devices_and_enter(suspend_state_t state) } /** - * suspend_finish - Do final work before exiting suspend sequence. + * suspend_finish - Clean up before finishing the suspend sequence. * - * Call platform code to clean up, restart processes, and free the - * console that we've allocated. This is not called for suspend-to-disk. + * Call platform code to clean up, restart processes, and free the console that + * we've allocated. This routine is not called for hibernation. */ static void suspend_finish(void) { @@ -265,16 +265,14 @@ static void suspend_finish(void) } /** - * enter_state - Do common work of entering low-power state. - * @state: pm_state structure for state we're entering. + * enter_state - Do common work needed to enter system sleep state. + * @state: System sleep state to enter. * - * Make sure we're the only ones trying to enter a sleep state. Fail - * if someone has beat us to it, since we don't want anything weird to - * happen when we wake up. - * Then, do the setup for suspend, enter the state, and cleaup (after - * we've woken up). + * Make sure that no one else is trying to put the system into a sleep state. + * Fail if that's not the case. Otherwise, prepare for system suspend, make the + * system enter the given sleep state and clean up after wakeup. */ -int enter_state(suspend_state_t state) +static int enter_state(suspend_state_t state) { int error; @@ -310,24 +308,26 @@ int enter_state(suspend_state_t state) } /** - * pm_suspend - Externally visible function for suspending system. - * @state: Enumerated value of state to enter. + * pm_suspend - Externally visible function for suspending the system. + * @state: System sleep state to enter. * - * Determine whether or not value is within range, get state - * structure, and enter (above). + * Check if the value of @state represents one of the supported states, + * execute enter_state() and update system suspend statistics. */ int pm_suspend(suspend_state_t state) { - int ret; - if (state > PM_SUSPEND_ON && state < PM_SUSPEND_MAX) { - ret = enter_state(state); - if (ret) { - suspend_stats.fail++; - dpm_save_failed_errno(ret); - } else - suspend_stats.success++; - return ret; + int error; + + if (state <= PM_SUSPEND_ON || state >= PM_SUSPEND_MAX) + return -EINVAL; + + error = enter_state(state); + if (error) { + suspend_stats.fail++; + dpm_save_failed_errno(error); + } else { + suspend_stats.success++; } - return -EINVAL; + return error; } EXPORT_SYMBOL(pm_suspend); diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 3739ecced085..8742fd013a94 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -773,8 +773,7 @@ static int enough_swap(unsigned int nr_pages, unsigned int flags) pr_debug("PM: Free swap pages: %u\n", free_swap); - required = PAGES_FOR_IO + ((flags & SF_NOCOMPRESS_MODE) ? - nr_pages : (nr_pages * LZO_CMP_PAGES) / LZO_UNC_PAGES + 1); + required = PAGES_FOR_IO + nr_pages; return free_swap > required; } @@ -802,10 +801,12 @@ int swsusp_write(unsigned int flags) printk(KERN_ERR "PM: Cannot get swap writer\n"); return error; } - if (!enough_swap(pages, flags)) { - printk(KERN_ERR "PM: Not enough free swap\n"); - error = -ENOSPC; - goto out_finish; + if (flags & SF_NOCOMPRESS_MODE) { + if (!enough_swap(pages, flags)) { + printk(KERN_ERR "PM: Not enough free swap\n"); + error = -ENOSPC; + goto out_finish; + } } memset(&snapshot, 0, sizeof(struct snapshot_handle)); error = snapshot_read_next(&snapshot); diff --git a/kernel/power/user.c b/kernel/power/user.c index 6b1ab7a88522..33c4329205af 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -251,12 +251,8 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, error = hibernation_snapshot(data->platform_support); if (!error) { error = put_user(in_suspend, (int __user *)arg); - if (!error && !freezer_test_done) - data->ready = 1; - if (freezer_test_done) { - freezer_test_done = false; - thaw_processes(); - } + data->ready = !freezer_test_done && !error; + freezer_test_done = false; } break; @@ -274,6 +270,15 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, swsusp_free(); memset(&data->handle, 0, sizeof(struct snapshot_handle)); data->ready = 0; + /* + * It is necessary to thaw kernel threads here, because + * SNAPSHOT_CREATE_IMAGE may be invoked directly after + * SNAPSHOT_FREE. In that case, if kernel threads were not + * thawed, the preallocation of memory carried out by + * hibernation_snapshot() might run into problems (i.e. it + * might fail or even deadlock). + */ + thaw_kernel_threads(); break; case SNAPSHOT_PREF_IMAGE_SIZE: diff --git a/kernel/printk.c b/kernel/printk.c index 989e4a52da76..b663c2c95d39 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -44,6 +44,9 @@ #include <asm/uaccess.h> +#define CREATE_TRACE_POINTS +#include <trace/events/printk.h> + /* * Architectures can override it: */ @@ -521,7 +524,7 @@ static void __call_console_drivers(unsigned start, unsigned end) } } -static int __read_mostly ignore_loglevel; +static bool __read_mostly ignore_loglevel; static int __init ignore_loglevel_setup(char *str) { @@ -532,7 +535,7 @@ static int __init ignore_loglevel_setup(char *str) } early_param("ignore_loglevel", ignore_loglevel_setup); -module_param_named(ignore_loglevel, ignore_loglevel, bool, S_IRUGO | S_IWUSR); +module_param(ignore_loglevel, bool, S_IRUGO | S_IWUSR); MODULE_PARM_DESC(ignore_loglevel, "ignore loglevel setting, to" "print all kernel messages to the console."); @@ -542,6 +545,8 @@ MODULE_PARM_DESC(ignore_loglevel, "ignore loglevel setting, to" static void _call_console_drivers(unsigned start, unsigned end, int msg_log_level) { + trace_console(&LOG_BUF(0), start, end, log_buf_len); + if ((msg_log_level < console_loglevel || ignore_loglevel) && console_drivers && start != end) { if ((start & LOG_BUF_MASK) > (end & LOG_BUF_MASK)) { @@ -696,12 +701,15 @@ static void zap_locks(void) } #if defined(CONFIG_PRINTK_TIME) -static int printk_time = 1; +static bool printk_time = 1; #else -static int printk_time = 0; +static bool printk_time = 0; #endif module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR); +static bool always_kmsg_dump; +module_param_named(always_kmsg_dump, always_kmsg_dump, bool, S_IRUGO | S_IWUSR); + /* Check if we have any console registered that can be called early in boot. */ static int have_callable_console(void) { @@ -1098,7 +1106,7 @@ int update_console_cmdline(char *name, int idx, char *name_new, int idx_new, cha return -1; } -int console_suspend_enabled = 1; +bool console_suspend_enabled = 1; EXPORT_SYMBOL(console_suspend_enabled); static int __init console_suspend_disable(char *str) @@ -1208,13 +1216,27 @@ int is_console_locked(void) return console_locked; } +/* + * Delayed printk facility, for scheduler-internal messages: + */ +#define PRINTK_BUF_SIZE 512 + +#define PRINTK_PENDING_WAKEUP 0x01 +#define PRINTK_PENDING_SCHED 0x02 + static DEFINE_PER_CPU(int, printk_pending); +static DEFINE_PER_CPU(char [PRINTK_BUF_SIZE], printk_sched_buf); void printk_tick(void) { if (__this_cpu_read(printk_pending)) { - __this_cpu_write(printk_pending, 0); - wake_up_interruptible(&log_wait); + int pending = __this_cpu_xchg(printk_pending, 0); + if (pending & PRINTK_PENDING_SCHED) { + char *buf = __get_cpu_var(printk_sched_buf); + printk(KERN_WARNING "[sched_delayed] %s", buf); + } + if (pending & PRINTK_PENDING_WAKEUP) + wake_up_interruptible(&log_wait); } } @@ -1228,7 +1250,7 @@ int printk_needs_cpu(int cpu) void wake_up_klogd(void) { if (waitqueue_active(&log_wait)) - this_cpu_write(printk_pending, 1); + this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP); } /** @@ -1621,6 +1643,26 @@ late_initcall(printk_late_init); #if defined CONFIG_PRINTK +int printk_sched(const char *fmt, ...) +{ + unsigned long flags; + va_list args; + char *buf; + int r; + + local_irq_save(flags); + buf = __get_cpu_var(printk_sched_buf); + + va_start(args, fmt); + r = vsnprintf(buf, PRINTK_BUF_SIZE, fmt, args); + va_end(args); + + __this_cpu_or(printk_pending, PRINTK_PENDING_SCHED); + local_irq_restore(flags); + + return r; +} + /* * printk rate limiting, lifted from the networking subsystem. * @@ -1732,6 +1774,9 @@ void kmsg_dump(enum kmsg_dump_reason reason) unsigned long l1, l2; unsigned long flags; + if ((reason > KMSG_DUMP_OOPS) && !always_kmsg_dump) + return; + /* Theoretically, the log could move on after we do this, but there's not a lot we can do about that. The new messages will overwrite the start of what we dump. */ diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 78ab24a7b0e4..00ab2ca5ed11 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -172,6 +172,14 @@ int ptrace_check_attach(struct task_struct *child, bool ignore_state) return ret; } +static int ptrace_has_cap(struct user_namespace *ns, unsigned int mode) +{ + if (mode & PTRACE_MODE_NOAUDIT) + return has_ns_capability_noaudit(current, ns, CAP_SYS_PTRACE); + else + return has_ns_capability(current, ns, CAP_SYS_PTRACE); +} + int __ptrace_may_access(struct task_struct *task, unsigned int mode) { const struct cred *cred = current_cred(), *tcred; @@ -198,7 +206,7 @@ int __ptrace_may_access(struct task_struct *task, unsigned int mode) cred->gid == tcred->sgid && cred->gid == tcred->gid)) goto ok; - if (ns_capable(tcred->user->user_ns, CAP_SYS_PTRACE)) + if (ptrace_has_cap(tcred->user->user_ns, mode)) goto ok; rcu_read_unlock(); return -EPERM; @@ -207,7 +215,7 @@ ok: smp_rmb(); if (task->mm) dumpable = get_dumpable(task->mm); - if (!dumpable && !task_ns_capable(task, CAP_SYS_PTRACE)) + if (!dumpable && !ptrace_has_cap(task_user_ns(task), mode)) return -EPERM; return security_ptrace_access_check(task, mode); @@ -277,7 +285,7 @@ static int ptrace_attach(struct task_struct *task, long request, task->ptrace = PT_PTRACED; if (seize) task->ptrace |= PT_SEIZED; - if (task_ns_capable(task, CAP_SYS_PTRACE)) + if (ns_capable(task_user_ns(task), CAP_SYS_PTRACE)) task->ptrace |= PT_PTRACE_CAP; __ptrace_link(task, current); diff --git a/kernel/rcu.h b/kernel/rcu.h index aa88baab5f78..8ba99cdc6515 100644 --- a/kernel/rcu.h +++ b/kernel/rcu.h @@ -33,8 +33,27 @@ * Process-level increment to ->dynticks_nesting field. This allows for * architectures that use half-interrupts and half-exceptions from * process context. + * + * DYNTICK_TASK_NEST_MASK defines a field of width DYNTICK_TASK_NEST_WIDTH + * that counts the number of process-based reasons why RCU cannot + * consider the corresponding CPU to be idle, and DYNTICK_TASK_NEST_VALUE + * is the value used to increment or decrement this field. + * + * The rest of the bits could in principle be used to count interrupts, + * but this would mean that a negative-one value in the interrupt + * field could incorrectly zero out the DYNTICK_TASK_NEST_MASK field. + * We therefore provide a two-bit guard field defined by DYNTICK_TASK_MASK + * that is set to DYNTICK_TASK_FLAG upon initial exit from idle. + * The DYNTICK_TASK_EXIT_IDLE value is thus the combined value used upon + * initial exit from idle. */ -#define DYNTICK_TASK_NESTING (LLONG_MAX / 2 - 1) +#define DYNTICK_TASK_NEST_WIDTH 7 +#define DYNTICK_TASK_NEST_VALUE ((LLONG_MAX >> DYNTICK_TASK_NEST_WIDTH) + 1) +#define DYNTICK_TASK_NEST_MASK (LLONG_MAX - DYNTICK_TASK_NEST_VALUE + 1) +#define DYNTICK_TASK_FLAG ((DYNTICK_TASK_NEST_VALUE / 8) * 2) +#define DYNTICK_TASK_MASK ((DYNTICK_TASK_NEST_VALUE / 8) * 3) +#define DYNTICK_TASK_EXIT_IDLE (DYNTICK_TASK_NEST_VALUE + \ + DYNTICK_TASK_FLAG) /* * debug_rcu_head_queue()/debug_rcu_head_unqueue() are used internally @@ -50,7 +69,6 @@ extern struct debug_obj_descr rcuhead_debug_descr; static inline void debug_rcu_head_queue(struct rcu_head *head) { - WARN_ON_ONCE((unsigned long)head & 0x3); debug_object_activate(head, &rcuhead_debug_descr); debug_object_active_state(head, &rcuhead_debug_descr, STATE_RCU_HEAD_READY, @@ -76,16 +94,18 @@ static inline void debug_rcu_head_unqueue(struct rcu_head *head) extern void kfree(const void *); -static inline void __rcu_reclaim(char *rn, struct rcu_head *head) +static inline bool __rcu_reclaim(char *rn, struct rcu_head *head) { unsigned long offset = (unsigned long)head->func; if (__is_kfree_rcu_offset(offset)) { RCU_TRACE(trace_rcu_invoke_kfree_callback(rn, head, offset)); kfree((void *)head - offset); + return 1; } else { RCU_TRACE(trace_rcu_invoke_callback(rn, head)); head->func(head); + return 0; } } diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index 2bc4e135ff23..a86f1741cc27 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -88,6 +88,9 @@ EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled); * section. * * Check debug_lockdep_rcu_enabled() to prevent false positives during boot. + * + * Note that rcu_read_lock() is disallowed if the CPU is either idle or + * offline from an RCU perspective, so check for those as well. */ int rcu_read_lock_bh_held(void) { @@ -95,6 +98,8 @@ int rcu_read_lock_bh_held(void) return 1; if (rcu_is_cpu_idle()) return 0; + if (!rcu_lockdep_current_cpu_online()) + return 0; return in_softirq() || irqs_disabled(); } EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held); diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c index 977296dca0a4..37a5444204d2 100644 --- a/kernel/rcutiny.c +++ b/kernel/rcutiny.c @@ -53,7 +53,7 @@ static void __call_rcu(struct rcu_head *head, #include "rcutiny_plugin.h" -static long long rcu_dynticks_nesting = DYNTICK_TASK_NESTING; +static long long rcu_dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; /* Common code for rcu_idle_enter() and rcu_irq_exit(), see kernel/rcutree.c. */ static void rcu_idle_enter_common(long long oldval) @@ -88,10 +88,16 @@ void rcu_idle_enter(void) local_irq_save(flags); oldval = rcu_dynticks_nesting; - rcu_dynticks_nesting = 0; + WARN_ON_ONCE((rcu_dynticks_nesting & DYNTICK_TASK_NEST_MASK) == 0); + if ((rcu_dynticks_nesting & DYNTICK_TASK_NEST_MASK) == + DYNTICK_TASK_NEST_VALUE) + rcu_dynticks_nesting = 0; + else + rcu_dynticks_nesting -= DYNTICK_TASK_NEST_VALUE; rcu_idle_enter_common(oldval); local_irq_restore(flags); } +EXPORT_SYMBOL_GPL(rcu_idle_enter); /* * Exit an interrupt handler towards idle. @@ -140,11 +146,15 @@ void rcu_idle_exit(void) local_irq_save(flags); oldval = rcu_dynticks_nesting; - WARN_ON_ONCE(oldval != 0); - rcu_dynticks_nesting = DYNTICK_TASK_NESTING; + WARN_ON_ONCE(rcu_dynticks_nesting < 0); + if (rcu_dynticks_nesting & DYNTICK_TASK_NEST_MASK) + rcu_dynticks_nesting += DYNTICK_TASK_NEST_VALUE; + else + rcu_dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; rcu_idle_exit_common(oldval); local_irq_restore(flags); } +EXPORT_SYMBOL_GPL(rcu_idle_exit); /* * Enter an interrupt handler, moving away from idle. @@ -258,7 +268,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) /* If no RCU callbacks ready to invoke, just return. */ if (&rcp->rcucblist == rcp->donetail) { - RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, -1)); + RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, 0, -1)); RCU_TRACE(trace_rcu_batch_end(rcp->name, 0, ACCESS_ONCE(rcp->rcucblist), need_resched(), @@ -269,7 +279,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) /* Move the ready-to-invoke callbacks to a local list. */ local_irq_save(flags); - RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, -1)); + RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, rcp->qlen, -1)); list = rcp->rcucblist; rcp->rcucblist = *rcp->donetail; *rcp->donetail = NULL; @@ -319,6 +329,10 @@ static void rcu_process_callbacks(struct softirq_action *unused) */ void synchronize_sched(void) { + rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) && + !lock_is_held(&rcu_lock_map) && + !lock_is_held(&rcu_sched_lock_map), + "Illegal synchronize_sched() in RCU read-side critical section"); cond_resched(); } EXPORT_SYMBOL_GPL(synchronize_sched); diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h index 9cb1ae4aabdd..22ecea0dfb62 100644 --- a/kernel/rcutiny_plugin.h +++ b/kernel/rcutiny_plugin.h @@ -132,6 +132,7 @@ static struct rcu_preempt_ctrlblk rcu_preempt_ctrlblk = { RCU_TRACE(.rcb.name = "rcu_preempt") }; +static void rcu_read_unlock_special(struct task_struct *t); static int rcu_preempted_readers_exp(void); static void rcu_report_exp_done(void); @@ -146,6 +147,16 @@ static int rcu_cpu_blocking_cur_gp(void) /* * Check for a running RCU reader. Because there is only one CPU, * there can be but one running RCU reader at a time. ;-) + * + * Returns zero if there are no running readers. Returns a positive + * number if there is at least one reader within its RCU read-side + * critical section. Returns a negative number if an outermost reader + * is in the midst of exiting from its RCU read-side critical section + * + * Returns zero if there are no running readers. Returns a positive + * number if there is at least one reader within its RCU read-side + * critical section. Returns a negative number if an outermost reader + * is in the midst of exiting from its RCU read-side critical section. */ static int rcu_preempt_running_reader(void) { @@ -307,7 +318,6 @@ static int rcu_boost(void) t = container_of(tb, struct task_struct, rcu_node_entry); rt_mutex_init_proxy_locked(&mtx, t); t->rcu_boost_mutex = &mtx; - t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BOOSTED; raw_local_irq_restore(flags); rt_mutex_lock(&mtx); rt_mutex_unlock(&mtx); /* Keep lockdep happy. */ @@ -475,7 +485,7 @@ void rcu_preempt_note_context_switch(void) unsigned long flags; local_irq_save(flags); /* must exclude scheduler_tick(). */ - if (rcu_preempt_running_reader() && + if (rcu_preempt_running_reader() > 0 && (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) { /* Possibly blocking in an RCU read-side critical section. */ @@ -494,6 +504,13 @@ void rcu_preempt_note_context_switch(void) list_add(&t->rcu_node_entry, &rcu_preempt_ctrlblk.blkd_tasks); if (rcu_cpu_blocking_cur_gp()) rcu_preempt_ctrlblk.gp_tasks = &t->rcu_node_entry; + } else if (rcu_preempt_running_reader() < 0 && + t->rcu_read_unlock_special) { + /* + * Complete exit from RCU read-side critical section on + * behalf of preempted instance of __rcu_read_unlock(). + */ + rcu_read_unlock_special(t); } /* @@ -526,12 +543,15 @@ EXPORT_SYMBOL_GPL(__rcu_read_lock); * notify RCU core processing or task having blocked during the RCU * read-side critical section. */ -static void rcu_read_unlock_special(struct task_struct *t) +static noinline void rcu_read_unlock_special(struct task_struct *t) { int empty; int empty_exp; unsigned long flags; struct list_head *np; +#ifdef CONFIG_RCU_BOOST + struct rt_mutex *rbmp = NULL; +#endif /* #ifdef CONFIG_RCU_BOOST */ int special; /* @@ -552,7 +572,7 @@ static void rcu_read_unlock_special(struct task_struct *t) rcu_preempt_cpu_qs(); /* Hardware IRQ handlers cannot block. */ - if (in_irq()) { + if (in_irq() || in_serving_softirq()) { local_irq_restore(flags); return; } @@ -597,10 +617,10 @@ static void rcu_read_unlock_special(struct task_struct *t) } #ifdef CONFIG_RCU_BOOST /* Unboost self if was boosted. */ - if (special & RCU_READ_UNLOCK_BOOSTED) { - t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BOOSTED; - rt_mutex_unlock(t->rcu_boost_mutex); + if (t->rcu_boost_mutex != NULL) { + rbmp = t->rcu_boost_mutex; t->rcu_boost_mutex = NULL; + rt_mutex_unlock(rbmp); } #endif /* #ifdef CONFIG_RCU_BOOST */ local_irq_restore(flags); @@ -618,13 +638,22 @@ void __rcu_read_unlock(void) struct task_struct *t = current; barrier(); /* needed if we ever invoke rcu_read_unlock in rcutiny.c */ - --t->rcu_read_lock_nesting; - barrier(); /* decrement before load of ->rcu_read_unlock_special */ - if (t->rcu_read_lock_nesting == 0 && - unlikely(ACCESS_ONCE(t->rcu_read_unlock_special))) - rcu_read_unlock_special(t); + if (t->rcu_read_lock_nesting != 1) + --t->rcu_read_lock_nesting; + else { + t->rcu_read_lock_nesting = INT_MIN; + barrier(); /* assign before ->rcu_read_unlock_special load */ + if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special))) + rcu_read_unlock_special(t); + barrier(); /* ->rcu_read_unlock_special load before assign */ + t->rcu_read_lock_nesting = 0; + } #ifdef CONFIG_PROVE_LOCKING - WARN_ON_ONCE(t->rcu_read_lock_nesting < 0); + { + int rrln = ACCESS_ONCE(t->rcu_read_lock_nesting); + + WARN_ON_ONCE(rrln < 0 && rrln > INT_MIN / 2); + } #endif /* #ifdef CONFIG_PROVE_LOCKING */ } EXPORT_SYMBOL_GPL(__rcu_read_unlock); @@ -649,7 +678,7 @@ static void rcu_preempt_check_callbacks(void) invoke_rcu_callbacks(); if (rcu_preempt_gp_in_progress() && rcu_cpu_blocking_cur_gp() && - rcu_preempt_running_reader()) + rcu_preempt_running_reader() > 0) t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS; } @@ -706,6 +735,11 @@ EXPORT_SYMBOL_GPL(call_rcu); */ void synchronize_rcu(void) { + rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) && + !lock_is_held(&rcu_lock_map) && + !lock_is_held(&rcu_sched_lock_map), + "Illegal synchronize_rcu() in RCU read-side critical section"); + #ifdef CONFIG_DEBUG_LOCK_ALLOC if (!rcu_scheduler_active) return; @@ -882,7 +916,8 @@ static void rcu_preempt_process_callbacks(void) static void invoke_rcu_callbacks(void) { have_rcu_kthread_work = 1; - wake_up(&rcu_kthread_wq); + if (rcu_kthread_task != NULL) + wake_up(&rcu_kthread_wq); } #ifdef CONFIG_RCU_TRACE @@ -943,12 +978,16 @@ early_initcall(rcu_spawn_kthreads); #else /* #ifdef CONFIG_RCU_BOOST */ +/* Hold off callback invocation until early_initcall() time. */ +static int rcu_scheduler_fully_active __read_mostly; + /* * Start up softirq processing of callbacks. */ void invoke_rcu_callbacks(void) { - raise_softirq(RCU_SOFTIRQ); + if (rcu_scheduler_fully_active) + raise_softirq(RCU_SOFTIRQ); } #ifdef CONFIG_RCU_TRACE @@ -963,10 +1002,14 @@ static bool rcu_is_callbacks_kthread(void) #endif /* #ifdef CONFIG_RCU_TRACE */ -void rcu_init(void) +static int __init rcu_scheduler_really_started(void) { + rcu_scheduler_fully_active = 1; open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); + raise_softirq(RCU_SOFTIRQ); /* Invoke any callbacks from early boot. */ + return 0; } +early_initcall(rcu_scheduler_really_started); #endif /* #else #ifdef CONFIG_RCU_BOOST */ diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 88f17b8a3b1d..a89b381a8c6e 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -56,8 +56,8 @@ static int nreaders = -1; /* # reader threads, defaults to 2*ncpus */ static int nfakewriters = 4; /* # fake writer threads */ static int stat_interval; /* Interval between stats, in seconds. */ /* Defaults to "only at end of test". */ -static int verbose; /* Print more debug info. */ -static int test_no_idle_hz; /* Test RCU's support for tickless idle CPUs. */ +static bool verbose; /* Print more debug info. */ +static bool test_no_idle_hz; /* Test RCU's support for tickless idle CPUs. */ static int shuffle_interval = 3; /* Interval between shuffles (in sec)*/ static int stutter = 5; /* Start/stop testing interval (in sec) */ static int irqreader = 1; /* RCU readers from irq (timers). */ @@ -65,7 +65,10 @@ static int fqs_duration; /* Duration of bursts (us), 0 to disable. */ static int fqs_holdoff; /* Hold time within burst (us). */ static int fqs_stutter = 3; /* Wait time between bursts (s). */ static int onoff_interval; /* Wait time between CPU hotplugs, 0=disable. */ +static int onoff_holdoff; /* Seconds after boot before CPU hotplugs. */ static int shutdown_secs; /* Shutdown time (s). <=0 for no shutdown. */ +static int stall_cpu; /* CPU-stall duration (s). 0 for no stall. */ +static int stall_cpu_holdoff = 10; /* Time to wait until stall (s). */ static int test_boost = 1; /* Test RCU prio boost: 0=no, 1=maybe, 2=yes. */ static int test_boost_interval = 7; /* Interval between boost tests, seconds. */ static int test_boost_duration = 4; /* Duration of each boost test, seconds. */ @@ -95,8 +98,14 @@ module_param(fqs_stutter, int, 0444); MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)"); module_param(onoff_interval, int, 0444); MODULE_PARM_DESC(onoff_interval, "Time between CPU hotplugs (s), 0=disable"); +module_param(onoff_holdoff, int, 0444); +MODULE_PARM_DESC(onoff_holdoff, "Time after boot before CPU hotplugs (s)"); module_param(shutdown_secs, int, 0444); MODULE_PARM_DESC(shutdown_secs, "Shutdown time (s), zero to disable."); +module_param(stall_cpu, int, 0444); +MODULE_PARM_DESC(stall_cpu, "Stall duration (s), zero to disable."); +module_param(stall_cpu_holdoff, int, 0444); +MODULE_PARM_DESC(stall_cpu_holdoff, "Time to wait before starting stall (s)."); module_param(test_boost, int, 0444); MODULE_PARM_DESC(test_boost, "Test RCU prio boost: 0=no, 1=maybe, 2=yes."); module_param(test_boost_interval, int, 0444); @@ -129,6 +138,7 @@ static struct task_struct *shutdown_task; #ifdef CONFIG_HOTPLUG_CPU static struct task_struct *onoff_task; #endif /* #ifdef CONFIG_HOTPLUG_CPU */ +static struct task_struct *stall_task; #define RCU_TORTURE_PIPE_LEN 10 @@ -990,12 +1000,12 @@ static void rcu_torture_timer(unsigned long unused) rcu_read_lock_bh_held() || rcu_read_lock_sched_held() || srcu_read_lock_held(&srcu_ctl)); - do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu); if (p == NULL) { /* Leave because rcu_torture_writer is not yet underway */ cur_ops->readunlock(idx); return; } + do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu); if (p->rtort_mbtest == 0) atomic_inc(&n_rcu_torture_mberror); spin_lock(&rand_lock); @@ -1053,13 +1063,13 @@ rcu_torture_reader(void *arg) rcu_read_lock_bh_held() || rcu_read_lock_sched_held() || srcu_read_lock_held(&srcu_ctl)); - do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu); if (p == NULL) { /* Wait for rcu_torture_writer to get underway */ cur_ops->readunlock(idx); schedule_timeout_interruptible(HZ); continue; } + do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu); if (p->rtort_mbtest == 0) atomic_inc(&n_rcu_torture_mberror); cur_ops->read_delay(&rand); @@ -1300,13 +1310,13 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, char *tag) "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d " "test_boost=%d/%d test_boost_interval=%d " "test_boost_duration=%d shutdown_secs=%d " - "onoff_interval=%d\n", + "onoff_interval=%d onoff_holdoff=%d\n", torture_type, tag, nrealreaders, nfakewriters, stat_interval, verbose, test_no_idle_hz, shuffle_interval, stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter, test_boost, cur_ops->can_boost, test_boost_interval, test_boost_duration, shutdown_secs, - onoff_interval); + onoff_interval, onoff_holdoff); } static struct notifier_block rcutorture_shutdown_nb = { @@ -1399,7 +1409,7 @@ rcu_torture_shutdown(void *arg) * Execute random CPU-hotplug operations at the interval specified * by the onoff_interval. */ -static int +static int __cpuinit rcu_torture_onoff(void *arg) { int cpu; @@ -1410,6 +1420,11 @@ rcu_torture_onoff(void *arg) for_each_online_cpu(cpu) maxcpu = cpu; WARN_ON(maxcpu < 0); + if (onoff_holdoff > 0) { + VERBOSE_PRINTK_STRING("rcu_torture_onoff begin holdoff"); + schedule_timeout_interruptible(onoff_holdoff * HZ); + VERBOSE_PRINTK_STRING("rcu_torture_onoff end holdoff"); + } while (!kthread_should_stop()) { cpu = (rcu_random(&rand) >> 4) % (maxcpu + 1); if (cpu_online(cpu) && cpu_is_hotpluggable(cpu)) { @@ -1447,15 +1462,18 @@ rcu_torture_onoff(void *arg) return 0; } -static int +static int __cpuinit rcu_torture_onoff_init(void) { + int ret; + if (onoff_interval <= 0) return 0; onoff_task = kthread_run(rcu_torture_onoff, NULL, "rcu_torture_onoff"); if (IS_ERR(onoff_task)) { + ret = PTR_ERR(onoff_task); onoff_task = NULL; - return PTR_ERR(onoff_task); + return ret; } return 0; } @@ -1481,6 +1499,63 @@ static void rcu_torture_onoff_cleanup(void) #endif /* #else #ifdef CONFIG_HOTPLUG_CPU */ +/* + * CPU-stall kthread. It waits as specified by stall_cpu_holdoff, then + * induces a CPU stall for the time specified by stall_cpu. + */ +static int __cpuinit rcu_torture_stall(void *args) +{ + unsigned long stop_at; + + VERBOSE_PRINTK_STRING("rcu_torture_stall task started"); + if (stall_cpu_holdoff > 0) { + VERBOSE_PRINTK_STRING("rcu_torture_stall begin holdoff"); + schedule_timeout_interruptible(stall_cpu_holdoff * HZ); + VERBOSE_PRINTK_STRING("rcu_torture_stall end holdoff"); + } + if (!kthread_should_stop()) { + stop_at = get_seconds() + stall_cpu; + /* RCU CPU stall is expected behavior in following code. */ + printk(KERN_ALERT "rcu_torture_stall start.\n"); + rcu_read_lock(); + preempt_disable(); + while (ULONG_CMP_LT(get_seconds(), stop_at)) + continue; /* Induce RCU CPU stall warning. */ + preempt_enable(); + rcu_read_unlock(); + printk(KERN_ALERT "rcu_torture_stall end.\n"); + } + rcutorture_shutdown_absorb("rcu_torture_stall"); + while (!kthread_should_stop()) + schedule_timeout_interruptible(10 * HZ); + return 0; +} + +/* Spawn CPU-stall kthread, if stall_cpu specified. */ +static int __init rcu_torture_stall_init(void) +{ + int ret; + + if (stall_cpu <= 0) + return 0; + stall_task = kthread_run(rcu_torture_stall, NULL, "rcu_torture_stall"); + if (IS_ERR(stall_task)) { + ret = PTR_ERR(stall_task); + stall_task = NULL; + return ret; + } + return 0; +} + +/* Clean up after the CPU-stall kthread, if one was spawned. */ +static void rcu_torture_stall_cleanup(void) +{ + if (stall_task == NULL) + return; + VERBOSE_PRINTK_STRING("Stopping rcu_torture_stall_task."); + kthread_stop(stall_task); +} + static int rcutorture_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) { @@ -1523,6 +1598,7 @@ rcu_torture_cleanup(void) fullstop = FULLSTOP_RMMOD; mutex_unlock(&fullstop_mutex); unregister_reboot_notifier(&rcutorture_shutdown_nb); + rcu_torture_stall_cleanup(); if (stutter_task) { VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task"); kthread_stop(stutter_task); @@ -1602,6 +1678,10 @@ rcu_torture_cleanup(void) cur_ops->cleanup(); if (atomic_read(&n_rcu_torture_error)) rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE"); + else if (n_online_successes != n_online_attempts || + n_offline_successes != n_offline_attempts) + rcu_torture_print_module_parms(cur_ops, + "End of test: RCU_HOTPLUG"); else rcu_torture_print_module_parms(cur_ops, "End of test: SUCCESS"); } @@ -1819,6 +1899,7 @@ rcu_torture_init(void) } rcu_torture_onoff_init(); register_reboot_notifier(&rcutorture_shutdown_nb); + rcu_torture_stall_init(); rcutorture_record_test_transition(); mutex_unlock(&fullstop_mutex); return 0; diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 6c4a6722abfd..1050d6d3922c 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -50,6 +50,8 @@ #include <linux/wait.h> #include <linux/kthread.h> #include <linux/prefetch.h> +#include <linux/delay.h> +#include <linux/stop_machine.h> #include "rcutree.h" #include <trace/events/rcu.h> @@ -196,7 +198,7 @@ void rcu_note_context_switch(int cpu) EXPORT_SYMBOL_GPL(rcu_note_context_switch); DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { - .dynticks_nesting = DYNTICK_TASK_NESTING, + .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE, .dynticks = ATOMIC_INIT(1), }; @@ -208,8 +210,11 @@ module_param(blimit, int, 0); module_param(qhimark, int, 0); module_param(qlowmark, int, 0); -int rcu_cpu_stall_suppress __read_mostly; +int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */ +int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT; + module_param(rcu_cpu_stall_suppress, int, 0644); +module_param(rcu_cpu_stall_timeout, int, 0644); static void force_quiescent_state(struct rcu_state *rsp, int relaxed); static int rcu_pending(int cpu); @@ -301,8 +306,6 @@ static struct rcu_node *rcu_get_root(struct rcu_state *rsp) return &rsp->node[0]; } -#ifdef CONFIG_SMP - /* * If the specified CPU is offline, tell the caller that it is in * a quiescent state. Otherwise, whack it with a reschedule IPI. @@ -317,30 +320,21 @@ static struct rcu_node *rcu_get_root(struct rcu_state *rsp) static int rcu_implicit_offline_qs(struct rcu_data *rdp) { /* - * If the CPU is offline, it is in a quiescent state. We can - * trust its state not to change because interrupts are disabled. + * If the CPU is offline for more than a jiffy, it is in a quiescent + * state. We can trust its state not to change because interrupts + * are disabled. The reason for the jiffy's worth of slack is to + * handle CPUs initializing on the way up and finding their way + * to the idle loop on the way down. */ - if (cpu_is_offline(rdp->cpu)) { + if (cpu_is_offline(rdp->cpu) && + ULONG_CMP_LT(rdp->rsp->gp_start + 2, jiffies)) { trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, "ofl"); rdp->offline_fqs++; return 1; } - - /* - * The CPU is online, so send it a reschedule IPI. This forces - * it through the scheduler, and (inefficiently) also handles cases - * where idle loops fail to inform RCU about the CPU being idle. - */ - if (rdp->cpu != smp_processor_id()) - smp_send_reschedule(rdp->cpu); - else - set_need_resched(); - rdp->resched_ipi++; return 0; } -#endif /* #ifdef CONFIG_SMP */ - /* * rcu_idle_enter_common - inform RCU that current CPU is moving towards idle * @@ -366,6 +360,17 @@ static void rcu_idle_enter_common(struct rcu_dynticks *rdtp, long long oldval) atomic_inc(&rdtp->dynticks); smp_mb__after_atomic_inc(); /* Force ordering with next sojourn. */ WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1); + + /* + * The idle task is not permitted to enter the idle loop while + * in an RCU read-side critical section. + */ + rcu_lockdep_assert(!lock_is_held(&rcu_lock_map), + "Illegal idle entry in RCU read-side critical section."); + rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map), + "Illegal idle entry in RCU-bh read-side critical section."); + rcu_lockdep_assert(!lock_is_held(&rcu_sched_lock_map), + "Illegal idle entry in RCU-sched read-side critical section."); } /** @@ -389,10 +394,15 @@ void rcu_idle_enter(void) local_irq_save(flags); rdtp = &__get_cpu_var(rcu_dynticks); oldval = rdtp->dynticks_nesting; - rdtp->dynticks_nesting = 0; + WARN_ON_ONCE((oldval & DYNTICK_TASK_NEST_MASK) == 0); + if ((oldval & DYNTICK_TASK_NEST_MASK) == DYNTICK_TASK_NEST_VALUE) + rdtp->dynticks_nesting = 0; + else + rdtp->dynticks_nesting -= DYNTICK_TASK_NEST_VALUE; rcu_idle_enter_common(rdtp, oldval); local_irq_restore(flags); } +EXPORT_SYMBOL_GPL(rcu_idle_enter); /** * rcu_irq_exit - inform RCU that current CPU is exiting irq towards idle @@ -462,7 +472,7 @@ static void rcu_idle_exit_common(struct rcu_dynticks *rdtp, long long oldval) * Exit idle mode, in other words, -enter- the mode in which RCU * read-side critical sections can occur. * - * We crowbar the ->dynticks_nesting field to DYNTICK_TASK_NESTING to + * We crowbar the ->dynticks_nesting field to DYNTICK_TASK_NEST to * allow for the possibility of usermode upcalls messing up our count * of interrupt nesting level during the busy period that is just * now starting. @@ -476,11 +486,15 @@ void rcu_idle_exit(void) local_irq_save(flags); rdtp = &__get_cpu_var(rcu_dynticks); oldval = rdtp->dynticks_nesting; - WARN_ON_ONCE(oldval != 0); - rdtp->dynticks_nesting = DYNTICK_TASK_NESTING; + WARN_ON_ONCE(oldval < 0); + if (oldval & DYNTICK_TASK_NEST_MASK) + rdtp->dynticks_nesting += DYNTICK_TASK_NEST_VALUE; + else + rdtp->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; rcu_idle_exit_common(rdtp, oldval); local_irq_restore(flags); } +EXPORT_SYMBOL_GPL(rcu_idle_exit); /** * rcu_irq_enter - inform RCU that current CPU is entering irq away from idle @@ -581,6 +595,49 @@ int rcu_is_cpu_idle(void) } EXPORT_SYMBOL(rcu_is_cpu_idle); +#ifdef CONFIG_HOTPLUG_CPU + +/* + * Is the current CPU online? Disable preemption to avoid false positives + * that could otherwise happen due to the current CPU number being sampled, + * this task being preempted, its old CPU being taken offline, resuming + * on some other CPU, then determining that its old CPU is now offline. + * It is OK to use RCU on an offline processor during initial boot, hence + * the check for rcu_scheduler_fully_active. Note also that it is OK + * for a CPU coming online to use RCU for one jiffy prior to marking itself + * online in the cpu_online_mask. Similarly, it is OK for a CPU going + * offline to continue to use RCU for one jiffy after marking itself + * offline in the cpu_online_mask. This leniency is necessary given the + * non-atomic nature of the online and offline processing, for example, + * the fact that a CPU enters the scheduler after completing the CPU_DYING + * notifiers. + * + * This is also why RCU internally marks CPUs online during the + * CPU_UP_PREPARE phase and offline during the CPU_DEAD phase. + * + * Disable checking if in an NMI handler because we cannot safely report + * errors from NMI handlers anyway. + */ +bool rcu_lockdep_current_cpu_online(void) +{ + struct rcu_data *rdp; + struct rcu_node *rnp; + bool ret; + + if (in_nmi()) + return 1; + preempt_disable(); + rdp = &__get_cpu_var(rcu_sched_data); + rnp = rdp->mynode; + ret = (rdp->grpmask & rnp->qsmaskinit) || + !rcu_scheduler_fully_active; + preempt_enable(); + return ret; +} +EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online); + +#endif /* #ifdef CONFIG_HOTPLUG_CPU */ + #endif /* #ifdef CONFIG_PROVE_RCU */ /** @@ -595,8 +652,6 @@ int rcu_is_cpu_rrupt_from_idle(void) return __get_cpu_var(rcu_dynticks).dynticks_nesting <= 1; } -#ifdef CONFIG_SMP - /* * Snapshot the specified CPU's dynticks counter so that we can later * credit them with an implicit quiescent state. Return 1 if this CPU @@ -640,12 +695,28 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) return rcu_implicit_offline_qs(rdp); } -#endif /* #ifdef CONFIG_SMP */ +static int jiffies_till_stall_check(void) +{ + int till_stall_check = ACCESS_ONCE(rcu_cpu_stall_timeout); + + /* + * Limit check must be consistent with the Kconfig limits + * for CONFIG_RCU_CPU_STALL_TIMEOUT. + */ + if (till_stall_check < 3) { + ACCESS_ONCE(rcu_cpu_stall_timeout) = 3; + till_stall_check = 3; + } else if (till_stall_check > 300) { + ACCESS_ONCE(rcu_cpu_stall_timeout) = 300; + till_stall_check = 300; + } + return till_stall_check * HZ + RCU_STALL_DELAY_DELTA; +} static void record_gp_stall_check_time(struct rcu_state *rsp) { rsp->gp_start = jiffies; - rsp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_CHECK; + rsp->jiffies_stall = jiffies + jiffies_till_stall_check(); } static void print_other_cpu_stall(struct rcu_state *rsp) @@ -664,13 +735,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp) raw_spin_unlock_irqrestore(&rnp->lock, flags); return; } - rsp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK; - - /* - * Now rat on any tasks that got kicked up to the root rcu_node - * due to CPU offlining. - */ - ndetected = rcu_print_task_stall(rnp); + rsp->jiffies_stall = jiffies + 3 * jiffies_till_stall_check() + 3; raw_spin_unlock_irqrestore(&rnp->lock, flags); /* @@ -678,8 +743,9 @@ static void print_other_cpu_stall(struct rcu_state *rsp) * See Documentation/RCU/stallwarn.txt for info on how to debug * RCU CPU stall warnings. */ - printk(KERN_ERR "INFO: %s detected stalls on CPUs/tasks: {", + printk(KERN_ERR "INFO: %s detected stalls on CPUs/tasks:", rsp->name); + print_cpu_stall_info_begin(); rcu_for_each_leaf_node(rsp, rnp) { raw_spin_lock_irqsave(&rnp->lock, flags); ndetected += rcu_print_task_stall(rnp); @@ -688,11 +754,22 @@ static void print_other_cpu_stall(struct rcu_state *rsp) continue; for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++) if (rnp->qsmask & (1UL << cpu)) { - printk(" %d", rnp->grplo + cpu); + print_cpu_stall_info(rsp, rnp->grplo + cpu); ndetected++; } } - printk("} (detected by %d, t=%ld jiffies)\n", + + /* + * Now rat on any tasks that got kicked up to the root rcu_node + * due to CPU offlining. + */ + rnp = rcu_get_root(rsp); + raw_spin_lock_irqsave(&rnp->lock, flags); + ndetected = rcu_print_task_stall(rnp); + raw_spin_unlock_irqrestore(&rnp->lock, flags); + + print_cpu_stall_info_end(); + printk(KERN_CONT "(detected by %d, t=%ld jiffies)\n", smp_processor_id(), (long)(jiffies - rsp->gp_start)); if (ndetected == 0) printk(KERN_ERR "INFO: Stall ended before state dump start\n"); @@ -716,15 +793,18 @@ static void print_cpu_stall(struct rcu_state *rsp) * See Documentation/RCU/stallwarn.txt for info on how to debug * RCU CPU stall warnings. */ - printk(KERN_ERR "INFO: %s detected stall on CPU %d (t=%lu jiffies)\n", - rsp->name, smp_processor_id(), jiffies - rsp->gp_start); + printk(KERN_ERR "INFO: %s self-detected stall on CPU", rsp->name); + print_cpu_stall_info_begin(); + print_cpu_stall_info(rsp, smp_processor_id()); + print_cpu_stall_info_end(); + printk(KERN_CONT " (t=%lu jiffies)\n", jiffies - rsp->gp_start); if (!trigger_all_cpu_backtrace()) dump_stack(); raw_spin_lock_irqsave(&rnp->lock, flags); if (ULONG_CMP_GE(jiffies, rsp->jiffies_stall)) - rsp->jiffies_stall = - jiffies + RCU_SECONDS_TILL_STALL_RECHECK; + rsp->jiffies_stall = jiffies + + 3 * jiffies_till_stall_check() + 3; raw_spin_unlock_irqrestore(&rnp->lock, flags); set_need_resched(); /* kick ourselves to get things going. */ @@ -807,6 +887,7 @@ static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct rdp->passed_quiesce = 0; } else rdp->qs_pending = 0; + zero_cpu_stall_ticks(rdp); } } @@ -943,6 +1024,10 @@ rcu_start_gp_per_cpu(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_dat * in preparation for detecting the next grace period. The caller must hold * the root node's ->lock, which is released before return. Hard irqs must * be disabled. + * + * Note that it is legal for a dying CPU (which is marked as offline) to + * invoke this function. This can happen when the dying CPU reports its + * quiescent state. */ static void rcu_start_gp(struct rcu_state *rsp, unsigned long flags) @@ -980,26 +1065,8 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) rsp->fqs_state = RCU_GP_INIT; /* Hold off force_quiescent_state. */ rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS; record_gp_stall_check_time(rsp); - - /* Special-case the common single-level case. */ - if (NUM_RCU_NODES == 1) { - rcu_preempt_check_blocked_tasks(rnp); - rnp->qsmask = rnp->qsmaskinit; - rnp->gpnum = rsp->gpnum; - rnp->completed = rsp->completed; - rsp->fqs_state = RCU_SIGNAL_INIT; /* force_quiescent_state OK */ - rcu_start_gp_per_cpu(rsp, rnp, rdp); - rcu_preempt_boost_start_gp(rnp); - trace_rcu_grace_period_init(rsp->name, rnp->gpnum, - rnp->level, rnp->grplo, - rnp->grphi, rnp->qsmask); - raw_spin_unlock_irqrestore(&rnp->lock, flags); - return; - } - raw_spin_unlock(&rnp->lock); /* leave irqs disabled. */ - /* Exclude any concurrent CPU-hotplug operations. */ raw_spin_lock(&rsp->onofflock); /* irqs already disabled. */ @@ -1245,53 +1312,115 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp) /* * Move a dying CPU's RCU callbacks to online CPU's callback list. - * Synchronization is not required because this function executes - * in stop_machine() context. + * Also record a quiescent state for this CPU for the current grace period. + * Synchronization and interrupt disabling are not required because + * this function executes in stop_machine() context. Therefore, cleanup + * operations that might block must be done later from the CPU_DEAD + * notifier. + * + * Note that the outgoing CPU's bit has already been cleared in the + * cpu_online_mask. This allows us to randomly pick a callback + * destination from the bits set in that mask. */ -static void rcu_send_cbs_to_online(struct rcu_state *rsp) +static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) { int i; - /* current DYING CPU is cleared in the cpu_online_mask */ + unsigned long mask; int receive_cpu = cpumask_any(cpu_online_mask); struct rcu_data *rdp = this_cpu_ptr(rsp->rda); struct rcu_data *receive_rdp = per_cpu_ptr(rsp->rda, receive_cpu); + RCU_TRACE(struct rcu_node *rnp = rdp->mynode); /* For dying CPU. */ + + /* First, adjust the counts. */ + if (rdp->nxtlist != NULL) { + receive_rdp->qlen_lazy += rdp->qlen_lazy; + receive_rdp->qlen += rdp->qlen; + rdp->qlen_lazy = 0; + rdp->qlen = 0; + } - if (rdp->nxtlist == NULL) - return; /* irqs disabled, so comparison is stable. */ + /* + * Next, move ready-to-invoke callbacks to be invoked on some + * other CPU. These will not be required to pass through another + * grace period: They are done, regardless of CPU. + */ + if (rdp->nxtlist != NULL && + rdp->nxttail[RCU_DONE_TAIL] != &rdp->nxtlist) { + struct rcu_head *oldhead; + struct rcu_head **oldtail; + struct rcu_head **newtail; + + oldhead = rdp->nxtlist; + oldtail = receive_rdp->nxttail[RCU_DONE_TAIL]; + rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL]; + *rdp->nxttail[RCU_DONE_TAIL] = *oldtail; + *receive_rdp->nxttail[RCU_DONE_TAIL] = oldhead; + newtail = rdp->nxttail[RCU_DONE_TAIL]; + for (i = RCU_DONE_TAIL; i < RCU_NEXT_SIZE; i++) { + if (receive_rdp->nxttail[i] == oldtail) + receive_rdp->nxttail[i] = newtail; + if (rdp->nxttail[i] == newtail) + rdp->nxttail[i] = &rdp->nxtlist; + } + } - *receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxtlist; - receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; - receive_rdp->qlen += rdp->qlen; - receive_rdp->n_cbs_adopted += rdp->qlen; - rdp->n_cbs_orphaned += rdp->qlen; + /* + * Finally, put the rest of the callbacks at the end of the list. + * The ones that made it partway through get to start over: We + * cannot assume that grace periods are synchronized across CPUs. + * (We could splice RCU_WAIT_TAIL into RCU_NEXT_READY_TAIL, but + * this does not seem compelling. Not yet, anyway.) + */ + if (rdp->nxtlist != NULL) { + *receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxtlist; + receive_rdp->nxttail[RCU_NEXT_TAIL] = + rdp->nxttail[RCU_NEXT_TAIL]; + receive_rdp->n_cbs_adopted += rdp->qlen; + rdp->n_cbs_orphaned += rdp->qlen; + + rdp->nxtlist = NULL; + for (i = 0; i < RCU_NEXT_SIZE; i++) + rdp->nxttail[i] = &rdp->nxtlist; + } - rdp->nxtlist = NULL; - for (i = 0; i < RCU_NEXT_SIZE; i++) - rdp->nxttail[i] = &rdp->nxtlist; - rdp->qlen = 0; + /* + * Record a quiescent state for the dying CPU. This is safe + * only because we have already cleared out the callbacks. + * (Otherwise, the RCU core might try to schedule the invocation + * of callbacks on this now-offline CPU, which would be bad.) + */ + mask = rdp->grpmask; /* rnp->grplo is constant. */ + trace_rcu_grace_period(rsp->name, + rnp->gpnum + 1 - !!(rnp->qsmask & mask), + "cpuofl"); + rcu_report_qs_rdp(smp_processor_id(), rsp, rdp, rsp->gpnum); + /* Note that rcu_report_qs_rdp() might call trace_rcu_grace_period(). */ } /* - * Remove the outgoing CPU from the bitmasks in the rcu_node hierarchy - * and move all callbacks from the outgoing CPU to the current one. + * The CPU has been completely removed, and some other CPU is reporting + * this fact from process context. Do the remainder of the cleanup. * There can only be one CPU hotplug operation at a time, so no other * CPU can be attempting to update rcu_cpu_kthread_task. */ -static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) +static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) { unsigned long flags; unsigned long mask; int need_report = 0; struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); - struct rcu_node *rnp; + struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rnp. */ + /* Adjust any no-longer-needed kthreads. */ rcu_stop_cpu_kthread(cpu); + rcu_node_kthread_setaffinity(rnp, -1); + + /* Remove the dying CPU from the bitmasks in the rcu_node hierarchy. */ /* Exclude any attempts to start a new grace period. */ raw_spin_lock_irqsave(&rsp->onofflock, flags); /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */ - rnp = rdp->mynode; /* this is the outgoing CPU's rnp. */ mask = rdp->grpmask; /* rnp->grplo is constant. */ do { raw_spin_lock(&rnp->lock); /* irqs already disabled. */ @@ -1299,20 +1428,11 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) if (rnp->qsmaskinit != 0) { if (rnp != rdp->mynode) raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ - else - trace_rcu_grace_period(rsp->name, - rnp->gpnum + 1 - - !!(rnp->qsmask & mask), - "cpuofl"); break; } - if (rnp == rdp->mynode) { - trace_rcu_grace_period(rsp->name, - rnp->gpnum + 1 - - !!(rnp->qsmask & mask), - "cpuofl"); + if (rnp == rdp->mynode) need_report = rcu_preempt_offline_tasks(rsp, rnp, rdp); - } else + else raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ mask = rnp->grpmask; rnp = rnp->parent; @@ -1332,29 +1452,15 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) raw_spin_unlock_irqrestore(&rnp->lock, flags); if (need_report & RCU_OFL_TASKS_EXP_GP) rcu_report_exp_rnp(rsp, rnp, true); - rcu_node_kthread_setaffinity(rnp, -1); -} - -/* - * Remove the specified CPU from the RCU hierarchy and move any pending - * callbacks that it might have to the current CPU. This code assumes - * that at least one CPU in the system will remain running at all times. - * Any attempt to offline -all- CPUs is likely to strand RCU callbacks. - */ -static void rcu_offline_cpu(int cpu) -{ - __rcu_offline_cpu(cpu, &rcu_sched_state); - __rcu_offline_cpu(cpu, &rcu_bh_state); - rcu_preempt_offline_cpu(cpu); } #else /* #ifdef CONFIG_HOTPLUG_CPU */ -static void rcu_send_cbs_to_online(struct rcu_state *rsp) +static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) { } -static void rcu_offline_cpu(int cpu) +static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) { } @@ -1368,11 +1474,11 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) { unsigned long flags; struct rcu_head *next, *list, **tail; - int bl, count; + int bl, count, count_lazy; /* If no callbacks are ready, just return.*/ if (!cpu_has_callbacks_ready_to_invoke(rdp)) { - trace_rcu_batch_start(rsp->name, 0, 0); + trace_rcu_batch_start(rsp->name, rdp->qlen_lazy, rdp->qlen, 0); trace_rcu_batch_end(rsp->name, 0, !!ACCESS_ONCE(rdp->nxtlist), need_resched(), is_idle_task(current), rcu_is_callbacks_kthread()); @@ -1384,8 +1490,9 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) * races with call_rcu() from interrupt handlers. */ local_irq_save(flags); + WARN_ON_ONCE(cpu_is_offline(smp_processor_id())); bl = rdp->blimit; - trace_rcu_batch_start(rsp->name, rdp->qlen, bl); + trace_rcu_batch_start(rsp->name, rdp->qlen_lazy, rdp->qlen, bl); list = rdp->nxtlist; rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL]; *rdp->nxttail[RCU_DONE_TAIL] = NULL; @@ -1396,12 +1503,13 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) local_irq_restore(flags); /* Invoke callbacks. */ - count = 0; + count = count_lazy = 0; while (list) { next = list->next; prefetch(next); debug_rcu_head_unqueue(list); - __rcu_reclaim(rsp->name, list); + if (__rcu_reclaim(rsp->name, list)) + count_lazy++; list = next; /* Stop only if limit reached and CPU has something to do. */ if (++count >= bl && @@ -1416,6 +1524,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) rcu_is_callbacks_kthread()); /* Update count, and requeue any remaining callbacks. */ + rdp->qlen_lazy -= count_lazy; rdp->qlen -= count; rdp->n_cbs_invoked += count; if (list != NULL) { @@ -1458,6 +1567,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) void rcu_check_callbacks(int cpu, int user) { trace_rcu_utilization("Start scheduler-tick"); + increment_cpu_stall_ticks(); if (user || rcu_is_cpu_rrupt_from_idle()) { /* @@ -1492,8 +1602,6 @@ void rcu_check_callbacks(int cpu, int user) trace_rcu_utilization("End scheduler-tick"); } -#ifdef CONFIG_SMP - /* * Scan the leaf rcu_node structures, processing dyntick state for any that * have not yet encountered a quiescent state, using the function specified. @@ -1616,15 +1724,6 @@ unlock_fqs_ret: trace_rcu_utilization("End fqs"); } -#else /* #ifdef CONFIG_SMP */ - -static void force_quiescent_state(struct rcu_state *rsp, int relaxed) -{ - set_need_resched(); -} - -#endif /* #else #ifdef CONFIG_SMP */ - /* * This does the RCU core processing work for the specified rcu_state * and rcu_data structures. This may be called only from the CPU to @@ -1702,11 +1801,12 @@ static void invoke_rcu_core(void) static void __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), - struct rcu_state *rsp) + struct rcu_state *rsp, bool lazy) { unsigned long flags; struct rcu_data *rdp; + WARN_ON_ONCE((unsigned long)head & 0x3); /* Misaligned rcu_head! */ debug_rcu_head_queue(head); head->func = func; head->next = NULL; @@ -1720,18 +1820,21 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), * a quiescent state betweentimes. */ local_irq_save(flags); + WARN_ON_ONCE(cpu_is_offline(smp_processor_id())); rdp = this_cpu_ptr(rsp->rda); /* Add the callback to our list. */ *rdp->nxttail[RCU_NEXT_TAIL] = head; rdp->nxttail[RCU_NEXT_TAIL] = &head->next; rdp->qlen++; + if (lazy) + rdp->qlen_lazy++; if (__is_kfree_rcu_offset((unsigned long)func)) trace_rcu_kfree_callback(rsp->name, head, (unsigned long)func, - rdp->qlen); + rdp->qlen_lazy, rdp->qlen); else - trace_rcu_callback(rsp->name, head, rdp->qlen); + trace_rcu_callback(rsp->name, head, rdp->qlen_lazy, rdp->qlen); /* If interrupts were disabled, don't dive into RCU core. */ if (irqs_disabled_flags(flags)) { @@ -1778,16 +1881,16 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), */ void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) { - __call_rcu(head, func, &rcu_sched_state); + __call_rcu(head, func, &rcu_sched_state, 0); } EXPORT_SYMBOL_GPL(call_rcu_sched); /* - * Queue an RCU for invocation after a quicker grace period. + * Queue an RCU callback for invocation after a quicker grace period. */ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) { - __call_rcu(head, func, &rcu_bh_state); + __call_rcu(head, func, &rcu_bh_state, 0); } EXPORT_SYMBOL_GPL(call_rcu_bh); @@ -1816,6 +1919,10 @@ EXPORT_SYMBOL_GPL(call_rcu_bh); */ void synchronize_sched(void) { + rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) && + !lock_is_held(&rcu_lock_map) && + !lock_is_held(&rcu_sched_lock_map), + "Illegal synchronize_sched() in RCU-sched read-side critical section"); if (rcu_blocking_is_gp()) return; wait_rcu_gp(call_rcu_sched); @@ -1833,12 +1940,137 @@ EXPORT_SYMBOL_GPL(synchronize_sched); */ void synchronize_rcu_bh(void) { + rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) && + !lock_is_held(&rcu_lock_map) && + !lock_is_held(&rcu_sched_lock_map), + "Illegal synchronize_rcu_bh() in RCU-bh read-side critical section"); if (rcu_blocking_is_gp()) return; wait_rcu_gp(call_rcu_bh); } EXPORT_SYMBOL_GPL(synchronize_rcu_bh); +static atomic_t sync_sched_expedited_started = ATOMIC_INIT(0); +static atomic_t sync_sched_expedited_done = ATOMIC_INIT(0); + +static int synchronize_sched_expedited_cpu_stop(void *data) +{ + /* + * There must be a full memory barrier on each affected CPU + * between the time that try_stop_cpus() is called and the + * time that it returns. + * + * In the current initial implementation of cpu_stop, the + * above condition is already met when the control reaches + * this point and the following smp_mb() is not strictly + * necessary. Do smp_mb() anyway for documentation and + * robustness against future implementation changes. + */ + smp_mb(); /* See above comment block. */ + return 0; +} + +/** + * synchronize_sched_expedited - Brute-force RCU-sched grace period + * + * Wait for an RCU-sched grace period to elapse, but use a "big hammer" + * approach to force the grace period to end quickly. This consumes + * significant time on all CPUs and is unfriendly to real-time workloads, + * so is thus not recommended for any sort of common-case code. In fact, + * if you are using synchronize_sched_expedited() in a loop, please + * restructure your code to batch your updates, and then use a single + * synchronize_sched() instead. + * + * Note that it is illegal to call this function while holding any lock + * that is acquired by a CPU-hotplug notifier. And yes, it is also illegal + * to call this function from a CPU-hotplug notifier. Failing to observe + * these restriction will result in deadlock. + * + * This implementation can be thought of as an application of ticket + * locking to RCU, with sync_sched_expedited_started and + * sync_sched_expedited_done taking on the roles of the halves + * of the ticket-lock word. Each task atomically increments + * sync_sched_expedited_started upon entry, snapshotting the old value, + * then attempts to stop all the CPUs. If this succeeds, then each + * CPU will have executed a context switch, resulting in an RCU-sched + * grace period. We are then done, so we use atomic_cmpxchg() to + * update sync_sched_expedited_done to match our snapshot -- but + * only if someone else has not already advanced past our snapshot. + * + * On the other hand, if try_stop_cpus() fails, we check the value + * of sync_sched_expedited_done. If it has advanced past our + * initial snapshot, then someone else must have forced a grace period + * some time after we took our snapshot. In this case, our work is + * done for us, and we can simply return. Otherwise, we try again, + * but keep our initial snapshot for purposes of checking for someone + * doing our work for us. + * + * If we fail too many times in a row, we fall back to synchronize_sched(). + */ +void synchronize_sched_expedited(void) +{ + int firstsnap, s, snap, trycount = 0; + + /* Note that atomic_inc_return() implies full memory barrier. */ + firstsnap = snap = atomic_inc_return(&sync_sched_expedited_started); + get_online_cpus(); + WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id())); + + /* + * Each pass through the following loop attempts to force a + * context switch on each CPU. + */ + while (try_stop_cpus(cpu_online_mask, + synchronize_sched_expedited_cpu_stop, + NULL) == -EAGAIN) { + put_online_cpus(); + + /* No joy, try again later. Or just synchronize_sched(). */ + if (trycount++ < 10) + udelay(trycount * num_online_cpus()); + else { + synchronize_sched(); + return; + } + + /* Check to see if someone else did our work for us. */ + s = atomic_read(&sync_sched_expedited_done); + if (UINT_CMP_GE((unsigned)s, (unsigned)firstsnap)) { + smp_mb(); /* ensure test happens before caller kfree */ + return; + } + + /* + * Refetching sync_sched_expedited_started allows later + * callers to piggyback on our grace period. We subtract + * 1 to get the same token that the last incrementer got. + * We retry after they started, so our grace period works + * for them, and they started after our first try, so their + * grace period works for us. + */ + get_online_cpus(); + snap = atomic_read(&sync_sched_expedited_started); + smp_mb(); /* ensure read is before try_stop_cpus(). */ + } + + /* + * Everyone up to our most recent fetch is covered by our grace + * period. Update the counter, but only if our work is still + * relevant -- which it won't be if someone who started later + * than we did beat us to the punch. + */ + do { + s = atomic_read(&sync_sched_expedited_done); + if (UINT_CMP_GE((unsigned)s, (unsigned)snap)) { + smp_mb(); /* ensure test happens before caller kfree */ + break; + } + } while (atomic_cmpxchg(&sync_sched_expedited_done, s, snap) != s); + + put_online_cpus(); +} +EXPORT_SYMBOL_GPL(synchronize_sched_expedited); + /* * Check to see if there is any immediate RCU-related work to be done * by the current CPU, for the specified type of RCU, returning 1 if so. @@ -1932,7 +2164,7 @@ static int rcu_cpu_has_callbacks(int cpu) /* RCU callbacks either ready or pending? */ return per_cpu(rcu_sched_data, cpu).nxtlist || per_cpu(rcu_bh_data, cpu).nxtlist || - rcu_preempt_needs_cpu(cpu); + rcu_preempt_cpu_has_callbacks(cpu); } static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL}; @@ -2027,9 +2259,10 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) rdp->nxtlist = NULL; for (i = 0; i < RCU_NEXT_SIZE; i++) rdp->nxttail[i] = &rdp->nxtlist; + rdp->qlen_lazy = 0; rdp->qlen = 0; rdp->dynticks = &per_cpu(rcu_dynticks, cpu); - WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_NESTING); + WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE); WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1); rdp->cpu = cpu; rdp->rsp = rsp; @@ -2057,7 +2290,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible) rdp->qlen_last_fqs_check = 0; rdp->n_force_qs_snap = rsp->n_force_qs; rdp->blimit = blimit; - rdp->dynticks->dynticks_nesting = DYNTICK_TASK_NESTING; + rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; atomic_set(&rdp->dynticks->dynticks, (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1); rcu_prepare_for_idle_init(cpu); @@ -2139,16 +2372,18 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self, * touch any data without introducing corruption. We send the * dying CPU's callbacks to an arbitrarily chosen online CPU. */ - rcu_send_cbs_to_online(&rcu_bh_state); - rcu_send_cbs_to_online(&rcu_sched_state); - rcu_preempt_send_cbs_to_online(); + rcu_cleanup_dying_cpu(&rcu_bh_state); + rcu_cleanup_dying_cpu(&rcu_sched_state); + rcu_preempt_cleanup_dying_cpu(); rcu_cleanup_after_idle(cpu); break; case CPU_DEAD: case CPU_DEAD_FROZEN: case CPU_UP_CANCELED: case CPU_UP_CANCELED_FROZEN: - rcu_offline_cpu(cpu); + rcu_cleanup_dead_cpu(cpu, &rcu_bh_state); + rcu_cleanup_dead_cpu(cpu, &rcu_sched_state); + rcu_preempt_cleanup_dead_cpu(cpu); break; default: break; diff --git a/kernel/rcutree.h b/kernel/rcutree.h index fddff92d6676..cdd1be0a4072 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -239,6 +239,12 @@ struct rcu_data { bool preemptible; /* Preemptible RCU? */ struct rcu_node *mynode; /* This CPU's leaf of hierarchy */ unsigned long grpmask; /* Mask to apply to leaf qsmask. */ +#ifdef CONFIG_RCU_CPU_STALL_INFO + unsigned long ticks_this_gp; /* The number of scheduling-clock */ + /* ticks this CPU has handled */ + /* during and after the last grace */ + /* period it is aware of. */ +#endif /* #ifdef CONFIG_RCU_CPU_STALL_INFO */ /* 2) batch handling */ /* @@ -265,7 +271,8 @@ struct rcu_data { */ struct rcu_head *nxtlist; struct rcu_head **nxttail[RCU_NEXT_SIZE]; - long qlen; /* # of queued callbacks */ + long qlen_lazy; /* # of lazy queued callbacks */ + long qlen; /* # of queued callbacks, incl lazy */ long qlen_last_fqs_check; /* qlen at last check for QS forcing */ unsigned long n_cbs_invoked; /* count of RCU cbs invoked. */ @@ -282,7 +289,6 @@ struct rcu_data { /* 4) reasons this CPU needed to be kicked by force_quiescent_state */ unsigned long dynticks_fqs; /* Kicked due to dynticks idle. */ unsigned long offline_fqs; /* Kicked due to being offline. */ - unsigned long resched_ipi; /* Sent a resched IPI. */ /* 5) __rcu_pending() statistics. */ unsigned long n_rcu_pending; /* rcu_pending() calls since boot. */ @@ -313,12 +319,6 @@ struct rcu_data { #else #define RCU_STALL_DELAY_DELTA 0 #endif - -#define RCU_SECONDS_TILL_STALL_CHECK (CONFIG_RCU_CPU_STALL_TIMEOUT * HZ + \ - RCU_STALL_DELAY_DELTA) - /* for rsp->jiffies_stall */ -#define RCU_SECONDS_TILL_STALL_RECHECK (3 * RCU_SECONDS_TILL_STALL_CHECK + 30) - /* for rsp->jiffies_stall */ #define RCU_STALL_RAT_DELAY 2 /* Allow other CPUs time */ /* to take at least one */ /* scheduling clock irq */ @@ -438,8 +438,8 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); static int rcu_preempt_offline_tasks(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp); -static void rcu_preempt_offline_cpu(int cpu); #endif /* #ifdef CONFIG_HOTPLUG_CPU */ +static void rcu_preempt_cleanup_dead_cpu(int cpu); static void rcu_preempt_check_callbacks(int cpu); static void rcu_preempt_process_callbacks(void); void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); @@ -448,9 +448,9 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, bool wake); #endif /* #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) */ static int rcu_preempt_pending(int cpu); -static int rcu_preempt_needs_cpu(int cpu); +static int rcu_preempt_cpu_has_callbacks(int cpu); static void __cpuinit rcu_preempt_init_percpu_data(int cpu); -static void rcu_preempt_send_cbs_to_online(void); +static void rcu_preempt_cleanup_dying_cpu(void); static void __init __rcu_init_preempt(void); static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags); static void rcu_preempt_boost_start_gp(struct rcu_node *rnp); @@ -471,5 +471,10 @@ static void __cpuinit rcu_prepare_kthreads(int cpu); static void rcu_prepare_for_idle_init(int cpu); static void rcu_cleanup_after_idle(int cpu); static void rcu_prepare_for_idle(int cpu); +static void print_cpu_stall_info_begin(void); +static void print_cpu_stall_info(struct rcu_state *rsp, int cpu); +static void print_cpu_stall_info_end(void); +static void zero_cpu_stall_ticks(struct rcu_data *rdp); +static void increment_cpu_stall_ticks(void); #endif /* #ifndef RCU_TREE_NONCORE */ diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 8bb35d73e1f9..c023464816be 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -25,7 +25,6 @@ */ #include <linux/delay.h> -#include <linux/stop_machine.h> #define RCU_KTHREAD_PRIO 1 @@ -63,7 +62,10 @@ static void __init rcu_bootup_announce_oddness(void) printk(KERN_INFO "\tRCU torture testing starts during boot.\n"); #endif #if defined(CONFIG_TREE_PREEMPT_RCU) && !defined(CONFIG_RCU_CPU_STALL_VERBOSE) - printk(KERN_INFO "\tVerbose stalled-CPUs detection is disabled.\n"); + printk(KERN_INFO "\tDump stacks of tasks blocking RCU-preempt GP.\n"); +#endif +#if defined(CONFIG_RCU_CPU_STALL_INFO) + printk(KERN_INFO "\tAdditional per-CPU info printed with stalls.\n"); #endif #if NUM_RCU_LVL_4 != 0 printk(KERN_INFO "\tExperimental four-level hierarchy is enabled.\n"); @@ -490,6 +492,31 @@ static void rcu_print_detail_task_stall(struct rcu_state *rsp) #endif /* #else #ifdef CONFIG_RCU_CPU_STALL_VERBOSE */ +#ifdef CONFIG_RCU_CPU_STALL_INFO + +static void rcu_print_task_stall_begin(struct rcu_node *rnp) +{ + printk(KERN_ERR "\tTasks blocked on level-%d rcu_node (CPUs %d-%d):", + rnp->level, rnp->grplo, rnp->grphi); +} + +static void rcu_print_task_stall_end(void) +{ + printk(KERN_CONT "\n"); +} + +#else /* #ifdef CONFIG_RCU_CPU_STALL_INFO */ + +static void rcu_print_task_stall_begin(struct rcu_node *rnp) +{ +} + +static void rcu_print_task_stall_end(void) +{ +} + +#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_INFO */ + /* * Scan the current list of tasks blocked within RCU read-side critical * sections, printing out the tid of each. @@ -501,12 +528,14 @@ static int rcu_print_task_stall(struct rcu_node *rnp) if (!rcu_preempt_blocked_readers_cgp(rnp)) return 0; + rcu_print_task_stall_begin(rnp); t = list_entry(rnp->gp_tasks, struct task_struct, rcu_node_entry); list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) { - printk(" P%d", t->pid); + printk(KERN_CONT " P%d", t->pid); ndetected++; } + rcu_print_task_stall_end(); return ndetected; } @@ -581,7 +610,7 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp, * absolutely necessary, but this is a good performance/complexity * tradeoff. */ - if (rcu_preempt_blocked_readers_cgp(rnp)) + if (rcu_preempt_blocked_readers_cgp(rnp) && rnp->qsmask == 0) retval |= RCU_OFL_TASKS_NORM_GP; if (rcu_preempted_readers_exp(rnp)) retval |= RCU_OFL_TASKS_EXP_GP; @@ -618,16 +647,16 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp, return retval; } +#endif /* #ifdef CONFIG_HOTPLUG_CPU */ + /* * Do CPU-offline processing for preemptible RCU. */ -static void rcu_preempt_offline_cpu(int cpu) +static void rcu_preempt_cleanup_dead_cpu(int cpu) { - __rcu_offline_cpu(cpu, &rcu_preempt_state); + rcu_cleanup_dead_cpu(cpu, &rcu_preempt_state); } -#endif /* #ifdef CONFIG_HOTPLUG_CPU */ - /* * Check for a quiescent state from the current CPU. When a task blocks, * the task is recorded in the corresponding CPU's rcu_node structure, @@ -671,10 +700,24 @@ static void rcu_preempt_do_callbacks(void) */ void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) { - __call_rcu(head, func, &rcu_preempt_state); + __call_rcu(head, func, &rcu_preempt_state, 0); } EXPORT_SYMBOL_GPL(call_rcu); +/* + * Queue an RCU callback for lazy invocation after a grace period. + * This will likely be later named something like "call_rcu_lazy()", + * but this change will require some way of tagging the lazy RCU + * callbacks in the list of pending callbacks. Until then, this + * function may only be called from __kfree_rcu(). + */ +void kfree_call_rcu(struct rcu_head *head, + void (*func)(struct rcu_head *rcu)) +{ + __call_rcu(head, func, &rcu_preempt_state, 1); +} +EXPORT_SYMBOL_GPL(kfree_call_rcu); + /** * synchronize_rcu - wait until a grace period has elapsed. * @@ -688,6 +731,10 @@ EXPORT_SYMBOL_GPL(call_rcu); */ void synchronize_rcu(void) { + rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) && + !lock_is_held(&rcu_lock_map) && + !lock_is_held(&rcu_sched_lock_map), + "Illegal synchronize_rcu() in RCU read-side critical section"); if (!rcu_scheduler_active) return; wait_rcu_gp(call_rcu); @@ -788,10 +835,22 @@ sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp) rcu_report_exp_rnp(rsp, rnp, false); /* Don't wake self. */ } -/* - * Wait for an rcu-preempt grace period, but expedite it. The basic idea - * is to invoke synchronize_sched_expedited() to push all the tasks to - * the ->blkd_tasks lists and wait for this list to drain. +/** + * synchronize_rcu_expedited - Brute-force RCU grace period + * + * Wait for an RCU-preempt grace period, but expedite it. The basic + * idea is to invoke synchronize_sched_expedited() to push all the tasks to + * the ->blkd_tasks lists and wait for this list to drain. This consumes + * significant time on all CPUs and is unfriendly to real-time workloads, + * so is thus not recommended for any sort of common-case code. + * In fact, if you are using synchronize_rcu_expedited() in a loop, + * please restructure your code to batch your updates, and then Use a + * single synchronize_rcu() instead. + * + * Note that it is illegal to call this function while holding any lock + * that is acquired by a CPU-hotplug notifier. And yes, it is also illegal + * to call this function from a CPU-hotplug notifier. Failing to observe + * these restriction will result in deadlock. */ void synchronize_rcu_expedited(void) { @@ -869,9 +928,9 @@ static int rcu_preempt_pending(int cpu) } /* - * Does preemptible RCU need the CPU to stay out of dynticks mode? + * Does preemptible RCU have callbacks on this CPU? */ -static int rcu_preempt_needs_cpu(int cpu) +static int rcu_preempt_cpu_has_callbacks(int cpu) { return !!per_cpu(rcu_preempt_data, cpu).nxtlist; } @@ -894,11 +953,12 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu) } /* - * Move preemptible RCU's callbacks from dying CPU to other online CPU. + * Move preemptible RCU's callbacks from dying CPU to other online CPU + * and record a quiescent state. */ -static void rcu_preempt_send_cbs_to_online(void) +static void rcu_preempt_cleanup_dying_cpu(void) { - rcu_send_cbs_to_online(&rcu_preempt_state); + rcu_cleanup_dying_cpu(&rcu_preempt_state); } /* @@ -1034,16 +1094,16 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp, return 0; } +#endif /* #ifdef CONFIG_HOTPLUG_CPU */ + /* * Because preemptible RCU does not exist, it never needs CPU-offline * processing. */ -static void rcu_preempt_offline_cpu(int cpu) +static void rcu_preempt_cleanup_dead_cpu(int cpu) { } -#endif /* #ifdef CONFIG_HOTPLUG_CPU */ - /* * Because preemptible RCU does not exist, it never has any callbacks * to check. @@ -1061,6 +1121,22 @@ static void rcu_preempt_process_callbacks(void) } /* + * Queue an RCU callback for lazy invocation after a grace period. + * This will likely be later named something like "call_rcu_lazy()", + * but this change will require some way of tagging the lazy RCU + * callbacks in the list of pending callbacks. Until then, this + * function may only be called from __kfree_rcu(). + * + * Because there is no preemptible RCU, we use RCU-sched instead. + */ +void kfree_call_rcu(struct rcu_head *head, + void (*func)(struct rcu_head *rcu)) +{ + __call_rcu(head, func, &rcu_sched_state, 1); +} +EXPORT_SYMBOL_GPL(kfree_call_rcu); + +/* * Wait for an rcu-preempt grace period, but make it happen quickly. * But because preemptible RCU does not exist, map to rcu-sched. */ @@ -1093,9 +1169,9 @@ static int rcu_preempt_pending(int cpu) } /* - * Because preemptible RCU does not exist, it never needs any CPU. + * Because preemptible RCU does not exist, it never has callbacks */ -static int rcu_preempt_needs_cpu(int cpu) +static int rcu_preempt_cpu_has_callbacks(int cpu) { return 0; } @@ -1119,9 +1195,9 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu) } /* - * Because there is no preemptible RCU, there are no callbacks to move. + * Because there is no preemptible RCU, there is no cleanup to do. */ -static void rcu_preempt_send_cbs_to_online(void) +static void rcu_preempt_cleanup_dying_cpu(void) { } @@ -1823,132 +1899,6 @@ static void __cpuinit rcu_prepare_kthreads(int cpu) #endif /* #else #ifdef CONFIG_RCU_BOOST */ -#ifndef CONFIG_SMP - -void synchronize_sched_expedited(void) -{ - cond_resched(); -} -EXPORT_SYMBOL_GPL(synchronize_sched_expedited); - -#else /* #ifndef CONFIG_SMP */ - -static atomic_t sync_sched_expedited_started = ATOMIC_INIT(0); -static atomic_t sync_sched_expedited_done = ATOMIC_INIT(0); - -static int synchronize_sched_expedited_cpu_stop(void *data) -{ - /* - * There must be a full memory barrier on each affected CPU - * between the time that try_stop_cpus() is called and the - * time that it returns. - * - * In the current initial implementation of cpu_stop, the - * above condition is already met when the control reaches - * this point and the following smp_mb() is not strictly - * necessary. Do smp_mb() anyway for documentation and - * robustness against future implementation changes. - */ - smp_mb(); /* See above comment block. */ - return 0; -} - -/* - * Wait for an rcu-sched grace period to elapse, but use "big hammer" - * approach to force grace period to end quickly. This consumes - * significant time on all CPUs, and is thus not recommended for - * any sort of common-case code. - * - * Note that it is illegal to call this function while holding any - * lock that is acquired by a CPU-hotplug notifier. Failing to - * observe this restriction will result in deadlock. - * - * This implementation can be thought of as an application of ticket - * locking to RCU, with sync_sched_expedited_started and - * sync_sched_expedited_done taking on the roles of the halves - * of the ticket-lock word. Each task atomically increments - * sync_sched_expedited_started upon entry, snapshotting the old value, - * then attempts to stop all the CPUs. If this succeeds, then each - * CPU will have executed a context switch, resulting in an RCU-sched - * grace period. We are then done, so we use atomic_cmpxchg() to - * update sync_sched_expedited_done to match our snapshot -- but - * only if someone else has not already advanced past our snapshot. - * - * On the other hand, if try_stop_cpus() fails, we check the value - * of sync_sched_expedited_done. If it has advanced past our - * initial snapshot, then someone else must have forced a grace period - * some time after we took our snapshot. In this case, our work is - * done for us, and we can simply return. Otherwise, we try again, - * but keep our initial snapshot for purposes of checking for someone - * doing our work for us. - * - * If we fail too many times in a row, we fall back to synchronize_sched(). - */ -void synchronize_sched_expedited(void) -{ - int firstsnap, s, snap, trycount = 0; - - /* Note that atomic_inc_return() implies full memory barrier. */ - firstsnap = snap = atomic_inc_return(&sync_sched_expedited_started); - get_online_cpus(); - - /* - * Each pass through the following loop attempts to force a - * context switch on each CPU. - */ - while (try_stop_cpus(cpu_online_mask, - synchronize_sched_expedited_cpu_stop, - NULL) == -EAGAIN) { - put_online_cpus(); - - /* No joy, try again later. Or just synchronize_sched(). */ - if (trycount++ < 10) - udelay(trycount * num_online_cpus()); - else { - synchronize_sched(); - return; - } - - /* Check to see if someone else did our work for us. */ - s = atomic_read(&sync_sched_expedited_done); - if (UINT_CMP_GE((unsigned)s, (unsigned)firstsnap)) { - smp_mb(); /* ensure test happens before caller kfree */ - return; - } - - /* - * Refetching sync_sched_expedited_started allows later - * callers to piggyback on our grace period. We subtract - * 1 to get the same token that the last incrementer got. - * We retry after they started, so our grace period works - * for them, and they started after our first try, so their - * grace period works for us. - */ - get_online_cpus(); - snap = atomic_read(&sync_sched_expedited_started); - smp_mb(); /* ensure read is before try_stop_cpus(). */ - } - - /* - * Everyone up to our most recent fetch is covered by our grace - * period. Update the counter, but only if our work is still - * relevant -- which it won't be if someone who started later - * than we did beat us to the punch. - */ - do { - s = atomic_read(&sync_sched_expedited_done); - if (UINT_CMP_GE((unsigned)s, (unsigned)snap)) { - smp_mb(); /* ensure test happens before caller kfree */ - break; - } - } while (atomic_cmpxchg(&sync_sched_expedited_done, s, snap) != s); - - put_online_cpus(); -} -EXPORT_SYMBOL_GPL(synchronize_sched_expedited); - -#endif /* #else #ifndef CONFIG_SMP */ - #if !defined(CONFIG_RCU_FAST_NO_HZ) /* @@ -1981,7 +1931,7 @@ static void rcu_cleanup_after_idle(int cpu) } /* - * Do the idle-entry grace-period work, which, because CONFIG_RCU_FAST_NO_HZ=y, + * Do the idle-entry grace-period work, which, because CONFIG_RCU_FAST_NO_HZ=n, * is nothing. */ static void rcu_prepare_for_idle(int cpu) @@ -2015,6 +1965,9 @@ static void rcu_prepare_for_idle(int cpu) * number, be warned: Setting RCU_IDLE_GP_DELAY too high can hang your * system. And if you are -that- concerned about energy efficiency, * just power the system down and be done with it! + * RCU_IDLE_LAZY_GP_DELAY gives the number of jiffies that a CPU is + * permitted to sleep in dyntick-idle mode with only lazy RCU + * callbacks pending. Setting this too high can OOM your system. * * The values below work well in practice. If future workloads require * adjustment, they can be converted into kernel config parameters, though @@ -2023,11 +1976,13 @@ static void rcu_prepare_for_idle(int cpu) #define RCU_IDLE_FLUSHES 5 /* Number of dyntick-idle tries. */ #define RCU_IDLE_OPT_FLUSHES 3 /* Optional dyntick-idle tries. */ #define RCU_IDLE_GP_DELAY 6 /* Roughly one grace period. */ +#define RCU_IDLE_LAZY_GP_DELAY (6 * HZ) /* Roughly six seconds. */ static DEFINE_PER_CPU(int, rcu_dyntick_drain); static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff); static DEFINE_PER_CPU(struct hrtimer, rcu_idle_gp_timer); -static ktime_t rcu_idle_gp_wait; +static ktime_t rcu_idle_gp_wait; /* If some non-lazy callbacks. */ +static ktime_t rcu_idle_lazy_gp_wait; /* If only lazy callbacks. */ /* * Allow the CPU to enter dyntick-idle mode if either: (1) There are no @@ -2048,6 +2003,48 @@ int rcu_needs_cpu(int cpu) } /* + * Does the specified flavor of RCU have non-lazy callbacks pending on + * the specified CPU? Both RCU flavor and CPU are specified by the + * rcu_data structure. + */ +static bool __rcu_cpu_has_nonlazy_callbacks(struct rcu_data *rdp) +{ + return rdp->qlen != rdp->qlen_lazy; +} + +#ifdef CONFIG_TREE_PREEMPT_RCU + +/* + * Are there non-lazy RCU-preempt callbacks? (There cannot be if there + * is no RCU-preempt in the kernel.) + */ +static bool rcu_preempt_cpu_has_nonlazy_callbacks(int cpu) +{ + struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu); + + return __rcu_cpu_has_nonlazy_callbacks(rdp); +} + +#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */ + +static bool rcu_preempt_cpu_has_nonlazy_callbacks(int cpu) +{ + return 0; +} + +#endif /* else #ifdef CONFIG_TREE_PREEMPT_RCU */ + +/* + * Does any flavor of RCU have non-lazy callbacks on the specified CPU? + */ +static bool rcu_cpu_has_nonlazy_callbacks(int cpu) +{ + return __rcu_cpu_has_nonlazy_callbacks(&per_cpu(rcu_sched_data, cpu)) || + __rcu_cpu_has_nonlazy_callbacks(&per_cpu(rcu_bh_data, cpu)) || + rcu_preempt_cpu_has_nonlazy_callbacks(cpu); +} + +/* * Timer handler used to force CPU to start pushing its remaining RCU * callbacks in the case where it entered dyntick-idle mode with callbacks * pending. The hander doesn't really need to do anything because the @@ -2074,6 +2071,8 @@ static void rcu_prepare_for_idle_init(int cpu) unsigned int upj = jiffies_to_usecs(RCU_IDLE_GP_DELAY); rcu_idle_gp_wait = ns_to_ktime(upj * (u64)1000); + upj = jiffies_to_usecs(RCU_IDLE_LAZY_GP_DELAY); + rcu_idle_lazy_gp_wait = ns_to_ktime(upj * (u64)1000); firsttime = 0; } } @@ -2109,10 +2108,6 @@ static void rcu_cleanup_after_idle(int cpu) */ static void rcu_prepare_for_idle(int cpu) { - unsigned long flags; - - local_irq_save(flags); - /* * If there are no callbacks on this CPU, enter dyntick-idle mode. * Also reset state to avoid prejudicing later attempts. @@ -2120,7 +2115,6 @@ static void rcu_prepare_for_idle(int cpu) if (!rcu_cpu_has_callbacks(cpu)) { per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1; per_cpu(rcu_dyntick_drain, cpu) = 0; - local_irq_restore(flags); trace_rcu_prep_idle("No callbacks"); return; } @@ -2130,7 +2124,6 @@ static void rcu_prepare_for_idle(int cpu) * refrained from disabling the scheduling-clock tick. */ if (per_cpu(rcu_dyntick_holdoff, cpu) == jiffies) { - local_irq_restore(flags); trace_rcu_prep_idle("In holdoff"); return; } @@ -2140,18 +2133,22 @@ static void rcu_prepare_for_idle(int cpu) /* First time through, initialize the counter. */ per_cpu(rcu_dyntick_drain, cpu) = RCU_IDLE_FLUSHES; } else if (per_cpu(rcu_dyntick_drain, cpu) <= RCU_IDLE_OPT_FLUSHES && - !rcu_pending(cpu)) { + !rcu_pending(cpu) && + !local_softirq_pending()) { /* Can we go dyntick-idle despite still having callbacks? */ trace_rcu_prep_idle("Dyntick with callbacks"); per_cpu(rcu_dyntick_drain, cpu) = 0; - per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1; - hrtimer_start(&per_cpu(rcu_idle_gp_timer, cpu), - rcu_idle_gp_wait, HRTIMER_MODE_REL); + per_cpu(rcu_dyntick_holdoff, cpu) = jiffies; + if (rcu_cpu_has_nonlazy_callbacks(cpu)) + hrtimer_start(&per_cpu(rcu_idle_gp_timer, cpu), + rcu_idle_gp_wait, HRTIMER_MODE_REL); + else + hrtimer_start(&per_cpu(rcu_idle_gp_timer, cpu), + rcu_idle_lazy_gp_wait, HRTIMER_MODE_REL); return; /* Nothing more to do immediately. */ } else if (--per_cpu(rcu_dyntick_drain, cpu) <= 0) { /* We have hit the limit, so time to give up. */ per_cpu(rcu_dyntick_holdoff, cpu) = jiffies; - local_irq_restore(flags); trace_rcu_prep_idle("Begin holdoff"); invoke_rcu_core(); /* Force the CPU out of dyntick-idle. */ return; @@ -2163,23 +2160,17 @@ static void rcu_prepare_for_idle(int cpu) */ #ifdef CONFIG_TREE_PREEMPT_RCU if (per_cpu(rcu_preempt_data, cpu).nxtlist) { - local_irq_restore(flags); rcu_preempt_qs(cpu); force_quiescent_state(&rcu_preempt_state, 0); - local_irq_save(flags); } #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ if (per_cpu(rcu_sched_data, cpu).nxtlist) { - local_irq_restore(flags); rcu_sched_qs(cpu); force_quiescent_state(&rcu_sched_state, 0); - local_irq_save(flags); } if (per_cpu(rcu_bh_data, cpu).nxtlist) { - local_irq_restore(flags); rcu_bh_qs(cpu); force_quiescent_state(&rcu_bh_state, 0); - local_irq_save(flags); } /* @@ -2187,13 +2178,124 @@ static void rcu_prepare_for_idle(int cpu) * So try forcing the callbacks through the grace period. */ if (rcu_cpu_has_callbacks(cpu)) { - local_irq_restore(flags); trace_rcu_prep_idle("More callbacks"); invoke_rcu_core(); - } else { - local_irq_restore(flags); + } else trace_rcu_prep_idle("Callbacks drained"); - } } #endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */ + +#ifdef CONFIG_RCU_CPU_STALL_INFO + +#ifdef CONFIG_RCU_FAST_NO_HZ + +static void print_cpu_stall_fast_no_hz(char *cp, int cpu) +{ + struct hrtimer *hrtp = &per_cpu(rcu_idle_gp_timer, cpu); + + sprintf(cp, "drain=%d %c timer=%lld", + per_cpu(rcu_dyntick_drain, cpu), + per_cpu(rcu_dyntick_holdoff, cpu) == jiffies ? 'H' : '.', + hrtimer_active(hrtp) + ? ktime_to_us(hrtimer_get_remaining(hrtp)) + : -1); +} + +#else /* #ifdef CONFIG_RCU_FAST_NO_HZ */ + +static void print_cpu_stall_fast_no_hz(char *cp, int cpu) +{ +} + +#endif /* #else #ifdef CONFIG_RCU_FAST_NO_HZ */ + +/* Initiate the stall-info list. */ +static void print_cpu_stall_info_begin(void) +{ + printk(KERN_CONT "\n"); +} + +/* + * Print out diagnostic information for the specified stalled CPU. + * + * If the specified CPU is aware of the current RCU grace period + * (flavor specified by rsp), then print the number of scheduling + * clock interrupts the CPU has taken during the time that it has + * been aware. Otherwise, print the number of RCU grace periods + * that this CPU is ignorant of, for example, "1" if the CPU was + * aware of the previous grace period. + * + * Also print out idle and (if CONFIG_RCU_FAST_NO_HZ) idle-entry info. + */ +static void print_cpu_stall_info(struct rcu_state *rsp, int cpu) +{ + char fast_no_hz[72]; + struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); + struct rcu_dynticks *rdtp = rdp->dynticks; + char *ticks_title; + unsigned long ticks_value; + + if (rsp->gpnum == rdp->gpnum) { + ticks_title = "ticks this GP"; + ticks_value = rdp->ticks_this_gp; + } else { + ticks_title = "GPs behind"; + ticks_value = rsp->gpnum - rdp->gpnum; + } + print_cpu_stall_fast_no_hz(fast_no_hz, cpu); + printk(KERN_ERR "\t%d: (%lu %s) idle=%03x/%llx/%d %s\n", + cpu, ticks_value, ticks_title, + atomic_read(&rdtp->dynticks) & 0xfff, + rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting, + fast_no_hz); +} + +/* Terminate the stall-info list. */ +static void print_cpu_stall_info_end(void) +{ + printk(KERN_ERR "\t"); +} + +/* Zero ->ticks_this_gp for all flavors of RCU. */ +static void zero_cpu_stall_ticks(struct rcu_data *rdp) +{ + rdp->ticks_this_gp = 0; +} + +/* Increment ->ticks_this_gp for all flavors of RCU. */ +static void increment_cpu_stall_ticks(void) +{ + __get_cpu_var(rcu_sched_data).ticks_this_gp++; + __get_cpu_var(rcu_bh_data).ticks_this_gp++; +#ifdef CONFIG_TREE_PREEMPT_RCU + __get_cpu_var(rcu_preempt_data).ticks_this_gp++; +#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ +} + +#else /* #ifdef CONFIG_RCU_CPU_STALL_INFO */ + +static void print_cpu_stall_info_begin(void) +{ + printk(KERN_CONT " {"); +} + +static void print_cpu_stall_info(struct rcu_state *rsp, int cpu) +{ + printk(KERN_CONT " %d", cpu); +} + +static void print_cpu_stall_info_end(void) +{ + printk(KERN_CONT "} "); +} + +static void zero_cpu_stall_ticks(struct rcu_data *rdp) +{ +} + +static void increment_cpu_stall_ticks(void) +{ +} + +#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_INFO */ diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index 654cfe67f0d1..ed459edeff43 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c @@ -72,9 +72,9 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) rdp->dynticks->dynticks_nesting, rdp->dynticks->dynticks_nmi_nesting, rdp->dynticks_fqs); - seq_printf(m, " of=%lu ri=%lu", rdp->offline_fqs, rdp->resched_ipi); - seq_printf(m, " ql=%ld qs=%c%c%c%c", - rdp->qlen, + seq_printf(m, " of=%lu", rdp->offline_fqs); + seq_printf(m, " ql=%ld/%ld qs=%c%c%c%c", + rdp->qlen_lazy, rdp->qlen, ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] != rdp->nxttail[RCU_NEXT_TAIL]], ".R"[rdp->nxttail[RCU_WAIT_TAIL] != @@ -144,8 +144,8 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp) rdp->dynticks->dynticks_nesting, rdp->dynticks->dynticks_nmi_nesting, rdp->dynticks_fqs); - seq_printf(m, ",%lu,%lu", rdp->offline_fqs, rdp->resched_ipi); - seq_printf(m, ",%ld,\"%c%c%c%c\"", rdp->qlen, + seq_printf(m, ",%lu", rdp->offline_fqs); + seq_printf(m, ",%ld,%ld,\"%c%c%c%c\"", rdp->qlen_lazy, rdp->qlen, ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] != rdp->nxttail[RCU_NEXT_TAIL]], ".R"[rdp->nxttail[RCU_WAIT_TAIL] != @@ -168,7 +168,7 @@ static int show_rcudata_csv(struct seq_file *m, void *unused) { seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pgp\",\"pq\","); seq_puts(m, "\"dt\",\"dt nesting\",\"dt NMI nesting\",\"df\","); - seq_puts(m, "\"of\",\"ri\",\"ql\",\"qs\""); + seq_puts(m, "\"of\",\"qll\",\"ql\",\"qs\""); #ifdef CONFIG_RCU_BOOST seq_puts(m, "\"kt\",\"ktl\""); #endif /* #ifdef CONFIG_RCU_BOOST */ diff --git a/kernel/relay.c b/kernel/relay.c index 4335e1d7ee2d..ab56a1764d4d 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -164,10 +164,14 @@ depopulate: */ static struct rchan_buf *relay_create_buf(struct rchan *chan) { - struct rchan_buf *buf = kzalloc(sizeof(struct rchan_buf), GFP_KERNEL); - if (!buf) + struct rchan_buf *buf; + + if (chan->n_subbufs > UINT_MAX / sizeof(size_t *)) return NULL; + buf = kzalloc(sizeof(struct rchan_buf), GFP_KERNEL); + if (!buf) + return NULL; buf->padding = kmalloc(chan->n_subbufs * sizeof(size_t *), GFP_KERNEL); if (!buf->padding) goto free_buf; @@ -574,6 +578,8 @@ struct rchan *relay_open(const char *base_filename, if (!(subbuf_size && n_subbufs)) return NULL; + if (subbuf_size > UINT_MAX / n_subbufs) + return NULL; chan = kzalloc(sizeof(struct rchan), GFP_KERNEL); if (!chan) diff --git a/kernel/res_counter.c b/kernel/res_counter.c index 6d269cce7aa1..d508363858b3 100644 --- a/kernel/res_counter.c +++ b/kernel/res_counter.c @@ -66,6 +66,31 @@ done: return ret; } +int res_counter_charge_nofail(struct res_counter *counter, unsigned long val, + struct res_counter **limit_fail_at) +{ + int ret, r; + unsigned long flags; + struct res_counter *c; + + r = ret = 0; + *limit_fail_at = NULL; + local_irq_save(flags); + for (c = counter; c != NULL; c = c->parent) { + spin_lock(&c->lock); + r = res_counter_charge_locked(c, val); + if (r) + c->usage += val; + spin_unlock(&c->lock); + if (r < 0 && ret == 0) { + *limit_fail_at = c; + ret = r; + } + } + local_irq_restore(flags); + + return ret; +} void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val) { if (WARN_ON(counter->usage < val)) diff --git a/kernel/resource.c b/kernel/resource.c index 7640b3a947d0..7e8ea66a8c01 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -749,6 +749,7 @@ int adjust_resource(struct resource *res, resource_size_t start, resource_size_t write_unlock(&resource_lock); return result; } +EXPORT_SYMBOL(adjust_resource); static void __init __reserve_region_with_split(struct resource *root, resource_size_t start, resource_size_t end, @@ -792,8 +793,6 @@ void __init reserve_region_with_split(struct resource *root, write_unlock(&resource_lock); } -EXPORT_SYMBOL(adjust_resource); - /** * resource_alignment - calculate resource's alignment * @res: resource pointer diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c index e8a1f83ee0e7..0984a21076a3 100644 --- a/kernel/sched/auto_group.c +++ b/kernel/sched/auto_group.c @@ -195,20 +195,20 @@ __setup("noautogroup", setup_autogroup); #ifdef CONFIG_PROC_FS -int proc_sched_autogroup_set_nice(struct task_struct *p, int *nice) +int proc_sched_autogroup_set_nice(struct task_struct *p, int nice) { static unsigned long next = INITIAL_JIFFIES; struct autogroup *ag; int err; - if (*nice < -20 || *nice > 19) + if (nice < -20 || nice > 19) return -EINVAL; - err = security_task_setnice(current, *nice); + err = security_task_setnice(current, nice); if (err) return err; - if (*nice < 0 && !can_nice(current, *nice)) + if (nice < 0 && !can_nice(current, nice)) return -EPERM; /* this is a heavy operation taking global locks.. */ @@ -219,9 +219,9 @@ int proc_sched_autogroup_set_nice(struct task_struct *p, int *nice) ag = autogroup_task_get(p); down_write(&ag->lock); - err = sched_group_set_shares(ag->tg, prio_to_weight[*nice + 20]); + err = sched_group_set_shares(ag->tg, prio_to_weight[nice + 20]); if (!err) - ag->nice = *nice; + ag->nice = nice; up_write(&ag->lock); autogroup_kref_put(ag); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index cecbb64be05f..503d6426126d 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -71,9 +71,11 @@ #include <linux/ftrace.h> #include <linux/slab.h> #include <linux/init_task.h> +#include <linux/binfmts.h> #include <asm/tlb.h> #include <asm/irq_regs.h> +#include <asm/mutex.h> #ifdef CONFIG_PARAVIRT #include <asm/paravirt.h> #endif @@ -161,13 +163,13 @@ static int sched_feat_show(struct seq_file *m, void *v) #ifdef HAVE_JUMP_LABEL -#define jump_label_key__true jump_label_key_enabled -#define jump_label_key__false jump_label_key_disabled +#define jump_label_key__true STATIC_KEY_INIT_TRUE +#define jump_label_key__false STATIC_KEY_INIT_FALSE #define SCHED_FEAT(name, enabled) \ jump_label_key__##enabled , -struct jump_label_key sched_feat_keys[__SCHED_FEAT_NR] = { +struct static_key sched_feat_keys[__SCHED_FEAT_NR] = { #include "features.h" }; @@ -175,14 +177,14 @@ struct jump_label_key sched_feat_keys[__SCHED_FEAT_NR] = { static void sched_feat_disable(int i) { - if (jump_label_enabled(&sched_feat_keys[i])) - jump_label_dec(&sched_feat_keys[i]); + if (static_key_enabled(&sched_feat_keys[i])) + static_key_slow_dec(&sched_feat_keys[i]); } static void sched_feat_enable(int i) { - if (!jump_label_enabled(&sched_feat_keys[i])) - jump_label_inc(&sched_feat_keys[i]); + if (!static_key_enabled(&sched_feat_keys[i])) + static_key_slow_inc(&sched_feat_keys[i]); } #else static void sched_feat_disable(int i) { }; @@ -723,9 +725,6 @@ static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) p->sched_class->dequeue_task(rq, p, flags); } -/* - * activate_task - move a task to the runqueue. - */ void activate_task(struct rq *rq, struct task_struct *p, int flags) { if (task_contributes_to_load(p)) @@ -734,9 +733,6 @@ void activate_task(struct rq *rq, struct task_struct *p, int flags) enqueue_task(rq, p, flags); } -/* - * deactivate_task - remove a task from the runqueue. - */ void deactivate_task(struct rq *rq, struct task_struct *p, int flags) { if (task_contributes_to_load(p)) @@ -899,7 +895,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta) delta -= irq_delta; #endif #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING - if (static_branch((¶virt_steal_rq_enabled))) { + if (static_key_false((¶virt_steal_rq_enabled))) { u64 st; steal = paravirt_steal_clock(cpu_of(rq)); @@ -1289,7 +1285,7 @@ static int select_fallback_rq(int cpu, struct task_struct *p) * leave kernel. */ if (p->mm && printk_ratelimit()) { - printk(KERN_INFO "process %d (%s) no longer affine to cpu%d\n", + printk_sched("process %d (%s) no longer affine to cpu%d\n", task_pid_nr(p), p->comm, cpu); } @@ -1512,7 +1508,7 @@ static int ttwu_activate_remote(struct task_struct *p, int wake_flags) } #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ -static inline int ttwu_share_cache(int this_cpu, int that_cpu) +bool cpus_share_cache(int this_cpu, int that_cpu) { return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); } @@ -1523,7 +1519,7 @@ static void ttwu_queue(struct task_struct *p, int cpu) struct rq *rq = cpu_rq(cpu); #if defined(CONFIG_SMP) - if (sched_feat(TTWU_QUEUE) && !ttwu_share_cache(smp_processor_id(), cpu)) { + if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) { sched_clock_cpu(cpu); /* sync clocks x-cpu */ ttwu_queue_remote(p, cpu); return; @@ -1937,7 +1933,6 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) local_irq_enable(); #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ finish_lock_switch(rq, prev); - trace_sched_stat_sleeptime(current, rq->clock); fire_sched_in_preempt_notifiers(current); if (mm) @@ -2272,13 +2267,10 @@ calc_load_n(unsigned long load, unsigned long exp, * Once we've updated the global active value, we need to apply the exponential * weights adjusted to the number of cycles missed. */ -static void calc_global_nohz(unsigned long ticks) +static void calc_global_nohz(void) { long delta, active, n; - if (time_before(jiffies, calc_load_update)) - return; - /* * If we crossed a calc_load_update boundary, make sure to fold * any pending idle changes, the respective CPUs might have @@ -2290,31 +2282,25 @@ static void calc_global_nohz(unsigned long ticks) atomic_long_add(delta, &calc_load_tasks); /* - * If we were idle for multiple load cycles, apply them. + * It could be the one fold was all it took, we done! */ - if (ticks >= LOAD_FREQ) { - n = ticks / LOAD_FREQ; + if (time_before(jiffies, calc_load_update + 10)) + return; - active = atomic_long_read(&calc_load_tasks); - active = active > 0 ? active * FIXED_1 : 0; + /* + * Catch-up, fold however many we are behind still + */ + delta = jiffies - calc_load_update - 10; + n = 1 + (delta / LOAD_FREQ); - avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n); - avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n); - avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n); + active = atomic_long_read(&calc_load_tasks); + active = active > 0 ? active * FIXED_1 : 0; - calc_load_update += n * LOAD_FREQ; - } + avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n); + avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n); + avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n); - /* - * Its possible the remainder of the above division also crosses - * a LOAD_FREQ period, the regular check in calc_global_load() - * which comes after this will take care of that. - * - * Consider us being 11 ticks before a cycle completion, and us - * sleeping for 4*LOAD_FREQ + 22 ticks, then the above code will - * age us 4 cycles, and the test in calc_global_load() will - * pick up the final one. - */ + calc_load_update += n * LOAD_FREQ; } #else void calc_load_account_idle(struct rq *this_rq) @@ -2326,7 +2312,7 @@ static inline long calc_load_fold_idle(void) return 0; } -static void calc_global_nohz(unsigned long ticks) +static void calc_global_nohz(void) { } #endif @@ -2354,8 +2340,6 @@ void calc_global_load(unsigned long ticks) { long active; - calc_global_nohz(ticks); - if (time_before(jiffies, calc_load_update + 10)) return; @@ -2367,6 +2351,16 @@ void calc_global_load(unsigned long ticks) avenrun[2] = calc_load(avenrun[2], EXP_15, active); calc_load_update += LOAD_FREQ; + + /* + * Account one period with whatever state we found before + * folding in the nohz state and ageing the entire idle period. + * + * This avoids loosing a sample when we go idle between + * calc_load_account_active() (10 ticks ago) and now and thus + * under-accounting. + */ + calc_global_nohz(); } /* @@ -2761,7 +2755,7 @@ void account_idle_time(cputime_t cputime) static __always_inline bool steal_account_process_tick(void) { #ifdef CONFIG_PARAVIRT - if (static_branch(¶virt_steal_enabled)) { + if (static_key_false(¶virt_steal_enabled)) { u64 steal, st = 0; steal = paravirt_steal_clock(smp_processor_id()); @@ -3226,14 +3220,14 @@ need_resched: post_schedule(rq); - preempt_enable_no_resched(); + sched_preempt_enable_no_resched(); if (need_resched()) goto need_resched; } static inline void sched_submit_work(struct task_struct *tsk) { - if (!tsk->state) + if (!tsk->state || tsk_is_pi_blocked(tsk)) return; /* * If we are going to sleep and we have plugged IO queued, @@ -3252,6 +3246,18 @@ asmlinkage void __sched schedule(void) } EXPORT_SYMBOL(schedule); +/** + * schedule_preempt_disabled - called with preemption disabled + * + * Returns with preemption disabled. Note: preempt_count must be 1 + */ +void __sched schedule_preempt_disabled(void) +{ + sched_preempt_enable_no_resched(); + schedule(); + preempt_disable(); +} + #ifdef CONFIG_MUTEX_SPIN_ON_OWNER static inline bool owner_running(struct mutex *lock, struct task_struct *owner) @@ -3412,9 +3418,9 @@ EXPORT_SYMBOL(__wake_up); /* * Same as __wake_up but called with the spinlock in wait_queue_head_t held. */ -void __wake_up_locked(wait_queue_head_t *q, unsigned int mode) +void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr) { - __wake_up_common(q, mode, 1, 0, NULL); + __wake_up_common(q, mode, nr, 0, NULL); } EXPORT_SYMBOL_GPL(__wake_up_locked); @@ -3773,6 +3779,24 @@ void rt_mutex_setprio(struct task_struct *p, int prio) rq = __task_rq_lock(p); + /* + * Idle task boosting is a nono in general. There is one + * exception, when PREEMPT_RT and NOHZ is active: + * + * The idle task calls get_next_timer_interrupt() and holds + * the timer wheel base->lock on the CPU and another CPU wants + * to access the timer (probably to cancel it). We can safely + * ignore the boosting request, as the idle CPU runs this code + * with interrupts disabled and will complete the lock + * protected section without being interrupted. So there is no + * real need to boost. + */ + if (unlikely(p == rq->idle)) { + WARN_ON(p != rq->curr); + WARN_ON(p->pi_blocked_on); + goto out_unlock; + } + trace_sched_pi_setprio(p, prio); oldprio = p->prio; prev_class = p->sched_class; @@ -3796,11 +3820,10 @@ void rt_mutex_setprio(struct task_struct *p, int prio) enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0); check_class_changed(rq, p, prev_class, oldprio); +out_unlock: __task_rq_unlock(rq); } - #endif - void set_user_nice(struct task_struct *p, long nice) { int old_prio, delta, on_rq; @@ -4134,7 +4157,7 @@ recheck: on_rq = p->on_rq; running = task_current(rq, p); if (on_rq) - deactivate_task(rq, p, 0); + dequeue_task(rq, p, 0); if (running) p->sched_class->put_prev_task(rq, p); @@ -4147,7 +4170,7 @@ recheck: if (running) p->sched_class->set_curr_task(rq); if (on_rq) - activate_task(rq, p, 0); + enqueue_task(rq, p, 0); check_class_changed(rq, p, prev_class, oldprio); task_rq_unlock(rq, p, &flags); @@ -4330,7 +4353,7 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) goto out_free_cpus_allowed; } retval = -EPERM; - if (!check_same_owner(p) && !task_ns_capable(p, CAP_SYS_NICE)) + if (!check_same_owner(p) && !ns_capable(task_user_ns(p), CAP_SYS_NICE)) goto out_unlock; retval = security_task_setscheduler(p); @@ -4480,7 +4503,7 @@ SYSCALL_DEFINE0(sched_yield) __release(rq->lock); spin_release(&rq->lock.dep_map, 1, _THIS_IP_); do_raw_spin_unlock(&rq->lock); - preempt_enable_no_resched(); + sched_preempt_enable_no_resched(); schedule(); @@ -4554,8 +4577,24 @@ EXPORT_SYMBOL(__cond_resched_softirq); /** * yield - yield the current processor to other threads. * - * This is a shortcut for kernel-space yielding - it marks the - * thread runnable and calls sys_sched_yield(). + * Do not ever use this function, there's a 99% chance you're doing it wrong. + * + * The scheduler is at all times free to pick the calling task as the most + * eligible task to run, if removing the yield() call from your code breaks + * it, its already broken. + * + * Typical broken usage is: + * + * while (!event) + * yield(); + * + * where one assumes that yield() will let 'the other' process run that will + * make event true. If the current task is a SCHED_FIFO task that will never + * happen. Never use yield() as a progress guarantee!! + * + * If you want to use yield() to wait for something, use wait_event(). + * If you want to use yield() to be 'nice' for others, use cond_resched(). + * If you still want to use yield(), do not! */ void __sched yield(void) { @@ -4998,9 +5037,9 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) * placed properly. */ if (p->on_rq) { - deactivate_task(rq_src, p, 0); + dequeue_task(rq_src, p, 0); set_task_cpu(p, dest_cpu); - activate_task(rq_dest, p, 0); + enqueue_task(rq_dest, p, 0); check_preempt_curr(rq_dest, p, 0); } done: @@ -5387,7 +5426,7 @@ static int __cpuinit sched_cpu_active(struct notifier_block *nfb, unsigned long action, void *hcpu) { switch (action & ~CPU_TASKS_FROZEN) { - case CPU_ONLINE: + case CPU_STARTING: case CPU_DOWN_FAILED: set_cpu_active((long)hcpu, true); return NOTIFY_OK; @@ -5759,7 +5798,7 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu) * * Also keep a unique ID per domain (we use the first cpu number in * the cpumask of the domain), this allows us to quickly tell if - * two cpus are in the same cache domain, see ttwu_share_cache(). + * two cpus are in the same cache domain, see cpus_share_cache(). */ DEFINE_PER_CPU(struct sched_domain *, sd_llc); DEFINE_PER_CPU(int, sd_llc_id); @@ -6936,6 +6975,9 @@ void __init sched_init(void) rq->online = 0; rq->idle_stamp = 0; rq->avg_idle = 2*sysctl_sched_migration_cost; + + INIT_LIST_HEAD(&rq->cfs_tasks); + rq_attach_root(rq, &def_root_domain); #ifdef CONFIG_NO_HZ rq->nohz_flags = 0; @@ -7032,10 +7074,10 @@ static void normalize_task(struct rq *rq, struct task_struct *p) on_rq = p->on_rq; if (on_rq) - deactivate_task(rq, p, 0); + dequeue_task(rq, p, 0); __setscheduler(rq, p, SCHED_NORMAL, 0); if (on_rq) { - activate_task(rq, p, 0); + enqueue_task(rq, p, 0); resched_task(rq->curr); } @@ -7134,10 +7176,6 @@ void set_curr_task(int cpu, struct task_struct *p) #endif -#ifdef CONFIG_RT_GROUP_SCHED -#else /* !CONFIG_RT_GROUP_SCHED */ -#endif /* CONFIG_RT_GROUP_SCHED */ - #ifdef CONFIG_CGROUP_SCHED /* task_group_lock serializes the addition/removal of task groups */ static DEFINE_SPINLOCK(task_group_lock); @@ -7246,9 +7284,6 @@ void sched_move_task(struct task_struct *tsk) } #endif /* CONFIG_CGROUP_SCHED */ -#ifdef CONFIG_FAIR_GROUP_SCHED -#endif - #if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH) static unsigned long to_ratio(u64 period, u64 runtime) { @@ -7537,8 +7572,7 @@ static inline struct task_group *cgroup_tg(struct cgroup *cgrp) struct task_group, css); } -static struct cgroup_subsys_state * -cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp) +static struct cgroup_subsys_state *cpu_cgroup_create(struct cgroup *cgrp) { struct task_group *tg, *parent; @@ -7555,15 +7589,14 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp) return &tg->css; } -static void -cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) +static void cpu_cgroup_destroy(struct cgroup *cgrp) { struct task_group *tg = cgroup_tg(cgrp); sched_destroy_group(tg); } -static int cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, +static int cpu_cgroup_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) { struct task_struct *task; @@ -7581,7 +7614,7 @@ static int cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, return 0; } -static void cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, +static void cpu_cgroup_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) { struct task_struct *task; @@ -7591,8 +7624,8 @@ static void cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, } static void -cpu_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp, - struct cgroup *old_cgrp, struct task_struct *task) +cpu_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp, + struct task_struct *task) { /* * cgroup_exit() is called in the copy_process() failure path. @@ -7942,8 +7975,7 @@ struct cgroup_subsys cpu_cgroup_subsys = { */ /* create a new cpu accounting group */ -static struct cgroup_subsys_state *cpuacct_create( - struct cgroup_subsys *ss, struct cgroup *cgrp) +static struct cgroup_subsys_state *cpuacct_create(struct cgroup *cgrp) { struct cpuacct *ca; @@ -7973,8 +8005,7 @@ out: } /* destroy an existing cpu accounting group */ -static void -cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) +static void cpuacct_destroy(struct cgroup *cgrp) { struct cpuacct *ca = cgroup_ca(cgrp); diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c index b0d798eaf130..d72586fdf660 100644 --- a/kernel/sched/cpupri.c +++ b/kernel/sched/cpupri.c @@ -129,7 +129,7 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p, * cpupri_set - update the cpu priority setting * @cp: The cpupri context * @cpu: The target cpu - * @pri: The priority (INVALID-RT99) to assign to this CPU + * @newpri: The priority (INVALID-RT99) to assign to this CPU * * Note: Assumes cpu_rq(cpu)->lock is locked * @@ -200,7 +200,6 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri) /** * cpupri_init - initialize the cpupri structure * @cp: The cpupri context - * @bootmem: true if allocations need to use bootmem * * Returns: -ENOMEM if memory fails. */ diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 2a075e10004b..09acaa15161d 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -288,7 +288,6 @@ static void print_cpu(struct seq_file *m, int cpu) P(yld_count); - P(sched_switch); P(sched_count); P(sched_goidle); #ifdef CONFIG_SMP diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 8e42de9105f8..94340c7544a9 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -776,29 +776,16 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) * Scheduling class queueing methods: */ -#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED -static void -add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight) -{ - cfs_rq->task_weight += weight; -} -#else -static inline void -add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight) -{ -} -#endif - static void account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) { update_load_add(&cfs_rq->load, se->load.weight); if (!parent_entity(se)) update_load_add(&rq_of(cfs_rq)->load, se->load.weight); - if (entity_is_task(se)) { - add_cfs_task_weight(cfs_rq, se->load.weight); - list_add(&se->group_node, &cfs_rq->tasks); - } +#ifdef CONFIG_SMP + if (entity_is_task(se)) + list_add_tail(&se->group_node, &rq_of(cfs_rq)->cfs_tasks); +#endif cfs_rq->nr_running++; } @@ -808,10 +795,8 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) update_load_sub(&cfs_rq->load, se->load.weight); if (!parent_entity(se)) update_load_sub(&rq_of(cfs_rq)->load, se->load.weight); - if (entity_is_task(se)) { - add_cfs_task_weight(cfs_rq, -se->load.weight); + if (entity_is_task(se)) list_del_init(&se->group_node); - } cfs_rq->nr_running--; } @@ -1003,6 +988,7 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) if (unlikely(delta > se->statistics.sleep_max)) se->statistics.sleep_max = delta; + se->statistics.sleep_start = 0; se->statistics.sum_sleep_runtime += delta; if (tsk) { @@ -1019,6 +1005,7 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) if (unlikely(delta > se->statistics.block_max)) se->statistics.block_max = delta; + se->statistics.block_start = 0; se->statistics.sum_sleep_runtime += delta; if (tsk) { @@ -1399,20 +1386,20 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) #ifdef CONFIG_CFS_BANDWIDTH #ifdef HAVE_JUMP_LABEL -static struct jump_label_key __cfs_bandwidth_used; +static struct static_key __cfs_bandwidth_used; static inline bool cfs_bandwidth_used(void) { - return static_branch(&__cfs_bandwidth_used); + return static_key_false(&__cfs_bandwidth_used); } void account_cfs_bandwidth_used(int enabled, int was_enabled) { /* only need to count groups transitioning between enabled/!enabled */ if (enabled && !was_enabled) - jump_label_inc(&__cfs_bandwidth_used); + static_key_slow_inc(&__cfs_bandwidth_used); else if (!enabled && was_enabled) - jump_label_dec(&__cfs_bandwidth_used); + static_key_slow_dec(&__cfs_bandwidth_used); } #else /* HAVE_JUMP_LABEL */ static bool cfs_bandwidth_used(void) @@ -2670,8 +2657,6 @@ static int select_idle_sibling(struct task_struct *p, int target) /* * Otherwise, iterate the domains and find an elegible idle cpu. */ - rcu_read_lock(); - sd = rcu_dereference(per_cpu(sd_llc, target)); for_each_lower_domain(sd) { sg = sd->groups; @@ -2693,8 +2678,6 @@ next: } while (sg != sd->groups); } done: - rcu_read_unlock(); - return target; } @@ -2920,7 +2903,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ return; /* - * This is possible from callers such as pull_task(), in which we + * This is possible from callers such as move_task(), in which we * unconditionally check_prempt_curr() after an enqueue (which may have * lead to a throttle). This both saves work and prevents false * next-buddy nomination below. @@ -3084,17 +3067,39 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp * Fair scheduling class load-balancing methods: */ +static unsigned long __read_mostly max_load_balance_interval = HZ/10; + +#define LBF_ALL_PINNED 0x01 +#define LBF_NEED_BREAK 0x02 + +struct lb_env { + struct sched_domain *sd; + + int src_cpu; + struct rq *src_rq; + + int dst_cpu; + struct rq *dst_rq; + + enum cpu_idle_type idle; + long load_move; + unsigned int flags; + + unsigned int loop; + unsigned int loop_break; + unsigned int loop_max; +}; + /* - * pull_task - move a task from a remote runqueue to the local runqueue. + * move_task - move a task from one runqueue to another runqueue. * Both runqueues must be locked. */ -static void pull_task(struct rq *src_rq, struct task_struct *p, - struct rq *this_rq, int this_cpu) +static void move_task(struct task_struct *p, struct lb_env *env) { - deactivate_task(src_rq, p, 0); - set_task_cpu(p, this_cpu); - activate_task(this_rq, p, 0); - check_preempt_curr(this_rq, p, 0); + deactivate_task(env->src_rq, p, 0); + set_task_cpu(p, env->dst_cpu); + activate_task(env->dst_rq, p, 0); + check_preempt_curr(env->dst_rq, p, 0); } /* @@ -3129,17 +3134,11 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) return delta < (s64)sysctl_sched_migration_cost; } -#define LBF_ALL_PINNED 0x01 -#define LBF_NEED_BREAK 0x02 -#define LBF_ABORT 0x04 - /* * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? */ static -int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, - struct sched_domain *sd, enum cpu_idle_type idle, - int *lb_flags) +int can_migrate_task(struct task_struct *p, struct lb_env *env) { int tsk_cache_hot = 0; /* @@ -3148,13 +3147,13 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, * 2) cannot be migrated to this CPU due to cpus_allowed, or * 3) are cache-hot on their current CPU. */ - if (!cpumask_test_cpu(this_cpu, tsk_cpus_allowed(p))) { + if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) { schedstat_inc(p, se.statistics.nr_failed_migrations_affine); return 0; } - *lb_flags &= ~LBF_ALL_PINNED; + env->flags &= ~LBF_ALL_PINNED; - if (task_running(rq, p)) { + if (task_running(env->src_rq, p)) { schedstat_inc(p, se.statistics.nr_failed_migrations_running); return 0; } @@ -3165,12 +3164,12 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, * 2) too many balance attempts have failed. */ - tsk_cache_hot = task_hot(p, rq->clock_task, sd); + tsk_cache_hot = task_hot(p, env->src_rq->clock_task, env->sd); if (!tsk_cache_hot || - sd->nr_balance_failed > sd->cache_nice_tries) { + env->sd->nr_balance_failed > env->sd->cache_nice_tries) { #ifdef CONFIG_SCHEDSTATS if (tsk_cache_hot) { - schedstat_inc(sd, lb_hot_gained[idle]); + schedstat_inc(env->sd, lb_hot_gained[env->idle]); schedstat_inc(p, se.statistics.nr_forced_migrations); } #endif @@ -3191,65 +3190,80 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, * * Called with both runqueues locked. */ -static int -move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, - struct sched_domain *sd, enum cpu_idle_type idle) +static int move_one_task(struct lb_env *env) { struct task_struct *p, *n; - struct cfs_rq *cfs_rq; - int pinned = 0; - for_each_leaf_cfs_rq(busiest, cfs_rq) { - list_for_each_entry_safe(p, n, &cfs_rq->tasks, se.group_node) { - if (throttled_lb_pair(task_group(p), - busiest->cpu, this_cpu)) - break; + list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) { + if (throttled_lb_pair(task_group(p), env->src_rq->cpu, env->dst_cpu)) + continue; - if (!can_migrate_task(p, busiest, this_cpu, - sd, idle, &pinned)) - continue; + if (!can_migrate_task(p, env)) + continue; - pull_task(busiest, p, this_rq, this_cpu); - /* - * Right now, this is only the second place pull_task() - * is called, so we can safely collect pull_task() - * stats here rather than inside pull_task(). - */ - schedstat_inc(sd, lb_gained[idle]); - return 1; - } + move_task(p, env); + /* + * Right now, this is only the second place move_task() + * is called, so we can safely collect move_task() + * stats here rather than inside move_task(). + */ + schedstat_inc(env->sd, lb_gained[env->idle]); + return 1; } - return 0; } -static unsigned long -balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, - unsigned long max_load_move, struct sched_domain *sd, - enum cpu_idle_type idle, int *lb_flags, - struct cfs_rq *busiest_cfs_rq) +static unsigned long task_h_load(struct task_struct *p); + +/* + * move_tasks tries to move up to load_move weighted load from busiest to + * this_rq, as part of a balancing operation within domain "sd". + * Returns 1 if successful and 0 otherwise. + * + * Called with both runqueues locked. + */ +static int move_tasks(struct lb_env *env) { - int loops = 0, pulled = 0; - long rem_load_move = max_load_move; - struct task_struct *p, *n; + struct list_head *tasks = &env->src_rq->cfs_tasks; + struct task_struct *p; + unsigned long load; + int pulled = 0; + + if (env->load_move <= 0) + return 0; + + while (!list_empty(tasks)) { + p = list_first_entry(tasks, struct task_struct, se.group_node); - if (max_load_move == 0) - goto out; + env->loop++; + /* We've more or less seen every task there is, call it quits */ + if (env->loop > env->loop_max) + break; - list_for_each_entry_safe(p, n, &busiest_cfs_rq->tasks, se.group_node) { - if (loops++ > sysctl_sched_nr_migrate) { - *lb_flags |= LBF_NEED_BREAK; + /* take a breather every nr_migrate tasks */ + if (env->loop > env->loop_break) { + env->loop_break += sysctl_sched_nr_migrate; + env->flags |= LBF_NEED_BREAK; break; } - if ((p->se.load.weight >> 1) > rem_load_move || - !can_migrate_task(p, busiest, this_cpu, sd, idle, - lb_flags)) - continue; + if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu)) + goto next; + + load = task_h_load(p); + + if (load < 16 && !env->sd->nr_balance_failed) + goto next; + + if ((load / 2) > env->load_move) + goto next; - pull_task(busiest, p, this_rq, this_cpu); + if (!can_migrate_task(p, env)) + goto next; + + move_task(p, env); pulled++; - rem_load_move -= p->se.load.weight; + env->load_move -= load; #ifdef CONFIG_PREEMPT /* @@ -3257,28 +3271,30 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, * kernels will stop after the first task is pulled to minimize * the critical section. */ - if (idle == CPU_NEWLY_IDLE) { - *lb_flags |= LBF_ABORT; + if (env->idle == CPU_NEWLY_IDLE) break; - } #endif /* * We only want to steal up to the prescribed amount of * weighted load. */ - if (rem_load_move <= 0) + if (env->load_move <= 0) break; + + continue; +next: + list_move_tail(&p->se.group_node, tasks); } -out: + /* - * Right now, this is one of only two places pull_task() is called, - * so we can safely collect pull_task() stats here rather than - * inside pull_task(). + * Right now, this is one of only two places move_task() is called, + * so we can safely collect move_task() stats here rather than + * inside move_task(). */ - schedstat_add(sd, lb_gained[idle], pulled); + schedstat_add(env->sd, lb_gained[env->idle], pulled); - return max_load_move - rem_load_move; + return pulled; } #ifdef CONFIG_FAIR_GROUP_SCHED @@ -3358,113 +3374,35 @@ static int tg_load_down(struct task_group *tg, void *data) static void update_h_load(long cpu) { + rcu_read_lock(); walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); + rcu_read_unlock(); } -static unsigned long -load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, - unsigned long max_load_move, - struct sched_domain *sd, enum cpu_idle_type idle, - int *lb_flags) +static unsigned long task_h_load(struct task_struct *p) { - long rem_load_move = max_load_move; - struct cfs_rq *busiest_cfs_rq; - - rcu_read_lock(); - update_h_load(cpu_of(busiest)); - - for_each_leaf_cfs_rq(busiest, busiest_cfs_rq) { - unsigned long busiest_h_load = busiest_cfs_rq->h_load; - unsigned long busiest_weight = busiest_cfs_rq->load.weight; - u64 rem_load, moved_load; - - if (*lb_flags & (LBF_NEED_BREAK|LBF_ABORT)) - break; - - /* - * empty group or part of a throttled hierarchy - */ - if (!busiest_cfs_rq->task_weight || - throttled_lb_pair(busiest_cfs_rq->tg, cpu_of(busiest), this_cpu)) - continue; - - rem_load = (u64)rem_load_move * busiest_weight; - rem_load = div_u64(rem_load, busiest_h_load + 1); - - moved_load = balance_tasks(this_rq, this_cpu, busiest, - rem_load, sd, idle, lb_flags, - busiest_cfs_rq); - - if (!moved_load) - continue; - - moved_load *= busiest_h_load; - moved_load = div_u64(moved_load, busiest_weight + 1); + struct cfs_rq *cfs_rq = task_cfs_rq(p); + unsigned long load; - rem_load_move -= moved_load; - if (rem_load_move < 0) - break; - } - rcu_read_unlock(); + load = p->se.load.weight; + load = div_u64(load * cfs_rq->h_load, cfs_rq->load.weight + 1); - return max_load_move - rem_load_move; + return load; } #else static inline void update_shares(int cpu) { } -static unsigned long -load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, - unsigned long max_load_move, - struct sched_domain *sd, enum cpu_idle_type idle, - int *lb_flags) +static inline void update_h_load(long cpu) { - return balance_tasks(this_rq, this_cpu, busiest, - max_load_move, sd, idle, lb_flags, - &busiest->cfs); } -#endif -/* - * move_tasks tries to move up to max_load_move weighted load from busiest to - * this_rq, as part of a balancing operation within domain "sd". - * Returns 1 if successful and 0 otherwise. - * - * Called with both runqueues locked. - */ -static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, - unsigned long max_load_move, - struct sched_domain *sd, enum cpu_idle_type idle, - int *lb_flags) +static unsigned long task_h_load(struct task_struct *p) { - unsigned long total_load_moved = 0, load_moved; - - do { - load_moved = load_balance_fair(this_rq, this_cpu, busiest, - max_load_move - total_load_moved, - sd, idle, lb_flags); - - total_load_moved += load_moved; - - if (*lb_flags & (LBF_NEED_BREAK|LBF_ABORT)) - break; - -#ifdef CONFIG_PREEMPT - /* - * NEWIDLE balancing is a source of latency, so preemptible - * kernels will stop after the first task is pulled to minimize - * the critical section. - */ - if (idle == CPU_NEWLY_IDLE && this_rq->nr_running) { - *lb_flags |= LBF_ABORT; - break; - } -#endif - } while (load_moved && max_load_move > total_load_moved); - - return total_load_moved > 0; + return p->se.load.weight; } +#endif /********** Helpers for find_busiest_group ************************/ /* @@ -3774,6 +3712,11 @@ void update_group_power(struct sched_domain *sd, int cpu) struct sched_domain *child = sd->child; struct sched_group *group, *sdg = sd->groups; unsigned long power; + unsigned long interval; + + interval = msecs_to_jiffies(sd->balance_interval); + interval = clamp(interval, 1UL, max_load_balance_interval); + sdg->sgp->next_update = jiffies + interval; if (!child) { update_cpu_power(sd, cpu); @@ -3881,12 +3824,15 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, * domains. In the newly idle case, we will allow all the cpu's * to do the newly idle load balance. */ - if (idle != CPU_NEWLY_IDLE && local_group) { - if (balance_cpu != this_cpu) { - *balance = 0; - return; - } - update_group_power(sd, this_cpu); + if (local_group) { + if (idle != CPU_NEWLY_IDLE) { + if (balance_cpu != this_cpu) { + *balance = 0; + return; + } + update_group_power(sd, this_cpu); + } else if (time_after_eq(jiffies, group->sgp->next_update)) + update_group_power(sd, this_cpu); } /* Adjust by relative CPU power of the group */ @@ -4449,13 +4395,21 @@ static int load_balance(int this_cpu, struct rq *this_rq, struct sched_domain *sd, enum cpu_idle_type idle, int *balance) { - int ld_moved, lb_flags = 0, active_balance = 0; + int ld_moved, active_balance = 0; struct sched_group *group; unsigned long imbalance; struct rq *busiest; unsigned long flags; struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); + struct lb_env env = { + .sd = sd, + .dst_cpu = this_cpu, + .dst_rq = this_rq, + .idle = idle, + .loop_break = sysctl_sched_nr_migrate, + }; + cpumask_copy(cpus, cpu_active_mask); schedstat_inc(sd, lb_count[idle]); @@ -4490,30 +4444,34 @@ redo: * still unbalanced. ld_moved simply stays zero, so it is * correctly treated as an imbalance. */ - lb_flags |= LBF_ALL_PINNED; + env.flags |= LBF_ALL_PINNED; + env.load_move = imbalance; + env.src_cpu = busiest->cpu; + env.src_rq = busiest; + env.loop_max = busiest->nr_running; + +more_balance: local_irq_save(flags); double_rq_lock(this_rq, busiest); - ld_moved = move_tasks(this_rq, this_cpu, busiest, - imbalance, sd, idle, &lb_flags); + if (!env.loop) + update_h_load(env.src_cpu); + ld_moved += move_tasks(&env); double_rq_unlock(this_rq, busiest); local_irq_restore(flags); + if (env.flags & LBF_NEED_BREAK) { + env.flags &= ~LBF_NEED_BREAK; + goto more_balance; + } + /* * some other cpu did the load balance for us. */ if (ld_moved && this_cpu != smp_processor_id()) resched_cpu(this_cpu); - if (lb_flags & LBF_ABORT) - goto out_balanced; - - if (lb_flags & LBF_NEED_BREAK) { - lb_flags &= ~LBF_NEED_BREAK; - goto redo; - } - /* All tasks on this runqueue were pinned by CPU affinity */ - if (unlikely(lb_flags & LBF_ALL_PINNED)) { + if (unlikely(env.flags & LBF_ALL_PINNED)) { cpumask_clear_cpu(cpu_of(busiest), cpus); if (!cpumask_empty(cpus)) goto redo; @@ -4543,7 +4501,7 @@ redo: tsk_cpus_allowed(busiest->curr))) { raw_spin_unlock_irqrestore(&busiest->lock, flags); - lb_flags |= LBF_ALL_PINNED; + env.flags |= LBF_ALL_PINNED; goto out_one_pinned; } @@ -4596,7 +4554,7 @@ out_balanced: out_one_pinned: /* tune up the balancing interval */ - if (((lb_flags & LBF_ALL_PINNED) && + if (((env.flags & LBF_ALL_PINNED) && sd->balance_interval < MAX_PINNED_INTERVAL) || (sd->balance_interval < sd->max_interval)) sd->balance_interval *= 2; @@ -4706,10 +4664,18 @@ static int active_load_balance_cpu_stop(void *data) } if (likely(sd)) { + struct lb_env env = { + .sd = sd, + .dst_cpu = target_cpu, + .dst_rq = target_rq, + .src_cpu = busiest_rq->cpu, + .src_rq = busiest_rq, + .idle = CPU_IDLE, + }; + schedstat_inc(sd, alb_count); - if (move_one_task(target_rq, target_cpu, busiest_rq, - sd, CPU_IDLE)) + if (move_one_task(&env)) schedstat_inc(sd, alb_pushed); else schedstat_inc(sd, alb_failed); @@ -4862,6 +4828,15 @@ static void nohz_balancer_kick(int cpu) return; } +static inline void clear_nohz_tick_stopped(int cpu) +{ + if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) { + cpumask_clear_cpu(cpu, nohz.idle_cpus_mask); + atomic_dec(&nohz.nr_cpus); + clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); + } +} + static inline void set_cpu_sd_state_busy(void) { struct sched_domain *sd; @@ -4900,6 +4875,12 @@ void select_nohz_load_balancer(int stop_tick) { int cpu = smp_processor_id(); + /* + * If this cpu is going down, then nothing needs to be done. + */ + if (!cpu_active(cpu)) + return; + if (stop_tick) { if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu))) return; @@ -4910,12 +4891,22 @@ void select_nohz_load_balancer(int stop_tick) } return; } + +static int __cpuinit sched_ilb_notifier(struct notifier_block *nfb, + unsigned long action, void *hcpu) +{ + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_DYING: + clear_nohz_tick_stopped(smp_processor_id()); + return NOTIFY_OK; + default: + return NOTIFY_DONE; + } +} #endif static DEFINE_SPINLOCK(balancing); -static unsigned long __read_mostly max_load_balance_interval = HZ/10; - /* * Scale the max load_balance interval with the number of CPUs in the system. * This trades load-balance latency on larger machines for less cross talk. @@ -5066,11 +5057,7 @@ static inline int nohz_kick_needed(struct rq *rq, int cpu) * busy tick after returning from idle, we will update the busy stats. */ set_cpu_sd_state_busy(); - if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) { - clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); - cpumask_clear_cpu(cpu, nohz.idle_cpus_mask); - atomic_dec(&nohz.nr_cpus); - } + clear_nohz_tick_stopped(cpu); /* * None are in tickless mode and hence no need for NOHZ idle load @@ -5313,7 +5300,6 @@ static void set_curr_task_fair(struct rq *rq) void init_cfs_rq(struct cfs_rq *cfs_rq) { cfs_rq->tasks_timeline = RB_ROOT; - INIT_LIST_HEAD(&cfs_rq->tasks); cfs_rq->min_vruntime = (u64)(-(1LL << 20)); #ifndef CONFIG_64BIT cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; @@ -5585,7 +5571,9 @@ __init void init_sched_fair_class(void) open_softirq(SCHED_SOFTIRQ, run_rebalance_domains); #ifdef CONFIG_NO_HZ + nohz.next_balance = jiffies; zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); + cpu_notifier(sched_ilb_notifier, 0); #endif #endif /* SMP */ diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 3640ebbb466b..b60dad720173 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -778,12 +778,9 @@ static inline int balance_runtime(struct rt_rq *rt_rq) static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) { - int i, idle = 1; + int i, idle = 1, throttled = 0; const struct cpumask *span; - if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) - return 1; - span = sched_rt_period_mask(); for_each_cpu(i, span) { int enqueue = 0; @@ -818,12 +815,17 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) if (!rt_rq_throttled(rt_rq)) enqueue = 1; } + if (rt_rq->rt_throttled) + throttled = 1; if (enqueue) sched_rt_rq_enqueue(rt_rq); raw_spin_unlock(&rq->lock); } + if (!throttled && (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)) + return 1; + return idle; } @@ -855,8 +857,30 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) return 0; if (rt_rq->rt_time > runtime) { - rt_rq->rt_throttled = 1; - printk_once(KERN_WARNING "sched: RT throttling activated\n"); + struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); + + /* + * Don't actually throttle groups that have no runtime assigned + * but accrue some time due to boosting. + */ + if (likely(rt_b->rt_runtime)) { + static bool once = false; + + rt_rq->rt_throttled = 1; + + if (!once) { + once = true; + printk_sched("sched: RT throttling activated\n"); + } + } else { + /* + * In case we did anyway, make it go away, + * replenishment is a joke, since it will replenish us + * with exactly 0 ns. + */ + rt_rq->rt_time = 0; + } + if (rt_rq_throttled(rt_rq)) { sched_rt_rq_dequeue(rt_rq); return 1; @@ -884,7 +908,8 @@ static void update_curr_rt(struct rq *rq) if (unlikely((s64)delta_exec < 0)) delta_exec = 0; - schedstat_set(curr->se.statistics.exec_max, max(curr->se.statistics.exec_max, delta_exec)); + schedstat_set(curr->se.statistics.exec_max, + max(curr->se.statistics.exec_max, delta_exec)); curr->se.sum_exec_runtime += delta_exec; account_group_exec_runtime(curr, delta_exec); @@ -1587,6 +1612,11 @@ static int push_rt_task(struct rq *rq) if (!next_task) return 0; +#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW + if (unlikely(task_running(rq, next_task))) + return 0; +#endif + retry: if (unlikely(next_task == rq->curr)) { WARN_ON(1); @@ -1967,7 +1997,7 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) if (--p->rt.time_slice) return; - p->rt.time_slice = DEF_TIMESLICE; + p->rt.time_slice = RR_TIMESLICE; /* * Requeue to the end of queue if we are not the only element @@ -1995,7 +2025,7 @@ static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task) * Time slice is 0 for SCHED_FIFO tasks */ if (task->policy == SCHED_RR) - return DEF_TIMESLICE; + return RR_TIMESLICE; else return 0; } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 98c0c2623db8..42b1f304b044 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -36,11 +36,7 @@ extern __read_mostly int scheduler_running; /* * These are the 'tuning knobs' of the scheduler: - * - * default timeslice is 100 msecs (used only for SCHED_RR tasks). - * Timeslices get refilled after they expire. */ -#define DEF_TIMESLICE (100 * HZ / 1000) /* * single value that denotes runtime == period, ie unlimited time. @@ -216,9 +212,6 @@ struct cfs_rq { struct rb_root tasks_timeline; struct rb_node *rb_leftmost; - struct list_head tasks; - struct list_head *balance_iterator; - /* * 'curr' points to currently running entity on this cfs_rq. * It is set to NULL otherwise (i.e when none are currently running). @@ -246,11 +239,6 @@ struct cfs_rq { #ifdef CONFIG_SMP /* - * the part of load.weight contributed by tasks - */ - unsigned long task_weight; - - /* * h_load = weight * f(tg) * * Where f(tg) is the recursive weight fraction assigned to @@ -424,6 +412,8 @@ struct rq { int cpu; int online; + struct list_head cfs_tasks; + u64 rt_avg; u64 age_stamp; u64 idle_stamp; @@ -462,7 +452,6 @@ struct rq { unsigned int yld_count; /* schedule() stats */ - unsigned int sched_switch; unsigned int sched_count; unsigned int sched_goidle; @@ -611,7 +600,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) * Tunables that become constants when CONFIG_SCHED_DEBUG is off: */ #ifdef CONFIG_SCHED_DEBUG -# include <linux/jump_label.h> +# include <linux/static_key.h> # define const_debug __read_mostly #else # define const_debug const @@ -630,18 +619,18 @@ enum { #undef SCHED_FEAT #if defined(CONFIG_SCHED_DEBUG) && defined(HAVE_JUMP_LABEL) -static __always_inline bool static_branch__true(struct jump_label_key *key) +static __always_inline bool static_branch__true(struct static_key *key) { - return likely(static_branch(key)); /* Not out of line branch. */ + return static_key_true(key); /* Not out of line branch. */ } -static __always_inline bool static_branch__false(struct jump_label_key *key) +static __always_inline bool static_branch__false(struct static_key *key) { - return unlikely(static_branch(key)); /* Out of line branch. */ + return static_key_false(key); /* Out of line branch. */ } #define SCHED_FEAT(name, enabled) \ -static __always_inline bool static_branch_##name(struct jump_label_key *key) \ +static __always_inline bool static_branch_##name(struct static_key *key) \ { \ return static_branch__##enabled(key); \ } @@ -650,7 +639,7 @@ static __always_inline bool static_branch_##name(struct jump_label_key *key) \ #undef SCHED_FEAT -extern struct jump_label_key sched_feat_keys[__SCHED_FEAT_NR]; +extern struct static_key sched_feat_keys[__SCHED_FEAT_NR]; #define sched_feat(x) (static_branch_##x(&sched_feat_keys[__SCHED_FEAT_##x])) #else /* !(SCHED_DEBUG && HAVE_JUMP_LABEL) */ #define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c index 2a581ba8e190..903ffa9e8872 100644 --- a/kernel/sched/stats.c +++ b/kernel/sched/stats.c @@ -32,9 +32,9 @@ static int show_schedstat(struct seq_file *seq, void *v) /* runqueue-specific stats */ seq_printf(seq, - "cpu%d %u %u %u %u %u %u %llu %llu %lu", + "cpu%d %u 0 %u %u %u %u %llu %llu %lu", cpu, rq->yld_count, - rq->sched_switch, rq->sched_count, rq->sched_goidle, + rq->sched_count, rq->sched_goidle, rq->ttwu_count, rq->ttwu_local, rq->rq_cpu_time, rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount); diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 57d4b13b631d..e8d76c5895ea 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -6,6 +6,7 @@ * This defines a simple but solid secure-computing mode. */ +#include <linux/audit.h> #include <linux/seccomp.h> #include <linux/sched.h> #include <linux/compat.h> @@ -54,6 +55,7 @@ void __secure_computing(int this_syscall) #ifdef SECCOMP_DEBUG dump_stack(); #endif + audit_seccomp(this_syscall); do_exit(SIGKILL); } diff --git a/kernel/signal.c b/kernel/signal.c index c73c4284160e..e76001ccf5cd 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1054,13 +1054,13 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t, struct sigpending *pending; struct sigqueue *q; int override_rlimit; - - trace_signal_generate(sig, info, t); + int ret = 0, result; assert_spin_locked(&t->sighand->siglock); + result = TRACE_SIGNAL_IGNORED; if (!prepare_signal(sig, t, from_ancestor_ns)) - return 0; + goto ret; pending = group ? &t->signal->shared_pending : &t->pending; /* @@ -1068,8 +1068,11 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t, * exactly one non-rt signal, so that we can get more * detailed information about the cause of the signal. */ + result = TRACE_SIGNAL_ALREADY_PENDING; if (legacy_queue(pending, sig)) - return 0; + goto ret; + + result = TRACE_SIGNAL_DELIVERED; /* * fast-pathed signals for kernel-internal things like SIGSTOP * or SIGKILL. @@ -1127,14 +1130,15 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t, * signal was rt and sent by user using something * other than kill(). */ - trace_signal_overflow_fail(sig, group, info); - return -EAGAIN; + result = TRACE_SIGNAL_OVERFLOW_FAIL; + ret = -EAGAIN; + goto ret; } else { /* * This is a silent loss of information. We still * send the signal, but the *info bits are lost. */ - trace_signal_lose_info(sig, group, info); + result = TRACE_SIGNAL_LOSE_INFO; } } @@ -1142,7 +1146,9 @@ out_set: signalfd_notify(t, sig); sigaddset(&pending->signal, sig); complete_signal(sig, t, group); - return 0; +ret: + trace_signal_generate(sig, info, t, group, result); + return ret; } static int send_signal(int sig, struct siginfo *info, struct task_struct *t, @@ -1585,7 +1591,7 @@ int send_sigqueue(struct sigqueue *q, struct task_struct *t, int group) int sig = q->info.si_signo; struct sigpending *pending; unsigned long flags; - int ret; + int ret, result; BUG_ON(!(q->flags & SIGQUEUE_PREALLOC)); @@ -1594,6 +1600,7 @@ int send_sigqueue(struct sigqueue *q, struct task_struct *t, int group) goto ret; ret = 1; /* the signal is ignored */ + result = TRACE_SIGNAL_IGNORED; if (!prepare_signal(sig, t, 0)) goto out; @@ -1605,6 +1612,7 @@ int send_sigqueue(struct sigqueue *q, struct task_struct *t, int group) */ BUG_ON(q->info.si_code != SI_TIMER); q->info.si_overrun++; + result = TRACE_SIGNAL_ALREADY_PENDING; goto out; } q->info.si_overrun = 0; @@ -1614,7 +1622,9 @@ int send_sigqueue(struct sigqueue *q, struct task_struct *t, int group) list_add_tail(&q->list, &pending->list); sigaddset(&pending->signal, sig); complete_signal(sig, t, group); + result = TRACE_SIGNAL_DELIVERED; out: + trace_signal_generate(sig, &q->info, t, group, result); unlock_task_sighand(t, &flags); ret: return ret; @@ -1642,6 +1652,15 @@ bool do_notify_parent(struct task_struct *tsk, int sig) BUG_ON(!tsk->ptrace && (tsk->group_leader != tsk || !thread_group_empty(tsk))); + if (sig != SIGCHLD) { + /* + * This is only possible if parent == real_parent. + * Check if it has changed security domain. + */ + if (tsk->parent_exec_id != tsk->parent->self_exec_id) + sig = SIGCHLD; + } + info.si_signo = sig; info.si_errno = 0; /* diff --git a/kernel/softirq.c b/kernel/softirq.c index 4eb3a0fa351e..671f9594e368 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -297,7 +297,7 @@ void irq_enter(void) int cpu = smp_processor_id(); rcu_irq_enter(); - if (idle_cpu(cpu) && !in_interrupt()) { + if (is_idle_task(current) && !in_interrupt()) { /* * Prevent raise_softirq from needlessly waking up ksoftirqd * here, as softirq will be serviced on return from interrupt. @@ -310,31 +310,21 @@ void irq_enter(void) __irq_enter(); } -#ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED static inline void invoke_softirq(void) { - if (!force_irqthreads) + if (!force_irqthreads) { +#ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED __do_softirq(); - else { - __local_bh_disable((unsigned long)__builtin_return_address(0), - SOFTIRQ_OFFSET); - wakeup_softirqd(); - __local_bh_enable(SOFTIRQ_OFFSET); - } -} #else -static inline void invoke_softirq(void) -{ - if (!force_irqthreads) do_softirq(); - else { +#endif + } else { __local_bh_disable((unsigned long)__builtin_return_address(0), SOFTIRQ_OFFSET); wakeup_softirqd(); __local_bh_enable(SOFTIRQ_OFFSET); } } -#endif /* * Exit an interrupt context. Process softirqs if needed and possible: @@ -353,7 +343,7 @@ void irq_exit(void) tick_nohz_irq_exit(); #endif rcu_irq_exit(); - preempt_enable_no_resched(); + sched_preempt_enable_no_resched(); } /* @@ -385,6 +375,12 @@ void raise_softirq(unsigned int nr) local_irq_restore(flags); } +void __raise_softirq_irqoff(unsigned int nr) +{ + trace_softirq_raise(nr); + or_softirq_pending(1UL << nr); +} + void open_softirq(int nr, void (*action)(struct softirq_action *)) { softirq_vec[nr].action = action; @@ -744,9 +740,7 @@ static int run_ksoftirqd(void * __bind_cpu) while (!kthread_should_stop()) { preempt_disable(); if (!local_softirq_pending()) { - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); } __set_current_state(TASK_RUNNING); @@ -761,7 +755,7 @@ static int run_ksoftirqd(void * __bind_cpu) if (local_softirq_pending()) __do_softirq(); local_irq_enable(); - preempt_enable_no_resched(); + sched_preempt_enable_no_resched(); cond_resched(); preempt_disable(); rcu_note_context_switch((long)__bind_cpu); diff --git a/kernel/srcu.c b/kernel/srcu.c index 0febf61e1aa3..ba35f3a4a1f4 100644 --- a/kernel/srcu.c +++ b/kernel/srcu.c @@ -172,6 +172,12 @@ static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void)) { int idx; + rcu_lockdep_assert(!lock_is_held(&sp->dep_map) && + !lock_is_held(&rcu_bh_lock_map) && + !lock_is_held(&rcu_lock_map) && + !lock_is_held(&rcu_sched_lock_map), + "Illegal synchronize_srcu() in same-type SRCU (or RCU) read-side critical section"); + idx = sp->completed; mutex_lock(&sp->mutex); @@ -280,19 +286,26 @@ void synchronize_srcu(struct srcu_struct *sp) EXPORT_SYMBOL_GPL(synchronize_srcu); /** - * synchronize_srcu_expedited - like synchronize_srcu, but less patient + * synchronize_srcu_expedited - Brute-force SRCU grace period * @sp: srcu_struct with which to synchronize. * - * Flip the completed counter, and wait for the old count to drain to zero. - * As with classic RCU, the updater must use some separate means of - * synchronizing concurrent updates. Can block; must be called from - * process context. + * Wait for an SRCU grace period to elapse, but use a "big hammer" + * approach to force the grace period to end quickly. This consumes + * significant time on all CPUs and is unfriendly to real-time workloads, + * so is thus not recommended for any sort of common-case code. In fact, + * if you are using synchronize_srcu_expedited() in a loop, please + * restructure your code to batch your updates, and then use a single + * synchronize_srcu() instead. * - * Note that it is illegal to call synchronize_srcu_expedited() - * from the corresponding SRCU read-side critical section; doing so - * will result in deadlock. However, it is perfectly legal to call - * synchronize_srcu_expedited() on one srcu_struct from some other - * srcu_struct's read-side critical section. + * Note that it is illegal to call this function while holding any lock + * that is acquired by a CPU-hotplug notifier. And yes, it is also illegal + * to call this function from a CPU-hotplug notifier. Failing to observe + * these restriction will result in deadlock. It is also illegal to call + * synchronize_srcu_expedited() from the corresponding SRCU read-side + * critical section; doing so will result in deadlock. However, it is + * perfectly legal to call synchronize_srcu_expedited() on one srcu_struct + * from some other srcu_struct's read-side critical section, as long as + * the resulting graph of srcu_structs is acyclic. */ void synchronize_srcu_expedited(struct srcu_struct *sp) { diff --git a/kernel/sys.c b/kernel/sys.c index ddf8155bf3f8..888d227fd195 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1692,6 +1692,124 @@ SYSCALL_DEFINE1(umask, int, mask) return mask; } +#ifdef CONFIG_CHECKPOINT_RESTORE +static int prctl_set_mm(int opt, unsigned long addr, + unsigned long arg4, unsigned long arg5) +{ + unsigned long rlim = rlimit(RLIMIT_DATA); + unsigned long vm_req_flags; + unsigned long vm_bad_flags; + struct vm_area_struct *vma; + int error = 0; + struct mm_struct *mm = current->mm; + + if (arg4 | arg5) + return -EINVAL; + + if (!capable(CAP_SYS_RESOURCE)) + return -EPERM; + + if (addr >= TASK_SIZE) + return -EINVAL; + + down_read(&mm->mmap_sem); + vma = find_vma(mm, addr); + + if (opt != PR_SET_MM_START_BRK && opt != PR_SET_MM_BRK) { + /* It must be existing VMA */ + if (!vma || vma->vm_start > addr) + goto out; + } + + error = -EINVAL; + switch (opt) { + case PR_SET_MM_START_CODE: + case PR_SET_MM_END_CODE: + vm_req_flags = VM_READ | VM_EXEC; + vm_bad_flags = VM_WRITE | VM_MAYSHARE; + + if ((vma->vm_flags & vm_req_flags) != vm_req_flags || + (vma->vm_flags & vm_bad_flags)) + goto out; + + if (opt == PR_SET_MM_START_CODE) + mm->start_code = addr; + else + mm->end_code = addr; + break; + + case PR_SET_MM_START_DATA: + case PR_SET_MM_END_DATA: + vm_req_flags = VM_READ | VM_WRITE; + vm_bad_flags = VM_EXEC | VM_MAYSHARE; + + if ((vma->vm_flags & vm_req_flags) != vm_req_flags || + (vma->vm_flags & vm_bad_flags)) + goto out; + + if (opt == PR_SET_MM_START_DATA) + mm->start_data = addr; + else + mm->end_data = addr; + break; + + case PR_SET_MM_START_STACK: + +#ifdef CONFIG_STACK_GROWSUP + vm_req_flags = VM_READ | VM_WRITE | VM_GROWSUP; +#else + vm_req_flags = VM_READ | VM_WRITE | VM_GROWSDOWN; +#endif + if ((vma->vm_flags & vm_req_flags) != vm_req_flags) + goto out; + + mm->start_stack = addr; + break; + + case PR_SET_MM_START_BRK: + if (addr <= mm->end_data) + goto out; + + if (rlim < RLIM_INFINITY && + (mm->brk - addr) + + (mm->end_data - mm->start_data) > rlim) + goto out; + + mm->start_brk = addr; + break; + + case PR_SET_MM_BRK: + if (addr <= mm->end_data) + goto out; + + if (rlim < RLIM_INFINITY && + (addr - mm->start_brk) + + (mm->end_data - mm->start_data) > rlim) + goto out; + + mm->brk = addr; + break; + + default: + error = -EINVAL; + goto out; + } + + error = 0; + +out: + up_read(&mm->mmap_sem); + + return error; +} +#else /* CONFIG_CHECKPOINT_RESTORE */ +static int prctl_set_mm(int opt, unsigned long addr, + unsigned long arg4, unsigned long arg5) +{ + return -EINVAL; +} +#endif + SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, unsigned long, arg4, unsigned long, arg5) { @@ -1841,6 +1959,9 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, else error = PR_MCE_KILL_DEFAULT; break; + case PR_SET_MM: + error = prctl_set_mm(arg2, arg3, arg4, arg5); + break; default: error = -EINVAL; break; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index ae2719643854..11d53046b905 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -58,6 +58,7 @@ #include <linux/oom.h> #include <linux/kmod.h> #include <linux/capability.h> +#include <linux/binfmts.h> #include <asm/uaccess.h> #include <asm/processor.h> @@ -803,6 +804,15 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, +#ifdef CONFIG_DEBUG_STACKOVERFLOW + { + .procname = "panic_on_stackoverflow", + .data = &sysctl_panic_on_stackoverflow, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif { .procname = "bootloader_type", .data = &bootloader_type, diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index f6117a4c7cb8..6e039b144daf 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -22,13 +22,16 @@ * NTP timekeeping variables: */ +DEFINE_SPINLOCK(ntp_lock); + + /* USER_HZ period (usecs): */ unsigned long tick_usec = TICK_USEC; /* ACTHZ period (nsecs): */ unsigned long tick_nsec; -u64 tick_length; +static u64 tick_length; static u64 tick_length_base; static struct hrtimer leap_timer; @@ -49,7 +52,7 @@ static struct hrtimer leap_timer; static int time_state = TIME_OK; /* clock status bits: */ -int time_status = STA_UNSYNC; +static int time_status = STA_UNSYNC; /* TAI offset (secs): */ static long time_tai; @@ -133,7 +136,7 @@ static inline void pps_reset_freq_interval(void) /** * pps_clear - Clears the PPS state variables * - * Must be called while holding a write on the xtime_lock + * Must be called while holding a write on the ntp_lock */ static inline void pps_clear(void) { @@ -149,7 +152,7 @@ static inline void pps_clear(void) * the last PPS signal. When it reaches 0, indicate that PPS signal is * missing. * - * Must be called while holding a write on the xtime_lock + * Must be called while holding a write on the ntp_lock */ static inline void pps_dec_valid(void) { @@ -233,6 +236,17 @@ static inline void pps_fill_timex(struct timex *txc) #endif /* CONFIG_NTP_PPS */ + +/** + * ntp_synced - Returns 1 if the NTP status is not UNSYNC + * + */ +static inline int ntp_synced(void) +{ + return !(time_status & STA_UNSYNC); +} + + /* * NTP methods: */ @@ -275,7 +289,7 @@ static inline s64 ntp_update_offset_fll(s64 offset64, long secs) time_status |= STA_MODE; - return div_s64(offset64 << (NTP_SCALE_SHIFT - SHIFT_FLL), secs); + return div64_long(offset64 << (NTP_SCALE_SHIFT - SHIFT_FLL), secs); } static void ntp_update_offset(long offset) @@ -330,11 +344,13 @@ static void ntp_update_offset(long offset) /** * ntp_clear - Clears the NTP state variables - * - * Must be called while holding a write on the xtime_lock */ void ntp_clear(void) { + unsigned long flags; + + spin_lock_irqsave(&ntp_lock, flags); + time_adjust = 0; /* stop active adjtime() */ time_status |= STA_UNSYNC; time_maxerror = NTP_PHASE_LIMIT; @@ -347,8 +363,23 @@ void ntp_clear(void) /* Clear PPS state variables */ pps_clear(); + spin_unlock_irqrestore(&ntp_lock, flags); + } + +u64 ntp_tick_length(void) +{ + unsigned long flags; + s64 ret; + + spin_lock_irqsave(&ntp_lock, flags); + ret = tick_length; + spin_unlock_irqrestore(&ntp_lock, flags); + return ret; +} + + /* * Leap second processing. If in leap-insert state at the end of the * day, the system clock is set back one second; if in leap-delete @@ -357,14 +388,15 @@ void ntp_clear(void) static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer) { enum hrtimer_restart res = HRTIMER_NORESTART; + unsigned long flags; + int leap = 0; - write_seqlock(&xtime_lock); - + spin_lock_irqsave(&ntp_lock, flags); switch (time_state) { case TIME_OK: break; case TIME_INS: - timekeeping_leap_insert(-1); + leap = -1; time_state = TIME_OOP; printk(KERN_NOTICE "Clock: inserting leap second 23:59:60 UTC\n"); @@ -372,7 +404,7 @@ static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer) res = HRTIMER_RESTART; break; case TIME_DEL: - timekeeping_leap_insert(1); + leap = 1; time_tai--; time_state = TIME_WAIT; printk(KERN_NOTICE @@ -387,8 +419,14 @@ static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer) time_state = TIME_OK; break; } + spin_unlock_irqrestore(&ntp_lock, flags); - write_sequnlock(&xtime_lock); + /* + * We have to call this outside of the ntp_lock to keep + * the proper locking hierarchy + */ + if (leap) + timekeeping_leap_insert(leap); return res; } @@ -404,6 +442,9 @@ static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer) void second_overflow(void) { s64 delta; + unsigned long flags; + + spin_lock_irqsave(&ntp_lock, flags); /* Bump the maxerror field */ time_maxerror += MAXFREQ / NSEC_PER_USEC; @@ -423,23 +464,25 @@ void second_overflow(void) pps_dec_valid(); if (!time_adjust) - return; + goto out; if (time_adjust > MAX_TICKADJ) { time_adjust -= MAX_TICKADJ; tick_length += MAX_TICKADJ_SCALED; - return; + goto out; } if (time_adjust < -MAX_TICKADJ) { time_adjust += MAX_TICKADJ; tick_length -= MAX_TICKADJ_SCALED; - return; + goto out; } tick_length += (s64)(time_adjust * NSEC_PER_USEC / NTP_INTERVAL_FREQ) << NTP_SCALE_SHIFT; time_adjust = 0; +out: + spin_unlock_irqrestore(&ntp_lock, flags); } #ifdef CONFIG_GENERIC_CMOS_UPDATE @@ -663,7 +706,7 @@ int do_adjtimex(struct timex *txc) getnstimeofday(&ts); - write_seqlock_irq(&xtime_lock); + spin_lock_irq(&ntp_lock); if (txc->modes & ADJ_ADJTIME) { long save_adjust = time_adjust; @@ -705,7 +748,7 @@ int do_adjtimex(struct timex *txc) /* fill PPS status fields */ pps_fill_timex(txc); - write_sequnlock_irq(&xtime_lock); + spin_unlock_irq(&ntp_lock); txc->time.tv_sec = ts.tv_sec; txc->time.tv_usec = ts.tv_nsec; @@ -903,7 +946,7 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts) pts_norm = pps_normalize_ts(*phase_ts); - write_seqlock_irqsave(&xtime_lock, flags); + spin_lock_irqsave(&ntp_lock, flags); /* clear the error bits, they will be set again if needed */ time_status &= ~(STA_PPSJITTER | STA_PPSWANDER | STA_PPSERROR); @@ -916,7 +959,7 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts) * just start the frequency interval */ if (unlikely(pps_fbase.tv_sec == 0)) { pps_fbase = *raw_ts; - write_sequnlock_irqrestore(&xtime_lock, flags); + spin_unlock_irqrestore(&ntp_lock, flags); return; } @@ -931,7 +974,7 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts) time_status |= STA_PPSJITTER; /* restart the frequency calibration interval */ pps_fbase = *raw_ts; - write_sequnlock_irqrestore(&xtime_lock, flags); + spin_unlock_irqrestore(&ntp_lock, flags); pr_err("hardpps: PPSJITTER: bad pulse\n"); return; } @@ -948,7 +991,7 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts) hardpps_update_phase(pts_norm.nsec); - write_sequnlock_irqrestore(&xtime_lock, flags); + spin_unlock_irqrestore(&ntp_lock, flags); } EXPORT_SYMBOL(hardpps); diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index fd4a7b1625a2..e883f57a3cd3 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -575,11 +575,15 @@ void tick_broadcast_switch_to_oneshot(void) unsigned long flags; raw_spin_lock_irqsave(&tick_broadcast_lock, flags); + if (cpumask_empty(tick_get_broadcast_mask())) + goto end; tick_broadcast_device.mode = TICKDEV_MODE_ONESHOT; bc = tick_broadcast_device.evtdev; if (bc) tick_broadcast_setup_oneshot(bc); + +end: raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); } diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 7656642e4b8e..3526038f2836 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -182,11 +182,7 @@ static void tick_nohz_stop_idle(int cpu, ktime_t now) static ktime_t tick_nohz_start_idle(int cpu, struct tick_sched *ts) { - ktime_t now; - - now = ktime_get(); - - update_ts_time_stats(cpu, ts, now, NULL); + ktime_t now = ktime_get(); ts->idle_entrytime = now; ts->idle_active = 1; @@ -562,20 +558,21 @@ void tick_nohz_idle_exit(void) local_irq_disable(); - if (ts->idle_active || (ts->inidle && ts->tick_stopped)) + WARN_ON_ONCE(!ts->inidle); + + ts->inidle = 0; + + if (ts->idle_active || ts->tick_stopped) now = ktime_get(); if (ts->idle_active) tick_nohz_stop_idle(cpu, now); - if (!ts->inidle || !ts->tick_stopped) { - ts->inidle = 0; + if (!ts->tick_stopped) { local_irq_enable(); return; } - ts->inidle = 0; - /* Update jiffies first */ select_nohz_load_balancer(0); tick_do_update_jiffies64(now); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 0c6358186401..403c2a092830 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -25,6 +25,8 @@ struct timekeeper { /* Current clocksource used for timekeeping. */ struct clocksource *clock; + /* NTP adjusted clock multiplier */ + u32 mult; /* The shift value of the current clocksource. */ int shift; @@ -45,12 +47,47 @@ struct timekeeper { /* Shift conversion between clock shifted nano seconds and * ntp shifted nano seconds. */ int ntp_error_shift; - /* NTP adjusted clock multiplier */ - u32 mult; + + /* The current time */ + struct timespec xtime; + /* + * wall_to_monotonic is what we need to add to xtime (or xtime corrected + * for sub jiffie times) to get to monotonic time. Monotonic is pegged + * at zero at system boot time, so wall_to_monotonic will be negative, + * however, we will ALWAYS keep the tv_nsec part positive so we can use + * the usual normalization. + * + * wall_to_monotonic is moved after resume from suspend for the + * monotonic time not to jump. We need to add total_sleep_time to + * wall_to_monotonic to get the real boot based time offset. + * + * - wall_to_monotonic is no longer the boot time, getboottime must be + * used instead. + */ + struct timespec wall_to_monotonic; + /* time spent in suspend */ + struct timespec total_sleep_time; + /* The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock. */ + struct timespec raw_time; + + /* Seqlock for all timekeeper values */ + seqlock_t lock; }; static struct timekeeper timekeeper; +/* + * This read-write spinlock protects us from races in SMP while + * playing with xtime. + */ +__cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock); + + +/* flag for if timekeeping is suspended */ +int __read_mostly timekeeping_suspended; + + + /** * timekeeper_setup_internals - Set up internals to use clocksource clock. * @@ -135,47 +172,28 @@ static inline s64 timekeeping_get_ns_raw(void) return clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift); } -/* - * This read-write spinlock protects us from races in SMP while - * playing with xtime. - */ -__cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock); - - -/* - * The current time - * wall_to_monotonic is what we need to add to xtime (or xtime corrected - * for sub jiffie times) to get to monotonic time. Monotonic is pegged - * at zero at system boot time, so wall_to_monotonic will be negative, - * however, we will ALWAYS keep the tv_nsec part positive so we can use - * the usual normalization. - * - * wall_to_monotonic is moved after resume from suspend for the monotonic - * time not to jump. We need to add total_sleep_time to wall_to_monotonic - * to get the real boot based time offset. - * - * - wall_to_monotonic is no longer the boot time, getboottime must be - * used instead. - */ -static struct timespec xtime __attribute__ ((aligned (16))); -static struct timespec wall_to_monotonic __attribute__ ((aligned (16))); -static struct timespec total_sleep_time; - -/* - * The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock. - */ -static struct timespec raw_time; +/* must hold write on timekeeper.lock */ +static void timekeeping_update(bool clearntp) +{ + if (clearntp) { + timekeeper.ntp_error = 0; + ntp_clear(); + } + update_vsyscall(&timekeeper.xtime, &timekeeper.wall_to_monotonic, + timekeeper.clock, timekeeper.mult); +} -/* flag for if timekeeping is suspended */ -int __read_mostly timekeeping_suspended; -/* must hold xtime_lock */ void timekeeping_leap_insert(int leapsecond) { - xtime.tv_sec += leapsecond; - wall_to_monotonic.tv_sec -= leapsecond; - update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock, - timekeeper.mult); + unsigned long flags; + + write_seqlock_irqsave(&timekeeper.lock, flags); + timekeeper.xtime.tv_sec += leapsecond; + timekeeper.wall_to_monotonic.tv_sec -= leapsecond; + timekeeping_update(false); + write_sequnlock_irqrestore(&timekeeper.lock, flags); + } /** @@ -202,10 +220,10 @@ static void timekeeping_forward_now(void) /* If arch requires, add in gettimeoffset() */ nsec += arch_gettimeoffset(); - timespec_add_ns(&xtime, nsec); + timespec_add_ns(&timekeeper.xtime, nsec); nsec = clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift); - timespec_add_ns(&raw_time, nsec); + timespec_add_ns(&timekeeper.raw_time, nsec); } /** @@ -222,15 +240,15 @@ void getnstimeofday(struct timespec *ts) WARN_ON(timekeeping_suspended); do { - seq = read_seqbegin(&xtime_lock); + seq = read_seqbegin(&timekeeper.lock); - *ts = xtime; + *ts = timekeeper.xtime; nsecs = timekeeping_get_ns(); /* If arch requires, add in gettimeoffset() */ nsecs += arch_gettimeoffset(); - } while (read_seqretry(&xtime_lock, seq)); + } while (read_seqretry(&timekeeper.lock, seq)); timespec_add_ns(ts, nsecs); } @@ -245,14 +263,16 @@ ktime_t ktime_get(void) WARN_ON(timekeeping_suspended); do { - seq = read_seqbegin(&xtime_lock); - secs = xtime.tv_sec + wall_to_monotonic.tv_sec; - nsecs = xtime.tv_nsec + wall_to_monotonic.tv_nsec; + seq = read_seqbegin(&timekeeper.lock); + secs = timekeeper.xtime.tv_sec + + timekeeper.wall_to_monotonic.tv_sec; + nsecs = timekeeper.xtime.tv_nsec + + timekeeper.wall_to_monotonic.tv_nsec; nsecs += timekeeping_get_ns(); /* If arch requires, add in gettimeoffset() */ nsecs += arch_gettimeoffset(); - } while (read_seqretry(&xtime_lock, seq)); + } while (read_seqretry(&timekeeper.lock, seq)); /* * Use ktime_set/ktime_add_ns to create a proper ktime on * 32-bit architectures without CONFIG_KTIME_SCALAR. @@ -278,14 +298,14 @@ void ktime_get_ts(struct timespec *ts) WARN_ON(timekeeping_suspended); do { - seq = read_seqbegin(&xtime_lock); - *ts = xtime; - tomono = wall_to_monotonic; + seq = read_seqbegin(&timekeeper.lock); + *ts = timekeeper.xtime; + tomono = timekeeper.wall_to_monotonic; nsecs = timekeeping_get_ns(); /* If arch requires, add in gettimeoffset() */ nsecs += arch_gettimeoffset(); - } while (read_seqretry(&xtime_lock, seq)); + } while (read_seqretry(&timekeeper.lock, seq)); set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec, ts->tv_nsec + tomono.tv_nsec + nsecs); @@ -313,10 +333,10 @@ void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real) do { u32 arch_offset; - seq = read_seqbegin(&xtime_lock); + seq = read_seqbegin(&timekeeper.lock); - *ts_raw = raw_time; - *ts_real = xtime; + *ts_raw = timekeeper.raw_time; + *ts_real = timekeeper.xtime; nsecs_raw = timekeeping_get_ns_raw(); nsecs_real = timekeeping_get_ns(); @@ -326,7 +346,7 @@ void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real) nsecs_raw += arch_offset; nsecs_real += arch_offset; - } while (read_seqretry(&xtime_lock, seq)); + } while (read_seqretry(&timekeeper.lock, seq)); timespec_add_ns(ts_raw, nsecs_raw); timespec_add_ns(ts_real, nsecs_real); @@ -365,23 +385,19 @@ int do_settimeofday(const struct timespec *tv) if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) return -EINVAL; - write_seqlock_irqsave(&xtime_lock, flags); + write_seqlock_irqsave(&timekeeper.lock, flags); timekeeping_forward_now(); - ts_delta.tv_sec = tv->tv_sec - xtime.tv_sec; - ts_delta.tv_nsec = tv->tv_nsec - xtime.tv_nsec; - wall_to_monotonic = timespec_sub(wall_to_monotonic, ts_delta); - - xtime = *tv; + ts_delta.tv_sec = tv->tv_sec - timekeeper.xtime.tv_sec; + ts_delta.tv_nsec = tv->tv_nsec - timekeeper.xtime.tv_nsec; + timekeeper.wall_to_monotonic = + timespec_sub(timekeeper.wall_to_monotonic, ts_delta); - timekeeper.ntp_error = 0; - ntp_clear(); - - update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock, - timekeeper.mult); + timekeeper.xtime = *tv; + timekeeping_update(true); - write_sequnlock_irqrestore(&xtime_lock, flags); + write_sequnlock_irqrestore(&timekeeper.lock, flags); /* signal hrtimers about time change */ clock_was_set(); @@ -405,20 +421,17 @@ int timekeeping_inject_offset(struct timespec *ts) if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC) return -EINVAL; - write_seqlock_irqsave(&xtime_lock, flags); + write_seqlock_irqsave(&timekeeper.lock, flags); timekeeping_forward_now(); - xtime = timespec_add(xtime, *ts); - wall_to_monotonic = timespec_sub(wall_to_monotonic, *ts); - - timekeeper.ntp_error = 0; - ntp_clear(); + timekeeper.xtime = timespec_add(timekeeper.xtime, *ts); + timekeeper.wall_to_monotonic = + timespec_sub(timekeeper.wall_to_monotonic, *ts); - update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock, - timekeeper.mult); + timekeeping_update(true); - write_sequnlock_irqrestore(&xtime_lock, flags); + write_sequnlock_irqrestore(&timekeeper.lock, flags); /* signal hrtimers about time change */ clock_was_set(); @@ -490,11 +503,11 @@ void getrawmonotonic(struct timespec *ts) s64 nsecs; do { - seq = read_seqbegin(&xtime_lock); + seq = read_seqbegin(&timekeeper.lock); nsecs = timekeeping_get_ns_raw(); - *ts = raw_time; + *ts = timekeeper.raw_time; - } while (read_seqretry(&xtime_lock, seq)); + } while (read_seqretry(&timekeeper.lock, seq)); timespec_add_ns(ts, nsecs); } @@ -510,24 +523,30 @@ int timekeeping_valid_for_hres(void) int ret; do { - seq = read_seqbegin(&xtime_lock); + seq = read_seqbegin(&timekeeper.lock); ret = timekeeper.clock->flags & CLOCK_SOURCE_VALID_FOR_HRES; - } while (read_seqretry(&xtime_lock, seq)); + } while (read_seqretry(&timekeeper.lock, seq)); return ret; } /** * timekeeping_max_deferment - Returns max time the clocksource can be deferred - * - * Caller must observe xtime_lock via read_seqbegin/read_seqretry to - * ensure that the clocksource does not change! */ u64 timekeeping_max_deferment(void) { - return timekeeper.clock->max_idle_ns; + unsigned long seq; + u64 ret; + do { + seq = read_seqbegin(&timekeeper.lock); + + ret = timekeeper.clock->max_idle_ns; + + } while (read_seqretry(&timekeeper.lock, seq)); + + return ret; } /** @@ -572,28 +591,29 @@ void __init timekeeping_init(void) read_persistent_clock(&now); read_boot_clock(&boot); - write_seqlock_irqsave(&xtime_lock, flags); + seqlock_init(&timekeeper.lock); ntp_init(); + write_seqlock_irqsave(&timekeeper.lock, flags); clock = clocksource_default_clock(); if (clock->enable) clock->enable(clock); timekeeper_setup_internals(clock); - xtime.tv_sec = now.tv_sec; - xtime.tv_nsec = now.tv_nsec; - raw_time.tv_sec = 0; - raw_time.tv_nsec = 0; + timekeeper.xtime.tv_sec = now.tv_sec; + timekeeper.xtime.tv_nsec = now.tv_nsec; + timekeeper.raw_time.tv_sec = 0; + timekeeper.raw_time.tv_nsec = 0; if (boot.tv_sec == 0 && boot.tv_nsec == 0) { - boot.tv_sec = xtime.tv_sec; - boot.tv_nsec = xtime.tv_nsec; + boot.tv_sec = timekeeper.xtime.tv_sec; + boot.tv_nsec = timekeeper.xtime.tv_nsec; } - set_normalized_timespec(&wall_to_monotonic, + set_normalized_timespec(&timekeeper.wall_to_monotonic, -boot.tv_sec, -boot.tv_nsec); - total_sleep_time.tv_sec = 0; - total_sleep_time.tv_nsec = 0; - write_sequnlock_irqrestore(&xtime_lock, flags); + timekeeper.total_sleep_time.tv_sec = 0; + timekeeper.total_sleep_time.tv_nsec = 0; + write_sequnlock_irqrestore(&timekeeper.lock, flags); } /* time in seconds when suspend began */ @@ -614,9 +634,11 @@ static void __timekeeping_inject_sleeptime(struct timespec *delta) return; } - xtime = timespec_add(xtime, *delta); - wall_to_monotonic = timespec_sub(wall_to_monotonic, *delta); - total_sleep_time = timespec_add(total_sleep_time, *delta); + timekeeper.xtime = timespec_add(timekeeper.xtime, *delta); + timekeeper.wall_to_monotonic = + timespec_sub(timekeeper.wall_to_monotonic, *delta); + timekeeper.total_sleep_time = timespec_add( + timekeeper.total_sleep_time, *delta); } @@ -640,17 +662,15 @@ void timekeeping_inject_sleeptime(struct timespec *delta) if (!(ts.tv_sec == 0 && ts.tv_nsec == 0)) return; - write_seqlock_irqsave(&xtime_lock, flags); + write_seqlock_irqsave(&timekeeper.lock, flags); + timekeeping_forward_now(); __timekeeping_inject_sleeptime(delta); - timekeeper.ntp_error = 0; - ntp_clear(); - update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock, - timekeeper.mult); + timekeeping_update(true); - write_sequnlock_irqrestore(&xtime_lock, flags); + write_sequnlock_irqrestore(&timekeeper.lock, flags); /* signal hrtimers about time change */ clock_was_set(); @@ -673,7 +693,7 @@ static void timekeeping_resume(void) clocksource_resume(); - write_seqlock_irqsave(&xtime_lock, flags); + write_seqlock_irqsave(&timekeeper.lock, flags); if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) { ts = timespec_sub(ts, timekeeping_suspend_time); @@ -683,7 +703,7 @@ static void timekeeping_resume(void) timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock); timekeeper.ntp_error = 0; timekeeping_suspended = 0; - write_sequnlock_irqrestore(&xtime_lock, flags); + write_sequnlock_irqrestore(&timekeeper.lock, flags); touch_softlockup_watchdog(); @@ -701,7 +721,7 @@ static int timekeeping_suspend(void) read_persistent_clock(&timekeeping_suspend_time); - write_seqlock_irqsave(&xtime_lock, flags); + write_seqlock_irqsave(&timekeeper.lock, flags); timekeeping_forward_now(); timekeeping_suspended = 1; @@ -711,7 +731,7 @@ static int timekeeping_suspend(void) * try to compensate so the difference in system time * and persistent_clock time stays close to constant. */ - delta = timespec_sub(xtime, timekeeping_suspend_time); + delta = timespec_sub(timekeeper.xtime, timekeeping_suspend_time); delta_delta = timespec_sub(delta, old_delta); if (abs(delta_delta.tv_sec) >= 2) { /* @@ -724,7 +744,7 @@ static int timekeeping_suspend(void) timekeeping_suspend_time = timespec_add(timekeeping_suspend_time, delta_delta); } - write_sequnlock_irqrestore(&xtime_lock, flags); + write_sequnlock_irqrestore(&timekeeper.lock, flags); clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL); clocksource_suspend(); @@ -775,7 +795,7 @@ static __always_inline int timekeeping_bigadjust(s64 error, s64 *interval, * Now calculate the error in (1 << look_ahead) ticks, but first * remove the single look ahead already included in the error. */ - tick_error = tick_length >> (timekeeper.ntp_error_shift + 1); + tick_error = ntp_tick_length() >> (timekeeper.ntp_error_shift + 1); tick_error -= timekeeper.xtime_interval >> 1; error = ((error - tick_error) >> look_ahead) + tick_error; @@ -943,22 +963,22 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift) timekeeper.xtime_nsec += timekeeper.xtime_interval << shift; while (timekeeper.xtime_nsec >= nsecps) { timekeeper.xtime_nsec -= nsecps; - xtime.tv_sec++; + timekeeper.xtime.tv_sec++; second_overflow(); } /* Accumulate raw time */ raw_nsecs = timekeeper.raw_interval << shift; - raw_nsecs += raw_time.tv_nsec; + raw_nsecs += timekeeper.raw_time.tv_nsec; if (raw_nsecs >= NSEC_PER_SEC) { u64 raw_secs = raw_nsecs; raw_nsecs = do_div(raw_secs, NSEC_PER_SEC); - raw_time.tv_sec += raw_secs; + timekeeper.raw_time.tv_sec += raw_secs; } - raw_time.tv_nsec = raw_nsecs; + timekeeper.raw_time.tv_nsec = raw_nsecs; /* Accumulate error between NTP and clock interval */ - timekeeper.ntp_error += tick_length << shift; + timekeeper.ntp_error += ntp_tick_length() << shift; timekeeper.ntp_error -= (timekeeper.xtime_interval + timekeeper.xtime_remainder) << (timekeeper.ntp_error_shift + shift); @@ -970,17 +990,19 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift) /** * update_wall_time - Uses the current clocksource to increment the wall time * - * Called from the timer interrupt, must hold a write on xtime_lock. */ static void update_wall_time(void) { struct clocksource *clock; cycle_t offset; int shift = 0, maxshift; + unsigned long flags; + + write_seqlock_irqsave(&timekeeper.lock, flags); /* Make sure we're fully resumed: */ if (unlikely(timekeeping_suspended)) - return; + goto out; clock = timekeeper.clock; @@ -989,7 +1011,8 @@ static void update_wall_time(void) #else offset = (clock->read(clock) - clock->cycle_last) & clock->mask; #endif - timekeeper.xtime_nsec = (s64)xtime.tv_nsec << timekeeper.shift; + timekeeper.xtime_nsec = (s64)timekeeper.xtime.tv_nsec << + timekeeper.shift; /* * With NO_HZ we may have to accumulate many cycle_intervals @@ -1002,7 +1025,7 @@ static void update_wall_time(void) shift = ilog2(offset) - ilog2(timekeeper.cycle_interval); shift = max(0, shift); /* Bound shift to one less then what overflows tick_length */ - maxshift = (8*sizeof(tick_length) - (ilog2(tick_length)+1)) - 1; + maxshift = (64 - (ilog2(ntp_tick_length())+1)) - 1; shift = min(shift, maxshift); while (offset >= timekeeper.cycle_interval) { offset = logarithmic_accumulation(offset, shift); @@ -1040,8 +1063,10 @@ static void update_wall_time(void) * Store full nanoseconds into xtime after rounding it up and * add the remainder to the error difference. */ - xtime.tv_nsec = ((s64) timekeeper.xtime_nsec >> timekeeper.shift) + 1; - timekeeper.xtime_nsec -= (s64) xtime.tv_nsec << timekeeper.shift; + timekeeper.xtime.tv_nsec = ((s64)timekeeper.xtime_nsec >> + timekeeper.shift) + 1; + timekeeper.xtime_nsec -= (s64)timekeeper.xtime.tv_nsec << + timekeeper.shift; timekeeper.ntp_error += timekeeper.xtime_nsec << timekeeper.ntp_error_shift; @@ -1049,15 +1074,17 @@ static void update_wall_time(void) * Finally, make sure that after the rounding * xtime.tv_nsec isn't larger then NSEC_PER_SEC */ - if (unlikely(xtime.tv_nsec >= NSEC_PER_SEC)) { - xtime.tv_nsec -= NSEC_PER_SEC; - xtime.tv_sec++; + if (unlikely(timekeeper.xtime.tv_nsec >= NSEC_PER_SEC)) { + timekeeper.xtime.tv_nsec -= NSEC_PER_SEC; + timekeeper.xtime.tv_sec++; second_overflow(); } - /* check to see if there is a new clocksource to use */ - update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock, - timekeeper.mult); + timekeeping_update(false); + +out: + write_sequnlock_irqrestore(&timekeeper.lock, flags); + } /** @@ -1074,8 +1101,10 @@ static void update_wall_time(void) void getboottime(struct timespec *ts) { struct timespec boottime = { - .tv_sec = wall_to_monotonic.tv_sec + total_sleep_time.tv_sec, - .tv_nsec = wall_to_monotonic.tv_nsec + total_sleep_time.tv_nsec + .tv_sec = timekeeper.wall_to_monotonic.tv_sec + + timekeeper.total_sleep_time.tv_sec, + .tv_nsec = timekeeper.wall_to_monotonic.tv_nsec + + timekeeper.total_sleep_time.tv_nsec }; set_normalized_timespec(ts, -boottime.tv_sec, -boottime.tv_nsec); @@ -1101,13 +1130,13 @@ void get_monotonic_boottime(struct timespec *ts) WARN_ON(timekeeping_suspended); do { - seq = read_seqbegin(&xtime_lock); - *ts = xtime; - tomono = wall_to_monotonic; - sleep = total_sleep_time; + seq = read_seqbegin(&timekeeper.lock); + *ts = timekeeper.xtime; + tomono = timekeeper.wall_to_monotonic; + sleep = timekeeper.total_sleep_time; nsecs = timekeeping_get_ns(); - } while (read_seqretry(&xtime_lock, seq)); + } while (read_seqretry(&timekeeper.lock, seq)); set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec + sleep.tv_sec, ts->tv_nsec + tomono.tv_nsec + sleep.tv_nsec + nsecs); @@ -1137,19 +1166,19 @@ EXPORT_SYMBOL_GPL(ktime_get_boottime); */ void monotonic_to_bootbased(struct timespec *ts) { - *ts = timespec_add(*ts, total_sleep_time); + *ts = timespec_add(*ts, timekeeper.total_sleep_time); } EXPORT_SYMBOL_GPL(monotonic_to_bootbased); unsigned long get_seconds(void) { - return xtime.tv_sec; + return timekeeper.xtime.tv_sec; } EXPORT_SYMBOL(get_seconds); struct timespec __current_kernel_time(void) { - return xtime; + return timekeeper.xtime; } struct timespec current_kernel_time(void) @@ -1158,10 +1187,10 @@ struct timespec current_kernel_time(void) unsigned long seq; do { - seq = read_seqbegin(&xtime_lock); + seq = read_seqbegin(&timekeeper.lock); - now = xtime; - } while (read_seqretry(&xtime_lock, seq)); + now = timekeeper.xtime; + } while (read_seqretry(&timekeeper.lock, seq)); return now; } @@ -1173,11 +1202,11 @@ struct timespec get_monotonic_coarse(void) unsigned long seq; do { - seq = read_seqbegin(&xtime_lock); + seq = read_seqbegin(&timekeeper.lock); - now = xtime; - mono = wall_to_monotonic; - } while (read_seqretry(&xtime_lock, seq)); + now = timekeeper.xtime; + mono = timekeeper.wall_to_monotonic; + } while (read_seqretry(&timekeeper.lock, seq)); set_normalized_timespec(&now, now.tv_sec + mono.tv_sec, now.tv_nsec + mono.tv_nsec); @@ -1209,11 +1238,11 @@ void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim, unsigned long seq; do { - seq = read_seqbegin(&xtime_lock); - *xtim = xtime; - *wtom = wall_to_monotonic; - *sleep = total_sleep_time; - } while (read_seqretry(&xtime_lock, seq)); + seq = read_seqbegin(&timekeeper.lock); + *xtim = timekeeper.xtime; + *wtom = timekeeper.wall_to_monotonic; + *sleep = timekeeper.total_sleep_time; + } while (read_seqretry(&timekeeper.lock, seq)); } /** @@ -1225,9 +1254,10 @@ ktime_t ktime_get_monotonic_offset(void) struct timespec wtom; do { - seq = read_seqbegin(&xtime_lock); - wtom = wall_to_monotonic; - } while (read_seqretry(&xtime_lock, seq)); + seq = read_seqbegin(&timekeeper.lock); + wtom = timekeeper.wall_to_monotonic; + } while (read_seqretry(&timekeeper.lock, seq)); + return timespec_to_ktime(wtom); } diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index b1e8943fed1d..867bd1dd2dd0 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -22,11 +22,13 @@ #include <linux/hardirq.h> #include <linux/kthread.h> #include <linux/uaccess.h> +#include <linux/bsearch.h> #include <linux/module.h> #include <linux/ftrace.h> #include <linux/sysctl.h> #include <linux/slab.h> #include <linux/ctype.h> +#include <linux/sort.h> #include <linux/list.h> #include <linux/hash.h> #include <linux/rcupdate.h> @@ -60,6 +62,8 @@ #define FTRACE_HASH_DEFAULT_BITS 10 #define FTRACE_HASH_MAX_BITS 12 +#define FL_GLOBAL_CONTROL_MASK (FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_CONTROL) + /* ftrace_enabled is a method to turn ftrace on or off */ int ftrace_enabled __read_mostly; static int last_ftrace_enabled; @@ -87,12 +91,14 @@ static struct ftrace_ops ftrace_list_end __read_mostly = { }; static struct ftrace_ops *ftrace_global_list __read_mostly = &ftrace_list_end; +static struct ftrace_ops *ftrace_control_list __read_mostly = &ftrace_list_end; static struct ftrace_ops *ftrace_ops_list __read_mostly = &ftrace_list_end; ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; static ftrace_func_t __ftrace_trace_function_delay __read_mostly = ftrace_stub; ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub; ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub; static struct ftrace_ops global_ops; +static struct ftrace_ops control_ops; static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip); @@ -166,6 +172,32 @@ static void ftrace_test_stop_func(unsigned long ip, unsigned long parent_ip) } #endif +static void control_ops_disable_all(struct ftrace_ops *ops) +{ + int cpu; + + for_each_possible_cpu(cpu) + *per_cpu_ptr(ops->disabled, cpu) = 1; +} + +static int control_ops_alloc(struct ftrace_ops *ops) +{ + int __percpu *disabled; + + disabled = alloc_percpu(int); + if (!disabled) + return -ENOMEM; + + ops->disabled = disabled; + control_ops_disable_all(ops); + return 0; +} + +static void control_ops_free(struct ftrace_ops *ops) +{ + free_percpu(ops->disabled); +} + static void update_global_ops(void) { ftrace_func_t func; @@ -257,6 +289,26 @@ static int remove_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops) return 0; } +static void add_ftrace_list_ops(struct ftrace_ops **list, + struct ftrace_ops *main_ops, + struct ftrace_ops *ops) +{ + int first = *list == &ftrace_list_end; + add_ftrace_ops(list, ops); + if (first) + add_ftrace_ops(&ftrace_ops_list, main_ops); +} + +static int remove_ftrace_list_ops(struct ftrace_ops **list, + struct ftrace_ops *main_ops, + struct ftrace_ops *ops) +{ + int ret = remove_ftrace_ops(list, ops); + if (!ret && *list == &ftrace_list_end) + ret = remove_ftrace_ops(&ftrace_ops_list, main_ops); + return ret; +} + static int __register_ftrace_function(struct ftrace_ops *ops) { if (ftrace_disabled) @@ -268,15 +320,20 @@ static int __register_ftrace_function(struct ftrace_ops *ops) if (WARN_ON(ops->flags & FTRACE_OPS_FL_ENABLED)) return -EBUSY; + /* We don't support both control and global flags set. */ + if ((ops->flags & FL_GLOBAL_CONTROL_MASK) == FL_GLOBAL_CONTROL_MASK) + return -EINVAL; + if (!core_kernel_data((unsigned long)ops)) ops->flags |= FTRACE_OPS_FL_DYNAMIC; if (ops->flags & FTRACE_OPS_FL_GLOBAL) { - int first = ftrace_global_list == &ftrace_list_end; - add_ftrace_ops(&ftrace_global_list, ops); + add_ftrace_list_ops(&ftrace_global_list, &global_ops, ops); ops->flags |= FTRACE_OPS_FL_ENABLED; - if (first) - add_ftrace_ops(&ftrace_ops_list, &global_ops); + } else if (ops->flags & FTRACE_OPS_FL_CONTROL) { + if (control_ops_alloc(ops)) + return -ENOMEM; + add_ftrace_list_ops(&ftrace_control_list, &control_ops, ops); } else add_ftrace_ops(&ftrace_ops_list, ops); @@ -300,11 +357,23 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops) return -EINVAL; if (ops->flags & FTRACE_OPS_FL_GLOBAL) { - ret = remove_ftrace_ops(&ftrace_global_list, ops); - if (!ret && ftrace_global_list == &ftrace_list_end) - ret = remove_ftrace_ops(&ftrace_ops_list, &global_ops); + ret = remove_ftrace_list_ops(&ftrace_global_list, + &global_ops, ops); if (!ret) ops->flags &= ~FTRACE_OPS_FL_ENABLED; + } else if (ops->flags & FTRACE_OPS_FL_CONTROL) { + ret = remove_ftrace_list_ops(&ftrace_control_list, + &control_ops, ops); + if (!ret) { + /* + * The ftrace_ops is now removed from the list, + * so there'll be no new users. We must ensure + * all current users are done before we free + * the control data. + */ + synchronize_sched(); + control_ops_free(ops); + } } else ret = remove_ftrace_ops(&ftrace_ops_list, ops); @@ -947,13 +1016,6 @@ struct ftrace_func_probe { struct rcu_head rcu; }; -enum { - FTRACE_ENABLE_CALLS = (1 << 0), - FTRACE_DISABLE_CALLS = (1 << 1), - FTRACE_UPDATE_TRACE_FUNC = (1 << 2), - FTRACE_START_FUNC_RET = (1 << 3), - FTRACE_STOP_FUNC_RET = (1 << 4), -}; struct ftrace_func_entry { struct hlist_node hlist; unsigned long ip; @@ -984,18 +1046,19 @@ static struct ftrace_ops global_ops = { .filter_hash = EMPTY_HASH, }; -static struct dyn_ftrace *ftrace_new_addrs; - static DEFINE_MUTEX(ftrace_regex_lock); struct ftrace_page { struct ftrace_page *next; + struct dyn_ftrace *records; int index; - struct dyn_ftrace records[]; + int size; }; -#define ENTRIES_PER_PAGE \ - ((PAGE_SIZE - sizeof(struct ftrace_page)) / sizeof(struct dyn_ftrace)) +static struct ftrace_page *ftrace_new_pgs; + +#define ENTRY_SIZE sizeof(struct dyn_ftrace) +#define ENTRIES_PER_PAGE (PAGE_SIZE / ENTRY_SIZE) /* estimate from running different kernels */ #define NR_TO_INIT 10000 @@ -1003,7 +1066,10 @@ struct ftrace_page { static struct ftrace_page *ftrace_pages_start; static struct ftrace_page *ftrace_pages; -static struct dyn_ftrace *ftrace_free_records; +static bool ftrace_hash_empty(struct ftrace_hash *hash) +{ + return !hash || !hash->count; +} static struct ftrace_func_entry * ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip) @@ -1013,7 +1079,7 @@ ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip) struct hlist_head *hhd; struct hlist_node *n; - if (!hash->count) + if (ftrace_hash_empty(hash)) return NULL; if (hash->size_bits > 0) @@ -1120,6 +1186,12 @@ static void free_ftrace_hash_rcu(struct ftrace_hash *hash) call_rcu_sched(&hash->rcu, __free_ftrace_hash_rcu); } +void ftrace_free_filter(struct ftrace_ops *ops) +{ + free_ftrace_hash(ops->filter_hash); + free_ftrace_hash(ops->notrace_hash); +} + static struct ftrace_hash *alloc_ftrace_hash(int size_bits) { struct ftrace_hash *hash; @@ -1130,7 +1202,7 @@ static struct ftrace_hash *alloc_ftrace_hash(int size_bits) return NULL; size = 1 << size_bits; - hash->buckets = kzalloc(sizeof(*hash->buckets) * size, GFP_KERNEL); + hash->buckets = kcalloc(size, sizeof(*hash->buckets), GFP_KERNEL); if (!hash->buckets) { kfree(hash); @@ -1157,7 +1229,7 @@ alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash) return NULL; /* Empty hash? */ - if (!hash || !hash->count) + if (ftrace_hash_empty(hash)) return new_hash; size = 1 << hash->size_bits; @@ -1282,9 +1354,9 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip) filter_hash = rcu_dereference_raw(ops->filter_hash); notrace_hash = rcu_dereference_raw(ops->notrace_hash); - if ((!filter_hash || !filter_hash->count || + if ((ftrace_hash_empty(filter_hash) || ftrace_lookup_ip(filter_hash, ip)) && - (!notrace_hash || !notrace_hash->count || + (ftrace_hash_empty(notrace_hash) || !ftrace_lookup_ip(notrace_hash, ip))) ret = 1; else @@ -1307,6 +1379,47 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip) } \ } + +static int ftrace_cmp_recs(const void *a, const void *b) +{ + const struct dyn_ftrace *reca = a; + const struct dyn_ftrace *recb = b; + + if (reca->ip > recb->ip) + return 1; + if (reca->ip < recb->ip) + return -1; + return 0; +} + +/** + * ftrace_location - return true if the ip giving is a traced location + * @ip: the instruction pointer to check + * + * Returns 1 if @ip given is a pointer to a ftrace location. + * That is, the instruction that is either a NOP or call to + * the function tracer. It checks the ftrace internal tables to + * determine if the address belongs or not. + */ +int ftrace_location(unsigned long ip) +{ + struct ftrace_page *pg; + struct dyn_ftrace *rec; + struct dyn_ftrace key; + + key.ip = ip; + + for (pg = ftrace_pages_start; pg; pg = pg->next) { + rec = bsearch(&key, pg->records, pg->index, + sizeof(struct dyn_ftrace), + ftrace_cmp_recs); + if (rec) + return 1; + } + + return 0; +} + static void __ftrace_hash_rec_update(struct ftrace_ops *ops, int filter_hash, bool inc) @@ -1336,7 +1449,7 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops, if (filter_hash) { hash = ops->filter_hash; other_hash = ops->notrace_hash; - if (!hash || !hash->count) + if (ftrace_hash_empty(hash)) all = 1; } else { inc = !inc; @@ -1346,7 +1459,7 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops, * If the notrace hash has no items, * then there's nothing to do. */ - if (hash && !hash->count) + if (ftrace_hash_empty(hash)) return; } @@ -1363,8 +1476,8 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops, if (!other_hash || !ftrace_lookup_ip(other_hash, rec->ip)) match = 1; } else { - in_hash = hash && !!ftrace_lookup_ip(hash, rec->ip); - in_other_hash = other_hash && !!ftrace_lookup_ip(other_hash, rec->ip); + in_hash = !!ftrace_lookup_ip(hash, rec->ip); + in_other_hash = !!ftrace_lookup_ip(other_hash, rec->ip); /* * @@ -1372,7 +1485,7 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops, if (filter_hash && in_hash && !in_other_hash) match = 1; else if (!filter_hash && in_hash && - (in_other_hash || !other_hash->count)) + (in_other_hash || ftrace_hash_empty(other_hash))) match = 1; } if (!match) @@ -1406,40 +1519,12 @@ static void ftrace_hash_rec_enable(struct ftrace_ops *ops, __ftrace_hash_rec_update(ops, filter_hash, 1); } -static void ftrace_free_rec(struct dyn_ftrace *rec) -{ - rec->freelist = ftrace_free_records; - ftrace_free_records = rec; - rec->flags |= FTRACE_FL_FREE; -} - static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip) { - struct dyn_ftrace *rec; - - /* First check for freed records */ - if (ftrace_free_records) { - rec = ftrace_free_records; - - if (unlikely(!(rec->flags & FTRACE_FL_FREE))) { - FTRACE_WARN_ON_ONCE(1); - ftrace_free_records = NULL; + if (ftrace_pages->index == ftrace_pages->size) { + /* We should have allocated enough */ + if (WARN_ON(!ftrace_pages->next)) return NULL; - } - - ftrace_free_records = rec->freelist; - memset(rec, 0, sizeof(*rec)); - return rec; - } - - if (ftrace_pages->index == ENTRIES_PER_PAGE) { - if (!ftrace_pages->next) { - /* allocate another page */ - ftrace_pages->next = - (void *)get_zeroed_page(GFP_KERNEL); - if (!ftrace_pages->next) - return NULL; - } ftrace_pages = ftrace_pages->next; } @@ -1459,8 +1544,6 @@ ftrace_record_ip(unsigned long ip) return NULL; rec->ip = ip; - rec->newlist = ftrace_new_addrs; - ftrace_new_addrs = rec; return rec; } @@ -1475,7 +1558,19 @@ static void print_ip_ins(const char *fmt, unsigned char *p) printk(KERN_CONT "%s%02x", i ? ":" : "", p[i]); } -static void ftrace_bug(int failed, unsigned long ip) +/** + * ftrace_bug - report and shutdown function tracer + * @failed: The failed type (EFAULT, EINVAL, EPERM) + * @ip: The address that failed + * + * The arch code that enables or disables the function tracing + * can call ftrace_bug() when it has detected a problem in + * modifying the code. @failed should be one of either: + * EFAULT - if the problem happens on reading the @ip address + * EINVAL - if what is read at @ip is not what was expected + * EPERM - if the problem happens on writting to the @ip address + */ +void ftrace_bug(int failed, unsigned long ip) { switch (failed) { case -EFAULT: @@ -1517,24 +1612,19 @@ int ftrace_text_reserved(void *start, void *end) return 0; } - -static int -__ftrace_replace_code(struct dyn_ftrace *rec, int enable) +static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update) { - unsigned long ftrace_addr; unsigned long flag = 0UL; - ftrace_addr = (unsigned long)FTRACE_ADDR; - /* - * If we are enabling tracing: + * If we are updating calls: * * If the record has a ref count, then we need to enable it * because someone is using it. * * Otherwise we make sure its disabled. * - * If we are disabling tracing, then disable all records that + * If we are disabling calls, then disable all records that * are enabled. */ if (enable && (rec->flags & ~FTRACE_FL_MASK)) @@ -1542,18 +1632,72 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable) /* If the state of this record hasn't changed, then do nothing */ if ((rec->flags & FTRACE_FL_ENABLED) == flag) - return 0; + return FTRACE_UPDATE_IGNORE; if (flag) { - rec->flags |= FTRACE_FL_ENABLED; + if (update) + rec->flags |= FTRACE_FL_ENABLED; + return FTRACE_UPDATE_MAKE_CALL; + } + + if (update) + rec->flags &= ~FTRACE_FL_ENABLED; + + return FTRACE_UPDATE_MAKE_NOP; +} + +/** + * ftrace_update_record, set a record that now is tracing or not + * @rec: the record to update + * @enable: set to 1 if the record is tracing, zero to force disable + * + * The records that represent all functions that can be traced need + * to be updated when tracing has been enabled. + */ +int ftrace_update_record(struct dyn_ftrace *rec, int enable) +{ + return ftrace_check_record(rec, enable, 1); +} + +/** + * ftrace_test_record, check if the record has been enabled or not + * @rec: the record to test + * @enable: set to 1 to check if enabled, 0 if it is disabled + * + * The arch code may need to test if a record is already set to + * tracing to determine how to modify the function code that it + * represents. + */ +int ftrace_test_record(struct dyn_ftrace *rec, int enable) +{ + return ftrace_check_record(rec, enable, 0); +} + +static int +__ftrace_replace_code(struct dyn_ftrace *rec, int enable) +{ + unsigned long ftrace_addr; + int ret; + + ftrace_addr = (unsigned long)FTRACE_ADDR; + + ret = ftrace_update_record(rec, enable); + + switch (ret) { + case FTRACE_UPDATE_IGNORE: + return 0; + + case FTRACE_UPDATE_MAKE_CALL: return ftrace_make_call(rec, ftrace_addr); + + case FTRACE_UPDATE_MAKE_NOP: + return ftrace_make_nop(NULL, rec, ftrace_addr); } - rec->flags &= ~FTRACE_FL_ENABLED; - return ftrace_make_nop(NULL, rec, ftrace_addr); + return -1; /* unknow ftrace bug */ } -static void ftrace_replace_code(int enable) +static void ftrace_replace_code(int update) { struct dyn_ftrace *rec; struct ftrace_page *pg; @@ -1563,11 +1707,7 @@ static void ftrace_replace_code(int enable) return; do_for_each_ftrace_rec(pg, rec) { - /* Skip over free records */ - if (rec->flags & FTRACE_FL_FREE) - continue; - - failed = __ftrace_replace_code(rec, enable); + failed = __ftrace_replace_code(rec, update); if (failed) { ftrace_bug(failed, rec->ip); /* Stop processing */ @@ -1576,6 +1716,78 @@ static void ftrace_replace_code(int enable) } while_for_each_ftrace_rec(); } +struct ftrace_rec_iter { + struct ftrace_page *pg; + int index; +}; + +/** + * ftrace_rec_iter_start, start up iterating over traced functions + * + * Returns an iterator handle that is used to iterate over all + * the records that represent address locations where functions + * are traced. + * + * May return NULL if no records are available. + */ +struct ftrace_rec_iter *ftrace_rec_iter_start(void) +{ + /* + * We only use a single iterator. + * Protected by the ftrace_lock mutex. + */ + static struct ftrace_rec_iter ftrace_rec_iter; + struct ftrace_rec_iter *iter = &ftrace_rec_iter; + + iter->pg = ftrace_pages_start; + iter->index = 0; + + /* Could have empty pages */ + while (iter->pg && !iter->pg->index) + iter->pg = iter->pg->next; + + if (!iter->pg) + return NULL; + + return iter; +} + +/** + * ftrace_rec_iter_next, get the next record to process. + * @iter: The handle to the iterator. + * + * Returns the next iterator after the given iterator @iter. + */ +struct ftrace_rec_iter *ftrace_rec_iter_next(struct ftrace_rec_iter *iter) +{ + iter->index++; + + if (iter->index >= iter->pg->index) { + iter->pg = iter->pg->next; + iter->index = 0; + + /* Could have empty pages */ + while (iter->pg && !iter->pg->index) + iter->pg = iter->pg->next; + } + + if (!iter->pg) + return NULL; + + return iter; +} + +/** + * ftrace_rec_iter_record, get the record at the iterator location + * @iter: The current iterator location + * + * Returns the record that the current @iter is at. + */ +struct dyn_ftrace *ftrace_rec_iter_record(struct ftrace_rec_iter *iter) +{ + return &iter->pg->records[iter->index]; +} + static int ftrace_code_disable(struct module *mod, struct dyn_ftrace *rec) { @@ -1617,13 +1829,7 @@ static int __ftrace_modify_code(void *data) { int *command = data; - /* - * Do not call function tracer while we update the code. - * We are in stop machine, no worrying about races. - */ - function_trace_stop++; - - if (*command & FTRACE_ENABLE_CALLS) + if (*command & FTRACE_UPDATE_CALLS) ftrace_replace_code(1); else if (*command & FTRACE_DISABLE_CALLS) ftrace_replace_code(0); @@ -1636,21 +1842,33 @@ static int __ftrace_modify_code(void *data) else if (*command & FTRACE_STOP_FUNC_RET) ftrace_disable_ftrace_graph_caller(); -#ifndef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST - /* - * For archs that call ftrace_test_stop_func(), we must - * wait till after we update all the function callers - * before we update the callback. This keeps different - * ops that record different functions from corrupting - * each other. - */ - __ftrace_trace_function = __ftrace_trace_function_delay; -#endif - function_trace_stop--; - return 0; } +/** + * ftrace_run_stop_machine, go back to the stop machine method + * @command: The command to tell ftrace what to do + * + * If an arch needs to fall back to the stop machine method, the + * it can call this function. + */ +void ftrace_run_stop_machine(int command) +{ + stop_machine(__ftrace_modify_code, &command, NULL); +} + +/** + * arch_ftrace_update_code, modify the code to trace or not trace + * @command: The command that needs to be done + * + * Archs can override this function if it does not need to + * run stop_machine() to modify code. + */ +void __weak arch_ftrace_update_code(int command) +{ + ftrace_run_stop_machine(command); +} + static void ftrace_run_update_code(int command) { int ret; @@ -1659,8 +1877,31 @@ static void ftrace_run_update_code(int command) FTRACE_WARN_ON(ret); if (ret) return; + /* + * Do not call function tracer while we update the code. + * We are in stop machine. + */ + function_trace_stop++; - stop_machine(__ftrace_modify_code, &command, NULL); + /* + * By default we use stop_machine() to modify the code. + * But archs can do what ever they want as long as it + * is safe. The stop_machine() is the safest, but also + * produces the most overhead. + */ + arch_ftrace_update_code(command); + +#ifndef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST + /* + * For archs that call ftrace_test_stop_func(), we must + * wait till after we update all the function callers + * before we update the callback. This keeps different + * ops that record different functions from corrupting + * each other. + */ + __ftrace_trace_function = __ftrace_trace_function_delay; +#endif + function_trace_stop--; ret = ftrace_arch_code_modify_post_process(); FTRACE_WARN_ON(ret); @@ -1691,7 +1932,7 @@ static int ftrace_startup(struct ftrace_ops *ops, int command) return -ENODEV; ftrace_start_up++; - command |= FTRACE_ENABLE_CALLS; + command |= FTRACE_UPDATE_CALLS; /* ops marked global share the filter hashes */ if (ops->flags & FTRACE_OPS_FL_GLOBAL) { @@ -1743,8 +1984,7 @@ static void ftrace_shutdown(struct ftrace_ops *ops, int command) if (ops != &global_ops || !global_start_up) ops->flags &= ~FTRACE_OPS_FL_ENABLED; - if (!ftrace_start_up) - command |= FTRACE_DISABLE_CALLS; + command |= FTRACE_UPDATE_CALLS; if (saved_ftrace_func != ftrace_trace_function) { saved_ftrace_func = ftrace_trace_function; @@ -1766,7 +2006,7 @@ static void ftrace_startup_sysctl(void) saved_ftrace_func = NULL; /* ftrace_start_up is true if we want ftrace running */ if (ftrace_start_up) - ftrace_run_update_code(FTRACE_ENABLE_CALLS); + ftrace_run_update_code(FTRACE_UPDATE_CALLS); } static void ftrace_shutdown_sysctl(void) @@ -1788,14 +2028,16 @@ static int ops_traces_mod(struct ftrace_ops *ops) struct ftrace_hash *hash; hash = ops->filter_hash; - return !!(!hash || !hash->count); + return ftrace_hash_empty(hash); } static int ftrace_update_code(struct module *mod) { + struct ftrace_page *pg; struct dyn_ftrace *p; cycle_t start, stop; unsigned long ref = 0; + int i; /* * When adding a module, we need to check if tracers are @@ -1817,46 +2059,44 @@ static int ftrace_update_code(struct module *mod) start = ftrace_now(raw_smp_processor_id()); ftrace_update_cnt = 0; - while (ftrace_new_addrs) { + for (pg = ftrace_new_pgs; pg; pg = pg->next) { - /* If something went wrong, bail without enabling anything */ - if (unlikely(ftrace_disabled)) - return -1; + for (i = 0; i < pg->index; i++) { + /* If something went wrong, bail without enabling anything */ + if (unlikely(ftrace_disabled)) + return -1; - p = ftrace_new_addrs; - ftrace_new_addrs = p->newlist; - p->flags = ref; + p = &pg->records[i]; + p->flags = ref; - /* - * Do the initial record conversion from mcount jump - * to the NOP instructions. - */ - if (!ftrace_code_disable(mod, p)) { - ftrace_free_rec(p); - /* Game over */ - break; - } + /* + * Do the initial record conversion from mcount jump + * to the NOP instructions. + */ + if (!ftrace_code_disable(mod, p)) + break; - ftrace_update_cnt++; + ftrace_update_cnt++; - /* - * If the tracing is enabled, go ahead and enable the record. - * - * The reason not to enable the record immediatelly is the - * inherent check of ftrace_make_nop/ftrace_make_call for - * correct previous instructions. Making first the NOP - * conversion puts the module to the correct state, thus - * passing the ftrace_make_call check. - */ - if (ftrace_start_up && ref) { - int failed = __ftrace_replace_code(p, 1); - if (failed) { - ftrace_bug(failed, p->ip); - ftrace_free_rec(p); + /* + * If the tracing is enabled, go ahead and enable the record. + * + * The reason not to enable the record immediatelly is the + * inherent check of ftrace_make_nop/ftrace_make_call for + * correct previous instructions. Making first the NOP + * conversion puts the module to the correct state, thus + * passing the ftrace_make_call check. + */ + if (ftrace_start_up && ref) { + int failed = __ftrace_replace_code(p, 1); + if (failed) + ftrace_bug(failed, p->ip); } } } + ftrace_new_pgs = NULL; + stop = ftrace_now(raw_smp_processor_id()); ftrace_update_time = stop - start; ftrace_update_tot_cnt += ftrace_update_cnt; @@ -1864,57 +2104,108 @@ static int ftrace_update_code(struct module *mod) return 0; } -static int __init ftrace_dyn_table_alloc(unsigned long num_to_init) +static int ftrace_allocate_records(struct ftrace_page *pg, int count) { - struct ftrace_page *pg; + int order; int cnt; - int i; - /* allocate a few pages */ - ftrace_pages_start = (void *)get_zeroed_page(GFP_KERNEL); - if (!ftrace_pages_start) - return -1; + if (WARN_ON(!count)) + return -EINVAL; + + order = get_count_order(DIV_ROUND_UP(count, ENTRIES_PER_PAGE)); /* - * Allocate a few more pages. - * - * TODO: have some parser search vmlinux before - * final linking to find all calls to ftrace. - * Then we can: - * a) know how many pages to allocate. - * and/or - * b) set up the table then. - * - * The dynamic code is still necessary for - * modules. + * We want to fill as much as possible. No more than a page + * may be empty. */ + while ((PAGE_SIZE << order) / ENTRY_SIZE >= count + ENTRIES_PER_PAGE) + order--; - pg = ftrace_pages = ftrace_pages_start; + again: + pg->records = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, order); - cnt = num_to_init / ENTRIES_PER_PAGE; - pr_info("ftrace: allocating %ld entries in %d pages\n", - num_to_init, cnt + 1); + if (!pg->records) { + /* if we can't allocate this size, try something smaller */ + if (!order) + return -ENOMEM; + order >>= 1; + goto again; + } - for (i = 0; i < cnt; i++) { - pg->next = (void *)get_zeroed_page(GFP_KERNEL); + cnt = (PAGE_SIZE << order) / ENTRY_SIZE; + pg->size = cnt; - /* If we fail, we'll try later anyway */ - if (!pg->next) + if (cnt > count) + cnt = count; + + return cnt; +} + +static struct ftrace_page * +ftrace_allocate_pages(unsigned long num_to_init) +{ + struct ftrace_page *start_pg; + struct ftrace_page *pg; + int order; + int cnt; + + if (!num_to_init) + return 0; + + start_pg = pg = kzalloc(sizeof(*pg), GFP_KERNEL); + if (!pg) + return NULL; + + /* + * Try to allocate as much as possible in one continues + * location that fills in all of the space. We want to + * waste as little space as possible. + */ + for (;;) { + cnt = ftrace_allocate_records(pg, num_to_init); + if (cnt < 0) + goto free_pages; + + num_to_init -= cnt; + if (!num_to_init) break; + pg->next = kzalloc(sizeof(*pg), GFP_KERNEL); + if (!pg->next) + goto free_pages; + pg = pg->next; } - return 0; + return start_pg; + + free_pages: + while (start_pg) { + order = get_count_order(pg->size / ENTRIES_PER_PAGE); + free_pages((unsigned long)pg->records, order); + start_pg = pg->next; + kfree(pg); + pg = start_pg; + } + pr_info("ftrace: FAILED to allocate memory for functions\n"); + return NULL; } -enum { - FTRACE_ITER_FILTER = (1 << 0), - FTRACE_ITER_NOTRACE = (1 << 1), - FTRACE_ITER_PRINTALL = (1 << 2), - FTRACE_ITER_HASH = (1 << 3), - FTRACE_ITER_ENABLED = (1 << 4), -}; +static int __init ftrace_dyn_table_alloc(unsigned long num_to_init) +{ + int cnt; + + if (!num_to_init) { + pr_info("ftrace: No functions to be traced?\n"); + return -1; + } + + cnt = num_to_init / ENTRIES_PER_PAGE; + pr_info("ftrace: allocating %ld entries in %d pages\n", + num_to_init, cnt + 1); + + return 0; +} #define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */ @@ -1980,6 +2271,9 @@ static void *t_hash_start(struct seq_file *m, loff_t *pos) void *p = NULL; loff_t l; + if (!(iter->flags & FTRACE_ITER_DO_HASH)) + return NULL; + if (iter->func_pos > *pos) return NULL; @@ -2023,7 +2317,7 @@ static void * t_next(struct seq_file *m, void *v, loff_t *pos) { struct ftrace_iterator *iter = m->private; - struct ftrace_ops *ops = &global_ops; + struct ftrace_ops *ops = iter->ops; struct dyn_ftrace *rec = NULL; if (unlikely(ftrace_disabled)) @@ -2047,9 +2341,7 @@ t_next(struct seq_file *m, void *v, loff_t *pos) } } else { rec = &iter->pg->records[iter->idx++]; - if ((rec->flags & FTRACE_FL_FREE) || - - ((iter->flags & FTRACE_ITER_FILTER) && + if (((iter->flags & FTRACE_ITER_FILTER) && !(ftrace_lookup_ip(ops->filter_hash, rec->ip))) || ((iter->flags & FTRACE_ITER_NOTRACE) && @@ -2081,7 +2373,7 @@ static void reset_iter_read(struct ftrace_iterator *iter) static void *t_start(struct seq_file *m, loff_t *pos) { struct ftrace_iterator *iter = m->private; - struct ftrace_ops *ops = &global_ops; + struct ftrace_ops *ops = iter->ops; void *p = NULL; loff_t l; @@ -2101,7 +2393,8 @@ static void *t_start(struct seq_file *m, loff_t *pos) * off, we can short cut and just print out that all * functions are enabled. */ - if (iter->flags & FTRACE_ITER_FILTER && !ops->filter_hash->count) { + if (iter->flags & FTRACE_ITER_FILTER && + ftrace_hash_empty(ops->filter_hash)) { if (*pos > 0) return t_hash_start(m, pos); iter->flags |= FTRACE_ITER_PRINTALL; @@ -2126,12 +2419,8 @@ static void *t_start(struct seq_file *m, loff_t *pos) break; } - if (!p) { - if (iter->flags & FTRACE_ITER_FILTER) - return t_hash_start(m, pos); - - return NULL; - } + if (!p) + return t_hash_start(m, pos); return iter; } @@ -2189,6 +2478,7 @@ ftrace_avail_open(struct inode *inode, struct file *file) return -ENOMEM; iter->pg = ftrace_pages_start; + iter->ops = &global_ops; ret = seq_open(file, &show_ftrace_seq_ops); if (!ret) { @@ -2217,6 +2507,7 @@ ftrace_enabled_open(struct inode *inode, struct file *file) iter->pg = ftrace_pages_start; iter->flags = FTRACE_ITER_ENABLED; + iter->ops = &global_ops; ret = seq_open(file, &show_ftrace_seq_ops); if (!ret) { @@ -2237,7 +2528,23 @@ static void ftrace_filter_reset(struct ftrace_hash *hash) mutex_unlock(&ftrace_lock); } -static int +/** + * ftrace_regex_open - initialize function tracer filter files + * @ops: The ftrace_ops that hold the hash filters + * @flag: The type of filter to process + * @inode: The inode, usually passed in to your open routine + * @file: The file, usually passed in to your open routine + * + * ftrace_regex_open() initializes the filter files for the + * @ops. Depending on @flag it may process the filter hash or + * the notrace hash of @ops. With this called from the open + * routine, you can use ftrace_filter_write() for the write + * routine if @flag has FTRACE_ITER_FILTER set, or + * ftrace_notrace_write() if @flag has FTRACE_ITER_NOTRACE set. + * ftrace_regex_lseek() should be used as the lseek routine, and + * release must call ftrace_regex_release(). + */ +int ftrace_regex_open(struct ftrace_ops *ops, int flag, struct inode *inode, struct file *file) { @@ -2306,8 +2613,9 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag, static int ftrace_filter_open(struct inode *inode, struct file *file) { - return ftrace_regex_open(&global_ops, FTRACE_ITER_FILTER, - inode, file); + return ftrace_regex_open(&global_ops, + FTRACE_ITER_FILTER | FTRACE_ITER_DO_HASH, + inode, file); } static int @@ -2317,7 +2625,7 @@ ftrace_notrace_open(struct inode *inode, struct file *file) inode, file); } -static loff_t +loff_t ftrace_regex_lseek(struct file *file, loff_t offset, int origin) { loff_t ret; @@ -2426,7 +2734,6 @@ match_records(struct ftrace_hash *hash, char *buff, goto out_unlock; do_for_each_ftrace_rec(pg, rec) { - if (ftrace_match_record(rec, mod, search, search_len, type)) { ret = enter_record(hash, rec, not); if (ret < 0) { @@ -2871,14 +3178,14 @@ out_unlock: return ret; } -static ssize_t +ssize_t ftrace_filter_write(struct file *file, const char __user *ubuf, size_t cnt, loff_t *ppos) { return ftrace_regex_write(file, ubuf, cnt, ppos, 1); } -static ssize_t +ssize_t ftrace_notrace_write(struct file *file, const char __user *ubuf, size_t cnt, loff_t *ppos) { @@ -2912,17 +3219,20 @@ ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len, mutex_lock(&ftrace_regex_lock); if (reset) ftrace_filter_reset(hash); - if (buf) - ftrace_match_records(hash, buf, len); + if (buf && !ftrace_match_records(hash, buf, len)) { + ret = -EINVAL; + goto out_regex_unlock; + } mutex_lock(&ftrace_lock); ret = ftrace_hash_move(ops, enable, orig_hash, hash); if (!ret && ops->flags & FTRACE_OPS_FL_ENABLED && ftrace_enabled) - ftrace_run_update_code(FTRACE_ENABLE_CALLS); + ftrace_run_update_code(FTRACE_UPDATE_CALLS); mutex_unlock(&ftrace_lock); + out_regex_unlock: mutex_unlock(&ftrace_regex_lock); free_ftrace_hash(hash); @@ -2939,10 +3249,10 @@ ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len, * Filters denote which functions should be enabled when tracing is enabled. * If @buf is NULL and reset is set, all functions will be enabled for tracing. */ -void ftrace_set_filter(struct ftrace_ops *ops, unsigned char *buf, +int ftrace_set_filter(struct ftrace_ops *ops, unsigned char *buf, int len, int reset) { - ftrace_set_regex(ops, buf, len, reset, 1); + return ftrace_set_regex(ops, buf, len, reset, 1); } EXPORT_SYMBOL_GPL(ftrace_set_filter); @@ -2957,10 +3267,10 @@ EXPORT_SYMBOL_GPL(ftrace_set_filter); * is enabled. If @buf is NULL and reset is set, all functions will be enabled * for tracing. */ -void ftrace_set_notrace(struct ftrace_ops *ops, unsigned char *buf, +int ftrace_set_notrace(struct ftrace_ops *ops, unsigned char *buf, int len, int reset) { - ftrace_set_regex(ops, buf, len, reset, 0); + return ftrace_set_regex(ops, buf, len, reset, 0); } EXPORT_SYMBOL_GPL(ftrace_set_notrace); /** @@ -3045,8 +3355,8 @@ static void __init set_ftrace_early_graph(char *buf) } #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ -static void __init -set_ftrace_early_filter(struct ftrace_ops *ops, char *buf, int enable) +void __init +ftrace_set_early_filter(struct ftrace_ops *ops, char *buf, int enable) { char *func; @@ -3059,17 +3369,16 @@ set_ftrace_early_filter(struct ftrace_ops *ops, char *buf, int enable) static void __init set_ftrace_early_filters(void) { if (ftrace_filter_buf[0]) - set_ftrace_early_filter(&global_ops, ftrace_filter_buf, 1); + ftrace_set_early_filter(&global_ops, ftrace_filter_buf, 1); if (ftrace_notrace_buf[0]) - set_ftrace_early_filter(&global_ops, ftrace_notrace_buf, 0); + ftrace_set_early_filter(&global_ops, ftrace_notrace_buf, 0); #ifdef CONFIG_FUNCTION_GRAPH_TRACER if (ftrace_graph_buf[0]) set_ftrace_early_graph(ftrace_graph_buf); #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ } -static int -ftrace_regex_release(struct inode *inode, struct file *file) +int ftrace_regex_release(struct inode *inode, struct file *file) { struct seq_file *m = (struct seq_file *)file->private_data; struct ftrace_iterator *iter; @@ -3107,7 +3416,7 @@ ftrace_regex_release(struct inode *inode, struct file *file) orig_hash, iter->hash); if (!ret && (iter->ops->flags & FTRACE_OPS_FL_ENABLED) && ftrace_enabled) - ftrace_run_update_code(FTRACE_ENABLE_CALLS); + ftrace_run_update_code(FTRACE_UPDATE_CALLS); mutex_unlock(&ftrace_lock); } @@ -3270,9 +3579,6 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer) do_for_each_ftrace_rec(pg, rec) { - if (rec->flags & FTRACE_FL_FREE) - continue; - if (ftrace_match_record(rec, NULL, search, search_len, type)) { /* if it is in the array */ exists = false; @@ -3381,15 +3687,62 @@ static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer) return 0; } +static void ftrace_swap_recs(void *a, void *b, int size) +{ + struct dyn_ftrace *reca = a; + struct dyn_ftrace *recb = b; + struct dyn_ftrace t; + + t = *reca; + *reca = *recb; + *recb = t; +} + static int ftrace_process_locs(struct module *mod, unsigned long *start, unsigned long *end) { + struct ftrace_page *pg; + unsigned long count; unsigned long *p; unsigned long addr; unsigned long flags = 0; /* Shut up gcc */ + int ret = -ENOMEM; + + count = end - start; + + if (!count) + return 0; + + pg = ftrace_allocate_pages(count); + if (!pg) + return -ENOMEM; mutex_lock(&ftrace_lock); + + /* + * Core and each module needs their own pages, as + * modules will free them when they are removed. + * Force a new page to be allocated for modules. + */ + if (!mod) { + WARN_ON(ftrace_pages || ftrace_pages_start); + /* First initialization */ + ftrace_pages = ftrace_pages_start = pg; + } else { + if (!ftrace_pages) + goto out; + + if (WARN_ON(ftrace_pages->next)) { + /* Hmm, we have free pages? */ + while (ftrace_pages->next) + ftrace_pages = ftrace_pages->next; + } + + ftrace_pages->next = pg; + ftrace_pages = pg; + } + p = start; while (p < end) { addr = ftrace_call_adjust(*p++); @@ -3401,9 +3754,18 @@ static int ftrace_process_locs(struct module *mod, */ if (!addr) continue; - ftrace_record_ip(addr); + if (!ftrace_record_ip(addr)) + break; } + /* These new locations need to be initialized */ + ftrace_new_pgs = pg; + + /* Make each individual set of pages sorted by ips */ + for (; pg; pg = pg->next) + sort(pg->records, pg->index, sizeof(struct dyn_ftrace), + ftrace_cmp_recs, ftrace_swap_recs); + /* * We only need to disable interrupts on start up * because we are modifying code that an interrupt @@ -3417,32 +3779,55 @@ static int ftrace_process_locs(struct module *mod, ftrace_update_code(mod); if (!mod) local_irq_restore(flags); + ret = 0; + out: mutex_unlock(&ftrace_lock); - return 0; + return ret; } #ifdef CONFIG_MODULES + +#define next_to_ftrace_page(p) container_of(p, struct ftrace_page, next) + void ftrace_release_mod(struct module *mod) { struct dyn_ftrace *rec; + struct ftrace_page **last_pg; struct ftrace_page *pg; + int order; mutex_lock(&ftrace_lock); if (ftrace_disabled) goto out_unlock; - do_for_each_ftrace_rec(pg, rec) { + /* + * Each module has its own ftrace_pages, remove + * them from the list. + */ + last_pg = &ftrace_pages_start; + for (pg = ftrace_pages_start; pg; pg = *last_pg) { + rec = &pg->records[0]; if (within_module_core(rec->ip, mod)) { /* - * rec->ip is changed in ftrace_free_rec() - * It should not between s and e if record was freed. + * As core pages are first, the first + * page should never be a module page. */ - FTRACE_WARN_ON(rec->flags & FTRACE_FL_FREE); - ftrace_free_rec(rec); - } - } while_for_each_ftrace_rec(); + if (WARN_ON(pg == ftrace_pages_start)) + goto out_unlock; + + /* Check if we are deleting the last page */ + if (pg == ftrace_pages) + ftrace_pages = next_to_ftrace_page(last_pg); + + *last_pg = pg->next; + order = get_count_order(pg->size / ENTRIES_PER_PAGE); + free_pages((unsigned long)pg->records, order); + kfree(pg); + } else + last_pg = &pg->next; + } out_unlock: mutex_unlock(&ftrace_lock); } @@ -3562,6 +3947,36 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip) #endif /* CONFIG_DYNAMIC_FTRACE */ static void +ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip) +{ + struct ftrace_ops *op; + + if (unlikely(trace_recursion_test(TRACE_CONTROL_BIT))) + return; + + /* + * Some of the ops may be dynamically allocated, + * they must be freed after a synchronize_sched(). + */ + preempt_disable_notrace(); + trace_recursion_set(TRACE_CONTROL_BIT); + op = rcu_dereference_raw(ftrace_control_list); + while (op != &ftrace_list_end) { + if (!ftrace_function_local_disabled(op) && + ftrace_ops_test(op, ip)) + op->func(ip, parent_ip); + + op = rcu_dereference_raw(op->next); + }; + trace_recursion_clear(TRACE_CONTROL_BIT); + preempt_enable_notrace(); +} + +static struct ftrace_ops control_ops = { + .func = ftrace_ops_control_func, +}; + +static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip) { struct ftrace_ops *op; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index a3f1bc5d2a00..10d5503f0d04 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2764,12 +2764,12 @@ static const char readme_msg[] = "tracing mini-HOWTO:\n\n" "# mount -t debugfs nodev /sys/kernel/debug\n\n" "# cat /sys/kernel/debug/tracing/available_tracers\n" - "wakeup preemptirqsoff preemptoff irqsoff function sched_switch nop\n\n" + "wakeup wakeup_rt preemptirqsoff preemptoff irqsoff function nop\n\n" "# cat /sys/kernel/debug/tracing/current_tracer\n" "nop\n" - "# echo sched_switch > /sys/kernel/debug/tracing/current_tracer\n" + "# echo wakeup > /sys/kernel/debug/tracing/current_tracer\n" "# cat /sys/kernel/debug/tracing/current_tracer\n" - "sched_switch\n" + "wakeup\n" "# cat /sys/kernel/debug/tracing/trace_options\n" "noprint-parent nosym-offset nosym-addr noverbose\n" "# echo print-parent > /sys/kernel/debug/tracing/trace_options\n" diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index b93ecbadad6d..54faec790bc1 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -56,17 +56,23 @@ enum trace_type { #define F_STRUCT(args...) args #undef FTRACE_ENTRY -#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \ - struct struct_name { \ - struct trace_entry ent; \ - tstruct \ +#define FTRACE_ENTRY(name, struct_name, id, tstruct, print, filter) \ + struct struct_name { \ + struct trace_entry ent; \ + tstruct \ } #undef TP_ARGS #define TP_ARGS(args...) args #undef FTRACE_ENTRY_DUP -#define FTRACE_ENTRY_DUP(name, name_struct, id, tstruct, printk) +#define FTRACE_ENTRY_DUP(name, name_struct, id, tstruct, printk, filter) + +#undef FTRACE_ENTRY_REG +#define FTRACE_ENTRY_REG(name, struct_name, id, tstruct, print, \ + filter, regfn) \ + FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print), \ + filter) #include "trace_entries.h" @@ -288,6 +294,8 @@ struct tracer { /* for function tracing recursion */ #define TRACE_INTERNAL_BIT (1<<11) #define TRACE_GLOBAL_BIT (1<<12) +#define TRACE_CONTROL_BIT (1<<13) + /* * Abuse of the trace_recursion. * As we need a way to maintain state if we are tracing the function @@ -589,6 +597,8 @@ static inline int ftrace_trace_task(struct task_struct *task) static inline int ftrace_is_dead(void) { return 0; } #endif +int ftrace_event_is_function(struct ftrace_event_call *call); + /* * struct trace_parser - servers for reading the user input separated by spaces * @cont: set if the input is not complete - no final space char was found @@ -766,9 +776,7 @@ struct filter_pred { u64 val; struct regex regex; unsigned short *ops; -#ifdef CONFIG_FTRACE_STARTUP_TEST struct ftrace_event_field *field; -#endif int offset; int not; int op; @@ -818,12 +826,22 @@ extern const char *__start___trace_bprintk_fmt[]; extern const char *__stop___trace_bprintk_fmt[]; #undef FTRACE_ENTRY -#define FTRACE_ENTRY(call, struct_name, id, tstruct, print) \ +#define FTRACE_ENTRY(call, struct_name, id, tstruct, print, filter) \ extern struct ftrace_event_call \ __attribute__((__aligned__(4))) event_##call; #undef FTRACE_ENTRY_DUP -#define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print) \ - FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print)) +#define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print, filter) \ + FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print), \ + filter) #include "trace_entries.h" +#ifdef CONFIG_PERF_EVENTS +#ifdef CONFIG_FUNCTION_TRACER +int perf_ftrace_event_register(struct ftrace_event_call *call, + enum trace_reg type, void *data); +#else +#define perf_ftrace_event_register NULL +#endif /* CONFIG_FUNCTION_TRACER */ +#endif /* CONFIG_PERF_EVENTS */ + #endif /* _LINUX_KERNEL_TRACE_H */ diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index 93365907f219..d91eb0541b3a 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -55,7 +55,7 @@ /* * Function trace entry - function address and parent function address: */ -FTRACE_ENTRY(function, ftrace_entry, +FTRACE_ENTRY_REG(function, ftrace_entry, TRACE_FN, @@ -64,7 +64,11 @@ FTRACE_ENTRY(function, ftrace_entry, __field( unsigned long, parent_ip ) ), - F_printk(" %lx <-- %lx", __entry->ip, __entry->parent_ip) + F_printk(" %lx <-- %lx", __entry->ip, __entry->parent_ip), + + FILTER_TRACE_FN, + + perf_ftrace_event_register ); /* Function call entry */ @@ -78,7 +82,9 @@ FTRACE_ENTRY(funcgraph_entry, ftrace_graph_ent_entry, __field_desc( int, graph_ent, depth ) ), - F_printk("--> %lx (%d)", __entry->func, __entry->depth) + F_printk("--> %lx (%d)", __entry->func, __entry->depth), + + FILTER_OTHER ); /* Function return entry */ @@ -98,7 +104,9 @@ FTRACE_ENTRY(funcgraph_exit, ftrace_graph_ret_entry, F_printk("<-- %lx (%d) (start: %llx end: %llx) over: %d", __entry->func, __entry->depth, __entry->calltime, __entry->rettime, - __entry->depth) + __entry->depth), + + FILTER_OTHER ); /* @@ -127,8 +135,9 @@ FTRACE_ENTRY(context_switch, ctx_switch_entry, F_printk("%u:%u:%u ==> %u:%u:%u [%03u]", __entry->prev_pid, __entry->prev_prio, __entry->prev_state, __entry->next_pid, __entry->next_prio, __entry->next_state, - __entry->next_cpu - ) + __entry->next_cpu), + + FILTER_OTHER ); /* @@ -146,8 +155,9 @@ FTRACE_ENTRY_DUP(wakeup, ctx_switch_entry, F_printk("%u:%u:%u ==+ %u:%u:%u [%03u]", __entry->prev_pid, __entry->prev_prio, __entry->prev_state, __entry->next_pid, __entry->next_prio, __entry->next_state, - __entry->next_cpu - ) + __entry->next_cpu), + + FILTER_OTHER ); /* @@ -169,7 +179,9 @@ FTRACE_ENTRY(kernel_stack, stack_entry, "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n", __entry->caller[0], __entry->caller[1], __entry->caller[2], __entry->caller[3], __entry->caller[4], __entry->caller[5], - __entry->caller[6], __entry->caller[7]) + __entry->caller[6], __entry->caller[7]), + + FILTER_OTHER ); FTRACE_ENTRY(user_stack, userstack_entry, @@ -185,7 +197,9 @@ FTRACE_ENTRY(user_stack, userstack_entry, "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n", __entry->caller[0], __entry->caller[1], __entry->caller[2], __entry->caller[3], __entry->caller[4], __entry->caller[5], - __entry->caller[6], __entry->caller[7]) + __entry->caller[6], __entry->caller[7]), + + FILTER_OTHER ); /* @@ -202,7 +216,9 @@ FTRACE_ENTRY(bprint, bprint_entry, ), F_printk("%08lx fmt:%p", - __entry->ip, __entry->fmt) + __entry->ip, __entry->fmt), + + FILTER_OTHER ); FTRACE_ENTRY(print, print_entry, @@ -215,7 +231,9 @@ FTRACE_ENTRY(print, print_entry, ), F_printk("%08lx %s", - __entry->ip, __entry->buf) + __entry->ip, __entry->buf), + + FILTER_OTHER ); FTRACE_ENTRY(mmiotrace_rw, trace_mmiotrace_rw, @@ -234,7 +252,9 @@ FTRACE_ENTRY(mmiotrace_rw, trace_mmiotrace_rw, F_printk("%lx %lx %lx %d %x %x", (unsigned long)__entry->phys, __entry->value, __entry->pc, - __entry->map_id, __entry->opcode, __entry->width) + __entry->map_id, __entry->opcode, __entry->width), + + FILTER_OTHER ); FTRACE_ENTRY(mmiotrace_map, trace_mmiotrace_map, @@ -252,7 +272,9 @@ FTRACE_ENTRY(mmiotrace_map, trace_mmiotrace_map, F_printk("%lx %lx %lx %d %x", (unsigned long)__entry->phys, __entry->virt, __entry->len, - __entry->map_id, __entry->opcode) + __entry->map_id, __entry->opcode), + + FILTER_OTHER ); @@ -272,6 +294,8 @@ FTRACE_ENTRY(branch, trace_branch, F_printk("%u:%s:%s (%u)", __entry->line, - __entry->func, __entry->file, __entry->correct) + __entry->func, __entry->file, __entry->correct), + + FILTER_OTHER ); diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 19a359d5e6d5..fee3752ae8f6 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -24,6 +24,11 @@ static int total_ref_count; static int perf_trace_event_perm(struct ftrace_event_call *tp_event, struct perf_event *p_event) { + /* The ftrace function trace is allowed only for root. */ + if (ftrace_event_is_function(tp_event) && + perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN)) + return -EPERM; + /* No tracing, just counting, so no obvious leak */ if (!(p_event->attr.sample_type & PERF_SAMPLE_RAW)) return 0; @@ -44,23 +49,17 @@ static int perf_trace_event_perm(struct ftrace_event_call *tp_event, return 0; } -static int perf_trace_event_init(struct ftrace_event_call *tp_event, - struct perf_event *p_event) +static int perf_trace_event_reg(struct ftrace_event_call *tp_event, + struct perf_event *p_event) { struct hlist_head __percpu *list; - int ret; + int ret = -ENOMEM; int cpu; - ret = perf_trace_event_perm(tp_event, p_event); - if (ret) - return ret; - p_event->tp_event = tp_event; if (tp_event->perf_refcount++ > 0) return 0; - ret = -ENOMEM; - list = alloc_percpu(struct hlist_head); if (!list) goto fail; @@ -83,7 +82,7 @@ static int perf_trace_event_init(struct ftrace_event_call *tp_event, } } - ret = tp_event->class->reg(tp_event, TRACE_REG_PERF_REGISTER); + ret = tp_event->class->reg(tp_event, TRACE_REG_PERF_REGISTER, NULL); if (ret) goto fail; @@ -108,6 +107,69 @@ fail: return ret; } +static void perf_trace_event_unreg(struct perf_event *p_event) +{ + struct ftrace_event_call *tp_event = p_event->tp_event; + int i; + + if (--tp_event->perf_refcount > 0) + goto out; + + tp_event->class->reg(tp_event, TRACE_REG_PERF_UNREGISTER, NULL); + + /* + * Ensure our callback won't be called anymore. The buffers + * will be freed after that. + */ + tracepoint_synchronize_unregister(); + + free_percpu(tp_event->perf_events); + tp_event->perf_events = NULL; + + if (!--total_ref_count) { + for (i = 0; i < PERF_NR_CONTEXTS; i++) { + free_percpu(perf_trace_buf[i]); + perf_trace_buf[i] = NULL; + } + } +out: + module_put(tp_event->mod); +} + +static int perf_trace_event_open(struct perf_event *p_event) +{ + struct ftrace_event_call *tp_event = p_event->tp_event; + return tp_event->class->reg(tp_event, TRACE_REG_PERF_OPEN, p_event); +} + +static void perf_trace_event_close(struct perf_event *p_event) +{ + struct ftrace_event_call *tp_event = p_event->tp_event; + tp_event->class->reg(tp_event, TRACE_REG_PERF_CLOSE, p_event); +} + +static int perf_trace_event_init(struct ftrace_event_call *tp_event, + struct perf_event *p_event) +{ + int ret; + + ret = perf_trace_event_perm(tp_event, p_event); + if (ret) + return ret; + + ret = perf_trace_event_reg(tp_event, p_event); + if (ret) + return ret; + + ret = perf_trace_event_open(p_event); + if (ret) { + perf_trace_event_unreg(p_event); + return ret; + } + + return 0; +} + int perf_trace_init(struct perf_event *p_event) { struct ftrace_event_call *tp_event; @@ -130,6 +192,14 @@ int perf_trace_init(struct perf_event *p_event) return ret; } +void perf_trace_destroy(struct perf_event *p_event) +{ + mutex_lock(&event_mutex); + perf_trace_event_close(p_event); + perf_trace_event_unreg(p_event); + mutex_unlock(&event_mutex); +} + int perf_trace_add(struct perf_event *p_event, int flags) { struct ftrace_event_call *tp_event = p_event->tp_event; @@ -146,43 +216,14 @@ int perf_trace_add(struct perf_event *p_event, int flags) list = this_cpu_ptr(pcpu_list); hlist_add_head_rcu(&p_event->hlist_entry, list); - return 0; + return tp_event->class->reg(tp_event, TRACE_REG_PERF_ADD, p_event); } void perf_trace_del(struct perf_event *p_event, int flags) { - hlist_del_rcu(&p_event->hlist_entry); -} - -void perf_trace_destroy(struct perf_event *p_event) -{ struct ftrace_event_call *tp_event = p_event->tp_event; - int i; - - mutex_lock(&event_mutex); - if (--tp_event->perf_refcount > 0) - goto out; - - tp_event->class->reg(tp_event, TRACE_REG_PERF_UNREGISTER); - - /* - * Ensure our callback won't be called anymore. The buffers - * will be freed after that. - */ - tracepoint_synchronize_unregister(); - - free_percpu(tp_event->perf_events); - tp_event->perf_events = NULL; - - if (!--total_ref_count) { - for (i = 0; i < PERF_NR_CONTEXTS; i++) { - free_percpu(perf_trace_buf[i]); - perf_trace_buf[i] = NULL; - } - } -out: - module_put(tp_event->mod); - mutex_unlock(&event_mutex); + hlist_del_rcu(&p_event->hlist_entry); + tp_event->class->reg(tp_event, TRACE_REG_PERF_DEL, p_event); } __kprobes void *perf_trace_buf_prepare(int size, unsigned short type, @@ -214,3 +255,86 @@ __kprobes void *perf_trace_buf_prepare(int size, unsigned short type, return raw_data; } EXPORT_SYMBOL_GPL(perf_trace_buf_prepare); + +#ifdef CONFIG_FUNCTION_TRACER +static void +perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip) +{ + struct ftrace_entry *entry; + struct hlist_head *head; + struct pt_regs regs; + int rctx; + +#define ENTRY_SIZE (ALIGN(sizeof(struct ftrace_entry) + sizeof(u32), \ + sizeof(u64)) - sizeof(u32)) + + BUILD_BUG_ON(ENTRY_SIZE > PERF_MAX_TRACE_SIZE); + + perf_fetch_caller_regs(®s); + + entry = perf_trace_buf_prepare(ENTRY_SIZE, TRACE_FN, NULL, &rctx); + if (!entry) + return; + + entry->ip = ip; + entry->parent_ip = parent_ip; + + head = this_cpu_ptr(event_function.perf_events); + perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, 0, + 1, ®s, head); + +#undef ENTRY_SIZE +} + +static int perf_ftrace_function_register(struct perf_event *event) +{ + struct ftrace_ops *ops = &event->ftrace_ops; + + ops->flags |= FTRACE_OPS_FL_CONTROL; + ops->func = perf_ftrace_function_call; + return register_ftrace_function(ops); +} + +static int perf_ftrace_function_unregister(struct perf_event *event) +{ + struct ftrace_ops *ops = &event->ftrace_ops; + int ret = unregister_ftrace_function(ops); + ftrace_free_filter(ops); + return ret; +} + +static void perf_ftrace_function_enable(struct perf_event *event) +{ + ftrace_function_local_enable(&event->ftrace_ops); +} + +static void perf_ftrace_function_disable(struct perf_event *event) +{ + ftrace_function_local_disable(&event->ftrace_ops); +} + +int perf_ftrace_event_register(struct ftrace_event_call *call, + enum trace_reg type, void *data) +{ + switch (type) { + case TRACE_REG_REGISTER: + case TRACE_REG_UNREGISTER: + break; + case TRACE_REG_PERF_REGISTER: + case TRACE_REG_PERF_UNREGISTER: + return 0; + case TRACE_REG_PERF_OPEN: + return perf_ftrace_function_register(data); + case TRACE_REG_PERF_CLOSE: + return perf_ftrace_function_unregister(data); + case TRACE_REG_PERF_ADD: + perf_ftrace_function_enable(data); + return 0; + case TRACE_REG_PERF_DEL: + perf_ftrace_function_disable(data); + return 0; + } + + return -EINVAL; +} +#endif /* CONFIG_FUNCTION_TRACER */ diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index c212a7f934ec..079a93ae8a9d 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -147,7 +147,8 @@ int trace_event_raw_init(struct ftrace_event_call *call) } EXPORT_SYMBOL_GPL(trace_event_raw_init); -int ftrace_event_reg(struct ftrace_event_call *call, enum trace_reg type) +int ftrace_event_reg(struct ftrace_event_call *call, + enum trace_reg type, void *data) { switch (type) { case TRACE_REG_REGISTER: @@ -170,6 +171,11 @@ int ftrace_event_reg(struct ftrace_event_call *call, enum trace_reg type) call->class->perf_probe, call); return 0; + case TRACE_REG_PERF_OPEN: + case TRACE_REG_PERF_CLOSE: + case TRACE_REG_PERF_ADD: + case TRACE_REG_PERF_DEL: + return 0; #endif } return 0; @@ -209,7 +215,7 @@ static int ftrace_event_enable_disable(struct ftrace_event_call *call, tracing_stop_cmdline_record(); call->flags &= ~TRACE_EVENT_FL_RECORDED_CMD; } - call->class->reg(call, TRACE_REG_UNREGISTER); + call->class->reg(call, TRACE_REG_UNREGISTER, NULL); } break; case 1: @@ -218,7 +224,7 @@ static int ftrace_event_enable_disable(struct ftrace_event_call *call, tracing_start_cmdline_record(); call->flags |= TRACE_EVENT_FL_RECORDED_CMD; } - ret = call->class->reg(call, TRACE_REG_REGISTER); + ret = call->class->reg(call, TRACE_REG_REGISTER, NULL); if (ret) { tracing_stop_cmdline_record(); pr_info("event trace: Could not enable event " diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index f04cc3136bd3..431dba8b7542 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -81,6 +81,7 @@ enum { FILT_ERR_TOO_MANY_PREDS, FILT_ERR_MISSING_FIELD, FILT_ERR_INVALID_FILTER, + FILT_ERR_IP_FIELD_ONLY, }; static char *err_text[] = { @@ -96,6 +97,7 @@ static char *err_text[] = { "Too many terms in predicate expression", "Missing field name and/or value", "Meaningless filter expression", + "Only 'ip' field is supported for function trace", }; struct opstack_op { @@ -685,7 +687,7 @@ find_event_field(struct ftrace_event_call *call, char *name) static int __alloc_pred_stack(struct pred_stack *stack, int n_preds) { - stack->preds = kzalloc(sizeof(*stack->preds)*(n_preds + 1), GFP_KERNEL); + stack->preds = kcalloc(n_preds + 1, sizeof(*stack->preds), GFP_KERNEL); if (!stack->preds) return -ENOMEM; stack->index = n_preds; @@ -826,8 +828,7 @@ static int __alloc_preds(struct event_filter *filter, int n_preds) if (filter->preds) __free_preds(filter); - filter->preds = - kzalloc(sizeof(*filter->preds) * n_preds, GFP_KERNEL); + filter->preds = kcalloc(n_preds, sizeof(*filter->preds), GFP_KERNEL); if (!filter->preds) return -ENOMEM; @@ -900,6 +901,11 @@ int filter_assign_type(const char *type) return FILTER_OTHER; } +static bool is_function_field(struct ftrace_event_field *field) +{ + return field->filter_type == FILTER_TRACE_FN; +} + static bool is_string_field(struct ftrace_event_field *field) { return field->filter_type == FILTER_DYN_STRING || @@ -987,6 +993,11 @@ static int init_pred(struct filter_parse_state *ps, fn = filter_pred_strloc; else fn = filter_pred_pchar; + } else if (is_function_field(field)) { + if (strcmp(field->name, "ip")) { + parse_error(ps, FILT_ERR_IP_FIELD_ONLY, 0); + return -EINVAL; + } } else { if (field->is_signed) ret = strict_strtoll(pred->regex.pattern, 0, &val); @@ -1334,10 +1345,7 @@ static struct filter_pred *create_pred(struct filter_parse_state *ps, strcpy(pred.regex.pattern, operand2); pred.regex.len = strlen(pred.regex.pattern); - -#ifdef CONFIG_FTRACE_STARTUP_TEST pred.field = field; -#endif return init_pred(ps, field, &pred) ? NULL : &pred; } @@ -1486,7 +1494,7 @@ static int fold_pred(struct filter_pred *preds, struct filter_pred *root) children = count_leafs(preds, &preds[root->left]); children += count_leafs(preds, &preds[root->right]); - root->ops = kzalloc(sizeof(*root->ops) * children, GFP_KERNEL); + root->ops = kcalloc(children, sizeof(*root->ops), GFP_KERNEL); if (!root->ops) return -ENOMEM; @@ -1738,11 +1746,121 @@ static int replace_system_preds(struct event_subsystem *system, return -ENOMEM; } +static int create_filter_start(char *filter_str, bool set_str, + struct filter_parse_state **psp, + struct event_filter **filterp) +{ + struct event_filter *filter; + struct filter_parse_state *ps = NULL; + int err = 0; + + WARN_ON_ONCE(*psp || *filterp); + + /* allocate everything, and if any fails, free all and fail */ + filter = __alloc_filter(); + if (filter && set_str) + err = replace_filter_string(filter, filter_str); + + ps = kzalloc(sizeof(*ps), GFP_KERNEL); + + if (!filter || !ps || err) { + kfree(ps); + __free_filter(filter); + return -ENOMEM; + } + + /* we're committed to creating a new filter */ + *filterp = filter; + *psp = ps; + + parse_init(ps, filter_ops, filter_str); + err = filter_parse(ps); + if (err && set_str) + append_filter_err(ps, filter); + return err; +} + +static void create_filter_finish(struct filter_parse_state *ps) +{ + if (ps) { + filter_opstack_clear(ps); + postfix_clear(ps); + kfree(ps); + } +} + +/** + * create_filter - create a filter for a ftrace_event_call + * @call: ftrace_event_call to create a filter for + * @filter_str: filter string + * @set_str: remember @filter_str and enable detailed error in filter + * @filterp: out param for created filter (always updated on return) + * + * Creates a filter for @call with @filter_str. If @set_str is %true, + * @filter_str is copied and recorded in the new filter. + * + * On success, returns 0 and *@filterp points to the new filter. On + * failure, returns -errno and *@filterp may point to %NULL or to a new + * filter. In the latter case, the returned filter contains error + * information if @set_str is %true and the caller is responsible for + * freeing it. + */ +static int create_filter(struct ftrace_event_call *call, + char *filter_str, bool set_str, + struct event_filter **filterp) +{ + struct event_filter *filter = NULL; + struct filter_parse_state *ps = NULL; + int err; + + err = create_filter_start(filter_str, set_str, &ps, &filter); + if (!err) { + err = replace_preds(call, filter, ps, filter_str, false); + if (err && set_str) + append_filter_err(ps, filter); + } + create_filter_finish(ps); + + *filterp = filter; + return err; +} + +/** + * create_system_filter - create a filter for an event_subsystem + * @system: event_subsystem to create a filter for + * @filter_str: filter string + * @filterp: out param for created filter (always updated on return) + * + * Identical to create_filter() except that it creates a subsystem filter + * and always remembers @filter_str. + */ +static int create_system_filter(struct event_subsystem *system, + char *filter_str, struct event_filter **filterp) +{ + struct event_filter *filter = NULL; + struct filter_parse_state *ps = NULL; + int err; + + err = create_filter_start(filter_str, true, &ps, &filter); + if (!err) { + err = replace_system_preds(system, ps, filter_str); + if (!err) { + /* System filters just show a default message */ + kfree(filter->filter_string); + filter->filter_string = NULL; + } else { + append_filter_err(ps, filter); + } + } + create_filter_finish(ps); + + *filterp = filter; + return err; +} + int apply_event_filter(struct ftrace_event_call *call, char *filter_string) { - struct filter_parse_state *ps; struct event_filter *filter; - struct event_filter *tmp; int err = 0; mutex_lock(&event_mutex); @@ -1759,49 +1877,30 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string) goto out_unlock; } - err = -ENOMEM; - ps = kzalloc(sizeof(*ps), GFP_KERNEL); - if (!ps) - goto out_unlock; + err = create_filter(call, filter_string, true, &filter); - filter = __alloc_filter(); - if (!filter) { - kfree(ps); - goto out_unlock; - } - - replace_filter_string(filter, filter_string); - - parse_init(ps, filter_ops, filter_string); - err = filter_parse(ps); - if (err) { - append_filter_err(ps, filter); - goto out; - } - - err = replace_preds(call, filter, ps, filter_string, false); - if (err) { - filter_disable(call); - append_filter_err(ps, filter); - } else - call->flags |= TRACE_EVENT_FL_FILTERED; -out: /* * Always swap the call filter with the new filter * even if there was an error. If there was an error * in the filter, we disable the filter and show the error * string */ - tmp = call->filter; - rcu_assign_pointer(call->filter, filter); - if (tmp) { - /* Make sure the call is done with the filter */ - synchronize_sched(); - __free_filter(tmp); + if (filter) { + struct event_filter *tmp = call->filter; + + if (!err) + call->flags |= TRACE_EVENT_FL_FILTERED; + else + filter_disable(call); + + rcu_assign_pointer(call->filter, filter); + + if (tmp) { + /* Make sure the call is done with the filter */ + synchronize_sched(); + __free_filter(tmp); + } } - filter_opstack_clear(ps); - postfix_clear(ps); - kfree(ps); out_unlock: mutex_unlock(&event_mutex); @@ -1811,7 +1910,6 @@ out_unlock: int apply_subsystem_event_filter(struct event_subsystem *system, char *filter_string) { - struct filter_parse_state *ps; struct event_filter *filter; int err = 0; @@ -1835,66 +1933,178 @@ int apply_subsystem_event_filter(struct event_subsystem *system, goto out_unlock; } - err = -ENOMEM; - ps = kzalloc(sizeof(*ps), GFP_KERNEL); - if (!ps) - goto out_unlock; + err = create_system_filter(system, filter_string, &filter); + if (filter) { + /* + * No event actually uses the system filter + * we can free it without synchronize_sched(). + */ + __free_filter(system->filter); + system->filter = filter; + } +out_unlock: + mutex_unlock(&event_mutex); - filter = __alloc_filter(); - if (!filter) - goto out; + return err; +} - /* System filters just show a default message */ - kfree(filter->filter_string); - filter->filter_string = NULL; +#ifdef CONFIG_PERF_EVENTS + +void ftrace_profile_free_filter(struct perf_event *event) +{ + struct event_filter *filter = event->filter; + + event->filter = NULL; + __free_filter(filter); +} + +struct function_filter_data { + struct ftrace_ops *ops; + int first_filter; + int first_notrace; +}; + +#ifdef CONFIG_FUNCTION_TRACER +static char ** +ftrace_function_filter_re(char *buf, int len, int *count) +{ + char *str, *sep, **re; + + str = kstrndup(buf, len, GFP_KERNEL); + if (!str) + return NULL; /* - * No event actually uses the system filter - * we can free it without synchronize_sched(). + * The argv_split function takes white space + * as a separator, so convert ',' into spaces. */ - __free_filter(system->filter); - system->filter = filter; + while ((sep = strchr(str, ','))) + *sep = ' '; - parse_init(ps, filter_ops, filter_string); - err = filter_parse(ps); - if (err) - goto err_filter; + re = argv_split(GFP_KERNEL, str, count); + kfree(str); + return re; +} - err = replace_system_preds(system, ps, filter_string); - if (err) - goto err_filter; +static int ftrace_function_set_regexp(struct ftrace_ops *ops, int filter, + int reset, char *re, int len) +{ + int ret; -out: - filter_opstack_clear(ps); - postfix_clear(ps); - kfree(ps); -out_unlock: - mutex_unlock(&event_mutex); + if (filter) + ret = ftrace_set_filter(ops, re, len, reset); + else + ret = ftrace_set_notrace(ops, re, len, reset); - return err; + return ret; +} -err_filter: - replace_filter_string(filter, filter_string); - append_filter_err(ps, system->filter); - goto out; +static int __ftrace_function_set_filter(int filter, char *buf, int len, + struct function_filter_data *data) +{ + int i, re_cnt, ret; + int *reset; + char **re; + + reset = filter ? &data->first_filter : &data->first_notrace; + + /* + * The 'ip' field could have multiple filters set, separated + * either by space or comma. We first cut the filter and apply + * all pieces separatelly. + */ + re = ftrace_function_filter_re(buf, len, &re_cnt); + if (!re) + return -EINVAL; + + for (i = 0; i < re_cnt; i++) { + ret = ftrace_function_set_regexp(data->ops, filter, *reset, + re[i], strlen(re[i])); + if (ret) + break; + + if (*reset) + *reset = 0; + } + + argv_free(re); + return ret; } -#ifdef CONFIG_PERF_EVENTS +static int ftrace_function_check_pred(struct filter_pred *pred, int leaf) +{ + struct ftrace_event_field *field = pred->field; -void ftrace_profile_free_filter(struct perf_event *event) + if (leaf) { + /* + * Check the leaf predicate for function trace, verify: + * - only '==' and '!=' is used + * - the 'ip' field is used + */ + if ((pred->op != OP_EQ) && (pred->op != OP_NE)) + return -EINVAL; + + if (strcmp(field->name, "ip")) + return -EINVAL; + } else { + /* + * Check the non leaf predicate for function trace, verify: + * - only '||' is used + */ + if (pred->op != OP_OR) + return -EINVAL; + } + + return 0; +} + +static int ftrace_function_set_filter_cb(enum move_type move, + struct filter_pred *pred, + int *err, void *data) { - struct event_filter *filter = event->filter; + /* Checking the node is valid for function trace. */ + if ((move != MOVE_DOWN) || + (pred->left != FILTER_PRED_INVALID)) { + *err = ftrace_function_check_pred(pred, 0); + } else { + *err = ftrace_function_check_pred(pred, 1); + if (*err) + return WALK_PRED_ABORT; + + *err = __ftrace_function_set_filter(pred->op == OP_EQ, + pred->regex.pattern, + pred->regex.len, + data); + } - event->filter = NULL; - __free_filter(filter); + return (*err) ? WALK_PRED_ABORT : WALK_PRED_DEFAULT; } +static int ftrace_function_set_filter(struct perf_event *event, + struct event_filter *filter) +{ + struct function_filter_data data = { + .first_filter = 1, + .first_notrace = 1, + .ops = &event->ftrace_ops, + }; + + return walk_pred_tree(filter->preds, filter->root, + ftrace_function_set_filter_cb, &data); +} +#else +static int ftrace_function_set_filter(struct perf_event *event, + struct event_filter *filter) +{ + return -ENODEV; +} +#endif /* CONFIG_FUNCTION_TRACER */ + int ftrace_profile_set_filter(struct perf_event *event, int event_id, char *filter_str) { int err; struct event_filter *filter; - struct filter_parse_state *ps; struct ftrace_event_call *call; mutex_lock(&event_mutex); @@ -1909,33 +2119,17 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id, if (event->filter) goto out_unlock; - filter = __alloc_filter(); - if (!filter) { - err = PTR_ERR(filter); - goto out_unlock; - } - - err = -ENOMEM; - ps = kzalloc(sizeof(*ps), GFP_KERNEL); - if (!ps) - goto free_filter; - - parse_init(ps, filter_ops, filter_str); - err = filter_parse(ps); + err = create_filter(call, filter_str, false, &filter); if (err) - goto free_ps; + goto free_filter; - err = replace_preds(call, filter, ps, filter_str, false); - if (!err) + if (ftrace_event_is_function(call)) + err = ftrace_function_set_filter(event, filter); + else event->filter = filter; -free_ps: - filter_opstack_clear(ps); - postfix_clear(ps); - kfree(ps); - free_filter: - if (err) + if (err || ftrace_event_is_function(call)) __free_filter(filter); out_unlock: @@ -1954,43 +2148,6 @@ out_unlock: #define CREATE_TRACE_POINTS #include "trace_events_filter_test.h" -static int test_get_filter(char *filter_str, struct ftrace_event_call *call, - struct event_filter **pfilter) -{ - struct event_filter *filter; - struct filter_parse_state *ps; - int err = -ENOMEM; - - filter = __alloc_filter(); - if (!filter) - goto out; - - ps = kzalloc(sizeof(*ps), GFP_KERNEL); - if (!ps) - goto free_filter; - - parse_init(ps, filter_ops, filter_str); - err = filter_parse(ps); - if (err) - goto free_ps; - - err = replace_preds(call, filter, ps, filter_str, false); - if (!err) - *pfilter = filter; - - free_ps: - filter_opstack_clear(ps); - postfix_clear(ps); - kfree(ps); - - free_filter: - if (err) - __free_filter(filter); - - out: - return err; -} - #define DATA_REC(m, va, vb, vc, vd, ve, vf, vg, vh, nvisit) \ { \ .filter = FILTER, \ @@ -2109,12 +2266,13 @@ static __init int ftrace_test_event_filter(void) struct test_filter_data_t *d = &test_filter_data[i]; int err; - err = test_get_filter(d->filter, &event_ftrace_test_filter, - &filter); + err = create_filter(&event_ftrace_test_filter, d->filter, + false, &filter); if (err) { printk(KERN_INFO "Failed to get filter for '%s', err %d\n", d->filter, err); + __free_filter(filter); break; } diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index bbeec31e0ae3..7b46c9bd22ae 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -18,6 +18,16 @@ #undef TRACE_SYSTEM #define TRACE_SYSTEM ftrace +/* + * The FTRACE_ENTRY_REG macro allows ftrace entry to define register + * function and thus become accesible via perf. + */ +#undef FTRACE_ENTRY_REG +#define FTRACE_ENTRY_REG(name, struct_name, id, tstruct, print, \ + filter, regfn) \ + FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print), \ + filter) + /* not needed for this file */ #undef __field_struct #define __field_struct(type, item) @@ -44,21 +54,22 @@ #define F_printk(fmt, args...) fmt, args #undef FTRACE_ENTRY -#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \ -struct ____ftrace_##name { \ - tstruct \ -}; \ -static void __always_unused ____ftrace_check_##name(void) \ -{ \ - struct ____ftrace_##name *__entry = NULL; \ - \ - /* force compile-time check on F_printk() */ \ - printk(print); \ +#define FTRACE_ENTRY(name, struct_name, id, tstruct, print, filter) \ +struct ____ftrace_##name { \ + tstruct \ +}; \ +static void __always_unused ____ftrace_check_##name(void) \ +{ \ + struct ____ftrace_##name *__entry = NULL; \ + \ + /* force compile-time check on F_printk() */ \ + printk(print); \ } #undef FTRACE_ENTRY_DUP -#define FTRACE_ENTRY_DUP(name, struct_name, id, tstruct, print) \ - FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print)) +#define FTRACE_ENTRY_DUP(name, struct_name, id, tstruct, print, filter) \ + FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print), \ + filter) #include "trace_entries.h" @@ -67,7 +78,7 @@ static void __always_unused ____ftrace_check_##name(void) \ ret = trace_define_field(event_call, #type, #item, \ offsetof(typeof(field), item), \ sizeof(field.item), \ - is_signed_type(type), FILTER_OTHER); \ + is_signed_type(type), filter_type); \ if (ret) \ return ret; @@ -77,7 +88,7 @@ static void __always_unused ____ftrace_check_##name(void) \ offsetof(typeof(field), \ container.item), \ sizeof(field.container.item), \ - is_signed_type(type), FILTER_OTHER); \ + is_signed_type(type), filter_type); \ if (ret) \ return ret; @@ -91,7 +102,7 @@ static void __always_unused ____ftrace_check_##name(void) \ ret = trace_define_field(event_call, event_storage, #item, \ offsetof(typeof(field), item), \ sizeof(field.item), \ - is_signed_type(type), FILTER_OTHER); \ + is_signed_type(type), filter_type); \ mutex_unlock(&event_storage_mutex); \ if (ret) \ return ret; \ @@ -104,7 +115,7 @@ static void __always_unused ____ftrace_check_##name(void) \ offsetof(typeof(field), \ container.item), \ sizeof(field.container.item), \ - is_signed_type(type), FILTER_OTHER); \ + is_signed_type(type), filter_type); \ if (ret) \ return ret; @@ -112,17 +123,18 @@ static void __always_unused ____ftrace_check_##name(void) \ #define __dynamic_array(type, item) \ ret = trace_define_field(event_call, #type, #item, \ offsetof(typeof(field), item), \ - 0, is_signed_type(type), FILTER_OTHER);\ + 0, is_signed_type(type), filter_type);\ if (ret) \ return ret; #undef FTRACE_ENTRY -#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \ +#define FTRACE_ENTRY(name, struct_name, id, tstruct, print, filter) \ int \ ftrace_define_fields_##name(struct ftrace_event_call *event_call) \ { \ struct struct_name field; \ int ret; \ + int filter_type = filter; \ \ tstruct; \ \ @@ -152,13 +164,15 @@ ftrace_define_fields_##name(struct ftrace_event_call *event_call) \ #undef F_printk #define F_printk(fmt, args...) #fmt ", " __stringify(args) -#undef FTRACE_ENTRY -#define FTRACE_ENTRY(call, struct_name, etype, tstruct, print) \ +#undef FTRACE_ENTRY_REG +#define FTRACE_ENTRY_REG(call, struct_name, etype, tstruct, print, filter,\ + regfn) \ \ struct ftrace_event_class event_class_ftrace_##call = { \ .system = __stringify(TRACE_SYSTEM), \ .define_fields = ftrace_define_fields_##call, \ .fields = LIST_HEAD_INIT(event_class_ftrace_##call.fields),\ + .reg = regfn, \ }; \ \ struct ftrace_event_call __used event_##call = { \ @@ -170,4 +184,14 @@ struct ftrace_event_call __used event_##call = { \ struct ftrace_event_call __used \ __attribute__((section("_ftrace_events"))) *__event_##call = &event_##call; +#undef FTRACE_ENTRY +#define FTRACE_ENTRY(call, struct_name, etype, tstruct, print, filter) \ + FTRACE_ENTRY_REG(call, struct_name, etype, \ + PARAMS(tstruct), PARAMS(print), filter, NULL) + +int ftrace_event_is_function(struct ftrace_event_call *call) +{ + return call == &event_function; +} + #include "trace_entries.h" diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 00d527c945a4..580a05ec926b 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1892,7 +1892,8 @@ static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri, #endif /* CONFIG_PERF_EVENTS */ static __kprobes -int kprobe_register(struct ftrace_event_call *event, enum trace_reg type) +int kprobe_register(struct ftrace_event_call *event, + enum trace_reg type, void *data) { struct trace_probe *tp = (struct trace_probe *)event->data; @@ -1909,6 +1910,11 @@ int kprobe_register(struct ftrace_event_call *event, enum trace_reg type) case TRACE_REG_PERF_UNREGISTER: disable_trace_probe(tp, TP_FLAG_PROFILE); return 0; + case TRACE_REG_PERF_OPEN: + case TRACE_REG_PERF_CLOSE: + case TRACE_REG_PERF_ADD: + case TRACE_REG_PERF_DEL: + return 0; #endif } return 0; diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 0d6ff3555942..859fae6b1825 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -264,7 +264,7 @@ void *trace_seq_reserve(struct trace_seq *s, size_t len) return ret; } -int trace_seq_path(struct trace_seq *s, struct path *path) +int trace_seq_path(struct trace_seq *s, const struct path *path) { unsigned char *p; @@ -300,7 +300,7 @@ ftrace_print_flags_seq(struct trace_seq *p, const char *delim, unsigned long mask; const char *str; const char *ret = p->buffer + p->len; - int i; + int i, first = 1; for (i = 0; flag_array[i].name && flags; i++) { @@ -310,14 +310,16 @@ ftrace_print_flags_seq(struct trace_seq *p, const char *delim, str = flag_array[i].name; flags &= ~mask; - if (p->len && delim) + if (!first && delim) trace_seq_puts(p, delim); + else + first = 0; trace_seq_puts(p, str); } /* check for left over flags */ if (flags) { - if (p->len && delim) + if (!first && delim) trace_seq_puts(p, delim); trace_seq_printf(p, "0x%lx", flags); } @@ -344,7 +346,7 @@ ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val, break; } - if (!p->len) + if (ret == (const char *)(p->buffer + p->len)) trace_seq_printf(p, "0x%lx", val); trace_seq_putc(p, 0); @@ -370,7 +372,7 @@ ftrace_print_symbols_seq_u64(struct trace_seq *p, unsigned long long val, break; } - if (!p->len) + if (ret == (const char *)(p->buffer + p->len)) trace_seq_printf(p, "0x%llx", val); trace_seq_putc(p, 0); diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 77575b386d97..d4545f49242e 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -13,6 +13,9 @@ #include <linux/sysctl.h> #include <linux/init.h> #include <linux/fs.h> + +#include <asm/setup.h> + #include "trace.h" #define STACK_TRACE_ENTRIES 500 @@ -133,7 +136,6 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip) static struct ftrace_ops trace_ops __read_mostly = { .func = stack_trace_call, - .flags = FTRACE_OPS_FL_GLOBAL, }; static ssize_t @@ -311,6 +313,21 @@ static const struct file_operations stack_trace_fops = { .release = seq_release, }; +static int +stack_trace_filter_open(struct inode *inode, struct file *file) +{ + return ftrace_regex_open(&trace_ops, FTRACE_ITER_FILTER, + inode, file); +} + +static const struct file_operations stack_trace_filter_fops = { + .open = stack_trace_filter_open, + .read = seq_read, + .write = ftrace_filter_write, + .llseek = ftrace_regex_lseek, + .release = ftrace_regex_release, +}; + int stack_trace_sysctl(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, @@ -338,8 +355,13 @@ stack_trace_sysctl(struct ctl_table *table, int write, return ret; } +static char stack_trace_filter_buf[COMMAND_LINE_SIZE+1] __initdata; + static __init int enable_stacktrace(char *str) { + if (strncmp(str, "_filter=", 8) == 0) + strncpy(stack_trace_filter_buf, str+8, COMMAND_LINE_SIZE); + stack_tracer_enabled = 1; last_stack_tracer_enabled = 1; return 1; @@ -358,6 +380,12 @@ static __init int stack_trace_init(void) trace_create_file("stack_trace", 0444, d_tracer, NULL, &stack_trace_fops); + trace_create_file("stack_trace_filter", 0444, d_tracer, + NULL, &stack_trace_filter_fops); + + if (stack_trace_filter_buf[0]) + ftrace_set_early_filter(&trace_ops, stack_trace_filter_buf, 1); + if (stack_tracer_enabled) register_ftrace_function(&trace_ops); diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index cb654542c1a1..96fc73369099 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -17,9 +17,9 @@ static DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls); static DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls); static int syscall_enter_register(struct ftrace_event_call *event, - enum trace_reg type); + enum trace_reg type, void *data); static int syscall_exit_register(struct ftrace_event_call *event, - enum trace_reg type); + enum trace_reg type, void *data); static int syscall_enter_define_fields(struct ftrace_event_call *call); static int syscall_exit_define_fields(struct ftrace_event_call *call); @@ -468,8 +468,8 @@ int __init init_ftrace_syscalls(void) unsigned long addr; int i; - syscalls_metadata = kzalloc(sizeof(*syscalls_metadata) * - NR_syscalls, GFP_KERNEL); + syscalls_metadata = kcalloc(NR_syscalls, sizeof(*syscalls_metadata), + GFP_KERNEL); if (!syscalls_metadata) { WARN_ON(1); return -ENOMEM; @@ -649,7 +649,7 @@ void perf_sysexit_disable(struct ftrace_event_call *call) #endif /* CONFIG_PERF_EVENTS */ static int syscall_enter_register(struct ftrace_event_call *event, - enum trace_reg type) + enum trace_reg type, void *data) { switch (type) { case TRACE_REG_REGISTER: @@ -664,13 +664,18 @@ static int syscall_enter_register(struct ftrace_event_call *event, case TRACE_REG_PERF_UNREGISTER: perf_sysenter_disable(event); return 0; + case TRACE_REG_PERF_OPEN: + case TRACE_REG_PERF_CLOSE: + case TRACE_REG_PERF_ADD: + case TRACE_REG_PERF_DEL: + return 0; #endif } return 0; } static int syscall_exit_register(struct ftrace_event_call *event, - enum trace_reg type) + enum trace_reg type, void *data) { switch (type) { case TRACE_REG_REGISTER: @@ -685,6 +690,11 @@ static int syscall_exit_register(struct ftrace_event_call *event, case TRACE_REG_PERF_UNREGISTER: perf_sysexit_disable(event); return 0; + case TRACE_REG_PERF_OPEN: + case TRACE_REG_PERF_CLOSE: + case TRACE_REG_PERF_ADD: + case TRACE_REG_PERF_DEL: + return 0; #endif } return 0; diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index db110b8ae030..d96ba22dabfa 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -25,7 +25,7 @@ #include <linux/err.h> #include <linux/slab.h> #include <linux/sched.h> -#include <linux/jump_label.h> +#include <linux/static_key.h> extern struct tracepoint * const __start___tracepoints_ptrs[]; extern struct tracepoint * const __stop___tracepoints_ptrs[]; @@ -256,9 +256,9 @@ static void set_tracepoint(struct tracepoint_entry **entry, { WARN_ON(strcmp((*entry)->name, elem->name) != 0); - if (elem->regfunc && !jump_label_enabled(&elem->key) && active) + if (elem->regfunc && !static_key_enabled(&elem->key) && active) elem->regfunc(); - else if (elem->unregfunc && jump_label_enabled(&elem->key) && !active) + else if (elem->unregfunc && static_key_enabled(&elem->key) && !active) elem->unregfunc(); /* @@ -269,10 +269,10 @@ static void set_tracepoint(struct tracepoint_entry **entry, * is used. */ rcu_assign_pointer(elem->funcs, (*entry)->funcs); - if (active && !jump_label_enabled(&elem->key)) - jump_label_inc(&elem->key); - else if (!active && jump_label_enabled(&elem->key)) - jump_label_dec(&elem->key); + if (active && !static_key_enabled(&elem->key)) + static_key_slow_inc(&elem->key); + else if (!active && static_key_enabled(&elem->key)) + static_key_slow_dec(&elem->key); } /* @@ -283,11 +283,11 @@ static void set_tracepoint(struct tracepoint_entry **entry, */ static void disable_tracepoint(struct tracepoint *elem) { - if (elem->unregfunc && jump_label_enabled(&elem->key)) + if (elem->unregfunc && static_key_enabled(&elem->key)) elem->unregfunc(); - if (jump_label_enabled(&elem->key)) - jump_label_dec(&elem->key); + if (static_key_enabled(&elem->key)) + static_key_slow_dec(&elem->key); rcu_assign_pointer(elem->funcs, NULL); } @@ -634,10 +634,11 @@ static int tracepoint_module_coming(struct module *mod) int ret = 0; /* - * We skip modules that tain the kernel, especially those with different - * module header (for forced load), to make sure we don't cause a crash. + * We skip modules that taint the kernel, especially those with different + * module headers (for forced load), to make sure we don't cause a crash. + * Staging and out-of-tree GPL modules are fine. */ - if (mod->taints) + if (mod->taints & ~((1 << TAINT_OOT_MODULE) | (1 << TAINT_CRAP))) return 0; mutex_lock(&tracepoints_mutex); tp_mod = kmalloc(sizeof(struct tp_module), GFP_KERNEL); diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 1d7bca7f4f52..14bc092fb12c 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -3,12 +3,9 @@ * * started by Don Zickus, Copyright (C) 2010 Red Hat, Inc. * - * this code detects hard lockups: incidents in where on a CPU - * the kernel does not respond to anything except NMI. - * - * Note: Most of this code is borrowed heavily from softlockup.c, - * so thanks to Ingo for the initial implementation. - * Some chunks also taken from arch/x86/kernel/apic/nmi.c, thanks + * Note: Most of this code is borrowed heavily from the original softlockup + * detector, so thanks to Ingo for the initial implementation. + * Some chunks also taken from the old x86-specific nmi watchdog code, thanks * to those contributors as well. */ @@ -117,9 +114,10 @@ static unsigned long get_sample_period(void) { /* * convert watchdog_thresh from seconds to ns - * the divide by 5 is to give hrtimer 5 chances to - * increment before the hardlockup detector generates - * a warning + * the divide by 5 is to give hrtimer several chances (two + * or three with the current relation between the soft + * and hard thresholds) to increment before the + * hardlockup detector generates a warning */ return get_softlockup_thresh() * (NSEC_PER_SEC / 5); } @@ -296,7 +294,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) if (__this_cpu_read(soft_watchdog_warn) == true) return HRTIMER_RESTART; - printk(KERN_ERR "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n", + printk(KERN_EMERG "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n", smp_processor_id(), duration, current->comm, task_pid_nr(current)); print_modules(); @@ -336,9 +334,11 @@ static int watchdog(void *unused) set_current_state(TASK_INTERRUPTIBLE); /* - * Run briefly once per second to reset the softlockup timestamp. - * If this gets delayed for more than 60 seconds then the - * debug-printout triggers in watchdog_timer_fn(). + * Run briefly (kicked by the hrtimer callback function) once every + * get_sample_period() seconds (4 seconds by default) to reset the + * softlockup timestamp. If this gets delayed for more than + * 2*watchdog_thresh seconds then the debug-printout triggers in + * watchdog_timer_fn(). */ while (!kthread_should_stop()) { __touch_watchdog(); diff --git a/kernel/workqueue.c b/kernel/workqueue.c index bec7b5b53e03..5abf42f63c08 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -253,11 +253,13 @@ struct workqueue_struct *system_long_wq __read_mostly; struct workqueue_struct *system_nrt_wq __read_mostly; struct workqueue_struct *system_unbound_wq __read_mostly; struct workqueue_struct *system_freezable_wq __read_mostly; +struct workqueue_struct *system_nrt_freezable_wq __read_mostly; EXPORT_SYMBOL_GPL(system_wq); EXPORT_SYMBOL_GPL(system_long_wq); EXPORT_SYMBOL_GPL(system_nrt_wq); EXPORT_SYMBOL_GPL(system_unbound_wq); EXPORT_SYMBOL_GPL(system_freezable_wq); +EXPORT_SYMBOL_GPL(system_nrt_freezable_wq); #define CREATE_TRACE_POINTS #include <trace/events/workqueue.h> @@ -474,13 +476,8 @@ static struct cpu_workqueue_struct *get_cwq(unsigned int cpu, struct workqueue_struct *wq) { if (!(wq->flags & WQ_UNBOUND)) { - if (likely(cpu < nr_cpu_ids)) { -#ifdef CONFIG_SMP + if (likely(cpu < nr_cpu_ids)) return per_cpu_ptr(wq->cpu_wq.pcpu, cpu); -#else - return wq->cpu_wq.single; -#endif - } } else if (likely(cpu == WORK_CPU_UNBOUND)) return wq->cpu_wq.single; return NULL; @@ -2897,13 +2894,8 @@ static int alloc_cwqs(struct workqueue_struct *wq) const size_t size = sizeof(struct cpu_workqueue_struct); const size_t align = max_t(size_t, 1 << WORK_STRUCT_FLAG_BITS, __alignof__(unsigned long long)); -#ifdef CONFIG_SMP - bool percpu = !(wq->flags & WQ_UNBOUND); -#else - bool percpu = false; -#endif - if (percpu) + if (!(wq->flags & WQ_UNBOUND)) wq->cpu_wq.pcpu = __alloc_percpu(size, align); else { void *ptr; @@ -2927,13 +2919,7 @@ static int alloc_cwqs(struct workqueue_struct *wq) static void free_cwqs(struct workqueue_struct *wq) { -#ifdef CONFIG_SMP - bool percpu = !(wq->flags & WQ_UNBOUND); -#else - bool percpu = false; -#endif - - if (percpu) + if (!(wq->flags & WQ_UNBOUND)) free_percpu(wq->cpu_wq.pcpu); else if (wq->cpu_wq.single) { /* the pointer to free is stored right after the cwq */ @@ -3833,8 +3819,11 @@ static int __init init_workqueues(void) WQ_UNBOUND_MAX_ACTIVE); system_freezable_wq = alloc_workqueue("events_freezable", WQ_FREEZABLE, 0); + system_nrt_freezable_wq = alloc_workqueue("events_nrt_freezable", + WQ_NON_REENTRANT | WQ_FREEZABLE, 0); BUG_ON(!system_wq || !system_long_wq || !system_nrt_wq || - !system_unbound_wq || !system_freezable_wq); + !system_unbound_wq || !system_freezable_wq || + !system_nrt_freezable_wq); return 0; } early_initcall(init_workqueues); |