diff options
Diffstat (limited to 'kernel')
130 files changed, 4933 insertions, 2638 deletions
diff --git a/kernel/.gitignore b/kernel/.gitignore index b3097bde4e9c..6e699100872f 100644 --- a/kernel/.gitignore +++ b/kernel/.gitignore @@ -1,7 +1,5 @@ # # Generated files # -config_data.h -config_data.gz timeconst.h hz.bc diff --git a/kernel/Makefile b/kernel/Makefile index 6aa7543bcdb2..6c57e78817da 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -116,17 +116,8 @@ obj-$(CONFIG_GCC_PLUGIN_STACKLEAK) += stackleak.o KASAN_SANITIZE_stackleak.o := n KCOV_INSTRUMENT_stackleak.o := n -$(obj)/configs.o: $(obj)/config_data.h +$(obj)/configs.o: $(obj)/config_data.gz targets += config_data.gz $(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE $(call if_changed,gzip) - -filechk_ikconfiggz = \ - echo "static const char kernel_config_data[] __used = MAGIC_START"; \ - cat $< | scripts/bin2c; \ - echo "MAGIC_END;" - -targets += config_data.h -$(obj)/config_data.h: $(obj)/config_data.gz FORCE - $(call filechk,ikconfiggz) diff --git a/kernel/async.c b/kernel/async.c index a893d6170944..f6bd0d9885e1 100644 --- a/kernel/async.c +++ b/kernel/async.c @@ -149,7 +149,25 @@ static void async_run_entry_fn(struct work_struct *work) wake_up(&async_done); } -static async_cookie_t __async_schedule(async_func_t func, void *data, struct async_domain *domain) +/** + * async_schedule_node_domain - NUMA specific version of async_schedule_domain + * @func: function to execute asynchronously + * @data: data pointer to pass to the function + * @node: NUMA node that we want to schedule this on or close to + * @domain: the domain + * + * Returns an async_cookie_t that may be used for checkpointing later. + * @domain may be used in the async_synchronize_*_domain() functions to + * wait within a certain synchronization domain rather than globally. + * + * Note: This function may be called from atomic or non-atomic contexts. + * + * The node requested will be honored on a best effort basis. If the node + * has no CPUs associated with it then the work is distributed among all + * available CPUs. + */ +async_cookie_t async_schedule_node_domain(async_func_t func, void *data, + int node, struct async_domain *domain) { struct async_entry *entry; unsigned long flags; @@ -195,43 +213,30 @@ static async_cookie_t __async_schedule(async_func_t func, void *data, struct asy current->flags |= PF_USED_ASYNC; /* schedule for execution */ - queue_work(system_unbound_wq, &entry->work); + queue_work_node(node, system_unbound_wq, &entry->work); return newcookie; } +EXPORT_SYMBOL_GPL(async_schedule_node_domain); /** - * async_schedule - schedule a function for asynchronous execution + * async_schedule_node - NUMA specific version of async_schedule * @func: function to execute asynchronously * @data: data pointer to pass to the function + * @node: NUMA node that we want to schedule this on or close to * * Returns an async_cookie_t that may be used for checkpointing later. * Note: This function may be called from atomic or non-atomic contexts. - */ -async_cookie_t async_schedule(async_func_t func, void *data) -{ - return __async_schedule(func, data, &async_dfl_domain); -} -EXPORT_SYMBOL_GPL(async_schedule); - -/** - * async_schedule_domain - schedule a function for asynchronous execution within a certain domain - * @func: function to execute asynchronously - * @data: data pointer to pass to the function - * @domain: the domain * - * Returns an async_cookie_t that may be used for checkpointing later. - * @domain may be used in the async_synchronize_*_domain() functions to - * wait within a certain synchronization domain rather than globally. A - * synchronization domain is specified via @domain. Note: This function - * may be called from atomic or non-atomic contexts. + * The node requested will be honored on a best effort basis. If the node + * has no CPUs associated with it then the work is distributed among all + * available CPUs. */ -async_cookie_t async_schedule_domain(async_func_t func, void *data, - struct async_domain *domain) +async_cookie_t async_schedule_node(async_func_t func, void *data, int node) { - return __async_schedule(func, data, domain); + return async_schedule_node_domain(func, data, node, &async_dfl_domain); } -EXPORT_SYMBOL_GPL(async_schedule_domain); +EXPORT_SYMBOL_GPL(async_schedule_node); /** * async_synchronize_full - synchronize all asynchronous function calls diff --git a/kernel/audit.c b/kernel/audit.c index 632d36059556..c89ea48c70a6 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -396,10 +396,10 @@ static int audit_log_config_change(char *function_name, u32 new, u32 old, struct audit_buffer *ab; int rc = 0; - ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); + ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_CONFIG_CHANGE); if (unlikely(!ab)) return rc; - audit_log_format(ab, "%s=%u old=%u ", function_name, new, old); + audit_log_format(ab, "op=set %s=%u old=%u ", function_name, new, old); audit_log_session_info(ab); rc = audit_log_task_context(ab); if (rc) @@ -1053,7 +1053,8 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type) return err; } -static void audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type) +static void audit_log_common_recv_msg(struct audit_context *context, + struct audit_buffer **ab, u16 msg_type) { uid_t uid = from_kuid(&init_user_ns, current_uid()); pid_t pid = task_tgid_nr(current); @@ -1063,7 +1064,7 @@ static void audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type) return; } - *ab = audit_log_start(NULL, GFP_KERNEL, msg_type); + *ab = audit_log_start(context, GFP_KERNEL, msg_type); if (unlikely(!*ab)) return; audit_log_format(*ab, "pid=%d uid=%u ", pid, uid); @@ -1071,6 +1072,12 @@ static void audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type) audit_log_task_context(*ab); } +static inline void audit_log_user_recv_msg(struct audit_buffer **ab, + u16 msg_type) +{ + audit_log_common_recv_msg(NULL, ab, msg_type); +} + int is_audit_feature_set(int i) { return af.features & AUDIT_FEATURE_TO_MASK(i); @@ -1338,7 +1345,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) if (err) break; } - audit_log_common_recv_msg(&ab, msg_type); + audit_log_user_recv_msg(&ab, msg_type); if (msg_type != AUDIT_USER_TTY) audit_log_format(ab, " msg='%.*s'", AUDIT_MESSAGE_TEXT_MAX, @@ -1361,8 +1368,12 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) if (nlmsg_len(nlh) < sizeof(struct audit_rule_data)) return -EINVAL; if (audit_enabled == AUDIT_LOCKED) { - audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE); - audit_log_format(ab, " audit_enabled=%d res=0", audit_enabled); + audit_log_common_recv_msg(audit_context(), &ab, + AUDIT_CONFIG_CHANGE); + audit_log_format(ab, " op=%s audit_enabled=%d res=0", + msg_type == AUDIT_ADD_RULE ? + "add_rule" : "remove_rule", + audit_enabled); audit_log_end(ab); return -EPERM; } @@ -1373,7 +1384,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) break; case AUDIT_TRIM: audit_trim_trees(); - audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE); + audit_log_common_recv_msg(audit_context(), &ab, + AUDIT_CONFIG_CHANGE); audit_log_format(ab, " op=trim res=1"); audit_log_end(ab); break; @@ -1403,8 +1415,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) /* OK, here comes... */ err = audit_tag_tree(old, new); - audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE); - + audit_log_common_recv_msg(audit_context(), &ab, + AUDIT_CONFIG_CHANGE); audit_log_format(ab, " op=make_equiv old="); audit_log_untrustedstring(ab, old); audit_log_format(ab, " new="); @@ -1471,7 +1483,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) old.enabled = t & AUDIT_TTY_ENABLE; old.log_passwd = !!(t & AUDIT_TTY_LOG_PASSWD); - audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE); + audit_log_common_recv_msg(audit_context(), &ab, + AUDIT_CONFIG_CHANGE); audit_log_format(ab, " op=tty_set old-enabled=%d new-enabled=%d" " old-log_passwd=%d new-log_passwd=%d res=%d", old.enabled, s.enabled, old.log_passwd, @@ -2054,153 +2067,6 @@ void audit_log_key(struct audit_buffer *ab, char *key) audit_log_format(ab, "(null)"); } -void audit_log_cap(struct audit_buffer *ab, char *prefix, kernel_cap_t *cap) -{ - int i; - - if (cap_isclear(*cap)) { - audit_log_format(ab, " %s=0", prefix); - return; - } - audit_log_format(ab, " %s=", prefix); - CAP_FOR_EACH_U32(i) - audit_log_format(ab, "%08x", cap->cap[CAP_LAST_U32 - i]); -} - -static void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name) -{ - audit_log_cap(ab, "cap_fp", &name->fcap.permitted); - audit_log_cap(ab, "cap_fi", &name->fcap.inheritable); - audit_log_format(ab, " cap_fe=%d cap_fver=%x", - name->fcap.fE, name->fcap_ver); -} - -static inline int audit_copy_fcaps(struct audit_names *name, - const struct dentry *dentry) -{ - struct cpu_vfs_cap_data caps; - int rc; - - if (!dentry) - return 0; - - rc = get_vfs_caps_from_disk(dentry, &caps); - if (rc) - return rc; - - name->fcap.permitted = caps.permitted; - name->fcap.inheritable = caps.inheritable; - name->fcap.fE = !!(caps.magic_etc & VFS_CAP_FLAGS_EFFECTIVE); - name->fcap_ver = (caps.magic_etc & VFS_CAP_REVISION_MASK) >> - VFS_CAP_REVISION_SHIFT; - - return 0; -} - -/* Copy inode data into an audit_names. */ -void audit_copy_inode(struct audit_names *name, const struct dentry *dentry, - struct inode *inode) -{ - name->ino = inode->i_ino; - name->dev = inode->i_sb->s_dev; - name->mode = inode->i_mode; - name->uid = inode->i_uid; - name->gid = inode->i_gid; - name->rdev = inode->i_rdev; - security_inode_getsecid(inode, &name->osid); - audit_copy_fcaps(name, dentry); -} - -/** - * audit_log_name - produce AUDIT_PATH record from struct audit_names - * @context: audit_context for the task - * @n: audit_names structure with reportable details - * @path: optional path to report instead of audit_names->name - * @record_num: record number to report when handling a list of names - * @call_panic: optional pointer to int that will be updated if secid fails - */ -void audit_log_name(struct audit_context *context, struct audit_names *n, - const struct path *path, int record_num, int *call_panic) -{ - struct audit_buffer *ab; - ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH); - if (!ab) - return; - - audit_log_format(ab, "item=%d", record_num); - - if (path) - audit_log_d_path(ab, " name=", path); - else if (n->name) { - switch (n->name_len) { - case AUDIT_NAME_FULL: - /* log the full path */ - audit_log_format(ab, " name="); - audit_log_untrustedstring(ab, n->name->name); - break; - case 0: - /* name was specified as a relative path and the - * directory component is the cwd */ - audit_log_d_path(ab, " name=", &context->pwd); - break; - default: - /* log the name's directory component */ - audit_log_format(ab, " name="); - audit_log_n_untrustedstring(ab, n->name->name, - n->name_len); - } - } else - audit_log_format(ab, " name=(null)"); - - if (n->ino != AUDIT_INO_UNSET) - audit_log_format(ab, " inode=%lu" - " dev=%02x:%02x mode=%#ho" - " ouid=%u ogid=%u rdev=%02x:%02x", - n->ino, - MAJOR(n->dev), - MINOR(n->dev), - n->mode, - from_kuid(&init_user_ns, n->uid), - from_kgid(&init_user_ns, n->gid), - MAJOR(n->rdev), - MINOR(n->rdev)); - if (n->osid != 0) { - char *ctx = NULL; - u32 len; - if (security_secid_to_secctx( - n->osid, &ctx, &len)) { - audit_log_format(ab, " osid=%u", n->osid); - if (call_panic) - *call_panic = 2; - } else { - audit_log_format(ab, " obj=%s", ctx); - security_release_secctx(ctx, len); - } - } - - /* log the audit_names record type */ - switch(n->type) { - case AUDIT_TYPE_NORMAL: - audit_log_format(ab, " nametype=NORMAL"); - break; - case AUDIT_TYPE_PARENT: - audit_log_format(ab, " nametype=PARENT"); - break; - case AUDIT_TYPE_CHILD_DELETE: - audit_log_format(ab, " nametype=DELETE"); - break; - case AUDIT_TYPE_CHILD_CREATE: - audit_log_format(ab, " nametype=CREATE"); - break; - default: - audit_log_format(ab, " nametype=UNKNOWN"); - break; - } - - audit_log_fcaps(ab, n); - audit_log_end(ab); -} - int audit_log_task_context(struct audit_buffer *ab) { char *ctx = NULL; @@ -2322,6 +2188,91 @@ void audit_log_link_denied(const char *operation) audit_log_end(ab); } +/* global counter which is incremented every time something logs in */ +static atomic_t session_id = ATOMIC_INIT(0); + +static int audit_set_loginuid_perm(kuid_t loginuid) +{ + /* if we are unset, we don't need privs */ + if (!audit_loginuid_set(current)) + return 0; + /* if AUDIT_FEATURE_LOGINUID_IMMUTABLE means never ever allow a change*/ + if (is_audit_feature_set(AUDIT_FEATURE_LOGINUID_IMMUTABLE)) + return -EPERM; + /* it is set, you need permission */ + if (!capable(CAP_AUDIT_CONTROL)) + return -EPERM; + /* reject if this is not an unset and we don't allow that */ + if (is_audit_feature_set(AUDIT_FEATURE_ONLY_UNSET_LOGINUID) + && uid_valid(loginuid)) + return -EPERM; + return 0; +} + +static void audit_log_set_loginuid(kuid_t koldloginuid, kuid_t kloginuid, + unsigned int oldsessionid, + unsigned int sessionid, int rc) +{ + struct audit_buffer *ab; + uid_t uid, oldloginuid, loginuid; + struct tty_struct *tty; + + if (!audit_enabled) + return; + + ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN); + if (!ab) + return; + + uid = from_kuid(&init_user_ns, task_uid(current)); + oldloginuid = from_kuid(&init_user_ns, koldloginuid); + loginuid = from_kuid(&init_user_ns, kloginuid), + tty = audit_get_tty(); + + audit_log_format(ab, "pid=%d uid=%u", task_tgid_nr(current), uid); + audit_log_task_context(ab); + audit_log_format(ab, " old-auid=%u auid=%u tty=%s old-ses=%u ses=%u res=%d", + oldloginuid, loginuid, tty ? tty_name(tty) : "(none)", + oldsessionid, sessionid, !rc); + audit_put_tty(tty); + audit_log_end(ab); +} + +/** + * audit_set_loginuid - set current task's loginuid + * @loginuid: loginuid value + * + * Returns 0. + * + * Called (set) from fs/proc/base.c::proc_loginuid_write(). + */ +int audit_set_loginuid(kuid_t loginuid) +{ + unsigned int oldsessionid, sessionid = AUDIT_SID_UNSET; + kuid_t oldloginuid; + int rc; + + oldloginuid = audit_get_loginuid(current); + oldsessionid = audit_get_sessionid(current); + + rc = audit_set_loginuid_perm(loginuid); + if (rc) + goto out; + + /* are we setting or clearing? */ + if (uid_valid(loginuid)) { + sessionid = (unsigned int)atomic_inc_return(&session_id); + if (unlikely(sessionid == AUDIT_SID_UNSET)) + sessionid = (unsigned int)atomic_inc_return(&session_id); + } + + current->sessionid = sessionid; + current->loginuid = loginuid; +out: + audit_log_set_loginuid(oldloginuid, loginuid, oldsessionid, sessionid, rc); + return rc; +} + /** * audit_log_end - end one audit record * @ab: the audit_buffer diff --git a/kernel/audit.h b/kernel/audit.h index 91421679a168..958d5b8fc1b3 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -69,6 +69,7 @@ struct audit_cap_data { kernel_cap_t effective; /* effective set of process */ }; kernel_cap_t ambient; + kuid_t rootid; }; /* When fs/namei.c:getname() is called, we store the pointer in name and bump @@ -212,15 +213,6 @@ extern bool audit_ever_enabled; extern void audit_log_session_info(struct audit_buffer *ab); -extern void audit_copy_inode(struct audit_names *name, - const struct dentry *dentry, - struct inode *inode); -extern void audit_log_cap(struct audit_buffer *ab, char *prefix, - kernel_cap_t *cap); -extern void audit_log_name(struct audit_context *context, - struct audit_names *n, const struct path *path, - int record_num, int *call_panic); - extern int auditd_test_task(struct task_struct *task); #define AUDIT_INODE_BUCKETS 32 @@ -267,25 +259,52 @@ extern void audit_log_d_path_exe(struct audit_buffer *ab, extern struct tty_struct *audit_get_tty(void); extern void audit_put_tty(struct tty_struct *tty); -/* audit watch functions */ +/* audit watch/mark/tree functions */ #ifdef CONFIG_AUDITSYSCALL +extern unsigned int audit_serial(void); +extern int auditsc_get_stamp(struct audit_context *ctx, + struct timespec64 *t, unsigned int *serial); + extern void audit_put_watch(struct audit_watch *watch); extern void audit_get_watch(struct audit_watch *watch); -extern int audit_to_watch(struct audit_krule *krule, char *path, int len, u32 op); +extern int audit_to_watch(struct audit_krule *krule, char *path, int len, + u32 op); extern int audit_add_watch(struct audit_krule *krule, struct list_head **list); extern void audit_remove_watch_rule(struct audit_krule *krule); extern char *audit_watch_path(struct audit_watch *watch); -extern int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev_t dev); +extern int audit_watch_compare(struct audit_watch *watch, unsigned long ino, + dev_t dev); -extern struct audit_fsnotify_mark *audit_alloc_mark(struct audit_krule *krule, char *pathname, int len); +extern struct audit_fsnotify_mark *audit_alloc_mark(struct audit_krule *krule, + char *pathname, int len); extern char *audit_mark_path(struct audit_fsnotify_mark *mark); extern void audit_remove_mark(struct audit_fsnotify_mark *audit_mark); extern void audit_remove_mark_rule(struct audit_krule *krule); -extern int audit_mark_compare(struct audit_fsnotify_mark *mark, unsigned long ino, dev_t dev); +extern int audit_mark_compare(struct audit_fsnotify_mark *mark, + unsigned long ino, dev_t dev); extern int audit_dupe_exe(struct audit_krule *new, struct audit_krule *old); -extern int audit_exe_compare(struct task_struct *tsk, struct audit_fsnotify_mark *mark); +extern int audit_exe_compare(struct task_struct *tsk, + struct audit_fsnotify_mark *mark); + +extern struct audit_chunk *audit_tree_lookup(const struct inode *inode); +extern void audit_put_chunk(struct audit_chunk *chunk); +extern bool audit_tree_match(struct audit_chunk *chunk, + struct audit_tree *tree); +extern int audit_make_tree(struct audit_krule *rule, char *pathname, u32 op); +extern int audit_add_tree_rule(struct audit_krule *rule); +extern int audit_remove_tree_rule(struct audit_krule *rule); +extern void audit_trim_trees(void); +extern int audit_tag_tree(char *old, char *new); +extern const char *audit_tree_path(struct audit_tree *tree); +extern void audit_put_tree(struct audit_tree *tree); +extern void audit_kill_trees(struct audit_context *context); -#else +extern int audit_signal_info(int sig, struct task_struct *t); +extern void audit_filter_inodes(struct task_struct *tsk, + struct audit_context *ctx); +extern struct list_head *audit_killed_trees(void); +#else /* CONFIG_AUDITSYSCALL */ +#define auditsc_get_stamp(c, t, s) 0 #define audit_put_watch(w) {} #define audit_get_watch(w) {} #define audit_to_watch(k, p, l, o) (-EINVAL) @@ -301,21 +320,7 @@ extern int audit_exe_compare(struct task_struct *tsk, struct audit_fsnotify_mark #define audit_mark_compare(m, i, d) 0 #define audit_exe_compare(t, m) (-EINVAL) #define audit_dupe_exe(n, o) (-EINVAL) -#endif /* CONFIG_AUDITSYSCALL */ -#ifdef CONFIG_AUDITSYSCALL -extern struct audit_chunk *audit_tree_lookup(const struct inode *inode); -extern void audit_put_chunk(struct audit_chunk *chunk); -extern bool audit_tree_match(struct audit_chunk *chunk, struct audit_tree *tree); -extern int audit_make_tree(struct audit_krule *rule, char *pathname, u32 op); -extern int audit_add_tree_rule(struct audit_krule *rule); -extern int audit_remove_tree_rule(struct audit_krule *rule); -extern void audit_trim_trees(void); -extern int audit_tag_tree(char *old, char *new); -extern const char *audit_tree_path(struct audit_tree *tree); -extern void audit_put_tree(struct audit_tree *tree); -extern void audit_kill_trees(struct list_head *list); -#else #define audit_remove_tree_rule(rule) BUG() #define audit_add_tree_rule(rule) -EINVAL #define audit_make_tree(rule, str, op) -EINVAL @@ -323,8 +328,11 @@ extern void audit_kill_trees(struct list_head *list); #define audit_put_tree(tree) (void)0 #define audit_tag_tree(old, new) -EINVAL #define audit_tree_path(rule) "" /* never called */ -#define audit_kill_trees(list) BUG() -#endif +#define audit_kill_trees(context) BUG() + +#define audit_signal_info(s, t) AUDIT_DISABLED +#define audit_filter_inodes(t, c) AUDIT_DISABLED +#endif /* CONFIG_AUDITSYSCALL */ extern char *audit_unpack_string(void **bufp, size_t *remain, size_t len); @@ -334,14 +342,5 @@ extern u32 audit_sig_sid; extern int audit_filter(int msgtype, unsigned int listtype); -#ifdef CONFIG_AUDITSYSCALL -extern int audit_signal_info(int sig, struct task_struct *t); -extern void audit_filter_inodes(struct task_struct *tsk, struct audit_context *ctx); -extern struct list_head *audit_killed_trees(void); -#else -#define audit_signal_info(s,t) AUDIT_DISABLED -#define audit_filter_inodes(t,c) AUDIT_DISABLED -#endif - extern void audit_ctl_lock(void); extern void audit_ctl_unlock(void); diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c index cf4512a33675..37ae95cfb7f4 100644 --- a/kernel/audit_fsnotify.c +++ b/kernel/audit_fsnotify.c @@ -127,7 +127,7 @@ static void audit_mark_log_rule_change(struct audit_fsnotify_mark *audit_mark, c if (!audit_enabled) return; - ab = audit_log_start(NULL, GFP_NOFS, AUDIT_CONFIG_CHANGE); + ab = audit_log_start(audit_context(), GFP_NOFS, AUDIT_CONFIG_CHANGE); if (unlikely(!ab)) return; audit_log_session_info(ab); diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index d4af4d97f847..abfb112f26aa 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -524,13 +524,14 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) return 0; } -static void audit_tree_log_remove_rule(struct audit_krule *rule) +static void audit_tree_log_remove_rule(struct audit_context *context, + struct audit_krule *rule) { struct audit_buffer *ab; if (!audit_enabled) return; - ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); + ab = audit_log_start(context, GFP_KERNEL, AUDIT_CONFIG_CHANGE); if (unlikely(!ab)) return; audit_log_format(ab, "op=remove_rule dir="); @@ -540,7 +541,7 @@ static void audit_tree_log_remove_rule(struct audit_krule *rule) audit_log_end(ab); } -static void kill_rules(struct audit_tree *tree) +static void kill_rules(struct audit_context *context, struct audit_tree *tree) { struct audit_krule *rule, *next; struct audit_entry *entry; @@ -551,7 +552,7 @@ static void kill_rules(struct audit_tree *tree) list_del_init(&rule->rlist); if (rule->tree) { /* not a half-baked one */ - audit_tree_log_remove_rule(rule); + audit_tree_log_remove_rule(context, rule); if (entry->rule.exe) audit_remove_mark(entry->rule.exe); rule->tree = NULL; @@ -633,7 +634,7 @@ static void trim_marked(struct audit_tree *tree) tree->goner = 1; spin_unlock(&hash_lock); mutex_lock(&audit_filter_mutex); - kill_rules(tree); + kill_rules(audit_context(), tree); list_del_init(&tree->list); mutex_unlock(&audit_filter_mutex); prune_one(tree); @@ -973,8 +974,10 @@ static void audit_schedule_prune(void) * ... and that one is done if evict_chunk() decides to delay until the end * of syscall. Runs synchronously. */ -void audit_kill_trees(struct list_head *list) +void audit_kill_trees(struct audit_context *context) { + struct list_head *list = &context->killed_trees; + audit_ctl_lock(); mutex_lock(&audit_filter_mutex); @@ -982,7 +985,7 @@ void audit_kill_trees(struct list_head *list) struct audit_tree *victim; victim = list_entry(list->next, struct audit_tree, list); - kill_rules(victim); + kill_rules(context, victim); list_del_init(&victim->list); mutex_unlock(&audit_filter_mutex); @@ -1017,7 +1020,7 @@ static void evict_chunk(struct audit_chunk *chunk) list_del_init(&owner->same_root); spin_unlock(&hash_lock); if (!postponed) { - kill_rules(owner); + kill_rules(audit_context(), owner); list_move(&owner->list, &prune_list); need_prune = 1; } else { diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 20ef9ba134b0..e8d1adeb2223 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -242,7 +242,7 @@ static void audit_watch_log_rule_change(struct audit_krule *r, struct audit_watc if (!audit_enabled) return; - ab = audit_log_start(NULL, GFP_NOFS, AUDIT_CONFIG_CHANGE); + ab = audit_log_start(audit_context(), GFP_NOFS, AUDIT_CONFIG_CHANGE); if (!ab) return; audit_log_session_info(ab); diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index bf309f2592c4..63f8b3f26fab 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -670,7 +670,7 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule) data->values[i] = AUDIT_UID_UNSET; break; } - /* fallthrough if set */ + /* fall through - if set */ default: data->values[i] = f->val; } @@ -1091,7 +1091,7 @@ static void audit_log_rule_change(char *action, struct audit_krule *rule, int re if (!audit_enabled) return; - ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); + ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_CONFIG_CHANGE); if (!ab) return; audit_log_session_info(ab); @@ -1355,7 +1355,7 @@ int audit_filter(int msgtype, unsigned int listtype) if (f->lsm_rule) { security_task_getsecid(current, &sid); result = security_audit_rule_match(sid, - f->type, f->op, f->lsm_rule, NULL); + f->type, f->op, f->lsm_rule); } break; case AUDIT_EXE: diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 6593a5207fb0..d1eab1d4a930 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -631,9 +631,8 @@ static int audit_filter_rules(struct task_struct *tsk, need_sid = 0; } result = security_audit_rule_match(sid, f->type, - f->op, - f->lsm_rule, - ctx); + f->op, + f->lsm_rule); } break; case AUDIT_OBJ_USER: @@ -647,13 +646,17 @@ static int audit_filter_rules(struct task_struct *tsk, /* Find files that match */ if (name) { result = security_audit_rule_match( - name->osid, f->type, f->op, - f->lsm_rule, ctx); + name->osid, + f->type, + f->op, + f->lsm_rule); } else if (ctx) { list_for_each_entry(n, &ctx->names_list, list) { - if (security_audit_rule_match(n->osid, f->type, - f->op, f->lsm_rule, - ctx)) { + if (security_audit_rule_match( + n->osid, + f->type, + f->op, + f->lsm_rule)) { ++result; break; } @@ -664,7 +667,7 @@ static int audit_filter_rules(struct task_struct *tsk, break; if (security_audit_rule_match(ctx->ipc.osid, f->type, f->op, - f->lsm_rule, ctx)) + f->lsm_rule)) ++result; } break; @@ -1136,6 +1139,32 @@ out: kfree(buf_head); } +void audit_log_cap(struct audit_buffer *ab, char *prefix, kernel_cap_t *cap) +{ + int i; + + if (cap_isclear(*cap)) { + audit_log_format(ab, " %s=0", prefix); + return; + } + audit_log_format(ab, " %s=", prefix); + CAP_FOR_EACH_U32(i) + audit_log_format(ab, "%08x", cap->cap[CAP_LAST_U32 - i]); +} + +static void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name) +{ + if (name->fcap_ver == -1) { + audit_log_format(ab, " cap_fe=? cap_fver=? cap_fp=? cap_fi=?"); + return; + } + audit_log_cap(ab, "cap_fp", &name->fcap.permitted); + audit_log_cap(ab, "cap_fi", &name->fcap.inheritable); + audit_log_format(ab, " cap_fe=%d cap_fver=%x cap_frootid=%d", + name->fcap.fE, name->fcap_ver, + from_kuid(&init_user_ns, name->fcap.rootid)); +} + static void show_special(struct audit_context *context, int *call_panic) { struct audit_buffer *ab; @@ -1258,6 +1287,97 @@ static inline int audit_proctitle_rtrim(char *proctitle, int len) return len; } +/* + * audit_log_name - produce AUDIT_PATH record from struct audit_names + * @context: audit_context for the task + * @n: audit_names structure with reportable details + * @path: optional path to report instead of audit_names->name + * @record_num: record number to report when handling a list of names + * @call_panic: optional pointer to int that will be updated if secid fails + */ +static void audit_log_name(struct audit_context *context, struct audit_names *n, + const struct path *path, int record_num, int *call_panic) +{ + struct audit_buffer *ab; + + ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH); + if (!ab) + return; + + audit_log_format(ab, "item=%d", record_num); + + if (path) + audit_log_d_path(ab, " name=", path); + else if (n->name) { + switch (n->name_len) { + case AUDIT_NAME_FULL: + /* log the full path */ + audit_log_format(ab, " name="); + audit_log_untrustedstring(ab, n->name->name); + break; + case 0: + /* name was specified as a relative path and the + * directory component is the cwd + */ + audit_log_d_path(ab, " name=", &context->pwd); + break; + default: + /* log the name's directory component */ + audit_log_format(ab, " name="); + audit_log_n_untrustedstring(ab, n->name->name, + n->name_len); + } + } else + audit_log_format(ab, " name=(null)"); + + if (n->ino != AUDIT_INO_UNSET) + audit_log_format(ab, " inode=%lu dev=%02x:%02x mode=%#ho ouid=%u ogid=%u rdev=%02x:%02x", + n->ino, + MAJOR(n->dev), + MINOR(n->dev), + n->mode, + from_kuid(&init_user_ns, n->uid), + from_kgid(&init_user_ns, n->gid), + MAJOR(n->rdev), + MINOR(n->rdev)); + if (n->osid != 0) { + char *ctx = NULL; + u32 len; + + if (security_secid_to_secctx( + n->osid, &ctx, &len)) { + audit_log_format(ab, " osid=%u", n->osid); + if (call_panic) + *call_panic = 2; + } else { + audit_log_format(ab, " obj=%s", ctx); + security_release_secctx(ctx, len); + } + } + + /* log the audit_names record type */ + switch (n->type) { + case AUDIT_TYPE_NORMAL: + audit_log_format(ab, " nametype=NORMAL"); + break; + case AUDIT_TYPE_PARENT: + audit_log_format(ab, " nametype=PARENT"); + break; + case AUDIT_TYPE_CHILD_DELETE: + audit_log_format(ab, " nametype=DELETE"); + break; + case AUDIT_TYPE_CHILD_CREATE: + audit_log_format(ab, " nametype=CREATE"); + break; + default: + audit_log_format(ab, " nametype=UNKNOWN"); + break; + } + + audit_log_fcaps(ab, n); + audit_log_end(ab); +} + static void audit_log_proctitle(void) { int res; @@ -1358,6 +1478,9 @@ static void audit_log_exit(void) audit_log_cap(ab, "pi", &axs->new_pcap.inheritable); audit_log_cap(ab, "pe", &axs->new_pcap.effective); audit_log_cap(ab, "pa", &axs->new_pcap.ambient); + audit_log_format(ab, " frootid=%d", + from_kuid(&init_user_ns, + axs->fcap.rootid)); break; } } @@ -1444,6 +1567,9 @@ void __audit_free(struct task_struct *tsk) if (!context) return; + if (!list_empty(&context->killed_trees)) + audit_kill_trees(context); + /* We are called either by do_exit() or the fork() error handling code; * in the former case tsk == current and in the latter tsk is a * random task_struct that doesn't doesn't have any meaningful data we @@ -1460,9 +1586,6 @@ void __audit_free(struct task_struct *tsk) audit_log_exit(); } - if (!list_empty(&context->killed_trees)) - audit_kill_trees(&context->killed_trees); - audit_set_context(tsk, NULL); audit_free_context(context); } @@ -1537,6 +1660,9 @@ void __audit_syscall_exit(int success, long return_code) if (!context) return; + if (!list_empty(&context->killed_trees)) + audit_kill_trees(context); + if (!context->dummy && context->in_syscall) { if (success) context->return_valid = AUDITSC_SUCCESS; @@ -1571,9 +1697,6 @@ void __audit_syscall_exit(int success, long return_code) context->in_syscall = 0; context->prio = context->state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0; - if (!list_empty(&context->killed_trees)) - audit_kill_trees(&context->killed_trees); - audit_free_names(context); unroll_tree_refs(context, NULL, 0); audit_free_aux(context); @@ -1750,6 +1873,47 @@ void __audit_getname(struct filename *name) get_fs_pwd(current->fs, &context->pwd); } +static inline int audit_copy_fcaps(struct audit_names *name, + const struct dentry *dentry) +{ + struct cpu_vfs_cap_data caps; + int rc; + + if (!dentry) + return 0; + + rc = get_vfs_caps_from_disk(dentry, &caps); + if (rc) + return rc; + + name->fcap.permitted = caps.permitted; + name->fcap.inheritable = caps.inheritable; + name->fcap.fE = !!(caps.magic_etc & VFS_CAP_FLAGS_EFFECTIVE); + name->fcap.rootid = caps.rootid; + name->fcap_ver = (caps.magic_etc & VFS_CAP_REVISION_MASK) >> + VFS_CAP_REVISION_SHIFT; + + return 0; +} + +/* Copy inode data into an audit_names. */ +void audit_copy_inode(struct audit_names *name, const struct dentry *dentry, + struct inode *inode, unsigned int flags) +{ + name->ino = inode->i_ino; + name->dev = inode->i_sb->s_dev; + name->mode = inode->i_mode; + name->uid = inode->i_uid; + name->gid = inode->i_gid; + name->rdev = inode->i_rdev; + security_inode_getsecid(inode, &name->osid); + if (flags & AUDIT_INODE_NOEVAL) { + name->fcap_ver = -1; + return; + } + audit_copy_fcaps(name, dentry); +} + /** * __audit_inode - store the inode and device from a lookup * @name: name being audited @@ -1763,10 +1927,31 @@ void __audit_inode(struct filename *name, const struct dentry *dentry, struct inode *inode = d_backing_inode(dentry); struct audit_names *n; bool parent = flags & AUDIT_INODE_PARENT; + struct audit_entry *e; + struct list_head *list = &audit_filter_list[AUDIT_FILTER_FS]; + int i; if (!context->in_syscall) return; + rcu_read_lock(); + if (!list_empty(list)) { + list_for_each_entry_rcu(e, list, list) { + for (i = 0; i < e->rule.field_count; i++) { + struct audit_field *f = &e->rule.fields[i]; + + if (f->type == AUDIT_FSTYPE + && audit_comparator(inode->i_sb->s_magic, + f->op, f->val) + && e->rule.action == AUDIT_NEVER) { + rcu_read_unlock(); + return; + } + } + } + } + rcu_read_unlock(); + if (!name) goto out_alloc; @@ -1832,7 +2017,7 @@ out: n->type = AUDIT_TYPE_NORMAL; } handle_path(dentry); - audit_copy_inode(n, dentry, inode); + audit_copy_inode(n, dentry, inode, flags & AUDIT_INODE_NOEVAL); } void __audit_file(const struct file *file) @@ -1875,14 +2060,12 @@ void __audit_inode_child(struct inode *parent, for (i = 0; i < e->rule.field_count; i++) { struct audit_field *f = &e->rule.fields[i]; - if (f->type == AUDIT_FSTYPE) { - if (audit_comparator(parent->i_sb->s_magic, - f->op, f->val)) { - if (e->rule.action == AUDIT_NEVER) { - rcu_read_unlock(); - return; - } - } + if (f->type == AUDIT_FSTYPE + && audit_comparator(parent->i_sb->s_magic, + f->op, f->val) + && e->rule.action == AUDIT_NEVER) { + rcu_read_unlock(); + return; } } } @@ -1933,7 +2116,7 @@ void __audit_inode_child(struct inode *parent, n = audit_alloc_name(context, AUDIT_TYPE_PARENT); if (!n) return; - audit_copy_inode(n, NULL, parent); + audit_copy_inode(n, NULL, parent, 0); } if (!found_child) { @@ -1952,7 +2135,7 @@ void __audit_inode_child(struct inode *parent, } if (inode) - audit_copy_inode(found_child, dentry, inode); + audit_copy_inode(found_child, dentry, inode, 0); else found_child->ino = AUDIT_INO_UNSET; } @@ -1983,90 +2166,6 @@ int auditsc_get_stamp(struct audit_context *ctx, return 1; } -/* global counter which is incremented every time something logs in */ -static atomic_t session_id = ATOMIC_INIT(0); - -static int audit_set_loginuid_perm(kuid_t loginuid) -{ - /* if we are unset, we don't need privs */ - if (!audit_loginuid_set(current)) - return 0; - /* if AUDIT_FEATURE_LOGINUID_IMMUTABLE means never ever allow a change*/ - if (is_audit_feature_set(AUDIT_FEATURE_LOGINUID_IMMUTABLE)) - return -EPERM; - /* it is set, you need permission */ - if (!capable(CAP_AUDIT_CONTROL)) - return -EPERM; - /* reject if this is not an unset and we don't allow that */ - if (is_audit_feature_set(AUDIT_FEATURE_ONLY_UNSET_LOGINUID) && uid_valid(loginuid)) - return -EPERM; - return 0; -} - -static void audit_log_set_loginuid(kuid_t koldloginuid, kuid_t kloginuid, - unsigned int oldsessionid, unsigned int sessionid, - int rc) -{ - struct audit_buffer *ab; - uid_t uid, oldloginuid, loginuid; - struct tty_struct *tty; - - if (!audit_enabled) - return; - - ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN); - if (!ab) - return; - - uid = from_kuid(&init_user_ns, task_uid(current)); - oldloginuid = from_kuid(&init_user_ns, koldloginuid); - loginuid = from_kuid(&init_user_ns, kloginuid), - tty = audit_get_tty(); - - audit_log_format(ab, "pid=%d uid=%u", task_tgid_nr(current), uid); - audit_log_task_context(ab); - audit_log_format(ab, " old-auid=%u auid=%u tty=%s old-ses=%u ses=%u res=%d", - oldloginuid, loginuid, tty ? tty_name(tty) : "(none)", - oldsessionid, sessionid, !rc); - audit_put_tty(tty); - audit_log_end(ab); -} - -/** - * audit_set_loginuid - set current task's audit_context loginuid - * @loginuid: loginuid value - * - * Returns 0. - * - * Called (set) from fs/proc/base.c::proc_loginuid_write(). - */ -int audit_set_loginuid(kuid_t loginuid) -{ - unsigned int oldsessionid, sessionid = AUDIT_SID_UNSET; - kuid_t oldloginuid; - int rc; - - oldloginuid = audit_get_loginuid(current); - oldsessionid = audit_get_sessionid(current); - - rc = audit_set_loginuid_perm(loginuid); - if (rc) - goto out; - - /* are we setting or clearing? */ - if (uid_valid(loginuid)) { - sessionid = (unsigned int)atomic_inc_return(&session_id); - if (unlikely(sessionid == AUDIT_SID_UNSET)) - sessionid = (unsigned int)atomic_inc_return(&session_id); - } - - current->sessionid = sessionid; - current->loginuid = loginuid; -out: - audit_log_set_loginuid(oldloginuid, loginuid, oldsessionid, sessionid, rc); - return rc; -} - /** * __audit_mq_open - record audit data for a POSIX MQ open * @oflag: open flag @@ -2355,6 +2454,7 @@ int __audit_log_bprm_fcaps(struct linux_binprm *bprm, ax->fcap.permitted = vcaps.permitted; ax->fcap.inheritable = vcaps.inheritable; ax->fcap.fE = !!(vcaps.magic_etc & VFS_CAP_FLAGS_EFFECTIVE); + ax->fcap.rootid = vcaps.rootid; ax->fcap_ver = (vcaps.magic_etc & VFS_CAP_REVISION_MASK) >> VFS_CAP_REVISION_SHIFT; ax->old_pcap.permitted = old->cap_permitted; diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 3f08c257858e..ff09d32a8a1b 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -539,7 +539,7 @@ bpf_get_prog_addr_region(const struct bpf_prog *prog, *symbol_end = addr + hdr->pages * PAGE_SIZE; } -static void bpf_get_prog_name(const struct bpf_prog *prog, char *sym) +void bpf_get_prog_name(const struct bpf_prog *prog, char *sym) { const char *end = sym + KSYM_NAME_LEN; const struct btf_type *type; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index bc34cf9fe9ee..62f6bced3a3c 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1258,6 +1258,7 @@ static void __bpf_prog_put_rcu(struct rcu_head *rcu) static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock) { if (atomic_dec_and_test(&prog->aux->refcnt)) { + perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_UNLOAD, 0); /* bpf_prog_free_id() must be called first */ bpf_prog_free_id(prog, do_idr_lock); bpf_prog_kallsyms_del_all(prog); @@ -1631,6 +1632,7 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) } bpf_prog_kallsyms_add(prog); + perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_LOAD, 0); return err; free_used_maps: diff --git a/kernel/capability.c b/kernel/capability.c index 1e1c0236f55b..1444f3954d75 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -93,9 +93,7 @@ static int cap_validate_magic(cap_user_header_t header, unsigned *tocopy) break; case _LINUX_CAPABILITY_VERSION_2: warn_deprecated_v2(); - /* - * fall through - v3 is otherwise equivalent to v2. - */ + /* fall through - v3 is otherwise equivalent to v2. */ case _LINUX_CAPABILITY_VERSION_3: *tocopy = _LINUX_CAPABILITY_U32S_3; break; @@ -299,7 +297,7 @@ bool has_ns_capability(struct task_struct *t, int ret; rcu_read_lock(); - ret = security_capable(__task_cred(t), ns, cap); + ret = security_capable(__task_cred(t), ns, cap, CAP_OPT_NONE); rcu_read_unlock(); return (ret == 0); @@ -340,7 +338,7 @@ bool has_ns_capability_noaudit(struct task_struct *t, int ret; rcu_read_lock(); - ret = security_capable_noaudit(__task_cred(t), ns, cap); + ret = security_capable(__task_cred(t), ns, cap, CAP_OPT_NOAUDIT); rcu_read_unlock(); return (ret == 0); @@ -363,7 +361,9 @@ bool has_capability_noaudit(struct task_struct *t, int cap) return has_ns_capability_noaudit(t, &init_user_ns, cap); } -static bool ns_capable_common(struct user_namespace *ns, int cap, bool audit) +static bool ns_capable_common(struct user_namespace *ns, + int cap, + unsigned int opts) { int capable; @@ -372,8 +372,7 @@ static bool ns_capable_common(struct user_namespace *ns, int cap, bool audit) BUG(); } - capable = audit ? security_capable(current_cred(), ns, cap) : - security_capable_noaudit(current_cred(), ns, cap); + capable = security_capable(current_cred(), ns, cap, opts); if (capable == 0) { current->flags |= PF_SUPERPRIV; return true; @@ -394,7 +393,7 @@ static bool ns_capable_common(struct user_namespace *ns, int cap, bool audit) */ bool ns_capable(struct user_namespace *ns, int cap) { - return ns_capable_common(ns, cap, true); + return ns_capable_common(ns, cap, CAP_OPT_NONE); } EXPORT_SYMBOL(ns_capable); @@ -412,11 +411,30 @@ EXPORT_SYMBOL(ns_capable); */ bool ns_capable_noaudit(struct user_namespace *ns, int cap) { - return ns_capable_common(ns, cap, false); + return ns_capable_common(ns, cap, CAP_OPT_NOAUDIT); } EXPORT_SYMBOL(ns_capable_noaudit); /** + * ns_capable_setid - Determine if the current task has a superior capability + * in effect, while signalling that this check is being done from within a + * setid syscall. + * @ns: The usernamespace we want the capability in + * @cap: The capability to be tested for + * + * Return true if the current task has the given superior capability currently + * available for use, false if not. + * + * This sets PF_SUPERPRIV on the task if the capability is available on the + * assumption that it's about to be used. + */ +bool ns_capable_setid(struct user_namespace *ns, int cap) +{ + return ns_capable_common(ns, cap, CAP_OPT_INSETID); +} +EXPORT_SYMBOL(ns_capable_setid); + +/** * capable - Determine if the current task has a superior capability in effect * @cap: The capability to be tested for * @@ -448,10 +466,11 @@ EXPORT_SYMBOL(capable); bool file_ns_capable(const struct file *file, struct user_namespace *ns, int cap) { + if (WARN_ON_ONCE(!cap_valid(cap))) return false; - if (security_capable(file->f_cred, ns, cap) == 0) + if (security_capable(file->f_cred, ns, cap, CAP_OPT_NONE) == 0) return true; return false; @@ -500,10 +519,12 @@ bool ptracer_capable(struct task_struct *tsk, struct user_namespace *ns) { int ret = 0; /* An absent tracer adds no restrictions */ const struct cred *cred; + rcu_read_lock(); cred = rcu_dereference(tsk->ptracer_cred); if (cred) - ret = security_capable_noaudit(cred, ns, CAP_SYS_PTRACE); + ret = security_capable(cred, ns, CAP_SYS_PTRACE, + CAP_OPT_NOAUDIT); rcu_read_unlock(); return (ret == 0); } diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index cef98502b124..eef24a25bda7 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -197,7 +197,7 @@ static u64 css_serial_nr_next = 1; */ static u16 have_fork_callback __read_mostly; static u16 have_exit_callback __read_mostly; -static u16 have_free_callback __read_mostly; +static u16 have_release_callback __read_mostly; static u16 have_canfork_callback __read_mostly; /* cgroup namespace for init task */ @@ -3534,6 +3534,16 @@ static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, return ret ?: nbytes; } +static __poll_t cgroup_file_poll(struct kernfs_open_file *of, poll_table *pt) +{ + struct cftype *cft = of->kn->priv; + + if (cft->poll) + return cft->poll(of, pt); + + return kernfs_generic_poll(of, pt); +} + static void *cgroup_seqfile_start(struct seq_file *seq, loff_t *ppos) { return seq_cft(seq)->seq_start(seq, ppos); @@ -3572,6 +3582,7 @@ static struct kernfs_ops cgroup_kf_single_ops = { .open = cgroup_file_open, .release = cgroup_file_release, .write = cgroup_file_write, + .poll = cgroup_file_poll, .seq_show = cgroup_seqfile_show, }; @@ -3580,6 +3591,7 @@ static struct kernfs_ops cgroup_kf_ops = { .open = cgroup_file_open, .release = cgroup_file_release, .write = cgroup_file_write, + .poll = cgroup_file_poll, .seq_start = cgroup_seqfile_start, .seq_next = cgroup_seqfile_next, .seq_stop = cgroup_seqfile_stop, @@ -5314,7 +5326,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early) have_fork_callback |= (bool)ss->fork << ss->id; have_exit_callback |= (bool)ss->exit << ss->id; - have_free_callback |= (bool)ss->free << ss->id; + have_release_callback |= (bool)ss->release << ss->id; have_canfork_callback |= (bool)ss->can_fork << ss->id; /* At system boot, before all subsystems have been @@ -5750,16 +5762,19 @@ void cgroup_exit(struct task_struct *tsk) } while_each_subsys_mask(); } -void cgroup_free(struct task_struct *task) +void cgroup_release(struct task_struct *task) { - struct css_set *cset = task_css_set(task); struct cgroup_subsys *ss; int ssid; - do_each_subsys_mask(ss, ssid, have_free_callback) { - ss->free(task); + do_each_subsys_mask(ss, ssid, have_release_callback) { + ss->release(task); } while_each_subsys_mask(); +} +void cgroup_free(struct task_struct *task) +{ + struct css_set *cset = task_css_set(task); put_css_set(cset); } diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 479743db6c37..72afd55f70c6 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -203,19 +203,6 @@ static inline struct cpuset *parent_cs(struct cpuset *cs) return css_cs(cs->css.parent); } -#ifdef CONFIG_NUMA -static inline bool task_has_mempolicy(struct task_struct *task) -{ - return task->mempolicy; -} -#else -static inline bool task_has_mempolicy(struct task_struct *task) -{ - return false; -} -#endif - - /* bits in struct cpuset flags field */ typedef enum { CS_ONLINE, diff --git a/kernel/cgroup/pids.c b/kernel/cgroup/pids.c index 9829c67ebc0a..c9960baaa14f 100644 --- a/kernel/cgroup/pids.c +++ b/kernel/cgroup/pids.c @@ -247,7 +247,7 @@ static void pids_cancel_fork(struct task_struct *task) pids_uncharge(pids, 1); } -static void pids_free(struct task_struct *task) +static void pids_release(struct task_struct *task) { struct pids_cgroup *pids = css_pids(task_css(task, pids_cgrp_id)); @@ -342,7 +342,7 @@ struct cgroup_subsys pids_cgrp_subsys = { .cancel_attach = pids_cancel_attach, .can_fork = pids_can_fork, .cancel_fork = pids_cancel_fork, - .free = pids_free, + .release = pids_release, .legacy_cftypes = pids_files, .dfl_cftypes = pids_files, .threaded = true, diff --git a/kernel/cgroup/rdma.c b/kernel/cgroup/rdma.c index d3bbb757ee49..1d75ae7f1cb7 100644 --- a/kernel/cgroup/rdma.c +++ b/kernel/cgroup/rdma.c @@ -313,10 +313,8 @@ EXPORT_SYMBOL(rdmacg_try_charge); * If IB stack wish a device to participate in rdma cgroup resource * tracking, it must invoke this API to register with rdma cgroup before * any user space application can start using the RDMA resources. - * Returns 0 on success or EINVAL when table length given is beyond - * supported size. */ -int rdmacg_register_device(struct rdmacg_device *device) +void rdmacg_register_device(struct rdmacg_device *device) { INIT_LIST_HEAD(&device->dev_node); INIT_LIST_HEAD(&device->rpools); @@ -324,7 +322,6 @@ int rdmacg_register_device(struct rdmacg_device *device) mutex_lock(&rdmacg_mutex); list_add_tail(&device->dev_node, &rdmacg_devices); mutex_unlock(&rdmacg_mutex); - return 0; } EXPORT_SYMBOL(rdmacg_register_device); diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index d503d1a9007c..bb95a35e8c2d 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -87,7 +87,6 @@ static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos, struct cgroup *root, int cpu) { struct cgroup_rstat_cpu *rstatc; - struct cgroup *parent; if (pos == root) return NULL; @@ -115,8 +114,8 @@ static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos, * However, due to the way we traverse, @pos will be the first * child in most cases. The only exception is @root. */ - parent = cgroup_parent(pos); - if (parent && rstatc->updated_next) { + if (rstatc->updated_next) { + struct cgroup *parent = cgroup_parent(pos); struct cgroup_rstat_cpu *prstatc = cgroup_rstat_cpu(parent, cpu); struct cgroup_rstat_cpu *nrstatc; struct cgroup **nextp; @@ -140,9 +139,12 @@ static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos, * updated stat. */ smp_mb(); + + return pos; } - return pos; + /* only happens for @root */ + return NULL; } /* see cgroup_rstat_flush() */ diff --git a/kernel/compat.c b/kernel/compat.c index f01affa17e22..d8a36c6ad7c9 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -20,7 +20,6 @@ #include <linux/syscalls.h> #include <linux/unistd.h> #include <linux/security.h> -#include <linux/timex.h> #include <linux/export.h> #include <linux/migrate.h> #include <linux/posix-timers.h> @@ -30,69 +29,6 @@ #include <linux/uaccess.h> -int compat_get_timex(struct timex *txc, const struct compat_timex __user *utp) -{ - struct compat_timex tx32; - - memset(txc, 0, sizeof(struct timex)); - if (copy_from_user(&tx32, utp, sizeof(struct compat_timex))) - return -EFAULT; - - txc->modes = tx32.modes; - txc->offset = tx32.offset; - txc->freq = tx32.freq; - txc->maxerror = tx32.maxerror; - txc->esterror = tx32.esterror; - txc->status = tx32.status; - txc->constant = tx32.constant; - txc->precision = tx32.precision; - txc->tolerance = tx32.tolerance; - txc->time.tv_sec = tx32.time.tv_sec; - txc->time.tv_usec = tx32.time.tv_usec; - txc->tick = tx32.tick; - txc->ppsfreq = tx32.ppsfreq; - txc->jitter = tx32.jitter; - txc->shift = tx32.shift; - txc->stabil = tx32.stabil; - txc->jitcnt = tx32.jitcnt; - txc->calcnt = tx32.calcnt; - txc->errcnt = tx32.errcnt; - txc->stbcnt = tx32.stbcnt; - - return 0; -} - -int compat_put_timex(struct compat_timex __user *utp, const struct timex *txc) -{ - struct compat_timex tx32; - - memset(&tx32, 0, sizeof(struct compat_timex)); - tx32.modes = txc->modes; - tx32.offset = txc->offset; - tx32.freq = txc->freq; - tx32.maxerror = txc->maxerror; - tx32.esterror = txc->esterror; - tx32.status = txc->status; - tx32.constant = txc->constant; - tx32.precision = txc->precision; - tx32.tolerance = txc->tolerance; - tx32.time.tv_sec = txc->time.tv_sec; - tx32.time.tv_usec = txc->time.tv_usec; - tx32.tick = txc->tick; - tx32.ppsfreq = txc->ppsfreq; - tx32.jitter = txc->jitter; - tx32.shift = txc->shift; - tx32.stabil = txc->stabil; - tx32.jitcnt = txc->jitcnt; - tx32.calcnt = txc->calcnt; - tx32.errcnt = txc->errcnt; - tx32.stbcnt = txc->stbcnt; - tx32.tai = txc->tai; - if (copy_to_user(utp, &tx32, sizeof(struct compat_timex))) - return -EFAULT; - return 0; -} - static int __compat_get_timeval(struct timeval *tv, const struct old_timeval32 __user *ctv) { return (!access_ok(ctv, sizeof(*ctv)) || diff --git a/kernel/configs.c b/kernel/configs.c index 2df132b20217..b062425ccf8d 100644 --- a/kernel/configs.c +++ b/kernel/configs.c @@ -30,37 +30,35 @@ #include <linux/init.h> #include <linux/uaccess.h> -/**************************************************/ -/* the actual current config file */ - /* - * Define kernel_config_data and kernel_config_data_size, which contains the - * wrapped and compressed configuration file. The file is first compressed - * with gzip and then bounded by two eight byte magic numbers to allow - * extraction from a binary kernel image: - * - * IKCFG_ST - * <image> - * IKCFG_ED + * "IKCFG_ST" and "IKCFG_ED" are used to extract the config data from + * a binary kernel image or a module. See scripts/extract-ikconfig. */ -#define MAGIC_START "IKCFG_ST" -#define MAGIC_END "IKCFG_ED" -#include "config_data.h" - - -#define MAGIC_SIZE (sizeof(MAGIC_START) - 1) -#define kernel_config_data_size \ - (sizeof(kernel_config_data) - 1 - MAGIC_SIZE * 2) +asm ( +" .pushsection .rodata, \"a\" \n" +" .ascii \"IKCFG_ST\" \n" +" .global kernel_config_data \n" +"kernel_config_data: \n" +" .incbin \"kernel/config_data.gz\" \n" +" .global kernel_config_data_end \n" +"kernel_config_data_end: \n" +" .ascii \"IKCFG_ED\" \n" +" .popsection \n" +); #ifdef CONFIG_IKCONFIG_PROC +extern char kernel_config_data; +extern char kernel_config_data_end; + static ssize_t ikconfig_read_current(struct file *file, char __user *buf, size_t len, loff_t * offset) { return simple_read_from_buffer(buf, len, offset, - kernel_config_data + MAGIC_SIZE, - kernel_config_data_size); + &kernel_config_data, + &kernel_config_data_end - + &kernel_config_data); } static const struct file_operations ikconfig_file_ops = { @@ -79,7 +77,7 @@ static int __init ikconfig_init(void) if (!entry) return -ENOMEM; - proc_set_size(entry, kernel_config_data_size); + proc_set_size(entry, &kernel_config_data_end - &kernel_config_data); return 0; } diff --git a/kernel/cpu.c b/kernel/cpu.c index d1c6d152da89..025f419d16f6 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -313,6 +313,15 @@ void cpus_write_unlock(void) void lockdep_assert_cpus_held(void) { + /* + * We can't have hotplug operations before userspace starts running, + * and some init codepaths will knowingly not take the hotplug lock. + * This is all valid, so mute lockdep until it makes sense to report + * unheld locks. + */ + if (system_state < SYSTEM_RUNNING) + return; + percpu_rwsem_assert_held(&cpu_hotplug_lock); } diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 933cb3e45b98..093c9f917ed0 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -464,6 +464,8 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE); #ifdef CONFIG_HUGETLB_PAGE VMCOREINFO_NUMBER(HUGETLB_PAGE_DTOR); +#define PAGE_OFFLINE_MAPCOUNT_VALUE (~PG_offline) + VMCOREINFO_NUMBER(PAGE_OFFLINE_MAPCOUNT_VALUE); #endif arch_crash_save_vmcoreinfo(); diff --git a/kernel/cred.c b/kernel/cred.c index 21f4a97085b4..45d77284aed0 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -760,19 +760,6 @@ bool creds_are_invalid(const struct cred *cred) { if (cred->magic != CRED_MAGIC) return true; -#ifdef CONFIG_SECURITY_SELINUX - /* - * cred->security == NULL if security_cred_alloc_blank() or - * security_prepare_creds() returned an error. - */ - if (selinux_is_enabled() && cred->security) { - if ((unsigned long) cred->security < PAGE_SIZE) - return true; - if ((*(u32 *)cred->security & 0xffffff00) == - (POISON_FREE << 24 | POISON_FREE << 16 | POISON_FREE << 8)) - return true; - } -#endif return false; } EXPORT_SYMBOL(creds_are_invalid); diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig index ca88b867e7fe..a06ba3013b3b 100644 --- a/kernel/dma/Kconfig +++ b/kernel/dma/Kconfig @@ -16,7 +16,16 @@ config ARCH_DMA_ADDR_T_64BIT config ARCH_HAS_DMA_COHERENCE_H bool -config HAVE_GENERIC_DMA_COHERENT +config ARCH_HAS_DMA_SET_MASK + bool + +config DMA_DECLARE_COHERENT + bool + +config ARCH_HAS_SETUP_DMA_OPS + bool + +config ARCH_HAS_TEARDOWN_DMA_OPS bool config ARCH_HAS_SYNC_DMA_FOR_DEVICE @@ -53,3 +62,116 @@ config DMA_REMAP config DMA_DIRECT_REMAP bool select DMA_REMAP + +config DMA_CMA + bool "DMA Contiguous Memory Allocator" + depends on HAVE_DMA_CONTIGUOUS && CMA + help + This enables the Contiguous Memory Allocator which allows drivers + to allocate big physically-contiguous blocks of memory for use with + hardware components that do not support I/O map nor scatter-gather. + + You can disable CMA by specifying "cma=0" on the kernel's command + line. + + For more information see <include/linux/dma-contiguous.h>. + If unsure, say "n". + +if DMA_CMA +comment "Default contiguous memory area size:" + +config CMA_SIZE_MBYTES + int "Size in Mega Bytes" + depends on !CMA_SIZE_SEL_PERCENTAGE + default 0 if X86 + default 16 + help + Defines the size (in MiB) of the default memory area for Contiguous + Memory Allocator. If the size of 0 is selected, CMA is disabled by + default, but it can be enabled by passing cma=size[MG] to the kernel. + + +config CMA_SIZE_PERCENTAGE + int "Percentage of total memory" + depends on !CMA_SIZE_SEL_MBYTES + default 0 if X86 + default 10 + help + Defines the size of the default memory area for Contiguous Memory + Allocator as a percentage of the total memory in the system. + If 0 percent is selected, CMA is disabled by default, but it can be + enabled by passing cma=size[MG] to the kernel. + +choice + prompt "Selected region size" + default CMA_SIZE_SEL_MBYTES + +config CMA_SIZE_SEL_MBYTES + bool "Use mega bytes value only" + +config CMA_SIZE_SEL_PERCENTAGE + bool "Use percentage value only" + +config CMA_SIZE_SEL_MIN + bool "Use lower value (minimum)" + +config CMA_SIZE_SEL_MAX + bool "Use higher value (maximum)" + +endchoice + +config CMA_ALIGNMENT + int "Maximum PAGE_SIZE order of alignment for contiguous buffers" + range 4 12 + default 8 + help + DMA mapping framework by default aligns all buffers to the smallest + PAGE_SIZE order which is greater than or equal to the requested buffer + size. This works well for buffers up to a few hundreds kilobytes, but + for larger buffers it just a memory waste. With this parameter you can + specify the maximum PAGE_SIZE order for contiguous buffers. Larger + buffers will be aligned only to this specified order. The order is + expressed as a power of two multiplied by the PAGE_SIZE. + + For example, if your system defaults to 4KiB pages, the order value + of 8 means that the buffers will be aligned up to 1MiB only. + + If unsure, leave the default value "8". + +endif + +config DMA_API_DEBUG + bool "Enable debugging of DMA-API usage" + select NEED_DMA_MAP_STATE + help + Enable this option to debug the use of the DMA API by device drivers. + With this option you will be able to detect common bugs in device + drivers like double-freeing of DMA mappings or freeing mappings that + were never allocated. + + This also attempts to catch cases where a page owned by DMA is + accessed by the cpu in a way that could cause data corruption. For + example, this enables cow_user_page() to check that the source page is + not undergoing DMA. + + This option causes a performance degradation. Use only if you want to + debug device drivers and dma interactions. + + If unsure, say N. + +config DMA_API_DEBUG_SG + bool "Debug DMA scatter-gather usage" + default y + depends on DMA_API_DEBUG + help + Perform extra checking that callers of dma_map_sg() have respected the + appropriate segment length/boundary limits for the given device when + preparing DMA scatterlists. + + This is particularly likely to have been overlooked in cases where the + dma_map_sg() API is used for general bulk mapping of pages rather than + preparing literal scatter-gather descriptors, where there is a risk of + unexpected behaviour from DMA API implementations if the scatterlist + is technically out-of-spec. + + If unsure, say N. diff --git a/kernel/dma/Makefile b/kernel/dma/Makefile index 72ff6e46aa86..d237cf3dc181 100644 --- a/kernel/dma/Makefile +++ b/kernel/dma/Makefile @@ -2,7 +2,7 @@ obj-$(CONFIG_HAS_DMA) += mapping.o direct.o dummy.o obj-$(CONFIG_DMA_CMA) += contiguous.o -obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += coherent.o +obj-$(CONFIG_DMA_DECLARE_COHERENT) += coherent.o obj-$(CONFIG_DMA_VIRT_OPS) += virt.o obj-$(CONFIG_DMA_API_DEBUG) += debug.o obj-$(CONFIG_SWIOTLB) += swiotlb.o diff --git a/kernel/dma/coherent.c b/kernel/dma/coherent.c index 66f0fb7e9a3a..29fd6590dc1e 100644 --- a/kernel/dma/coherent.c +++ b/kernel/dma/coherent.c @@ -14,7 +14,6 @@ struct dma_coherent_mem { dma_addr_t device_base; unsigned long pfn_base; int size; - int flags; unsigned long *bitmap; spinlock_t spinlock; bool use_dev_dma_pfn_offset; @@ -38,12 +37,12 @@ static inline dma_addr_t dma_get_device_base(struct device *dev, return mem->device_base; } -static int dma_init_coherent_memory( - phys_addr_t phys_addr, dma_addr_t device_addr, size_t size, int flags, - struct dma_coherent_mem **mem) +static int dma_init_coherent_memory(phys_addr_t phys_addr, + dma_addr_t device_addr, size_t size, + struct dma_coherent_mem **mem) { struct dma_coherent_mem *dma_mem = NULL; - void __iomem *mem_base = NULL; + void *mem_base = NULL; int pages = size >> PAGE_SHIFT; int bitmap_size = BITS_TO_LONGS(pages) * sizeof(long); int ret; @@ -73,7 +72,6 @@ static int dma_init_coherent_memory( dma_mem->device_base = device_addr; dma_mem->pfn_base = PFN_DOWN(phys_addr); dma_mem->size = pages; - dma_mem->flags = flags; spin_lock_init(&dma_mem->spinlock); *mem = dma_mem; @@ -110,12 +108,12 @@ static int dma_assign_coherent_memory(struct device *dev, } int dma_declare_coherent_memory(struct device *dev, phys_addr_t phys_addr, - dma_addr_t device_addr, size_t size, int flags) + dma_addr_t device_addr, size_t size) { struct dma_coherent_mem *mem; int ret; - ret = dma_init_coherent_memory(phys_addr, device_addr, size, flags, &mem); + ret = dma_init_coherent_memory(phys_addr, device_addr, size, &mem); if (ret) return ret; @@ -137,29 +135,6 @@ void dma_release_declared_memory(struct device *dev) } EXPORT_SYMBOL(dma_release_declared_memory); -void *dma_mark_declared_memory_occupied(struct device *dev, - dma_addr_t device_addr, size_t size) -{ - struct dma_coherent_mem *mem = dev->dma_mem; - unsigned long flags; - int pos, err; - - size += device_addr & ~PAGE_MASK; - - if (!mem) - return ERR_PTR(-EINVAL); - - spin_lock_irqsave(&mem->spinlock, flags); - pos = PFN_DOWN(device_addr - dma_get_device_base(dev, mem)); - err = bitmap_allocate_region(mem->bitmap, pos, get_order(size)); - spin_unlock_irqrestore(&mem->spinlock, flags); - - if (err != 0) - return ERR_PTR(err); - return mem->virt_base + (pos << PAGE_SHIFT); -} -EXPORT_SYMBOL(dma_mark_declared_memory_occupied); - static void *__dma_alloc_from_coherent(struct dma_coherent_mem *mem, ssize_t size, dma_addr_t *dma_handle) { @@ -213,15 +188,7 @@ int dma_alloc_from_dev_coherent(struct device *dev, ssize_t size, return 0; *ret = __dma_alloc_from_coherent(mem, size, dma_handle); - if (*ret) - return 1; - - /* - * In the case where the allocation can not be satisfied from the - * per-device area, try to fall back to generic memory if the - * constraints allow it. - */ - return mem->flags & DMA_MEMORY_EXCLUSIVE; + return 1; } void *dma_alloc_from_global_coherent(ssize_t size, dma_addr_t *dma_handle) @@ -350,8 +317,7 @@ static int rmem_dma_device_init(struct reserved_mem *rmem, struct device *dev) if (!mem) { ret = dma_init_coherent_memory(rmem->base, rmem->base, - rmem->size, - DMA_MEMORY_EXCLUSIVE, &mem); + rmem->size, &mem); if (ret) { pr_err("Reserved memory: failed to init DMA memory pool at %pa, size %ld MiB\n", &rmem->base, (unsigned long)rmem->size / SZ_1M); diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c index 23cf5361bcf1..45d51e8e26f6 100644 --- a/kernel/dma/debug.c +++ b/kernel/dma/debug.c @@ -134,17 +134,6 @@ static u32 nr_total_entries; /* number of preallocated entries requested by kernel cmdline */ static u32 nr_prealloc_entries = PREALLOC_DMA_DEBUG_ENTRIES; -/* debugfs dentry's for the stuff above */ -static struct dentry *dma_debug_dent __read_mostly; -static struct dentry *global_disable_dent __read_mostly; -static struct dentry *error_count_dent __read_mostly; -static struct dentry *show_all_errors_dent __read_mostly; -static struct dentry *show_num_errors_dent __read_mostly; -static struct dentry *num_free_entries_dent __read_mostly; -static struct dentry *min_free_entries_dent __read_mostly; -static struct dentry *nr_total_entries_dent __read_mostly; -static struct dentry *filter_dent __read_mostly; - /* per-driver filter related state */ #define NAME_MAX_LEN 64 @@ -840,66 +829,46 @@ static const struct file_operations filter_fops = { .llseek = default_llseek, }; -static int dma_debug_fs_init(void) +static int dump_show(struct seq_file *seq, void *v) { - dma_debug_dent = debugfs_create_dir("dma-api", NULL); - if (!dma_debug_dent) { - pr_err("can not create debugfs directory\n"); - return -ENOMEM; - } + int idx; - global_disable_dent = debugfs_create_bool("disabled", 0444, - dma_debug_dent, - &global_disable); - if (!global_disable_dent) - goto out_err; - - error_count_dent = debugfs_create_u32("error_count", 0444, - dma_debug_dent, &error_count); - if (!error_count_dent) - goto out_err; - - show_all_errors_dent = debugfs_create_u32("all_errors", 0644, - dma_debug_dent, - &show_all_errors); - if (!show_all_errors_dent) - goto out_err; - - show_num_errors_dent = debugfs_create_u32("num_errors", 0644, - dma_debug_dent, - &show_num_errors); - if (!show_num_errors_dent) - goto out_err; - - num_free_entries_dent = debugfs_create_u32("num_free_entries", 0444, - dma_debug_dent, - &num_free_entries); - if (!num_free_entries_dent) - goto out_err; - - min_free_entries_dent = debugfs_create_u32("min_free_entries", 0444, - dma_debug_dent, - &min_free_entries); - if (!min_free_entries_dent) - goto out_err; - - nr_total_entries_dent = debugfs_create_u32("nr_total_entries", 0444, - dma_debug_dent, - &nr_total_entries); - if (!nr_total_entries_dent) - goto out_err; - - filter_dent = debugfs_create_file("driver_filter", 0644, - dma_debug_dent, NULL, &filter_fops); - if (!filter_dent) - goto out_err; + for (idx = 0; idx < HASH_SIZE; idx++) { + struct hash_bucket *bucket = &dma_entry_hash[idx]; + struct dma_debug_entry *entry; + unsigned long flags; + spin_lock_irqsave(&bucket->lock, flags); + list_for_each_entry(entry, &bucket->list, list) { + seq_printf(seq, + "%s %s %s idx %d P=%llx N=%lx D=%llx L=%llx %s %s\n", + dev_name(entry->dev), + dev_driver_string(entry->dev), + type2name[entry->type], idx, + phys_addr(entry), entry->pfn, + entry->dev_addr, entry->size, + dir2name[entry->direction], + maperr2str[entry->map_err_type]); + } + spin_unlock_irqrestore(&bucket->lock, flags); + } return 0; +} +DEFINE_SHOW_ATTRIBUTE(dump); -out_err: - debugfs_remove_recursive(dma_debug_dent); - - return -ENOMEM; +static void dma_debug_fs_init(void) +{ + struct dentry *dentry = debugfs_create_dir("dma-api", NULL); + + debugfs_create_bool("disabled", 0444, dentry, &global_disable); + debugfs_create_u32("error_count", 0444, dentry, &error_count); + debugfs_create_u32("all_errors", 0644, dentry, &show_all_errors); + debugfs_create_u32("num_errors", 0644, dentry, &show_num_errors); + debugfs_create_u32("num_free_entries", 0444, dentry, &num_free_entries); + debugfs_create_u32("min_free_entries", 0444, dentry, &min_free_entries); + debugfs_create_u32("nr_total_entries", 0444, dentry, &nr_total_entries); + debugfs_create_file("driver_filter", 0644, dentry, NULL, &filter_fops); + debugfs_create_file("dump", 0444, dentry, NULL, &dump_fops); } static int device_dma_allocations(struct device *dev, struct dma_debug_entry **out_entry) @@ -985,12 +954,7 @@ static int dma_debug_init(void) spin_lock_init(&dma_entry_hash[i].lock); } - if (dma_debug_fs_init() != 0) { - pr_err("error creating debugfs entries - disabling\n"); - global_disable = true; - - return 0; - } + dma_debug_fs_init(); nr_pages = DIV_ROUND_UP(nr_prealloc_entries, DMA_DEBUG_DYNAMIC_ENTRIES); for (i = 0; i < nr_pages; ++i) diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 355d16acee6d..fcdb23e8d2fc 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -132,8 +132,7 @@ again: goto again; } - if (IS_ENABLED(CONFIG_ZONE_DMA) && - phys_mask < DMA_BIT_MASK(32) && !(gfp & GFP_DMA)) { + if (IS_ENABLED(CONFIG_ZONE_DMA) && !(gfp & GFP_DMA)) { gfp = (gfp & ~GFP_DMA32) | GFP_DMA; goto again; } @@ -356,6 +355,20 @@ out_unmap: } EXPORT_SYMBOL(dma_direct_map_sg); +dma_addr_t dma_direct_map_resource(struct device *dev, phys_addr_t paddr, + size_t size, enum dma_data_direction dir, unsigned long attrs) +{ + dma_addr_t dma_addr = paddr; + + if (unlikely(!dma_direct_possible(dev, dma_addr, size))) { + report_addr(dev, dma_addr, size); + return DMA_MAPPING_ERROR; + } + + return dma_addr; +} +EXPORT_SYMBOL(dma_direct_map_resource); + /* * Because 32-bit DMA masks are so common we expect every architecture to be * able to satisfy them - either by not supporting more physical memory, or by @@ -380,3 +393,14 @@ int dma_direct_supported(struct device *dev, u64 mask) */ return mask >= __phys_to_dma(dev, min_mask); } + +size_t dma_direct_max_mapping_size(struct device *dev) +{ + size_t size = SIZE_MAX; + + /* If SWIOTLB is active, use its maximum mapping size */ + if (is_swiotlb_active()) + size = swiotlb_max_mapping_size(dev); + + return size; +} diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index a11006b6d8e8..c000906348c9 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -207,7 +207,6 @@ int dma_mmap_attrs(struct device *dev, struct vm_area_struct *vma, } EXPORT_SYMBOL(dma_mmap_attrs); -#ifndef ARCH_HAS_DMA_GET_REQUIRED_MASK static u64 dma_default_get_required_mask(struct device *dev) { u32 low_totalram = ((max_pfn - 1) << PAGE_SHIFT); @@ -238,7 +237,6 @@ u64 dma_get_required_mask(struct device *dev) return dma_default_get_required_mask(dev); } EXPORT_SYMBOL_GPL(dma_get_required_mask); -#endif #ifndef arch_dma_alloc_attrs #define arch_dma_alloc_attrs(dev) (true) @@ -318,18 +316,23 @@ int dma_supported(struct device *dev, u64 mask) } EXPORT_SYMBOL(dma_supported); -#ifndef HAVE_ARCH_DMA_SET_MASK +#ifdef CONFIG_ARCH_HAS_DMA_SET_MASK +void arch_dma_set_mask(struct device *dev, u64 mask); +#else +#define arch_dma_set_mask(dev, mask) do { } while (0) +#endif + int dma_set_mask(struct device *dev, u64 mask) { if (!dev->dma_mask || !dma_supported(dev, mask)) return -EIO; + arch_dma_set_mask(dev, mask); dma_check_mask(dev, mask); *dev->dma_mask = mask; return 0; } EXPORT_SYMBOL(dma_set_mask); -#endif #ifndef CONFIG_ARCH_HAS_DMA_SET_COHERENT_MASK int dma_set_coherent_mask(struct device *dev, u64 mask) @@ -357,3 +360,17 @@ void dma_cache_sync(struct device *dev, void *vaddr, size_t size, ops->cache_sync(dev, vaddr, size, dir); } EXPORT_SYMBOL(dma_cache_sync); + +size_t dma_max_mapping_size(struct device *dev) +{ + const struct dma_map_ops *ops = get_dma_ops(dev); + size_t size = SIZE_MAX; + + if (dma_is_direct(ops)) + size = dma_direct_max_mapping_size(dev); + else if (ops && ops->max_mapping_size) + size = ops->max_mapping_size(dev); + + return size; +} +EXPORT_SYMBOL_GPL(dma_max_mapping_size); diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 1fb6fd68b9c7..9f5851064aea 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -34,6 +34,9 @@ #include <linux/scatterlist.h> #include <linux/mem_encrypt.h> #include <linux/set_memory.h> +#ifdef CONFIG_DEBUG_FS +#include <linux/debugfs.h> +#endif #include <asm/io.h> #include <asm/dma.h> @@ -73,6 +76,11 @@ phys_addr_t io_tlb_start, io_tlb_end; static unsigned long io_tlb_nslabs; /* + * The number of used IO TLB block + */ +static unsigned long io_tlb_used; + +/* * This is a free list describing the number of free entries available from * each index */ @@ -385,7 +393,7 @@ void __init swiotlb_exit(void) } /* - * Bounce: copy the swiotlb buffer back to the original dma location + * Bounce: copy the swiotlb buffer from or back to the original dma location */ static void swiotlb_bounce(phys_addr_t orig_addr, phys_addr_t tlb_addr, size_t size, enum dma_data_direction dir) @@ -475,6 +483,10 @@ phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, * request and allocate a buffer from that IO TLB pool. */ spin_lock_irqsave(&io_tlb_lock, flags); + + if (unlikely(nslots > io_tlb_nslabs - io_tlb_used)) + goto not_found; + index = ALIGN(io_tlb_index, stride); if (index >= io_tlb_nslabs) index = 0; @@ -524,6 +536,7 @@ not_found: dev_warn(hwdev, "swiotlb buffer is full (sz: %zd bytes)\n", size); return DMA_MAPPING_ERROR; found: + io_tlb_used += nslots; spin_unlock_irqrestore(&io_tlb_lock, flags); /* @@ -584,6 +597,8 @@ void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr, */ for (i = index - 1; (OFFSET(i, IO_TLB_SEGSIZE) != IO_TLB_SEGSIZE -1) && io_tlb_list[i]; i--) io_tlb_list[i] = ++count; + + io_tlb_used -= nslots; } spin_unlock_irqrestore(&io_tlb_lock, flags); } @@ -651,14 +666,49 @@ bool swiotlb_map(struct device *dev, phys_addr_t *phys, dma_addr_t *dma_addr, return true; } -/* - * Return whether the given device DMA address mask can be supported - * properly. For example, if your device can only drive the low 24-bits - * during bus mastering, then you would pass 0x00ffffff as the mask to - * this function. - */ -int -swiotlb_dma_supported(struct device *hwdev, u64 mask) +size_t swiotlb_max_mapping_size(struct device *dev) { - return __phys_to_dma(hwdev, io_tlb_end - 1) <= mask; + return ((size_t)1 << IO_TLB_SHIFT) * IO_TLB_SEGSIZE; } + +bool is_swiotlb_active(void) +{ + /* + * When SWIOTLB is initialized, even if io_tlb_start points to physical + * address zero, io_tlb_end surely doesn't. + */ + return io_tlb_end != 0; +} + +#ifdef CONFIG_DEBUG_FS + +static int __init swiotlb_create_debugfs(void) +{ + struct dentry *d_swiotlb_usage; + struct dentry *ent; + + d_swiotlb_usage = debugfs_create_dir("swiotlb", NULL); + + if (!d_swiotlb_usage) + return -ENOMEM; + + ent = debugfs_create_ulong("io_tlb_nslabs", 0400, + d_swiotlb_usage, &io_tlb_nslabs); + if (!ent) + goto fail; + + ent = debugfs_create_ulong("io_tlb_used", 0400, + d_swiotlb_usage, &io_tlb_used); + if (!ent) + goto fail; + + return 0; + +fail: + debugfs_remove_recursive(d_swiotlb_usage); + return -ENOMEM; +} + +late_initcall(swiotlb_create_debugfs); + +#endif diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index 24a77c34e9ad..c2b41a263166 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Performance events callchain code, extracted from core.c: * @@ -5,8 +6,6 @@ * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> - * - * For licensing details see kernel-base/COPYING */ #include <linux/perf_event.h> diff --git a/kernel/events/core.c b/kernel/events/core.c index 26d6edab051a..1032a16bd186 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Performance events core code: * @@ -5,8 +6,6 @@ * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> - * - * For licensing details see kernel-base/COPYING */ #include <linux/fs.h> @@ -385,6 +384,8 @@ static atomic_t nr_namespaces_events __read_mostly; static atomic_t nr_task_events __read_mostly; static atomic_t nr_freq_events __read_mostly; static atomic_t nr_switch_events __read_mostly; +static atomic_t nr_ksymbol_events __read_mostly; +static atomic_t nr_bpf_events __read_mostly; static LIST_HEAD(pmus); static DEFINE_MUTEX(pmus_lock); @@ -1171,7 +1172,7 @@ static void perf_event_ctx_deactivate(struct perf_event_context *ctx) static void get_ctx(struct perf_event_context *ctx) { - WARN_ON(!atomic_inc_not_zero(&ctx->refcount)); + refcount_inc(&ctx->refcount); } static void free_ctx(struct rcu_head *head) @@ -1185,7 +1186,7 @@ static void free_ctx(struct rcu_head *head) static void put_ctx(struct perf_event_context *ctx) { - if (atomic_dec_and_test(&ctx->refcount)) { + if (refcount_dec_and_test(&ctx->refcount)) { if (ctx->parent_ctx) put_ctx(ctx->parent_ctx); if (ctx->task && ctx->task != TASK_TOMBSTONE) @@ -1254,6 +1255,7 @@ static void put_ctx(struct perf_event_context *ctx) * perf_event_context::lock * perf_event::mmap_mutex * mmap_sem + * perf_addr_filters_head::lock * * cpu_hotplug_lock * pmus_lock @@ -1267,7 +1269,7 @@ perf_event_ctx_lock_nested(struct perf_event *event, int nesting) again: rcu_read_lock(); ctx = READ_ONCE(event->ctx); - if (!atomic_inc_not_zero(&ctx->refcount)) { + if (!refcount_inc_not_zero(&ctx->refcount)) { rcu_read_unlock(); goto again; } @@ -1400,7 +1402,7 @@ retry: } if (ctx->task == TASK_TOMBSTONE || - !atomic_inc_not_zero(&ctx->refcount)) { + !refcount_inc_not_zero(&ctx->refcount)) { raw_spin_unlock(&ctx->lock); ctx = NULL; } else { @@ -2797,7 +2799,7 @@ static int perf_event_stop(struct perf_event *event, int restart) * * (p1) when userspace mappings change as a result of (1) or (2) or (3) below, * we update the addresses of corresponding vmas in - * event::addr_filters_offs array and bump the event::addr_filters_gen; + * event::addr_filter_ranges array and bump the event::addr_filters_gen; * (p2) when an event is scheduled in (pmu::add), it calls * perf_event_addr_filters_sync() which calls pmu::addr_filters_sync() * if the generation has changed since the previous call. @@ -4056,7 +4058,7 @@ static void __perf_event_init_context(struct perf_event_context *ctx) INIT_LIST_HEAD(&ctx->event_list); INIT_LIST_HEAD(&ctx->pinned_active); INIT_LIST_HEAD(&ctx->flexible_active); - atomic_set(&ctx->refcount, 1); + refcount_set(&ctx->refcount, 1); } static struct perf_event_context * @@ -4235,8 +4237,9 @@ static bool is_sb_event(struct perf_event *event) if (attr->mmap || attr->mmap_data || attr->mmap2 || attr->comm || attr->comm_exec || - attr->task || - attr->context_switch) + attr->task || attr->ksymbol || + attr->context_switch || + attr->bpf_event) return true; return false; } @@ -4305,6 +4308,10 @@ static void unaccount_event(struct perf_event *event) dec = true; if (has_branch_stack(event)) dec = true; + if (event->attr.ksymbol) + atomic_dec(&nr_ksymbol_events); + if (event->attr.bpf_event) + atomic_dec(&nr_bpf_events); if (dec) { if (!atomic_add_unless(&perf_sched_count, -1, 1)) @@ -4440,7 +4447,7 @@ static void _free_event(struct perf_event *event) perf_event_free_bpf_prog(event); perf_addr_filters_splice(event, NULL); - kfree(event->addr_filters_offs); + kfree(event->addr_filter_ranges); if (event->destroy) event->destroy(event); @@ -5396,7 +5403,7 @@ struct ring_buffer *ring_buffer_get(struct perf_event *event) rcu_read_lock(); rb = rcu_dereference(event->rb); if (rb) { - if (!atomic_inc_not_zero(&rb->refcount)) + if (!refcount_inc_not_zero(&rb->refcount)) rb = NULL; } rcu_read_unlock(); @@ -5406,7 +5413,7 @@ struct ring_buffer *ring_buffer_get(struct perf_event *event) void ring_buffer_put(struct ring_buffer *rb) { - if (!atomic_dec_and_test(&rb->refcount)) + if (!refcount_dec_and_test(&rb->refcount)) return; WARN_ON_ONCE(!list_empty(&rb->event_list)); @@ -5467,11 +5474,11 @@ static void perf_mmap_close(struct vm_area_struct *vma) /* now it's safe to free the pages */ atomic_long_sub(rb->aux_nr_pages, &mmap_user->locked_vm); - vma->vm_mm->pinned_vm -= rb->aux_mmap_locked; + atomic64_sub(rb->aux_mmap_locked, &vma->vm_mm->pinned_vm); /* this has to be the last one */ rb_free_aux(rb); - WARN_ON_ONCE(atomic_read(&rb->aux_refcount)); + WARN_ON_ONCE(refcount_read(&rb->aux_refcount)); mutex_unlock(&event->mmap_mutex); } @@ -5540,7 +5547,7 @@ again: */ atomic_long_sub((size >> PAGE_SHIFT) + 1, &mmap_user->locked_vm); - vma->vm_mm->pinned_vm -= mmap_locked; + atomic64_sub(mmap_locked, &vma->vm_mm->pinned_vm); free_uid(mmap_user); out_put: @@ -5688,7 +5695,7 @@ accounting: lock_limit = rlimit(RLIMIT_MEMLOCK); lock_limit >>= PAGE_SHIFT; - locked = vma->vm_mm->pinned_vm + extra; + locked = atomic64_read(&vma->vm_mm->pinned_vm) + extra; if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() && !capable(CAP_IPC_LOCK)) { @@ -5729,7 +5736,7 @@ accounting: unlock: if (!ret) { atomic_long_add(user_extra, &user->locked_vm); - vma->vm_mm->pinned_vm += extra; + atomic64_add(extra, &vma->vm_mm->pinned_vm); atomic_inc(&event->mmap_count); } else if (rb) { @@ -6497,7 +6504,7 @@ void perf_prepare_sample(struct perf_event_header *header, data->phys_addr = perf_virt_to_phys(data->addr); } -static __always_inline void +static __always_inline int __perf_event_output(struct perf_event *event, struct perf_sample_data *data, struct pt_regs *regs, @@ -6507,13 +6514,15 @@ __perf_event_output(struct perf_event *event, { struct perf_output_handle handle; struct perf_event_header header; + int err; /* protect the callchain buffers */ rcu_read_lock(); perf_prepare_sample(&header, data, event, regs); - if (output_begin(&handle, event, header.size)) + err = output_begin(&handle, event, header.size); + if (err) goto exit; perf_output_sample(&handle, &header, data, event); @@ -6522,6 +6531,7 @@ __perf_event_output(struct perf_event *event, exit: rcu_read_unlock(); + return err; } void @@ -6540,12 +6550,12 @@ perf_event_output_backward(struct perf_event *event, __perf_event_output(event, data, regs, perf_output_begin_backward); } -void +int perf_event_output(struct perf_event *event, struct perf_sample_data *data, struct pt_regs *regs) { - __perf_event_output(event, data, regs, perf_output_begin); + return __perf_event_output(event, data, regs, perf_output_begin); } /* @@ -6686,7 +6696,8 @@ static void perf_event_addr_filters_exec(struct perf_event *event, void *data) raw_spin_lock_irqsave(&ifh->lock, flags); list_for_each_entry(filter, &ifh->list, entry) { if (filter->path.dentry) { - event->addr_filters_offs[count] = 0; + event->addr_filter_ranges[count].start = 0; + event->addr_filter_ranges[count].size = 0; restart++; } @@ -7366,28 +7377,47 @@ static bool perf_addr_filter_match(struct perf_addr_filter *filter, return true; } +static bool perf_addr_filter_vma_adjust(struct perf_addr_filter *filter, + struct vm_area_struct *vma, + struct perf_addr_filter_range *fr) +{ + unsigned long vma_size = vma->vm_end - vma->vm_start; + unsigned long off = vma->vm_pgoff << PAGE_SHIFT; + struct file *file = vma->vm_file; + + if (!perf_addr_filter_match(filter, file, off, vma_size)) + return false; + + if (filter->offset < off) { + fr->start = vma->vm_start; + fr->size = min(vma_size, filter->size - (off - filter->offset)); + } else { + fr->start = vma->vm_start + filter->offset - off; + fr->size = min(vma->vm_end - fr->start, filter->size); + } + + return true; +} + static void __perf_addr_filters_adjust(struct perf_event *event, void *data) { struct perf_addr_filters_head *ifh = perf_event_addr_filters(event); struct vm_area_struct *vma = data; - unsigned long off = vma->vm_pgoff << PAGE_SHIFT, flags; - struct file *file = vma->vm_file; struct perf_addr_filter *filter; unsigned int restart = 0, count = 0; + unsigned long flags; if (!has_addr_filter(event)) return; - if (!file) + if (!vma->vm_file) return; raw_spin_lock_irqsave(&ifh->lock, flags); list_for_each_entry(filter, &ifh->list, entry) { - if (perf_addr_filter_match(filter, file, off, - vma->vm_end - vma->vm_start)) { - event->addr_filters_offs[count] = vma->vm_start; + if (perf_addr_filter_vma_adjust(filter, vma, + &event->addr_filter_ranges[count])) restart++; - } count++; } @@ -7658,6 +7688,207 @@ static void perf_log_throttle(struct perf_event *event, int enable) perf_output_end(&handle); } +/* + * ksymbol register/unregister tracking + */ + +struct perf_ksymbol_event { + const char *name; + int name_len; + struct { + struct perf_event_header header; + u64 addr; + u32 len; + u16 ksym_type; + u16 flags; + } event_id; +}; + +static int perf_event_ksymbol_match(struct perf_event *event) +{ + return event->attr.ksymbol; +} + +static void perf_event_ksymbol_output(struct perf_event *event, void *data) +{ + struct perf_ksymbol_event *ksymbol_event = data; + struct perf_output_handle handle; + struct perf_sample_data sample; + int ret; + + if (!perf_event_ksymbol_match(event)) + return; + + perf_event_header__init_id(&ksymbol_event->event_id.header, + &sample, event); + ret = perf_output_begin(&handle, event, + ksymbol_event->event_id.header.size); + if (ret) + return; + + perf_output_put(&handle, ksymbol_event->event_id); + __output_copy(&handle, ksymbol_event->name, ksymbol_event->name_len); + perf_event__output_id_sample(event, &handle, &sample); + + perf_output_end(&handle); +} + +void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len, bool unregister, + const char *sym) +{ + struct perf_ksymbol_event ksymbol_event; + char name[KSYM_NAME_LEN]; + u16 flags = 0; + int name_len; + + if (!atomic_read(&nr_ksymbol_events)) + return; + + if (ksym_type >= PERF_RECORD_KSYMBOL_TYPE_MAX || + ksym_type == PERF_RECORD_KSYMBOL_TYPE_UNKNOWN) + goto err; + + strlcpy(name, sym, KSYM_NAME_LEN); + name_len = strlen(name) + 1; + while (!IS_ALIGNED(name_len, sizeof(u64))) + name[name_len++] = '\0'; + BUILD_BUG_ON(KSYM_NAME_LEN % sizeof(u64)); + + if (unregister) + flags |= PERF_RECORD_KSYMBOL_FLAGS_UNREGISTER; + + ksymbol_event = (struct perf_ksymbol_event){ + .name = name, + .name_len = name_len, + .event_id = { + .header = { + .type = PERF_RECORD_KSYMBOL, + .size = sizeof(ksymbol_event.event_id) + + name_len, + }, + .addr = addr, + .len = len, + .ksym_type = ksym_type, + .flags = flags, + }, + }; + + perf_iterate_sb(perf_event_ksymbol_output, &ksymbol_event, NULL); + return; +err: + WARN_ONCE(1, "%s: Invalid KSYMBOL type 0x%x\n", __func__, ksym_type); +} + +/* + * bpf program load/unload tracking + */ + +struct perf_bpf_event { + struct bpf_prog *prog; + struct { + struct perf_event_header header; + u16 type; + u16 flags; + u32 id; + u8 tag[BPF_TAG_SIZE]; + } event_id; +}; + +static int perf_event_bpf_match(struct perf_event *event) +{ + return event->attr.bpf_event; +} + +static void perf_event_bpf_output(struct perf_event *event, void *data) +{ + struct perf_bpf_event *bpf_event = data; + struct perf_output_handle handle; + struct perf_sample_data sample; + int ret; + + if (!perf_event_bpf_match(event)) + return; + + perf_event_header__init_id(&bpf_event->event_id.header, + &sample, event); + ret = perf_output_begin(&handle, event, + bpf_event->event_id.header.size); + if (ret) + return; + + perf_output_put(&handle, bpf_event->event_id); + perf_event__output_id_sample(event, &handle, &sample); + + perf_output_end(&handle); +} + +static void perf_event_bpf_emit_ksymbols(struct bpf_prog *prog, + enum perf_bpf_event_type type) +{ + bool unregister = type == PERF_BPF_EVENT_PROG_UNLOAD; + char sym[KSYM_NAME_LEN]; + int i; + + if (prog->aux->func_cnt == 0) { + bpf_get_prog_name(prog, sym); + perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_BPF, + (u64)(unsigned long)prog->bpf_func, + prog->jited_len, unregister, sym); + } else { + for (i = 0; i < prog->aux->func_cnt; i++) { + struct bpf_prog *subprog = prog->aux->func[i]; + + bpf_get_prog_name(subprog, sym); + perf_event_ksymbol( + PERF_RECORD_KSYMBOL_TYPE_BPF, + (u64)(unsigned long)subprog->bpf_func, + subprog->jited_len, unregister, sym); + } + } +} + +void perf_event_bpf_event(struct bpf_prog *prog, + enum perf_bpf_event_type type, + u16 flags) +{ + struct perf_bpf_event bpf_event; + + if (type <= PERF_BPF_EVENT_UNKNOWN || + type >= PERF_BPF_EVENT_MAX) + return; + + switch (type) { + case PERF_BPF_EVENT_PROG_LOAD: + case PERF_BPF_EVENT_PROG_UNLOAD: + if (atomic_read(&nr_ksymbol_events)) + perf_event_bpf_emit_ksymbols(prog, type); + break; + default: + break; + } + + if (!atomic_read(&nr_bpf_events)) + return; + + bpf_event = (struct perf_bpf_event){ + .prog = prog, + .event_id = { + .header = { + .type = PERF_RECORD_BPF_EVENT, + .size = sizeof(bpf_event.event_id), + }, + .type = type, + .flags = flags, + .id = prog->aux->id, + }, + }; + + BUILD_BUG_ON(BPF_TAG_SIZE % sizeof(u64)); + + memcpy(bpf_event.event_id.tag, prog->tag, BPF_TAG_SIZE); + perf_iterate_sb(perf_event_bpf_output, &bpf_event, NULL); +} + void perf_event_itrace_started(struct perf_event *event) { event->attach_state |= PERF_ATTACH_ITRACE; @@ -8776,26 +9007,19 @@ static void perf_addr_filters_splice(struct perf_event *event, * @filter; if so, adjust filter's address range. * Called with mm::mmap_sem down for reading. */ -static unsigned long perf_addr_filter_apply(struct perf_addr_filter *filter, - struct mm_struct *mm) +static void perf_addr_filter_apply(struct perf_addr_filter *filter, + struct mm_struct *mm, + struct perf_addr_filter_range *fr) { struct vm_area_struct *vma; for (vma = mm->mmap; vma; vma = vma->vm_next) { - struct file *file = vma->vm_file; - unsigned long off = vma->vm_pgoff << PAGE_SHIFT; - unsigned long vma_size = vma->vm_end - vma->vm_start; - - if (!file) - continue; - - if (!perf_addr_filter_match(filter, file, off, vma_size)) + if (!vma->vm_file) continue; - return vma->vm_start; + if (perf_addr_filter_vma_adjust(filter, vma, fr)) + return; } - - return 0; } /* @@ -8829,15 +9053,15 @@ static void perf_event_addr_filters_apply(struct perf_event *event) raw_spin_lock_irqsave(&ifh->lock, flags); list_for_each_entry(filter, &ifh->list, entry) { - event->addr_filters_offs[count] = 0; + event->addr_filter_ranges[count].start = 0; + event->addr_filter_ranges[count].size = 0; /* * Adjust base offset if the filter is associated to a binary * that needs to be mapped: */ if (filter->path.dentry) - event->addr_filters_offs[count] = - perf_addr_filter_apply(filter, mm); + perf_addr_filter_apply(filter, mm, &event->addr_filter_ranges[count]); count++; } @@ -8951,6 +9175,7 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr, case IF_SRC_KERNELADDR: case IF_SRC_KERNEL: kernel = 1; + /* fall through */ case IF_SRC_FILEADDR: case IF_SRC_FILE: @@ -9788,6 +10013,15 @@ static int perf_try_init_event(struct pmu *pmu, struct perf_event *event) if (ctx) perf_event_ctx_unlock(event->group_leader, ctx); + if (!ret) { + if (pmu->capabilities & PERF_PMU_CAP_NO_EXCLUDE && + event_has_any_exclude_flag(event)) { + if (event->destroy) + event->destroy(event); + ret = -EINVAL; + } + } + if (ret) module_put(pmu->module); @@ -9916,6 +10150,10 @@ static void account_event(struct perf_event *event) inc = true; if (is_cgroup_event(event)) inc = true; + if (event->attr.ksymbol) + atomic_inc(&nr_ksymbol_events); + if (event->attr.bpf_event) + atomic_inc(&nr_bpf_events); if (inc) { /* @@ -10098,14 +10336,28 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, goto err_pmu; if (has_addr_filter(event)) { - event->addr_filters_offs = kcalloc(pmu->nr_addr_filters, - sizeof(unsigned long), - GFP_KERNEL); - if (!event->addr_filters_offs) { + event->addr_filter_ranges = kcalloc(pmu->nr_addr_filters, + sizeof(struct perf_addr_filter_range), + GFP_KERNEL); + if (!event->addr_filter_ranges) { err = -ENOMEM; goto err_per_task; } + /* + * Clone the parent's vma offsets: they are valid until exec() + * even if the mm is not shared with the parent. + */ + if (event->parent) { + struct perf_addr_filters_head *ifh = perf_event_addr_filters(event); + + raw_spin_lock_irq(&ifh->lock); + memcpy(event->addr_filter_ranges, + event->parent->addr_filter_ranges, + pmu->nr_addr_filters * sizeof(struct perf_addr_filter_range)); + raw_spin_unlock_irq(&ifh->lock); + } + /* force hw sync on the address filters */ event->addr_filters_gen = 1; } @@ -10124,7 +10376,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, return event; err_addr_filters: - kfree(event->addr_filters_offs); + kfree(event->addr_filter_ranges); err_per_task: exclusive_event_destroy(event); @@ -10407,7 +10659,7 @@ __perf_event_ctx_lock_double(struct perf_event *group_leader, again: rcu_read_lock(); gctx = READ_ONCE(group_leader->ctx); - if (!atomic_inc_not_zero(&gctx->refcount)) { + if (!refcount_inc_not_zero(&gctx->refcount)) { rcu_read_unlock(); goto again; } diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index 5befb338a18d..c5cd852fe86b 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -1,18 +1,5 @@ +// SPDX-License-Identifier: GPL-2.0+ /* - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - * * Copyright (C) 2007 Alan Stern * Copyright (C) IBM Corporation, 2009 * Copyright (C) 2009, Frederic Weisbecker <fweisbec@gmail.com> diff --git a/kernel/events/internal.h b/kernel/events/internal.h index 6dc725a7e7bc..79c47076700a 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -4,13 +4,14 @@ #include <linux/hardirq.h> #include <linux/uaccess.h> +#include <linux/refcount.h> /* Buffer handling */ #define RING_BUFFER_WRITABLE 0x01 struct ring_buffer { - atomic_t refcount; + refcount_t refcount; struct rcu_head rcu_head; #ifdef CONFIG_PERF_USE_VMALLOC struct work_struct work; @@ -48,7 +49,7 @@ struct ring_buffer { atomic_t aux_mmap_count; unsigned long aux_mmap_locked; void (*free_aux)(void *); - atomic_t aux_refcount; + refcount_t aux_refcount; void **aux_pages; void *aux_priv; diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 5ab4fe3b1dcc..a4047321d7d8 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Performance events ring-buffer code: * @@ -5,8 +6,6 @@ * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> - * - * For licensing details see kernel-base/COPYING */ #include <linux/perf_event.h> @@ -285,7 +284,7 @@ ring_buffer_init(struct ring_buffer *rb, long watermark, int flags) else rb->overwrite = 1; - atomic_set(&rb->refcount, 1); + refcount_set(&rb->refcount, 1); INIT_LIST_HEAD(&rb->event_list); spin_lock_init(&rb->event_lock); @@ -358,7 +357,7 @@ void *perf_aux_output_begin(struct perf_output_handle *handle, if (!atomic_read(&rb->aux_mmap_count)) goto err; - if (!atomic_inc_not_zero(&rb->aux_refcount)) + if (!refcount_inc_not_zero(&rb->aux_refcount)) goto err; /* @@ -599,29 +598,27 @@ int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event, { bool overwrite = !(flags & RING_BUFFER_WRITABLE); int node = (event->cpu == -1) ? -1 : cpu_to_node(event->cpu); - int ret = -ENOMEM, max_order = 0; + int ret = -ENOMEM, max_order; if (!has_aux(event)) return -EOPNOTSUPP; - if (event->pmu->capabilities & PERF_PMU_CAP_AUX_NO_SG) { - /* - * We need to start with the max_order that fits in nr_pages, - * not the other way around, hence ilog2() and not get_order. - */ - max_order = ilog2(nr_pages); + /* + * We need to start with the max_order that fits in nr_pages, + * not the other way around, hence ilog2() and not get_order. + */ + max_order = ilog2(nr_pages); - /* - * PMU requests more than one contiguous chunks of memory - * for SW double buffering - */ - if ((event->pmu->capabilities & PERF_PMU_CAP_AUX_SW_DOUBLEBUF) && - !overwrite) { - if (!max_order) - return -EINVAL; + /* + * PMU requests more than one contiguous chunks of memory + * for SW double buffering + */ + if ((event->pmu->capabilities & PERF_PMU_CAP_AUX_SW_DOUBLEBUF) && + !overwrite) { + if (!max_order) + return -EINVAL; - max_order--; - } + max_order--; } rb->aux_pages = kcalloc_node(nr_pages, sizeof(void *), GFP_KERNEL, @@ -658,7 +655,7 @@ int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event, goto out; } - rb->aux_priv = event->pmu->setup_aux(event->cpu, rb->aux_pages, nr_pages, + rb->aux_priv = event->pmu->setup_aux(event, rb->aux_pages, nr_pages, overwrite); if (!rb->aux_priv) goto out; @@ -671,7 +668,7 @@ int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event, * we keep a refcount here to make sure either of the two can * reference them safely. */ - atomic_set(&rb->aux_refcount, 1); + refcount_set(&rb->aux_refcount, 1); rb->aux_overwrite = overwrite; rb->aux_watermark = watermark; @@ -690,7 +687,7 @@ out: void rb_free_aux(struct ring_buffer *rb) { - if (atomic_dec_and_test(&rb->aux_refcount)) + if (refcount_dec_and_test(&rb->aux_refcount)) __rb_free_aux(rb); } diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 8aef47ee7bfa..affa830a198c 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1,20 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * User-space Probes (UProbes) * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - * * Copyright (C) IBM Corporation, 2008-2012 * Authors: * Srikar Dronamraju diff --git a/kernel/exit.c b/kernel/exit.c index 2639a30a8aa5..2166c2d92ddc 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -219,6 +219,7 @@ repeat: } write_unlock_irq(&tasklist_lock); + cgroup_release(p); release_thread(p); call_rcu(&p->rcu, delayed_put_task_struct); diff --git a/kernel/fork.c b/kernel/fork.c index b69248e6f0e0..9dcd18aa210b 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -77,7 +77,6 @@ #include <linux/blkdev.h> #include <linux/fs_struct.h> #include <linux/magic.h> -#include <linux/sched/mm.h> #include <linux/perf_event.h> #include <linux/posix-timers.h> #include <linux/user-return-notifier.h> @@ -429,7 +428,7 @@ static void release_task_stack(struct task_struct *tsk) #ifdef CONFIG_THREAD_INFO_IN_TASK void put_task_stack(struct task_struct *tsk) { - if (atomic_dec_and_test(&tsk->stack_refcount)) + if (refcount_dec_and_test(&tsk->stack_refcount)) release_task_stack(tsk); } #endif @@ -447,7 +446,7 @@ void free_task(struct task_struct *tsk) * If the task had a separate stack allocation, it should be gone * by now. */ - WARN_ON_ONCE(atomic_read(&tsk->stack_refcount) != 0); + WARN_ON_ONCE(refcount_read(&tsk->stack_refcount) != 0); #endif rt_mutex_debug_task_free(tsk); ftrace_graph_exit_task(tsk); @@ -710,14 +709,14 @@ static inline void free_signal_struct(struct signal_struct *sig) static inline void put_signal_struct(struct signal_struct *sig) { - if (atomic_dec_and_test(&sig->sigcnt)) + if (refcount_dec_and_test(&sig->sigcnt)) free_signal_struct(sig); } void __put_task_struct(struct task_struct *tsk) { WARN_ON(!tsk->exit_state); - WARN_ON(atomic_read(&tsk->usage)); + WARN_ON(refcount_read(&tsk->usage)); WARN_ON(tsk == current); cgroup_free(tsk); @@ -867,7 +866,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) tsk->stack_vm_area = stack_vm_area; #endif #ifdef CONFIG_THREAD_INFO_IN_TASK - atomic_set(&tsk->stack_refcount, 1); + refcount_set(&tsk->stack_refcount, 1); #endif if (err) @@ -896,7 +895,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) * One for us, one for whoever does the "release_task()" (usually * parent) */ - atomic_set(&tsk->usage, 2); + refcount_set(&tsk->usage, 2); #ifdef CONFIG_BLK_DEV_IO_TRACE tsk->btrace_seq = 0; #endif @@ -981,7 +980,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, mm_pgtables_bytes_init(mm); mm->map_count = 0; mm->locked_vm = 0; - mm->pinned_vm = 0; + atomic64_set(&mm->pinned_vm, 0); memset(&mm->rss_stat, 0, sizeof(mm->rss_stat)); spin_lock_init(&mm->page_table_lock); spin_lock_init(&mm->arg_lock); @@ -1463,7 +1462,7 @@ static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk) struct sighand_struct *sig; if (clone_flags & CLONE_SIGHAND) { - atomic_inc(¤t->sighand->count); + refcount_inc(¤t->sighand->count); return 0; } sig = kmem_cache_alloc(sighand_cachep, GFP_KERNEL); @@ -1471,7 +1470,7 @@ static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk) if (!sig) return -ENOMEM; - atomic_set(&sig->count, 1); + refcount_set(&sig->count, 1); spin_lock_irq(¤t->sighand->siglock); memcpy(sig->action, current->sighand->action, sizeof(sig->action)); spin_unlock_irq(¤t->sighand->siglock); @@ -1480,7 +1479,7 @@ static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk) void __cleanup_sighand(struct sighand_struct *sighand) { - if (atomic_dec_and_test(&sighand->count)) { + if (refcount_dec_and_test(&sighand->count)) { signalfd_cleanup(sighand); /* * sighand_cachep is SLAB_TYPESAFE_BY_RCU so we can free it @@ -1527,7 +1526,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) sig->nr_threads = 1; atomic_set(&sig->live, 1); - atomic_set(&sig->sigcnt, 1); + refcount_set(&sig->sigcnt, 1); /* list_add(thread_node, thread_head) without INIT_LIST_HEAD() */ sig->thread_head = (struct list_head)LIST_HEAD_INIT(tsk->thread_node); @@ -2082,7 +2081,7 @@ static __latent_entropy struct task_struct *copy_process( } else { current->signal->nr_threads++; atomic_inc(¤t->signal->live); - atomic_inc(¤t->signal->sigcnt); + refcount_inc(¤t->signal->sigcnt); task_join_group_stop(p); list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group); @@ -2439,7 +2438,7 @@ static int check_unshare_flags(unsigned long unshare_flags) return -EINVAL; } if (unshare_flags & (CLONE_SIGHAND | CLONE_VM)) { - if (atomic_read(¤t->sighand->count) > 1) + if (refcount_read(¤t->sighand->count) > 1) return -EINVAL; } if (unshare_flags & CLONE_VM) { diff --git a/kernel/futex.c b/kernel/futex.c index a0514e01c3eb..c3b73b0311bc 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -68,6 +68,7 @@ #include <linux/freezer.h> #include <linux/memblock.h> #include <linux/fault-inject.h> +#include <linux/refcount.h> #include <asm/futex.h> @@ -212,7 +213,7 @@ struct futex_pi_state { struct rt_mutex pi_mutex; struct task_struct *owner; - atomic_t refcount; + refcount_t refcount; union futex_key key; } __randomize_layout; @@ -321,12 +322,8 @@ static int __init fail_futex_debugfs(void) if (IS_ERR(dir)) return PTR_ERR(dir); - if (!debugfs_create_bool("ignore-private", mode, dir, - &fail_futex.ignore_private)) { - debugfs_remove_recursive(dir); - return -ENOMEM; - } - + debugfs_create_bool("ignore-private", mode, dir, + &fail_futex.ignore_private); return 0; } @@ -803,7 +800,7 @@ static int refill_pi_state_cache(void) INIT_LIST_HEAD(&pi_state->list); /* pi_mutex gets initialized later */ pi_state->owner = NULL; - atomic_set(&pi_state->refcount, 1); + refcount_set(&pi_state->refcount, 1); pi_state->key = FUTEX_KEY_INIT; current->pi_state_cache = pi_state; @@ -823,7 +820,7 @@ static struct futex_pi_state *alloc_pi_state(void) static void get_pi_state(struct futex_pi_state *pi_state) { - WARN_ON_ONCE(!atomic_inc_not_zero(&pi_state->refcount)); + WARN_ON_ONCE(!refcount_inc_not_zero(&pi_state->refcount)); } /* @@ -835,7 +832,7 @@ static void put_pi_state(struct futex_pi_state *pi_state) if (!pi_state) return; - if (!atomic_dec_and_test(&pi_state->refcount)) + if (!refcount_dec_and_test(&pi_state->refcount)) return; /* @@ -865,7 +862,7 @@ static void put_pi_state(struct futex_pi_state *pi_state) * refcount is at 0 - put it back to 1. */ pi_state->owner = NULL; - atomic_set(&pi_state->refcount, 1); + refcount_set(&pi_state->refcount, 1); current->pi_state_cache = pi_state; } } @@ -908,7 +905,7 @@ void exit_pi_state_list(struct task_struct *curr) * In that case; drop the locks to let put_pi_state() make * progress and retry the loop. */ - if (!atomic_inc_not_zero(&pi_state->refcount)) { + if (!refcount_inc_not_zero(&pi_state->refcount)) { raw_spin_unlock_irq(&curr->pi_lock); cpu_relax(); raw_spin_lock_irq(&curr->pi_lock); @@ -1064,7 +1061,7 @@ static int attach_to_pi_state(u32 __user *uaddr, u32 uval, * and futex_wait_requeue_pi() as it cannot go to 0 and consequently * free pi_state before we can take a reference ourselves. */ - WARN_ON(!atomic_read(&pi_state->refcount)); + WARN_ON(!refcount_read(&pi_state->refcount)); /* * Now that we have a pi_state, we can acquire wait_lock @@ -1467,8 +1464,7 @@ static void mark_wake_futex(struct wake_q_head *wake_q, struct futex_q *q) * Queue the task for later wakeup for after we've released * the hb->lock. wake_q_add() grabs reference to p. */ - wake_q_add(wake_q, p); - put_task_struct(p); + wake_q_add_safe(wake_q, p); } /* @@ -3823,7 +3819,7 @@ err_unlock: #endif /* CONFIG_COMPAT */ #ifdef CONFIG_COMPAT_32BIT_TIME -COMPAT_SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, +SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, op, u32, val, struct old_timespec32 __user *, utime, u32 __user *, uaddr2, u32, val3) { diff --git a/kernel/gcov/gcc_3_4.c b/kernel/gcov/gcc_3_4.c index 1e32e66c9563..2dddecbdbe6e 100644 --- a/kernel/gcov/gcc_3_4.c +++ b/kernel/gcov/gcc_3_4.c @@ -245,8 +245,7 @@ struct gcov_info *gcov_info_dup(struct gcov_info *info) /* Duplicate gcov_info. */ active = num_counter_active(info); - dup = kzalloc(sizeof(struct gcov_info) + - sizeof(struct gcov_ctr_info) * active, GFP_KERNEL); + dup = kzalloc(struct_size(dup, counts, active), GFP_KERNEL); if (!dup) return NULL; dup->version = info->version; @@ -364,8 +363,7 @@ struct gcov_iterator *gcov_iter_new(struct gcov_info *info) { struct gcov_iterator *iter; - iter = kzalloc(sizeof(struct gcov_iterator) + - num_counter_active(info) * sizeof(struct type_info), + iter = kzalloc(struct_size(iter, type_info, num_counter_active(info)), GFP_KERNEL); if (iter) iter->info = info; diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 4a9191617076..f108a95882c6 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -19,6 +19,7 @@ #include <linux/utsname.h> #include <linux/sched/signal.h> #include <linux/sched/debug.h> +#include <linux/sched/sysctl.h> #include <trace/events/sched.h> @@ -126,7 +127,7 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) if (sysctl_hung_task_warnings > 0) sysctl_hung_task_warnings--; pr_err("INFO: task %s:%d blocked for more than %ld seconds.\n", - t->comm, t->pid, timeout); + t->comm, t->pid, (jiffies - t->last_switch_time) / HZ); pr_err(" %s %s %.*s\n", print_tainted(), init_utsname()->release, (int)strcspn(init_utsname()->version, " "), diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c index 45b68b4ea48b..f18cd5aa33e8 100644 --- a/kernel/irq/affinity.c +++ b/kernel/irq/affinity.c @@ -9,7 +9,7 @@ #include <linux/cpu.h> static void irq_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk, - int cpus_per_vec) + unsigned int cpus_per_vec) { const struct cpumask *siblmsk; int cpu, sibl; @@ -95,15 +95,17 @@ static int get_nodes_in_cpumask(cpumask_var_t *node_to_cpumask, } static int __irq_build_affinity_masks(const struct irq_affinity *affd, - int startvec, int numvecs, int firstvec, + unsigned int startvec, + unsigned int numvecs, + unsigned int firstvec, cpumask_var_t *node_to_cpumask, const struct cpumask *cpu_mask, struct cpumask *nmsk, struct irq_affinity_desc *masks) { - int n, nodes, cpus_per_vec, extra_vecs, done = 0; - int last_affv = firstvec + numvecs; - int curvec = startvec; + unsigned int n, nodes, cpus_per_vec, extra_vecs, done = 0; + unsigned int last_affv = firstvec + numvecs; + unsigned int curvec = startvec; nodemask_t nodemsk = NODE_MASK_NONE; if (!cpumask_weight(cpu_mask)) @@ -117,18 +119,16 @@ static int __irq_build_affinity_masks(const struct irq_affinity *affd, */ if (numvecs <= nodes) { for_each_node_mask(n, nodemsk) { - cpumask_or(&masks[curvec].mask, - &masks[curvec].mask, - node_to_cpumask[n]); + cpumask_or(&masks[curvec].mask, &masks[curvec].mask, + node_to_cpumask[n]); if (++curvec == last_affv) curvec = firstvec; } - done = numvecs; - goto out; + return numvecs; } for_each_node_mask(n, nodemsk) { - int ncpus, v, vecs_to_assign, vecs_per_node; + unsigned int ncpus, v, vecs_to_assign, vecs_per_node; /* Spread the vectors per node */ vecs_per_node = (numvecs - (curvec - firstvec)) / nodes; @@ -163,8 +163,6 @@ static int __irq_build_affinity_masks(const struct irq_affinity *affd, curvec = firstvec; --nodes; } - -out: return done; } @@ -174,19 +172,24 @@ out: * 2) spread other possible CPUs on these vectors */ static int irq_build_affinity_masks(const struct irq_affinity *affd, - int startvec, int numvecs, int firstvec, - cpumask_var_t *node_to_cpumask, + unsigned int startvec, unsigned int numvecs, + unsigned int firstvec, struct irq_affinity_desc *masks) { - int curvec = startvec, nr_present, nr_others; - int ret = -ENOMEM; + unsigned int curvec = startvec, nr_present, nr_others; + cpumask_var_t *node_to_cpumask; cpumask_var_t nmsk, npresmsk; + int ret = -ENOMEM; if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL)) return ret; if (!zalloc_cpumask_var(&npresmsk, GFP_KERNEL)) - goto fail; + goto fail_nmsk; + + node_to_cpumask = alloc_node_to_cpumask(); + if (!node_to_cpumask) + goto fail_npresmsk; ret = 0; /* Stabilize the cpumasks */ @@ -217,13 +220,22 @@ static int irq_build_affinity_masks(const struct irq_affinity *affd, if (nr_present < numvecs) WARN_ON(nr_present + nr_others < numvecs); + free_node_to_cpumask(node_to_cpumask); + + fail_npresmsk: free_cpumask_var(npresmsk); - fail: + fail_nmsk: free_cpumask_var(nmsk); return ret; } +static void default_calc_sets(struct irq_affinity *affd, unsigned int affvecs) +{ + affd->nr_sets = 1; + affd->set_size[0] = affvecs; +} + /** * irq_create_affinity_masks - Create affinity masks for multiqueue spreading * @nvecs: The total number of vectors @@ -232,50 +244,62 @@ static int irq_build_affinity_masks(const struct irq_affinity *affd, * Returns the irq_affinity_desc pointer or NULL if allocation failed. */ struct irq_affinity_desc * -irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) +irq_create_affinity_masks(unsigned int nvecs, struct irq_affinity *affd) { - int affvecs = nvecs - affd->pre_vectors - affd->post_vectors; - int curvec, usedvecs; - cpumask_var_t *node_to_cpumask; + unsigned int affvecs, curvec, usedvecs, i; struct irq_affinity_desc *masks = NULL; - int i, nr_sets; /* - * If there aren't any vectors left after applying the pre/post - * vectors don't bother with assigning affinity. + * Determine the number of vectors which need interrupt affinities + * assigned. If the pre/post request exhausts the available vectors + * then nothing to do here except for invoking the calc_sets() + * callback so the device driver can adjust to the situation. If there + * is only a single vector, then managing the queue is pointless as + * well. */ - if (nvecs == affd->pre_vectors + affd->post_vectors) + if (nvecs > 1 && nvecs > affd->pre_vectors + affd->post_vectors) + affvecs = nvecs - affd->pre_vectors - affd->post_vectors; + else + affvecs = 0; + + /* + * Simple invocations do not provide a calc_sets() callback. Install + * the generic one. + */ + if (!affd->calc_sets) + affd->calc_sets = default_calc_sets; + + /* Recalculate the sets */ + affd->calc_sets(affd, affvecs); + + if (WARN_ON_ONCE(affd->nr_sets > IRQ_AFFINITY_MAX_SETS)) return NULL; - node_to_cpumask = alloc_node_to_cpumask(); - if (!node_to_cpumask) + /* Nothing to assign? */ + if (!affvecs) return NULL; masks = kcalloc(nvecs, sizeof(*masks), GFP_KERNEL); if (!masks) - goto outnodemsk; + return NULL; /* Fill out vectors at the beginning that don't need affinity */ for (curvec = 0; curvec < affd->pre_vectors; curvec++) cpumask_copy(&masks[curvec].mask, irq_default_affinity); + /* * Spread on present CPUs starting from affd->pre_vectors. If we * have multiple sets, build each sets affinity mask separately. */ - nr_sets = affd->nr_sets; - if (!nr_sets) - nr_sets = 1; - - for (i = 0, usedvecs = 0; i < nr_sets; i++) { - int this_vecs = affd->sets ? affd->sets[i] : affvecs; + for (i = 0, usedvecs = 0; i < affd->nr_sets; i++) { + unsigned int this_vecs = affd->set_size[i]; int ret; ret = irq_build_affinity_masks(affd, curvec, this_vecs, - curvec, node_to_cpumask, masks); + curvec, masks); if (ret) { kfree(masks); - masks = NULL; - goto outnodemsk; + return NULL; } curvec += this_vecs; usedvecs += this_vecs; @@ -293,8 +317,6 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) for (i = affd->pre_vectors; i < nvecs - affd->post_vectors; i++) masks[i].is_managed = 1; -outnodemsk: - free_node_to_cpumask(node_to_cpumask); return masks; } @@ -304,25 +326,22 @@ outnodemsk: * @maxvec: The maximum number of vectors available * @affd: Description of the affinity requirements */ -int irq_calc_affinity_vectors(int minvec, int maxvec, const struct irq_affinity *affd) +unsigned int irq_calc_affinity_vectors(unsigned int minvec, unsigned int maxvec, + const struct irq_affinity *affd) { - int resv = affd->pre_vectors + affd->post_vectors; - int vecs = maxvec - resv; - int set_vecs; + unsigned int resv = affd->pre_vectors + affd->post_vectors; + unsigned int set_vecs; if (resv > minvec) return 0; - if (affd->nr_sets) { - int i; - - for (i = 0, set_vecs = 0; i < affd->nr_sets; i++) - set_vecs += affd->sets[i]; + if (affd->calc_sets) { + set_vecs = maxvec - resv; } else { get_online_cpus(); set_vecs = cpumask_weight(cpu_possible_mask); put_online_cpus(); } - return resv + min(set_vecs, vecs); + return resv + min(set_vecs, maxvec - resv); } diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 34e969069488..3faef4a77f71 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -730,6 +730,37 @@ out: EXPORT_SYMBOL_GPL(handle_fasteoi_irq); /** + * handle_fasteoi_nmi - irq handler for NMI interrupt lines + * @desc: the interrupt description structure for this irq + * + * A simple NMI-safe handler, considering the restrictions + * from request_nmi. + * + * Only a single callback will be issued to the chip: an ->eoi() + * call when the interrupt has been serviced. This enables support + * for modern forms of interrupt handlers, which handle the flow + * details in hardware, transparently. + */ +void handle_fasteoi_nmi(struct irq_desc *desc) +{ + struct irq_chip *chip = irq_desc_get_chip(desc); + struct irqaction *action = desc->action; + unsigned int irq = irq_desc_get_irq(desc); + irqreturn_t res; + + trace_irq_handler_entry(irq, action); + /* + * NMIs cannot be shared, there is only one action. + */ + res = action->handler(irq, action->dev_id); + trace_irq_handler_exit(irq, action, res); + + if (chip->irq_eoi) + chip->irq_eoi(&desc->irq_data); +} +EXPORT_SYMBOL_GPL(handle_fasteoi_nmi); + +/** * handle_edge_irq - edge type IRQ handler * @desc: the interrupt description structure for this irq * @@ -855,7 +886,11 @@ void handle_percpu_irq(struct irq_desc *desc) { struct irq_chip *chip = irq_desc_get_chip(desc); - kstat_incr_irqs_this_cpu(desc); + /* + * PER CPU interrupts are not serialized. Do not touch + * desc->tot_count. + */ + __kstat_incr_irqs_this_cpu(desc); if (chip->irq_ack) chip->irq_ack(&desc->irq_data); @@ -884,7 +919,11 @@ void handle_percpu_devid_irq(struct irq_desc *desc) unsigned int irq = irq_desc_get_irq(desc); irqreturn_t res; - kstat_incr_irqs_this_cpu(desc); + /* + * PER CPU interrupts are not serialized. Do not touch + * desc->tot_count. + */ + __kstat_incr_irqs_this_cpu(desc); if (chip->irq_ack) chip->irq_ack(&desc->irq_data); @@ -908,6 +947,29 @@ void handle_percpu_devid_irq(struct irq_desc *desc) chip->irq_eoi(&desc->irq_data); } +/** + * handle_percpu_devid_fasteoi_nmi - Per CPU local NMI handler with per cpu + * dev ids + * @desc: the interrupt description structure for this irq + * + * Similar to handle_fasteoi_nmi, but handling the dev_id cookie + * as a percpu pointer. + */ +void handle_percpu_devid_fasteoi_nmi(struct irq_desc *desc) +{ + struct irq_chip *chip = irq_desc_get_chip(desc); + struct irqaction *action = desc->action; + unsigned int irq = irq_desc_get_irq(desc); + irqreturn_t res; + + trace_irq_handler_entry(irq, action); + res = action->handler(irq, raw_cpu_ptr(action->percpu_dev_id)); + trace_irq_handler_exit(irq, action, res); + + if (chip->irq_eoi) + chip->irq_eoi(&desc->irq_data); +} + static void __irq_do_set_handler(struct irq_desc *desc, irq_flow_handler_t handle, int is_chained, const char *name) @@ -1278,6 +1340,17 @@ void irq_chip_mask_parent(struct irq_data *data) EXPORT_SYMBOL_GPL(irq_chip_mask_parent); /** + * irq_chip_mask_ack_parent - Mask and acknowledge the parent interrupt + * @data: Pointer to interrupt specific data + */ +void irq_chip_mask_ack_parent(struct irq_data *data) +{ + data = data->parent_data; + data->chip->irq_mask_ack(data); +} +EXPORT_SYMBOL_GPL(irq_chip_mask_ack_parent); + +/** * irq_chip_unmask_parent - Unmask the parent interrupt * @data: Pointer to interrupt specific data */ @@ -1381,6 +1454,7 @@ int irq_chip_set_wake_parent(struct irq_data *data, unsigned int on) return -ENOSYS; } +EXPORT_SYMBOL_GPL(irq_chip_set_wake_parent); #endif /** diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c index 6f636136cccc..516c00a5e867 100644 --- a/kernel/irq/debugfs.c +++ b/kernel/irq/debugfs.c @@ -56,6 +56,7 @@ static const struct irq_bit_descr irqchip_flags[] = { BIT_MASK_DESCR(IRQCHIP_ONESHOT_SAFE), BIT_MASK_DESCR(IRQCHIP_EOI_THREADED), BIT_MASK_DESCR(IRQCHIP_SUPPORTS_LEVEL_MSI), + BIT_MASK_DESCR(IRQCHIP_SUPPORTS_NMI), }; static void @@ -140,6 +141,7 @@ static const struct irq_bit_descr irqdesc_istates[] = { BIT_MASK_DESCR(IRQS_WAITING), BIT_MASK_DESCR(IRQS_PENDING), BIT_MASK_DESCR(IRQS_SUSPENDED), + BIT_MASK_DESCR(IRQS_NMI), }; @@ -203,8 +205,8 @@ static ssize_t irq_debug_write(struct file *file, const char __user *user_buf, chip_bus_lock(desc); raw_spin_lock_irqsave(&desc->lock, flags); - if (irq_settings_is_level(desc)) { - /* Can't do level, sorry */ + if (irq_settings_is_level(desc) || desc->istate & IRQS_NMI) { + /* Can't do level nor NMIs, sorry */ err = -EINVAL; } else { desc->istate |= IRQS_PENDING; @@ -256,8 +258,6 @@ static int __init irq_debugfs_init(void) int irq; root_dir = debugfs_create_dir("irq", NULL); - if (!root_dir) - return -ENOMEM; irq_domain_debugfs_init(root_dir); diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 38554bc35375..6df5ddfdb0f8 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -166,7 +166,7 @@ irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc, unsigned int *flags __irq_wake_thread(desc, action); - /* Fall through to add to randomness */ + /* Fall through - to add to randomness */ case IRQ_HANDLED: *flags |= action->flags; break; diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index ca6afa267070..70c3053bc1f6 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -49,6 +49,7 @@ enum { * IRQS_WAITING - irq is waiting * IRQS_PENDING - irq is pending and replayed later * IRQS_SUSPENDED - irq is suspended + * IRQS_NMI - irq line is used to deliver NMIs */ enum { IRQS_AUTODETECT = 0x00000001, @@ -60,6 +61,7 @@ enum { IRQS_PENDING = 0x00000200, IRQS_SUSPENDED = 0x00000800, IRQS_TIMINGS = 0x00001000, + IRQS_NMI = 0x00002000, }; #include "debug.h" @@ -242,12 +244,18 @@ static inline void irq_state_set_masked(struct irq_desc *desc) #undef __irqd_to_state -static inline void kstat_incr_irqs_this_cpu(struct irq_desc *desc) +static inline void __kstat_incr_irqs_this_cpu(struct irq_desc *desc) { __this_cpu_inc(*desc->kstat_irqs); __this_cpu_inc(kstat.irqs_sum); } +static inline void kstat_incr_irqs_this_cpu(struct irq_desc *desc) +{ + __kstat_incr_irqs_this_cpu(desc); + desc->tot_count++; +} + static inline int irq_desc_get_node(struct irq_desc *desc) { return irq_common_data_get_node(&desc->irq_common_data); diff --git a/kernel/irq/irq_sim.c b/kernel/irq/irq_sim.c index 98a20e1594ce..b992f88c5613 100644 --- a/kernel/irq/irq_sim.c +++ b/kernel/irq/irq_sim.c @@ -25,10 +25,22 @@ static void irq_sim_irqunmask(struct irq_data *data) irq_ctx->enabled = true; } +static int irq_sim_set_type(struct irq_data *data, unsigned int type) +{ + /* We only support rising and falling edge trigger types. */ + if (type & ~IRQ_TYPE_EDGE_BOTH) + return -EINVAL; + + irqd_set_trigger_type(data, type); + + return 0; +} + static struct irq_chip irq_sim_irqchip = { .name = "irq_sim", .irq_mask = irq_sim_irqmask, .irq_unmask = irq_sim_irqunmask, + .irq_set_type = irq_sim_set_type, }; static void irq_sim_handle_irq(struct irq_work *work) diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index ef8ad36cadcf..13539e12cd80 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -119,6 +119,7 @@ static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node, desc->depth = 1; desc->irq_count = 0; desc->irqs_unhandled = 0; + desc->tot_count = 0; desc->name = NULL; desc->owner = owner; for_each_possible_cpu(cpu) @@ -669,6 +670,41 @@ int __handle_domain_irq(struct irq_domain *domain, unsigned int hwirq, set_irq_regs(old_regs); return ret; } + +#ifdef CONFIG_IRQ_DOMAIN +/** + * handle_domain_nmi - Invoke the handler for a HW irq belonging to a domain + * @domain: The domain where to perform the lookup + * @hwirq: The HW irq number to convert to a logical one + * @regs: Register file coming from the low-level handling code + * + * Returns: 0 on success, or -EINVAL if conversion has failed + */ +int handle_domain_nmi(struct irq_domain *domain, unsigned int hwirq, + struct pt_regs *regs) +{ + struct pt_regs *old_regs = set_irq_regs(regs); + unsigned int irq; + int ret = 0; + + nmi_enter(); + + irq = irq_find_mapping(domain, hwirq); + + /* + * ack_bad_irq is not NMI-safe, just report + * an invalid interrupt. + */ + if (likely(irq)) + generic_handle_irq(irq); + else + ret = -EINVAL; + + nmi_exit(); + set_irq_regs(old_regs); + return ret; +} +#endif #endif /* Dynamic interrupt handling */ @@ -919,11 +955,15 @@ unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) unsigned int kstat_irqs(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); - int cpu; unsigned int sum = 0; + int cpu; if (!desc || !desc->kstat_irqs) return 0; + if (!irq_settings_is_per_cpu_devid(desc) && + !irq_settings_is_per_cpu(desc)) + return desc->tot_count; + for_each_possible_cpu(cpu) sum += *per_cpu_ptr(desc->kstat_irqs, cpu); return sum; diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 8b0be4bd6565..9ed29e4a7dbf 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -458,6 +458,20 @@ void irq_set_default_host(struct irq_domain *domain) } EXPORT_SYMBOL_GPL(irq_set_default_host); +/** + * irq_get_default_host() - Retrieve the "default" irq domain + * + * Returns: the default domain, if any. + * + * Modern code should never use this. This should only be used on + * systems that cannot implement a firmware->fwnode mapping (which + * both DT and ACPI provide). + */ +struct irq_domain *irq_get_default_host(void) +{ + return irq_default_domain; +} + static void irq_domain_clear_mapping(struct irq_domain *domain, irq_hw_number_t hwirq) { @@ -729,16 +743,17 @@ static int irq_domain_translate(struct irq_domain *d, return 0; } -static void of_phandle_args_to_fwspec(struct of_phandle_args *irq_data, +static void of_phandle_args_to_fwspec(struct device_node *np, const u32 *args, + unsigned int count, struct irq_fwspec *fwspec) { int i; - fwspec->fwnode = irq_data->np ? &irq_data->np->fwnode : NULL; - fwspec->param_count = irq_data->args_count; + fwspec->fwnode = np ? &np->fwnode : NULL; + fwspec->param_count = count; - for (i = 0; i < irq_data->args_count; i++) - fwspec->param[i] = irq_data->args[i]; + for (i = 0; i < count; i++) + fwspec->param[i] = args[i]; } unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec) @@ -836,7 +851,9 @@ unsigned int irq_create_of_mapping(struct of_phandle_args *irq_data) { struct irq_fwspec fwspec; - of_phandle_args_to_fwspec(irq_data, &fwspec); + of_phandle_args_to_fwspec(irq_data->np, irq_data->args, + irq_data->args_count, &fwspec); + return irq_create_fwspec_mapping(&fwspec); } EXPORT_SYMBOL_GPL(irq_create_of_mapping); @@ -928,11 +945,10 @@ int irq_domain_xlate_twocell(struct irq_domain *d, struct device_node *ctrlr, const u32 *intspec, unsigned int intsize, irq_hw_number_t *out_hwirq, unsigned int *out_type) { - if (WARN_ON(intsize < 2)) - return -EINVAL; - *out_hwirq = intspec[0]; - *out_type = intspec[1] & IRQ_TYPE_SENSE_MASK; - return 0; + struct irq_fwspec fwspec; + + of_phandle_args_to_fwspec(ctrlr, intspec, intsize, &fwspec); + return irq_domain_translate_twocell(d, &fwspec, out_hwirq, out_type); } EXPORT_SYMBOL_GPL(irq_domain_xlate_twocell); @@ -968,6 +984,27 @@ const struct irq_domain_ops irq_domain_simple_ops = { }; EXPORT_SYMBOL_GPL(irq_domain_simple_ops); +/** + * irq_domain_translate_twocell() - Generic translate for direct two cell + * bindings + * + * Device Tree IRQ specifier translation function which works with two cell + * bindings where the cell values map directly to the hwirq number + * and linux irq flags. + */ +int irq_domain_translate_twocell(struct irq_domain *d, + struct irq_fwspec *fwspec, + unsigned long *out_hwirq, + unsigned int *out_type) +{ + if (WARN_ON(fwspec->param_count < 2)) + return -EINVAL; + *out_hwirq = fwspec->param[0]; + *out_type = fwspec->param[1] & IRQ_TYPE_SENSE_MASK; + return 0; +} +EXPORT_SYMBOL_GPL(irq_domain_translate_twocell); + int irq_domain_alloc_descs(int virq, unsigned int cnt, irq_hw_number_t hwirq, int node, const struct irq_affinity_desc *affinity) { @@ -1749,8 +1786,6 @@ void __init irq_domain_debugfs_init(struct dentry *root) struct irq_domain *d; domain_dir = debugfs_create_dir("domains", root); - if (!domain_dir) - return; debugfs_create_file("default", 0444, domain_dir, NULL, &irq_domain_debug_fops); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 84b54a17b95d..9ec34a2a6638 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -341,7 +341,7 @@ irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify) /* The release function is promised process context */ might_sleep(); - if (!desc) + if (!desc || desc->istate & IRQS_NMI) return -EINVAL; /* Complete initialisation of *notify */ @@ -553,6 +553,21 @@ bool disable_hardirq(unsigned int irq) } EXPORT_SYMBOL_GPL(disable_hardirq); +/** + * disable_nmi_nosync - disable an nmi without waiting + * @irq: Interrupt to disable + * + * Disable the selected interrupt line. Disables and enables are + * nested. + * The interrupt to disable must have been requested through request_nmi. + * Unlike disable_nmi(), this function does not ensure existing + * instances of the IRQ handler have completed before returning. + */ +void disable_nmi_nosync(unsigned int irq) +{ + disable_irq_nosync(irq); +} + void __enable_irq(struct irq_desc *desc) { switch (desc->depth) { @@ -609,6 +624,20 @@ out: } EXPORT_SYMBOL(enable_irq); +/** + * enable_nmi - enable handling of an nmi + * @irq: Interrupt to enable + * + * The interrupt to enable must have been requested through request_nmi. + * Undoes the effect of one call to disable_nmi(). If this + * matches the last disable, processing of interrupts on this + * IRQ line is re-enabled. + */ +void enable_nmi(unsigned int irq) +{ + enable_irq(irq); +} + static int set_irq_wake_real(unsigned int irq, unsigned int on) { struct irq_desc *desc = irq_to_desc(irq); @@ -644,6 +673,12 @@ int irq_set_irq_wake(unsigned int irq, unsigned int on) if (!desc) return -EINVAL; + /* Don't use NMIs as wake up interrupts please */ + if (desc->istate & IRQS_NMI) { + ret = -EINVAL; + goto out_unlock; + } + /* wakeup-capable irqs can be shared between drivers that * don't need to have the same sleep mode behaviors. */ @@ -666,6 +701,8 @@ int irq_set_irq_wake(unsigned int irq, unsigned int on) irqd_clear(&desc->irq_data, IRQD_WAKEUP_STATE); } } + +out_unlock: irq_put_desc_busunlock(desc, flags); return ret; } @@ -726,6 +763,7 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned long flags) case IRQ_SET_MASK_OK_DONE: irqd_clear(&desc->irq_data, IRQD_TRIGGER_MASK); irqd_set(&desc->irq_data, flags); + /* fall through */ case IRQ_SET_MASK_OK_NOCOPY: flags = irqd_get_trigger_type(&desc->irq_data); @@ -1128,6 +1166,39 @@ static void irq_release_resources(struct irq_desc *desc) c->irq_release_resources(d); } +static bool irq_supports_nmi(struct irq_desc *desc) +{ + struct irq_data *d = irq_desc_get_irq_data(desc); + +#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY + /* Only IRQs directly managed by the root irqchip can be set as NMI */ + if (d->parent_data) + return false; +#endif + /* Don't support NMIs for chips behind a slow bus */ + if (d->chip->irq_bus_lock || d->chip->irq_bus_sync_unlock) + return false; + + return d->chip->flags & IRQCHIP_SUPPORTS_NMI; +} + +static int irq_nmi_setup(struct irq_desc *desc) +{ + struct irq_data *d = irq_desc_get_irq_data(desc); + struct irq_chip *c = d->chip; + + return c->irq_nmi_setup ? c->irq_nmi_setup(d) : -EINVAL; +} + +static void irq_nmi_teardown(struct irq_desc *desc) +{ + struct irq_data *d = irq_desc_get_irq_data(desc); + struct irq_chip *c = d->chip; + + if (c->irq_nmi_teardown) + c->irq_nmi_teardown(d); +} + static int setup_irq_thread(struct irqaction *new, unsigned int irq, bool secondary) { @@ -1302,9 +1373,17 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) * fields must have IRQF_SHARED set and the bits which * set the trigger type must match. Also all must * agree on ONESHOT. + * Interrupt lines used for NMIs cannot be shared. */ unsigned int oldtype; + if (desc->istate & IRQS_NMI) { + pr_err("Invalid attempt to share NMI for %s (irq %d) on irqchip %s.\n", + new->name, irq, desc->irq_data.chip->name); + ret = -EINVAL; + goto out_unlock; + } + /* * If nobody did set the configuration before, inherit * the one provided by the requester. @@ -1756,6 +1835,59 @@ const void *free_irq(unsigned int irq, void *dev_id) } EXPORT_SYMBOL(free_irq); +/* This function must be called with desc->lock held */ +static const void *__cleanup_nmi(unsigned int irq, struct irq_desc *desc) +{ + const char *devname = NULL; + + desc->istate &= ~IRQS_NMI; + + if (!WARN_ON(desc->action == NULL)) { + irq_pm_remove_action(desc, desc->action); + devname = desc->action->name; + unregister_handler_proc(irq, desc->action); + + kfree(desc->action); + desc->action = NULL; + } + + irq_settings_clr_disable_unlazy(desc); + irq_shutdown(desc); + + irq_release_resources(desc); + + irq_chip_pm_put(&desc->irq_data); + module_put(desc->owner); + + return devname; +} + +const void *free_nmi(unsigned int irq, void *dev_id) +{ + struct irq_desc *desc = irq_to_desc(irq); + unsigned long flags; + const void *devname; + + if (!desc || WARN_ON(!(desc->istate & IRQS_NMI))) + return NULL; + + if (WARN_ON(irq_settings_is_per_cpu_devid(desc))) + return NULL; + + /* NMI still enabled */ + if (WARN_ON(desc->depth == 0)) + disable_nmi_nosync(irq); + + raw_spin_lock_irqsave(&desc->lock, flags); + + irq_nmi_teardown(desc); + devname = __cleanup_nmi(irq, desc); + + raw_spin_unlock_irqrestore(&desc->lock, flags); + + return devname; +} + /** * request_threaded_irq - allocate an interrupt line * @irq: Interrupt line to allocate @@ -1925,6 +2057,101 @@ int request_any_context_irq(unsigned int irq, irq_handler_t handler, } EXPORT_SYMBOL_GPL(request_any_context_irq); +/** + * request_nmi - allocate an interrupt line for NMI delivery + * @irq: Interrupt line to allocate + * @handler: Function to be called when the IRQ occurs. + * Threaded handler for threaded interrupts. + * @irqflags: Interrupt type flags + * @name: An ascii name for the claiming device + * @dev_id: A cookie passed back to the handler function + * + * This call allocates interrupt resources and enables the + * interrupt line and IRQ handling. It sets up the IRQ line + * to be handled as an NMI. + * + * An interrupt line delivering NMIs cannot be shared and IRQ handling + * cannot be threaded. + * + * Interrupt lines requested for NMI delivering must produce per cpu + * interrupts and have auto enabling setting disabled. + * + * Dev_id must be globally unique. Normally the address of the + * device data structure is used as the cookie. Since the handler + * receives this value it makes sense to use it. + * + * If the interrupt line cannot be used to deliver NMIs, function + * will fail and return a negative value. + */ +int request_nmi(unsigned int irq, irq_handler_t handler, + unsigned long irqflags, const char *name, void *dev_id) +{ + struct irqaction *action; + struct irq_desc *desc; + unsigned long flags; + int retval; + + if (irq == IRQ_NOTCONNECTED) + return -ENOTCONN; + + /* NMI cannot be shared, used for Polling */ + if (irqflags & (IRQF_SHARED | IRQF_COND_SUSPEND | IRQF_IRQPOLL)) + return -EINVAL; + + if (!(irqflags & IRQF_PERCPU)) + return -EINVAL; + + if (!handler) + return -EINVAL; + + desc = irq_to_desc(irq); + + if (!desc || irq_settings_can_autoenable(desc) || + !irq_settings_can_request(desc) || + WARN_ON(irq_settings_is_per_cpu_devid(desc)) || + !irq_supports_nmi(desc)) + return -EINVAL; + + action = kzalloc(sizeof(struct irqaction), GFP_KERNEL); + if (!action) + return -ENOMEM; + + action->handler = handler; + action->flags = irqflags | IRQF_NO_THREAD | IRQF_NOBALANCING; + action->name = name; + action->dev_id = dev_id; + + retval = irq_chip_pm_get(&desc->irq_data); + if (retval < 0) + goto err_out; + + retval = __setup_irq(irq, desc, action); + if (retval) + goto err_irq_setup; + + raw_spin_lock_irqsave(&desc->lock, flags); + + /* Setup NMI state */ + desc->istate |= IRQS_NMI; + retval = irq_nmi_setup(desc); + if (retval) { + __cleanup_nmi(irq, desc); + raw_spin_unlock_irqrestore(&desc->lock, flags); + return -EINVAL; + } + + raw_spin_unlock_irqrestore(&desc->lock, flags); + + return 0; + +err_irq_setup: + irq_chip_pm_put(&desc->irq_data); +err_out: + kfree(action); + + return retval; +} + void enable_percpu_irq(unsigned int irq, unsigned int type) { unsigned int cpu = smp_processor_id(); @@ -1959,6 +2186,11 @@ out: } EXPORT_SYMBOL_GPL(enable_percpu_irq); +void enable_percpu_nmi(unsigned int irq, unsigned int type) +{ + enable_percpu_irq(irq, type); +} + /** * irq_percpu_is_enabled - Check whether the per cpu irq is enabled * @irq: Linux irq number to check for @@ -1998,6 +2230,11 @@ void disable_percpu_irq(unsigned int irq) } EXPORT_SYMBOL_GPL(disable_percpu_irq); +void disable_percpu_nmi(unsigned int irq) +{ + disable_percpu_irq(irq); +} + /* * Internal function to unregister a percpu irqaction. */ @@ -2029,6 +2266,8 @@ static struct irqaction *__free_percpu_irq(unsigned int irq, void __percpu *dev_ /* Found it - now remove it from the list of entries: */ desc->action = NULL; + desc->istate &= ~IRQS_NMI; + raw_spin_unlock_irqrestore(&desc->lock, flags); unregister_handler_proc(irq, action); @@ -2082,6 +2321,19 @@ void free_percpu_irq(unsigned int irq, void __percpu *dev_id) } EXPORT_SYMBOL_GPL(free_percpu_irq); +void free_percpu_nmi(unsigned int irq, void __percpu *dev_id) +{ + struct irq_desc *desc = irq_to_desc(irq); + + if (!desc || !irq_settings_is_per_cpu_devid(desc)) + return; + + if (WARN_ON(!(desc->istate & IRQS_NMI))) + return; + + kfree(__free_percpu_irq(irq, dev_id)); +} + /** * setup_percpu_irq - setup a per-cpu interrupt * @irq: Interrupt line to setup @@ -2172,6 +2424,158 @@ int __request_percpu_irq(unsigned int irq, irq_handler_t handler, EXPORT_SYMBOL_GPL(__request_percpu_irq); /** + * request_percpu_nmi - allocate a percpu interrupt line for NMI delivery + * @irq: Interrupt line to allocate + * @handler: Function to be called when the IRQ occurs. + * @name: An ascii name for the claiming device + * @dev_id: A percpu cookie passed back to the handler function + * + * This call allocates interrupt resources for a per CPU NMI. Per CPU NMIs + * have to be setup on each CPU by calling prepare_percpu_nmi() before + * being enabled on the same CPU by using enable_percpu_nmi(). + * + * Dev_id must be globally unique. It is a per-cpu variable, and + * the handler gets called with the interrupted CPU's instance of + * that variable. + * + * Interrupt lines requested for NMI delivering should have auto enabling + * setting disabled. + * + * If the interrupt line cannot be used to deliver NMIs, function + * will fail returning a negative value. + */ +int request_percpu_nmi(unsigned int irq, irq_handler_t handler, + const char *name, void __percpu *dev_id) +{ + struct irqaction *action; + struct irq_desc *desc; + unsigned long flags; + int retval; + + if (!handler) + return -EINVAL; + + desc = irq_to_desc(irq); + + if (!desc || !irq_settings_can_request(desc) || + !irq_settings_is_per_cpu_devid(desc) || + irq_settings_can_autoenable(desc) || + !irq_supports_nmi(desc)) + return -EINVAL; + + /* The line cannot already be NMI */ + if (desc->istate & IRQS_NMI) + return -EINVAL; + + action = kzalloc(sizeof(struct irqaction), GFP_KERNEL); + if (!action) + return -ENOMEM; + + action->handler = handler; + action->flags = IRQF_PERCPU | IRQF_NO_SUSPEND | IRQF_NO_THREAD + | IRQF_NOBALANCING; + action->name = name; + action->percpu_dev_id = dev_id; + + retval = irq_chip_pm_get(&desc->irq_data); + if (retval < 0) + goto err_out; + + retval = __setup_irq(irq, desc, action); + if (retval) + goto err_irq_setup; + + raw_spin_lock_irqsave(&desc->lock, flags); + desc->istate |= IRQS_NMI; + raw_spin_unlock_irqrestore(&desc->lock, flags); + + return 0; + +err_irq_setup: + irq_chip_pm_put(&desc->irq_data); +err_out: + kfree(action); + + return retval; +} + +/** + * prepare_percpu_nmi - performs CPU local setup for NMI delivery + * @irq: Interrupt line to prepare for NMI delivery + * + * This call prepares an interrupt line to deliver NMI on the current CPU, + * before that interrupt line gets enabled with enable_percpu_nmi(). + * + * As a CPU local operation, this should be called from non-preemptible + * context. + * + * If the interrupt line cannot be used to deliver NMIs, function + * will fail returning a negative value. + */ +int prepare_percpu_nmi(unsigned int irq) +{ + unsigned long flags; + struct irq_desc *desc; + int ret = 0; + + WARN_ON(preemptible()); + + desc = irq_get_desc_lock(irq, &flags, + IRQ_GET_DESC_CHECK_PERCPU); + if (!desc) + return -EINVAL; + + if (WARN(!(desc->istate & IRQS_NMI), + KERN_ERR "prepare_percpu_nmi called for a non-NMI interrupt: irq %u\n", + irq)) { + ret = -EINVAL; + goto out; + } + + ret = irq_nmi_setup(desc); + if (ret) { + pr_err("Failed to setup NMI delivery: irq %u\n", irq); + goto out; + } + +out: + irq_put_desc_unlock(desc, flags); + return ret; +} + +/** + * teardown_percpu_nmi - undoes NMI setup of IRQ line + * @irq: Interrupt line from which CPU local NMI configuration should be + * removed + * + * This call undoes the setup done by prepare_percpu_nmi(). + * + * IRQ line should not be enabled for the current CPU. + * + * As a CPU local operation, this should be called from non-preemptible + * context. + */ +void teardown_percpu_nmi(unsigned int irq) +{ + unsigned long flags; + struct irq_desc *desc; + + WARN_ON(preemptible()); + + desc = irq_get_desc_lock(irq, &flags, + IRQ_GET_DESC_CHECK_PERCPU); + if (!desc) + return; + + if (WARN_ON(!(desc->istate & IRQS_NMI))) + goto out; + + irq_nmi_teardown(desc); +out: + irq_put_desc_unlock(desc, flags); +} + +/** * irq_get_irqchip_state - returns the irqchip state of a interrupt. * @irq: Interrupt line that is forwarded to a VM * @which: One of IRQCHIP_STATE_* the caller wants to know about diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index f3a04994e063..14934afa9e68 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -494,7 +494,7 @@ static int get_ksymbol_ftrace_mod(struct kallsym_iter *iter) static int get_ksymbol_bpf(struct kallsym_iter *iter) { - iter->module_name[0] = '\0'; + strlcpy(iter->module_name, "bpf", MODULE_NAME_LEN); iter->exported = 0; return bpf_get_kallsym(iter->pos - iter->pos_ftrace_mod_end, &iter->value, &iter->type, diff --git a/kernel/kcov.c b/kernel/kcov.c index c2277dbdbfb1..2ee38727844a 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -20,6 +20,7 @@ #include <linux/debugfs.h> #include <linux/uaccess.h> #include <linux/kcov.h> +#include <linux/refcount.h> #include <asm/setup.h> /* Number of 64-bit words written per one comparison: */ @@ -44,7 +45,7 @@ struct kcov { * - opened file descriptor * - task with enabled coverage (we can't unwire it from another task) */ - atomic_t refcount; + refcount_t refcount; /* The lock protects mode, size, area and t. */ spinlock_t lock; enum kcov_mode mode; @@ -228,12 +229,12 @@ EXPORT_SYMBOL(__sanitizer_cov_trace_switch); static void kcov_get(struct kcov *kcov) { - atomic_inc(&kcov->refcount); + refcount_inc(&kcov->refcount); } static void kcov_put(struct kcov *kcov) { - if (atomic_dec_and_test(&kcov->refcount)) { + if (refcount_dec_and_test(&kcov->refcount)) { vfree(kcov->area); kfree(kcov); } @@ -312,7 +313,7 @@ static int kcov_open(struct inode *inode, struct file *filep) if (!kcov) return -ENOMEM; kcov->mode = KCOV_MODE_DISABLED; - atomic_set(&kcov->refcount, 1); + refcount_set(&kcov->refcount, 1); spin_lock_init(&kcov->lock); filep->private_data = kcov; return nonseekable_open(inode, filep); @@ -444,10 +445,8 @@ static int __init kcov_init(void) * there is no need to protect it against removal races. The * use of debugfs_create_file_unsafe() is actually safe here. */ - if (!debugfs_create_file_unsafe("kcov", 0600, NULL, NULL, &kcov_fops)) { - pr_err("failed to create kcov in debugfs\n"); - return -ENOMEM; - } + debugfs_create_file_unsafe("kcov", 0600, NULL, NULL, &kcov_fops); + return 0; } diff --git a/kernel/kprobes.c b/kernel/kprobes.c index f4ddfdd2d07e..c83e54727131 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1396,7 +1396,7 @@ bool __weak arch_within_kprobe_blacklist(unsigned long addr) addr < (unsigned long)__kprobes_text_end; } -bool within_kprobe_blacklist(unsigned long addr) +static bool __within_kprobe_blacklist(unsigned long addr) { struct kprobe_blacklist_entry *ent; @@ -1410,7 +1410,26 @@ bool within_kprobe_blacklist(unsigned long addr) if (addr >= ent->start_addr && addr < ent->end_addr) return true; } + return false; +} +bool within_kprobe_blacklist(unsigned long addr) +{ + char symname[KSYM_NAME_LEN], *p; + + if (__within_kprobe_blacklist(addr)) + return true; + + /* Check if the address is on a suffixed-symbol */ + if (!lookup_symbol_name(addr, symname)) { + p = strchr(symname, '.'); + if (!p) + return false; + *p = '\0'; + addr = (unsigned long)kprobe_lookup_name(symname, 0); + if (addr) + return __within_kprobe_blacklist(addr); + } return false; } diff --git a/kernel/kthread.c b/kernel/kthread.c index 087d18d771b5..5942eeafb9ac 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -20,6 +20,7 @@ #include <linux/freezer.h> #include <linux/ptrace.h> #include <linux/uaccess.h> +#include <linux/numa.h> #include <trace/events/sched.h> static DEFINE_SPINLOCK(kthread_create_lock); @@ -101,6 +102,12 @@ bool kthread_should_stop(void) } EXPORT_SYMBOL(kthread_should_stop); +bool __kthread_should_park(struct task_struct *k) +{ + return test_bit(KTHREAD_SHOULD_PARK, &to_kthread(k)->flags); +} +EXPORT_SYMBOL_GPL(__kthread_should_park); + /** * kthread_should_park - should this kthread park now? * @@ -114,7 +121,7 @@ EXPORT_SYMBOL(kthread_should_stop); */ bool kthread_should_park(void) { - return test_bit(KTHREAD_SHOULD_PARK, &to_kthread(current)->flags); + return __kthread_should_park(current); } EXPORT_SYMBOL_GPL(kthread_should_park); @@ -599,7 +606,7 @@ void __kthread_init_worker(struct kthread_worker *worker, struct lock_class_key *key) { memset(worker, 0, sizeof(struct kthread_worker)); - spin_lock_init(&worker->lock); + raw_spin_lock_init(&worker->lock); lockdep_set_class_and_name(&worker->lock, key, name); INIT_LIST_HEAD(&worker->work_list); INIT_LIST_HEAD(&worker->delayed_work_list); @@ -641,21 +648,21 @@ repeat: if (kthread_should_stop()) { __set_current_state(TASK_RUNNING); - spin_lock_irq(&worker->lock); + raw_spin_lock_irq(&worker->lock); worker->task = NULL; - spin_unlock_irq(&worker->lock); + raw_spin_unlock_irq(&worker->lock); return 0; } work = NULL; - spin_lock_irq(&worker->lock); + raw_spin_lock_irq(&worker->lock); if (!list_empty(&worker->work_list)) { work = list_first_entry(&worker->work_list, struct kthread_work, node); list_del_init(&work->node); } worker->current_work = work; - spin_unlock_irq(&worker->lock); + raw_spin_unlock_irq(&worker->lock); if (work) { __set_current_state(TASK_RUNNING); @@ -675,7 +682,7 @@ __kthread_create_worker(int cpu, unsigned int flags, { struct kthread_worker *worker; struct task_struct *task; - int node = -1; + int node = NUMA_NO_NODE; worker = kzalloc(sizeof(*worker), GFP_KERNEL); if (!worker) @@ -812,12 +819,12 @@ bool kthread_queue_work(struct kthread_worker *worker, bool ret = false; unsigned long flags; - spin_lock_irqsave(&worker->lock, flags); + raw_spin_lock_irqsave(&worker->lock, flags); if (!queuing_blocked(worker, work)) { kthread_insert_work(worker, work, &worker->work_list); ret = true; } - spin_unlock_irqrestore(&worker->lock, flags); + raw_spin_unlock_irqrestore(&worker->lock, flags); return ret; } EXPORT_SYMBOL_GPL(kthread_queue_work); @@ -835,6 +842,7 @@ void kthread_delayed_work_timer_fn(struct timer_list *t) struct kthread_delayed_work *dwork = from_timer(dwork, t, timer); struct kthread_work *work = &dwork->work; struct kthread_worker *worker = work->worker; + unsigned long flags; /* * This might happen when a pending work is reinitialized. @@ -843,7 +851,7 @@ void kthread_delayed_work_timer_fn(struct timer_list *t) if (WARN_ON_ONCE(!worker)) return; - spin_lock(&worker->lock); + raw_spin_lock_irqsave(&worker->lock, flags); /* Work must not be used with >1 worker, see kthread_queue_work(). */ WARN_ON_ONCE(work->worker != worker); @@ -852,7 +860,7 @@ void kthread_delayed_work_timer_fn(struct timer_list *t) list_del_init(&work->node); kthread_insert_work(worker, work, &worker->work_list); - spin_unlock(&worker->lock); + raw_spin_unlock_irqrestore(&worker->lock, flags); } EXPORT_SYMBOL(kthread_delayed_work_timer_fn); @@ -908,14 +916,14 @@ bool kthread_queue_delayed_work(struct kthread_worker *worker, unsigned long flags; bool ret = false; - spin_lock_irqsave(&worker->lock, flags); + raw_spin_lock_irqsave(&worker->lock, flags); if (!queuing_blocked(worker, work)) { __kthread_queue_delayed_work(worker, dwork, delay); ret = true; } - spin_unlock_irqrestore(&worker->lock, flags); + raw_spin_unlock_irqrestore(&worker->lock, flags); return ret; } EXPORT_SYMBOL_GPL(kthread_queue_delayed_work); @@ -951,7 +959,7 @@ void kthread_flush_work(struct kthread_work *work) if (!worker) return; - spin_lock_irq(&worker->lock); + raw_spin_lock_irq(&worker->lock); /* Work must not be used with >1 worker, see kthread_queue_work(). */ WARN_ON_ONCE(work->worker != worker); @@ -963,7 +971,7 @@ void kthread_flush_work(struct kthread_work *work) else noop = true; - spin_unlock_irq(&worker->lock); + raw_spin_unlock_irq(&worker->lock); if (!noop) wait_for_completion(&fwork.done); @@ -996,9 +1004,9 @@ static bool __kthread_cancel_work(struct kthread_work *work, bool is_dwork, * any queuing is blocked by setting the canceling counter. */ work->canceling++; - spin_unlock_irqrestore(&worker->lock, *flags); + raw_spin_unlock_irqrestore(&worker->lock, *flags); del_timer_sync(&dwork->timer); - spin_lock_irqsave(&worker->lock, *flags); + raw_spin_lock_irqsave(&worker->lock, *flags); work->canceling--; } @@ -1045,7 +1053,7 @@ bool kthread_mod_delayed_work(struct kthread_worker *worker, unsigned long flags; int ret = false; - spin_lock_irqsave(&worker->lock, flags); + raw_spin_lock_irqsave(&worker->lock, flags); /* Do not bother with canceling when never queued. */ if (!work->worker) @@ -1062,7 +1070,7 @@ bool kthread_mod_delayed_work(struct kthread_worker *worker, fast_queue: __kthread_queue_delayed_work(worker, dwork, delay); out: - spin_unlock_irqrestore(&worker->lock, flags); + raw_spin_unlock_irqrestore(&worker->lock, flags); return ret; } EXPORT_SYMBOL_GPL(kthread_mod_delayed_work); @@ -1076,7 +1084,7 @@ static bool __kthread_cancel_work_sync(struct kthread_work *work, bool is_dwork) if (!worker) goto out; - spin_lock_irqsave(&worker->lock, flags); + raw_spin_lock_irqsave(&worker->lock, flags); /* Work must not be used with >1 worker, see kthread_queue_work(). */ WARN_ON_ONCE(work->worker != worker); @@ -1090,13 +1098,13 @@ static bool __kthread_cancel_work_sync(struct kthread_work *work, bool is_dwork) * In the meantime, block any queuing by setting the canceling counter. */ work->canceling++; - spin_unlock_irqrestore(&worker->lock, flags); + raw_spin_unlock_irqrestore(&worker->lock, flags); kthread_flush_work(work); - spin_lock_irqsave(&worker->lock, flags); + raw_spin_lock_irqsave(&worker->lock, flags); work->canceling--; out_fast: - spin_unlock_irqrestore(&worker->lock, flags); + raw_spin_unlock_irqrestore(&worker->lock, flags); out: return ret; } diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index 5b77a7314e01..eb0ee10a1981 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -45,7 +45,12 @@ */ DEFINE_MUTEX(klp_mutex); -static LIST_HEAD(klp_patches); +/* + * Actively used patches: enabled or in transition. Note that replaced + * or disabled patches are not listed even though the related kernel + * module still can be loaded. + */ +LIST_HEAD(klp_patches); static struct kobject *klp_root_kobj; @@ -82,20 +87,43 @@ static void klp_find_object_module(struct klp_object *obj) mutex_unlock(&module_mutex); } -static bool klp_is_patch_registered(struct klp_patch *patch) +static bool klp_initialized(void) +{ + return !!klp_root_kobj; +} + +static struct klp_func *klp_find_func(struct klp_object *obj, + struct klp_func *old_func) { - struct klp_patch *mypatch; + struct klp_func *func; - list_for_each_entry(mypatch, &klp_patches, list) - if (mypatch == patch) - return true; + klp_for_each_func(obj, func) { + if ((strcmp(old_func->old_name, func->old_name) == 0) && + (old_func->old_sympos == func->old_sympos)) { + return func; + } + } - return false; + return NULL; } -static bool klp_initialized(void) +static struct klp_object *klp_find_object(struct klp_patch *patch, + struct klp_object *old_obj) { - return !!klp_root_kobj; + struct klp_object *obj; + + klp_for_each_object(patch, obj) { + if (klp_is_module(old_obj)) { + if (klp_is_module(obj) && + strcmp(old_obj->name, obj->name) == 0) { + return obj; + } + } else if (!klp_is_module(obj)) { + return obj; + } + } + + return NULL; } struct klp_find_arg { @@ -278,170 +306,6 @@ static int klp_write_object_relocations(struct module *pmod, return ret; } -static int __klp_disable_patch(struct klp_patch *patch) -{ - struct klp_object *obj; - - if (WARN_ON(!patch->enabled)) - return -EINVAL; - - if (klp_transition_patch) - return -EBUSY; - - /* enforce stacking: only the last enabled patch can be disabled */ - if (!list_is_last(&patch->list, &klp_patches) && - list_next_entry(patch, list)->enabled) - return -EBUSY; - - klp_init_transition(patch, KLP_UNPATCHED); - - klp_for_each_object(patch, obj) - if (obj->patched) - klp_pre_unpatch_callback(obj); - - /* - * Enforce the order of the func->transition writes in - * klp_init_transition() and the TIF_PATCH_PENDING writes in - * klp_start_transition(). In the rare case where klp_ftrace_handler() - * is called shortly after klp_update_patch_state() switches the task, - * this ensures the handler sees that func->transition is set. - */ - smp_wmb(); - - klp_start_transition(); - klp_try_complete_transition(); - patch->enabled = false; - - return 0; -} - -/** - * klp_disable_patch() - disables a registered patch - * @patch: The registered, enabled patch to be disabled - * - * Unregisters the patched functions from ftrace. - * - * Return: 0 on success, otherwise error - */ -int klp_disable_patch(struct klp_patch *patch) -{ - int ret; - - mutex_lock(&klp_mutex); - - if (!klp_is_patch_registered(patch)) { - ret = -EINVAL; - goto err; - } - - if (!patch->enabled) { - ret = -EINVAL; - goto err; - } - - ret = __klp_disable_patch(patch); - -err: - mutex_unlock(&klp_mutex); - return ret; -} -EXPORT_SYMBOL_GPL(klp_disable_patch); - -static int __klp_enable_patch(struct klp_patch *patch) -{ - struct klp_object *obj; - int ret; - - if (klp_transition_patch) - return -EBUSY; - - if (WARN_ON(patch->enabled)) - return -EINVAL; - - /* enforce stacking: only the first disabled patch can be enabled */ - if (patch->list.prev != &klp_patches && - !list_prev_entry(patch, list)->enabled) - return -EBUSY; - - /* - * A reference is taken on the patch module to prevent it from being - * unloaded. - */ - if (!try_module_get(patch->mod)) - return -ENODEV; - - pr_notice("enabling patch '%s'\n", patch->mod->name); - - klp_init_transition(patch, KLP_PATCHED); - - /* - * Enforce the order of the func->transition writes in - * klp_init_transition() and the ops->func_stack writes in - * klp_patch_object(), so that klp_ftrace_handler() will see the - * func->transition updates before the handler is registered and the - * new funcs become visible to the handler. - */ - smp_wmb(); - - klp_for_each_object(patch, obj) { - if (!klp_is_object_loaded(obj)) - continue; - - ret = klp_pre_patch_callback(obj); - if (ret) { - pr_warn("pre-patch callback failed for object '%s'\n", - klp_is_module(obj) ? obj->name : "vmlinux"); - goto err; - } - - ret = klp_patch_object(obj); - if (ret) { - pr_warn("failed to patch object '%s'\n", - klp_is_module(obj) ? obj->name : "vmlinux"); - goto err; - } - } - - klp_start_transition(); - klp_try_complete_transition(); - patch->enabled = true; - - return 0; -err: - pr_warn("failed to enable patch '%s'\n", patch->mod->name); - - klp_cancel_transition(); - return ret; -} - -/** - * klp_enable_patch() - enables a registered patch - * @patch: The registered, disabled patch to be enabled - * - * Performs the needed symbol lookups and code relocations, - * then registers the patched functions with ftrace. - * - * Return: 0 on success, otherwise error - */ -int klp_enable_patch(struct klp_patch *patch) -{ - int ret; - - mutex_lock(&klp_mutex); - - if (!klp_is_patch_registered(patch)) { - ret = -EINVAL; - goto err; - } - - ret = __klp_enable_patch(patch); - -err: - mutex_unlock(&klp_mutex); - return ret; -} -EXPORT_SYMBOL_GPL(klp_enable_patch); - /* * Sysfs Interface * @@ -449,11 +313,11 @@ EXPORT_SYMBOL_GPL(klp_enable_patch); * /sys/kernel/livepatch/<patch> * /sys/kernel/livepatch/<patch>/enabled * /sys/kernel/livepatch/<patch>/transition - * /sys/kernel/livepatch/<patch>/signal * /sys/kernel/livepatch/<patch>/force * /sys/kernel/livepatch/<patch>/<object> * /sys/kernel/livepatch/<patch>/<object>/<function,sympos> */ +static int __klp_disable_patch(struct klp_patch *patch); static ssize_t enabled_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) @@ -470,40 +334,32 @@ static ssize_t enabled_store(struct kobject *kobj, struct kobj_attribute *attr, mutex_lock(&klp_mutex); - if (!klp_is_patch_registered(patch)) { - /* - * Module with the patch could either disappear meanwhile or is - * not properly initialized yet. - */ - ret = -EINVAL; - goto err; - } - if (patch->enabled == enabled) { /* already in requested state */ ret = -EINVAL; - goto err; + goto out; } - if (patch == klp_transition_patch) { + /* + * Allow to reverse a pending transition in both ways. It might be + * necessary to complete the transition without forcing and breaking + * the system integrity. + * + * Do not allow to re-enable a disabled patch. + */ + if (patch == klp_transition_patch) klp_reverse_transition(); - } else if (enabled) { - ret = __klp_enable_patch(patch); - if (ret) - goto err; - } else { + else if (!enabled) ret = __klp_disable_patch(patch); - if (ret) - goto err; - } + else + ret = -EINVAL; +out: mutex_unlock(&klp_mutex); + if (ret) + return ret; return count; - -err: - mutex_unlock(&klp_mutex); - return ret; } static ssize_t enabled_show(struct kobject *kobj, @@ -525,35 +381,6 @@ static ssize_t transition_show(struct kobject *kobj, patch == klp_transition_patch); } -static ssize_t signal_store(struct kobject *kobj, struct kobj_attribute *attr, - const char *buf, size_t count) -{ - struct klp_patch *patch; - int ret; - bool val; - - ret = kstrtobool(buf, &val); - if (ret) - return ret; - - if (!val) - return count; - - mutex_lock(&klp_mutex); - - patch = container_of(kobj, struct klp_patch, kobj); - if (patch != klp_transition_patch) { - mutex_unlock(&klp_mutex); - return -EINVAL; - } - - klp_send_signals(); - - mutex_unlock(&klp_mutex); - - return count; -} - static ssize_t force_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { @@ -585,16 +412,129 @@ static ssize_t force_store(struct kobject *kobj, struct kobj_attribute *attr, static struct kobj_attribute enabled_kobj_attr = __ATTR_RW(enabled); static struct kobj_attribute transition_kobj_attr = __ATTR_RO(transition); -static struct kobj_attribute signal_kobj_attr = __ATTR_WO(signal); static struct kobj_attribute force_kobj_attr = __ATTR_WO(force); static struct attribute *klp_patch_attrs[] = { &enabled_kobj_attr.attr, &transition_kobj_attr.attr, - &signal_kobj_attr.attr, &force_kobj_attr.attr, NULL }; +static void klp_free_object_dynamic(struct klp_object *obj) +{ + kfree(obj->name); + kfree(obj); +} + +static struct klp_object *klp_alloc_object_dynamic(const char *name) +{ + struct klp_object *obj; + + obj = kzalloc(sizeof(*obj), GFP_KERNEL); + if (!obj) + return NULL; + + if (name) { + obj->name = kstrdup(name, GFP_KERNEL); + if (!obj->name) { + kfree(obj); + return NULL; + } + } + + INIT_LIST_HEAD(&obj->func_list); + obj->dynamic = true; + + return obj; +} + +static void klp_free_func_nop(struct klp_func *func) +{ + kfree(func->old_name); + kfree(func); +} + +static struct klp_func *klp_alloc_func_nop(struct klp_func *old_func, + struct klp_object *obj) +{ + struct klp_func *func; + + func = kzalloc(sizeof(*func), GFP_KERNEL); + if (!func) + return NULL; + + if (old_func->old_name) { + func->old_name = kstrdup(old_func->old_name, GFP_KERNEL); + if (!func->old_name) { + kfree(func); + return NULL; + } + } + + /* + * func->new_func is same as func->old_func. These addresses are + * set when the object is loaded, see klp_init_object_loaded(). + */ + func->old_sympos = old_func->old_sympos; + func->nop = true; + + return func; +} + +static int klp_add_object_nops(struct klp_patch *patch, + struct klp_object *old_obj) +{ + struct klp_object *obj; + struct klp_func *func, *old_func; + + obj = klp_find_object(patch, old_obj); + + if (!obj) { + obj = klp_alloc_object_dynamic(old_obj->name); + if (!obj) + return -ENOMEM; + + list_add_tail(&obj->node, &patch->obj_list); + } + + klp_for_each_func(old_obj, old_func) { + func = klp_find_func(obj, old_func); + if (func) + continue; + + func = klp_alloc_func_nop(old_func, obj); + if (!func) + return -ENOMEM; + + list_add_tail(&func->node, &obj->func_list); + } + + return 0; +} + +/* + * Add 'nop' functions which simply return to the caller to run + * the original function. The 'nop' functions are added to a + * patch to facilitate a 'replace' mode. + */ +static int klp_add_nops(struct klp_patch *patch) +{ + struct klp_patch *old_patch; + struct klp_object *old_obj; + + klp_for_each_patch(old_patch) { + klp_for_each_object(old_patch, old_obj) { + int err; + + err = klp_add_object_nops(patch, old_obj); + if (err) + return err; + } + } + + return 0; +} + static void klp_kobj_release_patch(struct kobject *kobj) { struct klp_patch *patch; @@ -611,6 +551,12 @@ static struct kobj_type klp_ktype_patch = { static void klp_kobj_release_object(struct kobject *kobj) { + struct klp_object *obj; + + obj = container_of(kobj, struct klp_object, kobj); + + if (obj->dynamic) + klp_free_object_dynamic(obj); } static struct kobj_type klp_ktype_object = { @@ -620,6 +566,12 @@ static struct kobj_type klp_ktype_object = { static void klp_kobj_release_func(struct kobject *kobj) { + struct klp_func *func; + + func = container_of(kobj, struct klp_func, kobj); + + if (func->nop) + klp_free_func_nop(func); } static struct kobj_type klp_ktype_func = { @@ -627,17 +579,23 @@ static struct kobj_type klp_ktype_func = { .sysfs_ops = &kobj_sysfs_ops, }; -/* - * Free all functions' kobjects in the array up to some limit. When limit is - * NULL, all kobjects are freed. - */ -static void klp_free_funcs_limited(struct klp_object *obj, - struct klp_func *limit) +static void __klp_free_funcs(struct klp_object *obj, bool nops_only) { - struct klp_func *func; + struct klp_func *func, *tmp_func; + + klp_for_each_func_safe(obj, func, tmp_func) { + if (nops_only && !func->nop) + continue; + + list_del(&func->node); - for (func = obj->funcs; func->old_name && func != limit; func++) - kobject_put(&func->kobj); + /* Might be called from klp_init_patch() error path. */ + if (func->kobj_added) { + kobject_put(&func->kobj); + } else if (func->nop) { + klp_free_func_nop(func); + } + } } /* Clean up when a patched object is unloaded */ @@ -647,35 +605,111 @@ static void klp_free_object_loaded(struct klp_object *obj) obj->mod = NULL; - klp_for_each_func(obj, func) - func->old_addr = 0; + klp_for_each_func(obj, func) { + func->old_func = NULL; + + if (func->nop) + func->new_func = NULL; + } } -/* - * Free all objects' kobjects in the array up to some limit. When limit is - * NULL, all kobjects are freed. - */ -static void klp_free_objects_limited(struct klp_patch *patch, - struct klp_object *limit) +static void __klp_free_objects(struct klp_patch *patch, bool nops_only) { - struct klp_object *obj; + struct klp_object *obj, *tmp_obj; + + klp_for_each_object_safe(patch, obj, tmp_obj) { + __klp_free_funcs(obj, nops_only); + + if (nops_only && !obj->dynamic) + continue; + + list_del(&obj->node); - for (obj = patch->objs; obj->funcs && obj != limit; obj++) { - klp_free_funcs_limited(obj, NULL); - kobject_put(&obj->kobj); + /* Might be called from klp_init_patch() error path. */ + if (obj->kobj_added) { + kobject_put(&obj->kobj); + } else if (obj->dynamic) { + klp_free_object_dynamic(obj); + } } } -static void klp_free_patch(struct klp_patch *patch) +static void klp_free_objects(struct klp_patch *patch) +{ + __klp_free_objects(patch, false); +} + +static void klp_free_objects_dynamic(struct klp_patch *patch) +{ + __klp_free_objects(patch, true); +} + +/* + * This function implements the free operations that can be called safely + * under klp_mutex. + * + * The operation must be completed by calling klp_free_patch_finish() + * outside klp_mutex. + */ +void klp_free_patch_start(struct klp_patch *patch) { - klp_free_objects_limited(patch, NULL); if (!list_empty(&patch->list)) list_del(&patch->list); + + klp_free_objects(patch); +} + +/* + * This function implements the free part that must be called outside + * klp_mutex. + * + * It must be called after klp_free_patch_start(). And it has to be + * the last function accessing the livepatch structures when the patch + * gets disabled. + */ +static void klp_free_patch_finish(struct klp_patch *patch) +{ + /* + * Avoid deadlock with enabled_store() sysfs callback by + * calling this outside klp_mutex. It is safe because + * this is called when the patch gets disabled and it + * cannot get enabled again. + */ + if (patch->kobj_added) { + kobject_put(&patch->kobj); + wait_for_completion(&patch->finish); + } + + /* Put the module after the last access to struct klp_patch. */ + if (!patch->forced) + module_put(patch->mod); +} + +/* + * The livepatch might be freed from sysfs interface created by the patch. + * This work allows to wait until the interface is destroyed in a separate + * context. + */ +static void klp_free_patch_work_fn(struct work_struct *work) +{ + struct klp_patch *patch = + container_of(work, struct klp_patch, free_work); + + klp_free_patch_finish(patch); } static int klp_init_func(struct klp_object *obj, struct klp_func *func) { - if (!func->old_name || !func->new_func) + int ret; + + if (!func->old_name) + return -EINVAL; + + /* + * NOPs get the address later. The patched module must be loaded, + * see klp_init_object_loaded(). + */ + if (!func->new_func && !func->nop) return -EINVAL; if (strlen(func->old_name) >= KSYM_NAME_LEN) @@ -690,9 +724,13 @@ static int klp_init_func(struct klp_object *obj, struct klp_func *func) * object. If the user selects 0 for old_sympos, then 1 will be used * since a unique symbol will be the first occurrence. */ - return kobject_init_and_add(&func->kobj, &klp_ktype_func, - &obj->kobj, "%s,%lu", func->old_name, - func->old_sympos ? func->old_sympos : 1); + ret = kobject_init_and_add(&func->kobj, &klp_ktype_func, + &obj->kobj, "%s,%lu", func->old_name, + func->old_sympos ? func->old_sympos : 1); + if (!ret) + func->kobj_added = true; + + return ret; } /* Arches may override this to finish any remaining arch-specific tasks */ @@ -721,11 +759,11 @@ static int klp_init_object_loaded(struct klp_patch *patch, klp_for_each_func(obj, func) { ret = klp_find_object_symbol(obj->name, func->old_name, func->old_sympos, - &func->old_addr); + (unsigned long *)&func->old_func); if (ret) return ret; - ret = kallsyms_lookup_size_offset(func->old_addr, + ret = kallsyms_lookup_size_offset((unsigned long)func->old_func, &func->old_size, NULL); if (!ret) { pr_err("kallsyms size lookup failed for '%s'\n", @@ -733,6 +771,9 @@ static int klp_init_object_loaded(struct klp_patch *patch, return -ENOENT; } + if (func->nop) + func->new_func = func->old_func; + ret = kallsyms_lookup_size_offset((unsigned long)func->new_func, &func->new_size, NULL); if (!ret) { @@ -751,9 +792,6 @@ static int klp_init_object(struct klp_patch *patch, struct klp_object *obj) int ret; const char *name; - if (!obj->funcs) - return -EINVAL; - if (klp_is_module(obj) && strlen(obj->name) >= MODULE_NAME_LEN) return -EINVAL; @@ -767,122 +805,191 @@ static int klp_init_object(struct klp_patch *patch, struct klp_object *obj) &patch->kobj, "%s", name); if (ret) return ret; + obj->kobj_added = true; klp_for_each_func(obj, func) { ret = klp_init_func(obj, func); if (ret) - goto free; + return ret; } - if (klp_is_object_loaded(obj)) { + if (klp_is_object_loaded(obj)) ret = klp_init_object_loaded(patch, obj); - if (ret) - goto free; - } - return 0; - -free: - klp_free_funcs_limited(obj, func); - kobject_put(&obj->kobj); return ret; } -static int klp_init_patch(struct klp_patch *patch) +static int klp_init_patch_early(struct klp_patch *patch) { struct klp_object *obj; - int ret; + struct klp_func *func; if (!patch->objs) return -EINVAL; - mutex_lock(&klp_mutex); - + INIT_LIST_HEAD(&patch->list); + INIT_LIST_HEAD(&patch->obj_list); + patch->kobj_added = false; patch->enabled = false; + patch->forced = false; + INIT_WORK(&patch->free_work, klp_free_patch_work_fn); init_completion(&patch->finish); + klp_for_each_object_static(patch, obj) { + if (!obj->funcs) + return -EINVAL; + + INIT_LIST_HEAD(&obj->func_list); + obj->kobj_added = false; + list_add_tail(&obj->node, &patch->obj_list); + + klp_for_each_func_static(obj, func) { + func->kobj_added = false; + list_add_tail(&func->node, &obj->func_list); + } + } + + if (!try_module_get(patch->mod)) + return -ENODEV; + + return 0; +} + +static int klp_init_patch(struct klp_patch *patch) +{ + struct klp_object *obj; + int ret; + ret = kobject_init_and_add(&patch->kobj, &klp_ktype_patch, klp_root_kobj, "%s", patch->mod->name); - if (ret) { - mutex_unlock(&klp_mutex); + if (ret) return ret; + patch->kobj_added = true; + + if (patch->replace) { + ret = klp_add_nops(patch); + if (ret) + return ret; } klp_for_each_object(patch, obj) { ret = klp_init_object(patch, obj); if (ret) - goto free; + return ret; } list_add_tail(&patch->list, &klp_patches); - mutex_unlock(&klp_mutex); - return 0; +} -free: - klp_free_objects_limited(patch, obj); +static int __klp_disable_patch(struct klp_patch *patch) +{ + struct klp_object *obj; - mutex_unlock(&klp_mutex); + if (WARN_ON(!patch->enabled)) + return -EINVAL; - kobject_put(&patch->kobj); - wait_for_completion(&patch->finish); + if (klp_transition_patch) + return -EBUSY; - return ret; + klp_init_transition(patch, KLP_UNPATCHED); + + klp_for_each_object(patch, obj) + if (obj->patched) + klp_pre_unpatch_callback(obj); + + /* + * Enforce the order of the func->transition writes in + * klp_init_transition() and the TIF_PATCH_PENDING writes in + * klp_start_transition(). In the rare case where klp_ftrace_handler() + * is called shortly after klp_update_patch_state() switches the task, + * this ensures the handler sees that func->transition is set. + */ + smp_wmb(); + + klp_start_transition(); + patch->enabled = false; + klp_try_complete_transition(); + + return 0; } -/** - * klp_unregister_patch() - unregisters a patch - * @patch: Disabled patch to be unregistered - * - * Frees the data structures and removes the sysfs interface. - * - * Return: 0 on success, otherwise error - */ -int klp_unregister_patch(struct klp_patch *patch) +static int __klp_enable_patch(struct klp_patch *patch) { + struct klp_object *obj; int ret; - mutex_lock(&klp_mutex); + if (klp_transition_patch) + return -EBUSY; - if (!klp_is_patch_registered(patch)) { - ret = -EINVAL; - goto err; - } + if (WARN_ON(patch->enabled)) + return -EINVAL; - if (patch->enabled) { - ret = -EBUSY; - goto err; - } + if (!patch->kobj_added) + return -EINVAL; - klp_free_patch(patch); + pr_notice("enabling patch '%s'\n", patch->mod->name); - mutex_unlock(&klp_mutex); + klp_init_transition(patch, KLP_PATCHED); + + /* + * Enforce the order of the func->transition writes in + * klp_init_transition() and the ops->func_stack writes in + * klp_patch_object(), so that klp_ftrace_handler() will see the + * func->transition updates before the handler is registered and the + * new funcs become visible to the handler. + */ + smp_wmb(); + + klp_for_each_object(patch, obj) { + if (!klp_is_object_loaded(obj)) + continue; + + ret = klp_pre_patch_callback(obj); + if (ret) { + pr_warn("pre-patch callback failed for object '%s'\n", + klp_is_module(obj) ? obj->name : "vmlinux"); + goto err; + } + + ret = klp_patch_object(obj); + if (ret) { + pr_warn("failed to patch object '%s'\n", + klp_is_module(obj) ? obj->name : "vmlinux"); + goto err; + } + } - kobject_put(&patch->kobj); - wait_for_completion(&patch->finish); + klp_start_transition(); + patch->enabled = true; + klp_try_complete_transition(); return 0; err: - mutex_unlock(&klp_mutex); + pr_warn("failed to enable patch '%s'\n", patch->mod->name); + + klp_cancel_transition(); return ret; } -EXPORT_SYMBOL_GPL(klp_unregister_patch); /** - * klp_register_patch() - registers a patch - * @patch: Patch to be registered + * klp_enable_patch() - enable the livepatch + * @patch: patch to be enabled * - * Initializes the data structure associated with the patch and - * creates the sysfs interface. + * Initializes the data structure associated with the patch, creates the sysfs + * interface, performs the needed symbol lookups and code relocations, + * registers the patched functions with ftrace. * - * There is no need to take the reference on the patch module here. It is done - * later when the patch is enabled. + * This function is supposed to be called from the livepatch module_init() + * callback. * * Return: 0 on success, otherwise error */ -int klp_register_patch(struct klp_patch *patch) +int klp_enable_patch(struct klp_patch *patch) { + int ret; + if (!patch || !patch->mod) return -EINVAL; @@ -897,12 +1004,91 @@ int klp_register_patch(struct klp_patch *patch) if (!klp_have_reliable_stack()) { pr_err("This architecture doesn't have support for the livepatch consistency model.\n"); - return -ENOSYS; + return -EOPNOTSUPP; + } + + + mutex_lock(&klp_mutex); + + ret = klp_init_patch_early(patch); + if (ret) { + mutex_unlock(&klp_mutex); + return ret; } - return klp_init_patch(patch); + ret = klp_init_patch(patch); + if (ret) + goto err; + + ret = __klp_enable_patch(patch); + if (ret) + goto err; + + mutex_unlock(&klp_mutex); + + return 0; + +err: + klp_free_patch_start(patch); + + mutex_unlock(&klp_mutex); + + klp_free_patch_finish(patch); + + return ret; +} +EXPORT_SYMBOL_GPL(klp_enable_patch); + +/* + * This function removes replaced patches. + * + * We could be pretty aggressive here. It is called in the situation where + * these structures are no longer accessible. All functions are redirected + * by the klp_transition_patch. They use either a new code or they are in + * the original code because of the special nop function patches. + * + * The only exception is when the transition was forced. In this case, + * klp_ftrace_handler() might still see the replaced patch on the stack. + * Fortunately, it is carefully designed to work with removed functions + * thanks to RCU. We only have to keep the patches on the system. Also + * this is handled transparently by patch->module_put. + */ +void klp_discard_replaced_patches(struct klp_patch *new_patch) +{ + struct klp_patch *old_patch, *tmp_patch; + + klp_for_each_patch_safe(old_patch, tmp_patch) { + if (old_patch == new_patch) + return; + + old_patch->enabled = false; + klp_unpatch_objects(old_patch); + klp_free_patch_start(old_patch); + schedule_work(&old_patch->free_work); + } +} + +/* + * This function removes the dynamically allocated 'nop' functions. + * + * We could be pretty aggressive. NOPs do not change the existing + * behavior except for adding unnecessary delay by the ftrace handler. + * + * It is safe even when the transition was forced. The ftrace handler + * will see a valid ops->func_stack entry thanks to RCU. + * + * We could even free the NOPs structures. They must be the last entry + * in ops->func_stack. Therefore unregister_ftrace_function() is called. + * It does the same as klp_synchronize_transition() to make sure that + * nobody is inside the ftrace handler once the operation finishes. + * + * IMPORTANT: It must be called right after removing the replaced patches! + */ +void klp_discard_nops(struct klp_patch *new_patch) +{ + klp_unpatch_objects_dynamic(klp_transition_patch); + klp_free_objects_dynamic(klp_transition_patch); } -EXPORT_SYMBOL_GPL(klp_register_patch); /* * Remove parts of patches that touch a given kernel module. The list of @@ -915,7 +1101,7 @@ static void klp_cleanup_module_patches_limited(struct module *mod, struct klp_patch *patch; struct klp_object *obj; - list_for_each_entry(patch, &klp_patches, list) { + klp_for_each_patch(patch) { if (patch == limit) break; @@ -923,21 +1109,14 @@ static void klp_cleanup_module_patches_limited(struct module *mod, if (!klp_is_module(obj) || strcmp(obj->name, mod->name)) continue; - /* - * Only unpatch the module if the patch is enabled or - * is in transition. - */ - if (patch->enabled || patch == klp_transition_patch) { + if (patch != klp_transition_patch) + klp_pre_unpatch_callback(obj); - if (patch != klp_transition_patch) - klp_pre_unpatch_callback(obj); + pr_notice("reverting patch '%s' on unloading module '%s'\n", + patch->mod->name, obj->mod->name); + klp_unpatch_object(obj); - pr_notice("reverting patch '%s' on unloading module '%s'\n", - patch->mod->name, obj->mod->name); - klp_unpatch_object(obj); - - klp_post_unpatch_callback(obj); - } + klp_post_unpatch_callback(obj); klp_free_object_loaded(obj); break; @@ -962,7 +1141,7 @@ int klp_module_coming(struct module *mod) */ mod->klp_alive = true; - list_for_each_entry(patch, &klp_patches, list) { + klp_for_each_patch(patch) { klp_for_each_object(patch, obj) { if (!klp_is_module(obj) || strcmp(obj->name, mod->name)) continue; @@ -976,13 +1155,6 @@ int klp_module_coming(struct module *mod) goto err; } - /* - * Only patch the module if the patch is enabled or is - * in transition. - */ - if (!patch->enabled && patch != klp_transition_patch) - break; - pr_notice("applying patch '%s' to loading module '%s'\n", patch->mod->name, obj->mod->name); diff --git a/kernel/livepatch/core.h b/kernel/livepatch/core.h index 48a83d4364cf..ec43a40b853f 100644 --- a/kernel/livepatch/core.h +++ b/kernel/livepatch/core.h @@ -5,6 +5,17 @@ #include <linux/livepatch.h> extern struct mutex klp_mutex; +extern struct list_head klp_patches; + +#define klp_for_each_patch_safe(patch, tmp_patch) \ + list_for_each_entry_safe(patch, tmp_patch, &klp_patches, list) + +#define klp_for_each_patch(patch) \ + list_for_each_entry(patch, &klp_patches, list) + +void klp_free_patch_start(struct klp_patch *patch); +void klp_discard_replaced_patches(struct klp_patch *new_patch); +void klp_discard_nops(struct klp_patch *new_patch); static inline bool klp_is_object_loaded(struct klp_object *obj) { diff --git a/kernel/livepatch/patch.c b/kernel/livepatch/patch.c index 7702cb4064fc..99cb3ad05eb4 100644 --- a/kernel/livepatch/patch.c +++ b/kernel/livepatch/patch.c @@ -34,7 +34,7 @@ static LIST_HEAD(klp_ops); -struct klp_ops *klp_find_ops(unsigned long old_addr) +struct klp_ops *klp_find_ops(void *old_func) { struct klp_ops *ops; struct klp_func *func; @@ -42,7 +42,7 @@ struct klp_ops *klp_find_ops(unsigned long old_addr) list_for_each_entry(ops, &klp_ops, node) { func = list_first_entry(&ops->func_stack, struct klp_func, stack_node); - if (func->old_addr == old_addr) + if (func->old_func == old_func) return ops; } @@ -118,7 +118,15 @@ static void notrace klp_ftrace_handler(unsigned long ip, } } + /* + * NOPs are used to replace existing patches with original code. + * Do nothing! Setting pc would cause an infinite loop. + */ + if (func->nop) + goto unlock; + klp_arch_set_pc(regs, (unsigned long)func->new_func); + unlock: preempt_enable_notrace(); } @@ -142,17 +150,18 @@ static void klp_unpatch_func(struct klp_func *func) if (WARN_ON(!func->patched)) return; - if (WARN_ON(!func->old_addr)) + if (WARN_ON(!func->old_func)) return; - ops = klp_find_ops(func->old_addr); + ops = klp_find_ops(func->old_func); if (WARN_ON(!ops)) return; if (list_is_singular(&ops->func_stack)) { unsigned long ftrace_loc; - ftrace_loc = klp_get_ftrace_location(func->old_addr); + ftrace_loc = + klp_get_ftrace_location((unsigned long)func->old_func); if (WARN_ON(!ftrace_loc)) return; @@ -174,17 +183,18 @@ static int klp_patch_func(struct klp_func *func) struct klp_ops *ops; int ret; - if (WARN_ON(!func->old_addr)) + if (WARN_ON(!func->old_func)) return -EINVAL; if (WARN_ON(func->patched)) return -EINVAL; - ops = klp_find_ops(func->old_addr); + ops = klp_find_ops(func->old_func); if (!ops) { unsigned long ftrace_loc; - ftrace_loc = klp_get_ftrace_location(func->old_addr); + ftrace_loc = + klp_get_ftrace_location((unsigned long)func->old_func); if (!ftrace_loc) { pr_err("failed to find location for function '%s'\n", func->old_name); @@ -236,15 +246,26 @@ err: return ret; } -void klp_unpatch_object(struct klp_object *obj) +static void __klp_unpatch_object(struct klp_object *obj, bool nops_only) { struct klp_func *func; - klp_for_each_func(obj, func) + klp_for_each_func(obj, func) { + if (nops_only && !func->nop) + continue; + if (func->patched) klp_unpatch_func(func); + } - obj->patched = false; + if (obj->dynamic || !nops_only) + obj->patched = false; +} + + +void klp_unpatch_object(struct klp_object *obj) +{ + __klp_unpatch_object(obj, false); } int klp_patch_object(struct klp_object *obj) @@ -267,11 +288,21 @@ int klp_patch_object(struct klp_object *obj) return 0; } -void klp_unpatch_objects(struct klp_patch *patch) +static void __klp_unpatch_objects(struct klp_patch *patch, bool nops_only) { struct klp_object *obj; klp_for_each_object(patch, obj) if (obj->patched) - klp_unpatch_object(obj); + __klp_unpatch_object(obj, nops_only); +} + +void klp_unpatch_objects(struct klp_patch *patch) +{ + __klp_unpatch_objects(patch, false); +} + +void klp_unpatch_objects_dynamic(struct klp_patch *patch) +{ + __klp_unpatch_objects(patch, true); } diff --git a/kernel/livepatch/patch.h b/kernel/livepatch/patch.h index e72d8250d04b..d5f2fbe373e0 100644 --- a/kernel/livepatch/patch.h +++ b/kernel/livepatch/patch.h @@ -10,7 +10,7 @@ * struct klp_ops - structure for tracking registered ftrace ops structs * * A single ftrace_ops is shared between all enabled replacement functions - * (klp_func structs) which have the same old_addr. This allows the switch + * (klp_func structs) which have the same old_func. This allows the switch * between function versions to happen instantaneously by updating the klp_ops * struct's func_stack list. The winner is the klp_func at the top of the * func_stack (front of the list). @@ -25,10 +25,11 @@ struct klp_ops { struct ftrace_ops fops; }; -struct klp_ops *klp_find_ops(unsigned long old_addr); +struct klp_ops *klp_find_ops(void *old_func); int klp_patch_object(struct klp_object *obj); void klp_unpatch_object(struct klp_object *obj); void klp_unpatch_objects(struct klp_patch *patch); +void klp_unpatch_objects_dynamic(struct klp_patch *patch); #endif /* _LIVEPATCH_PATCH_H */ diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c index 304d5eb8a98c..9c89ae8b337a 100644 --- a/kernel/livepatch/transition.c +++ b/kernel/livepatch/transition.c @@ -29,11 +29,13 @@ #define MAX_STACK_ENTRIES 100 #define STACK_ERR_BUF_SIZE 128 +#define SIGNALS_TIMEOUT 15 + struct klp_patch *klp_transition_patch; static int klp_target_state = KLP_UNDEFINED; -static bool klp_forced = false; +static unsigned int klp_signals_cnt; /* * This work can be performed periodically to finish patching or unpatching any @@ -87,6 +89,11 @@ static void klp_complete_transition(void) klp_transition_patch->mod->name, klp_target_state == KLP_PATCHED ? "patching" : "unpatching"); + if (klp_transition_patch->replace && klp_target_state == KLP_PATCHED) { + klp_discard_replaced_patches(klp_transition_patch); + klp_discard_nops(klp_transition_patch); + } + if (klp_target_state == KLP_UNPATCHED) { /* * All tasks have transitioned to KLP_UNPATCHED so we can now @@ -136,13 +143,6 @@ static void klp_complete_transition(void) pr_notice("'%s': %s complete\n", klp_transition_patch->mod->name, klp_target_state == KLP_PATCHED ? "patching" : "unpatching"); - /* - * klp_forced set implies unbounded increase of module's ref count if - * the module is disabled/enabled in a loop. - */ - if (!klp_forced && klp_target_state == KLP_UNPATCHED) - module_put(klp_transition_patch->mod); - klp_target_state = KLP_UNDEFINED; klp_transition_patch = NULL; } @@ -224,11 +224,11 @@ static int klp_check_stack_func(struct klp_func *func, * Check for the to-be-patched function * (the previous func). */ - ops = klp_find_ops(func->old_addr); + ops = klp_find_ops(func->old_func); if (list_is_singular(&ops->func_stack)) { /* original function */ - func_addr = func->old_addr; + func_addr = (unsigned long)func->old_func; func_size = func->old_size; } else { /* previously patched function */ @@ -348,6 +348,47 @@ done: } /* + * Sends a fake signal to all non-kthread tasks with TIF_PATCH_PENDING set. + * Kthreads with TIF_PATCH_PENDING set are woken up. + */ +static void klp_send_signals(void) +{ + struct task_struct *g, *task; + + if (klp_signals_cnt == SIGNALS_TIMEOUT) + pr_notice("signaling remaining tasks\n"); + + read_lock(&tasklist_lock); + for_each_process_thread(g, task) { + if (!klp_patch_pending(task)) + continue; + + /* + * There is a small race here. We could see TIF_PATCH_PENDING + * set and decide to wake up a kthread or send a fake signal. + * Meanwhile the task could migrate itself and the action + * would be meaningless. It is not serious though. + */ + if (task->flags & PF_KTHREAD) { + /* + * Wake up a kthread which sleeps interruptedly and + * still has not been migrated. + */ + wake_up_state(task, TASK_INTERRUPTIBLE); + } else { + /* + * Send fake signal to all non-kthread tasks which are + * still not migrated. + */ + spin_lock_irq(&task->sighand->siglock); + signal_wake_up(task, 0); + spin_unlock_irq(&task->sighand->siglock); + } + } + read_unlock(&tasklist_lock); +} + +/* * Try to switch all remaining tasks to the target patch state by walking the * stacks of sleeping tasks and looking for any to-be-patched or * to-be-unpatched functions. If such functions are found, the task can't be @@ -359,6 +400,7 @@ void klp_try_complete_transition(void) { unsigned int cpu; struct task_struct *g, *task; + struct klp_patch *patch; bool complete = true; WARN_ON_ONCE(klp_target_state == KLP_UNDEFINED); @@ -396,6 +438,10 @@ void klp_try_complete_transition(void) put_online_cpus(); if (!complete) { + if (klp_signals_cnt && !(klp_signals_cnt % SIGNALS_TIMEOUT)) + klp_send_signals(); + klp_signals_cnt++; + /* * Some tasks weren't able to be switched over. Try again * later and/or wait for other methods like kernel exit @@ -407,7 +453,18 @@ void klp_try_complete_transition(void) } /* we're done, now cleanup the data structures */ + patch = klp_transition_patch; klp_complete_transition(); + + /* + * It would make more sense to free the patch in + * klp_complete_transition() but it is called also + * from klp_cancel_transition(). + */ + if (!patch->enabled) { + klp_free_patch_start(patch); + schedule_work(&patch->free_work); + } } /* @@ -446,6 +503,8 @@ void klp_start_transition(void) if (task->patch_state != klp_target_state) set_tsk_thread_flag(task, TIF_PATCH_PENDING); } + + klp_signals_cnt = 0; } /* @@ -569,47 +628,6 @@ void klp_copy_process(struct task_struct *child) } /* - * Sends a fake signal to all non-kthread tasks with TIF_PATCH_PENDING set. - * Kthreads with TIF_PATCH_PENDING set are woken up. Only admin can request this - * action currently. - */ -void klp_send_signals(void) -{ - struct task_struct *g, *task; - - pr_notice("signaling remaining tasks\n"); - - read_lock(&tasklist_lock); - for_each_process_thread(g, task) { - if (!klp_patch_pending(task)) - continue; - - /* - * There is a small race here. We could see TIF_PATCH_PENDING - * set and decide to wake up a kthread or send a fake signal. - * Meanwhile the task could migrate itself and the action - * would be meaningless. It is not serious though. - */ - if (task->flags & PF_KTHREAD) { - /* - * Wake up a kthread which sleeps interruptedly and - * still has not been migrated. - */ - wake_up_state(task, TASK_INTERRUPTIBLE); - } else { - /* - * Send fake signal to all non-kthread tasks which are - * still not migrated. - */ - spin_lock_irq(&task->sighand->siglock); - signal_wake_up(task, 0); - spin_unlock_irq(&task->sighand->siglock); - } - } - read_unlock(&tasklist_lock); -} - -/* * Drop TIF_PATCH_PENDING of all tasks on admin's request. This forces an * existing transition to finish. * @@ -620,6 +638,7 @@ void klp_send_signals(void) */ void klp_force_transition(void) { + struct klp_patch *patch; struct task_struct *g, *task; unsigned int cpu; @@ -633,5 +652,6 @@ void klp_force_transition(void) for_each_possible_cpu(cpu) klp_update_patch_state(idle_task(cpu)); - klp_forced = true; + klp_for_each_patch(patch) + patch->forced = true; } diff --git a/kernel/livepatch/transition.h b/kernel/livepatch/transition.h index f9d0bc016067..322db16233de 100644 --- a/kernel/livepatch/transition.h +++ b/kernel/livepatch/transition.h @@ -11,7 +11,6 @@ void klp_cancel_transition(void); void klp_start_transition(void); void klp_try_complete_transition(void); void klp_reverse_transition(void); -void klp_send_signals(void); void klp_force_transition(void); #endif /* _LIVEPATCH_TRANSITION_H */ diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 95932333a48b..34cdcbedda49 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -45,11 +45,14 @@ #include <linux/hash.h> #include <linux/ftrace.h> #include <linux/stringify.h> +#include <linux/bitmap.h> #include <linux/bitops.h> #include <linux/gfp.h> #include <linux/random.h> #include <linux/jhash.h> #include <linux/nmi.h> +#include <linux/rcupdate.h> +#include <linux/kprobes.h> #include <asm/sections.h> @@ -81,6 +84,7 @@ module_param(lock_stat, int, 0644); * code to recurse back into the lockdep code... */ static arch_spinlock_t lockdep_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; +static struct task_struct *lockdep_selftest_task_struct; static int graph_lock(void) { @@ -130,13 +134,17 @@ static inline int debug_locks_off_graph_unlock(void) unsigned long nr_list_entries; static struct lock_list list_entries[MAX_LOCKDEP_ENTRIES]; +static DECLARE_BITMAP(list_entries_in_use, MAX_LOCKDEP_ENTRIES); /* * All data structures here are protected by the global debug_lock. * - * Mutex key structs only get allocated, once during bootup, and never - * get freed - this significantly simplifies the debugging code. + * nr_lock_classes is the number of elements of lock_classes[] that is + * in use. */ +#define KEYHASH_BITS (MAX_LOCKDEP_KEYS_BITS - 1) +#define KEYHASH_SIZE (1UL << KEYHASH_BITS) +static struct hlist_head lock_keys_hash[KEYHASH_SIZE]; unsigned long nr_lock_classes; #ifndef CONFIG_DEBUG_LOCKDEP static @@ -277,11 +285,42 @@ static inline void lock_release_holdtime(struct held_lock *hlock) #endif /* - * We keep a global list of all lock classes. The list only grows, - * never shrinks. The list is only accessed with the lockdep - * spinlock lock held. + * We keep a global list of all lock classes. The list is only accessed with + * the lockdep spinlock lock held. free_lock_classes is a list with free + * elements. These elements are linked together by the lock_entry member in + * struct lock_class. */ LIST_HEAD(all_lock_classes); +static LIST_HEAD(free_lock_classes); + +/** + * struct pending_free - information about data structures about to be freed + * @zapped: Head of a list with struct lock_class elements. + * @lock_chains_being_freed: Bitmap that indicates which lock_chains[] elements + * are about to be freed. + */ +struct pending_free { + struct list_head zapped; + DECLARE_BITMAP(lock_chains_being_freed, MAX_LOCKDEP_CHAINS); +}; + +/** + * struct delayed_free - data structures used for delayed freeing + * + * A data structure for delayed freeing of data structures that may be + * accessed by RCU readers at the time these were freed. + * + * @rcu_head: Used to schedule an RCU callback for freeing data structures. + * @index: Index of @pf to which freed data structures are added. + * @scheduled: Whether or not an RCU callback has been scheduled. + * @pf: Array with information about data structures about to be freed. + */ +static struct delayed_free { + struct rcu_head rcu_head; + int index; + int scheduled; + struct pending_free pf[2]; +} delayed_free; /* * The lockdep classes are in a hash-table as well, for fast lookup: @@ -331,6 +370,11 @@ void lockdep_on(void) } EXPORT_SYMBOL(lockdep_on); +void lockdep_set_selftest_task(struct task_struct *task) +{ + lockdep_selftest_task_struct = task; +} + /* * Debugging switches: */ @@ -599,7 +643,7 @@ static int very_verbose(struct lock_class *class) * Is this the address of a static object: */ #ifdef __KERNEL__ -static int static_obj(void *obj) +static int static_obj(const void *obj) { unsigned long start = (unsigned long) &_stext, end = (unsigned long) &_end, @@ -716,6 +760,17 @@ static bool assign_lock_key(struct lockdep_map *lock) { unsigned long can_addr, addr = (unsigned long)lock; +#ifdef __KERNEL__ + /* + * lockdep_free_key_range() assumes that struct lock_class_key + * objects do not overlap. Since we use the address of lock + * objects as class key for static objects, check whether the + * size of lock_class_key objects does not exceed the size of + * the smallest lock object. + */ + BUILD_BUG_ON(sizeof(struct lock_class_key) > sizeof(raw_spinlock_t)); +#endif + if (__is_kernel_percpu_address(addr, &can_addr)) lock->key = (void *)can_addr; else if (__is_module_percpu_address(addr, &can_addr)) @@ -735,6 +790,289 @@ static bool assign_lock_key(struct lockdep_map *lock) return true; } +#ifdef CONFIG_DEBUG_LOCKDEP + +/* Check whether element @e occurs in list @h */ +static bool in_list(struct list_head *e, struct list_head *h) +{ + struct list_head *f; + + list_for_each(f, h) { + if (e == f) + return true; + } + + return false; +} + +/* + * Check whether entry @e occurs in any of the locks_after or locks_before + * lists. + */ +static bool in_any_class_list(struct list_head *e) +{ + struct lock_class *class; + int i; + + for (i = 0; i < ARRAY_SIZE(lock_classes); i++) { + class = &lock_classes[i]; + if (in_list(e, &class->locks_after) || + in_list(e, &class->locks_before)) + return true; + } + return false; +} + +static bool class_lock_list_valid(struct lock_class *c, struct list_head *h) +{ + struct lock_list *e; + + list_for_each_entry(e, h, entry) { + if (e->links_to != c) { + printk(KERN_INFO "class %s: mismatch for lock entry %ld; class %s <> %s", + c->name ? : "(?)", + (unsigned long)(e - list_entries), + e->links_to && e->links_to->name ? + e->links_to->name : "(?)", + e->class && e->class->name ? e->class->name : + "(?)"); + return false; + } + } + return true; +} + +#ifdef CONFIG_PROVE_LOCKING +static u16 chain_hlocks[MAX_LOCKDEP_CHAIN_HLOCKS]; +#endif + +static bool check_lock_chain_key(struct lock_chain *chain) +{ +#ifdef CONFIG_PROVE_LOCKING + u64 chain_key = 0; + int i; + + for (i = chain->base; i < chain->base + chain->depth; i++) + chain_key = iterate_chain_key(chain_key, chain_hlocks[i] + 1); + /* + * The 'unsigned long long' casts avoid that a compiler warning + * is reported when building tools/lib/lockdep. + */ + if (chain->chain_key != chain_key) { + printk(KERN_INFO "chain %lld: key %#llx <> %#llx\n", + (unsigned long long)(chain - lock_chains), + (unsigned long long)chain->chain_key, + (unsigned long long)chain_key); + return false; + } +#endif + return true; +} + +static bool in_any_zapped_class_list(struct lock_class *class) +{ + struct pending_free *pf; + int i; + + for (i = 0, pf = delayed_free.pf; i < ARRAY_SIZE(delayed_free.pf); i++, pf++) { + if (in_list(&class->lock_entry, &pf->zapped)) + return true; + } + + return false; +} + +static bool __check_data_structures(void) +{ + struct lock_class *class; + struct lock_chain *chain; + struct hlist_head *head; + struct lock_list *e; + int i; + + /* Check whether all classes occur in a lock list. */ + for (i = 0; i < ARRAY_SIZE(lock_classes); i++) { + class = &lock_classes[i]; + if (!in_list(&class->lock_entry, &all_lock_classes) && + !in_list(&class->lock_entry, &free_lock_classes) && + !in_any_zapped_class_list(class)) { + printk(KERN_INFO "class %px/%s is not in any class list\n", + class, class->name ? : "(?)"); + return false; + } + } + + /* Check whether all classes have valid lock lists. */ + for (i = 0; i < ARRAY_SIZE(lock_classes); i++) { + class = &lock_classes[i]; + if (!class_lock_list_valid(class, &class->locks_before)) + return false; + if (!class_lock_list_valid(class, &class->locks_after)) + return false; + } + + /* Check the chain_key of all lock chains. */ + for (i = 0; i < ARRAY_SIZE(chainhash_table); i++) { + head = chainhash_table + i; + hlist_for_each_entry_rcu(chain, head, entry) { + if (!check_lock_chain_key(chain)) + return false; + } + } + + /* + * Check whether all list entries that are in use occur in a class + * lock list. + */ + for_each_set_bit(i, list_entries_in_use, ARRAY_SIZE(list_entries)) { + e = list_entries + i; + if (!in_any_class_list(&e->entry)) { + printk(KERN_INFO "list entry %d is not in any class list; class %s <> %s\n", + (unsigned int)(e - list_entries), + e->class->name ? : "(?)", + e->links_to->name ? : "(?)"); + return false; + } + } + + /* + * Check whether all list entries that are not in use do not occur in + * a class lock list. + */ + for_each_clear_bit(i, list_entries_in_use, ARRAY_SIZE(list_entries)) { + e = list_entries + i; + if (in_any_class_list(&e->entry)) { + printk(KERN_INFO "list entry %d occurs in a class list; class %s <> %s\n", + (unsigned int)(e - list_entries), + e->class && e->class->name ? e->class->name : + "(?)", + e->links_to && e->links_to->name ? + e->links_to->name : "(?)"); + return false; + } + } + + return true; +} + +int check_consistency = 0; +module_param(check_consistency, int, 0644); + +static void check_data_structures(void) +{ + static bool once = false; + + if (check_consistency && !once) { + if (!__check_data_structures()) { + once = true; + WARN_ON(once); + } + } +} + +#else /* CONFIG_DEBUG_LOCKDEP */ + +static inline void check_data_structures(void) { } + +#endif /* CONFIG_DEBUG_LOCKDEP */ + +/* + * Initialize the lock_classes[] array elements, the free_lock_classes list + * and also the delayed_free structure. + */ +static void init_data_structures_once(void) +{ + static bool ds_initialized, rcu_head_initialized; + int i; + + if (likely(rcu_head_initialized)) + return; + + if (system_state >= SYSTEM_SCHEDULING) { + init_rcu_head(&delayed_free.rcu_head); + rcu_head_initialized = true; + } + + if (ds_initialized) + return; + + ds_initialized = true; + + INIT_LIST_HEAD(&delayed_free.pf[0].zapped); + INIT_LIST_HEAD(&delayed_free.pf[1].zapped); + + for (i = 0; i < ARRAY_SIZE(lock_classes); i++) { + list_add_tail(&lock_classes[i].lock_entry, &free_lock_classes); + INIT_LIST_HEAD(&lock_classes[i].locks_after); + INIT_LIST_HEAD(&lock_classes[i].locks_before); + } +} + +static inline struct hlist_head *keyhashentry(const struct lock_class_key *key) +{ + unsigned long hash = hash_long((uintptr_t)key, KEYHASH_BITS); + + return lock_keys_hash + hash; +} + +/* Register a dynamically allocated key. */ +void lockdep_register_key(struct lock_class_key *key) +{ + struct hlist_head *hash_head; + struct lock_class_key *k; + unsigned long flags; + + if (WARN_ON_ONCE(static_obj(key))) + return; + hash_head = keyhashentry(key); + + raw_local_irq_save(flags); + if (!graph_lock()) + goto restore_irqs; + hlist_for_each_entry_rcu(k, hash_head, hash_entry) { + if (WARN_ON_ONCE(k == key)) + goto out_unlock; + } + hlist_add_head_rcu(&key->hash_entry, hash_head); +out_unlock: + graph_unlock(); +restore_irqs: + raw_local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(lockdep_register_key); + +/* Check whether a key has been registered as a dynamic key. */ +static bool is_dynamic_key(const struct lock_class_key *key) +{ + struct hlist_head *hash_head; + struct lock_class_key *k; + bool found = false; + + if (WARN_ON_ONCE(static_obj(key))) + return false; + + /* + * If lock debugging is disabled lock_keys_hash[] may contain + * pointers to memory that has already been freed. Avoid triggering + * a use-after-free in that case by returning early. + */ + if (!debug_locks) + return true; + + hash_head = keyhashentry(key); + + rcu_read_lock(); + hlist_for_each_entry_rcu(k, hash_head, hash_entry) { + if (k == key) { + found = true; + break; + } + } + rcu_read_unlock(); + + return found; +} + /* * Register a lock's class in the hash-table, if the class is not present * yet. Otherwise we look it up. We cache the result in the lock object @@ -756,7 +1094,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) if (!lock->key) { if (!assign_lock_key(lock)) return NULL; - } else if (!static_obj(lock->key)) { + } else if (!static_obj(lock->key) && !is_dynamic_key(lock->key)) { return NULL; } @@ -775,11 +1113,12 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) goto out_unlock_set; } - /* - * Allocate a new key from the static array, and add it to - * the hash: - */ - if (nr_lock_classes >= MAX_LOCKDEP_KEYS) { + init_data_structures_once(); + + /* Allocate a new lock class and add it to the hash. */ + class = list_first_entry_or_null(&free_lock_classes, typeof(*class), + lock_entry); + if (!class) { if (!debug_locks_off_graph_unlock()) { return NULL; } @@ -788,13 +1127,13 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) dump_stack(); return NULL; } - class = lock_classes + nr_lock_classes++; + nr_lock_classes++; debug_atomic_inc(nr_unused_locks); class->key = key; class->name = lock->name; class->subclass = subclass; - INIT_LIST_HEAD(&class->locks_before); - INIT_LIST_HEAD(&class->locks_after); + WARN_ON_ONCE(!list_empty(&class->locks_before)); + WARN_ON_ONCE(!list_empty(&class->locks_after)); class->name_version = count_matching_names(class); /* * We use RCU's safe list-add method to make @@ -802,9 +1141,10 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) */ hlist_add_head_rcu(&class->hash_entry, hash_head); /* - * Add it to the global list of classes: + * Remove the class from the free list and add it to the global list + * of classes. */ - list_add_tail(&class->lock_entry, &all_lock_classes); + list_move_tail(&class->lock_entry, &all_lock_classes); if (verbose(class)) { graph_unlock(); @@ -845,7 +1185,10 @@ out_set_class_cache: */ static struct lock_list *alloc_list_entry(void) { - if (nr_list_entries >= MAX_LOCKDEP_ENTRIES) { + int idx = find_first_zero_bit(list_entries_in_use, + ARRAY_SIZE(list_entries)); + + if (idx >= ARRAY_SIZE(list_entries)) { if (!debug_locks_off_graph_unlock()) return NULL; @@ -853,13 +1196,16 @@ static struct lock_list *alloc_list_entry(void) dump_stack(); return NULL; } - return list_entries + nr_list_entries++; + nr_list_entries++; + __set_bit(idx, list_entries_in_use); + return list_entries + idx; } /* * Add a new dependency to the head of the list: */ -static int add_lock_to_list(struct lock_class *this, struct list_head *head, +static int add_lock_to_list(struct lock_class *this, + struct lock_class *links_to, struct list_head *head, unsigned long ip, int distance, struct stack_trace *trace) { @@ -873,6 +1219,7 @@ static int add_lock_to_list(struct lock_class *this, struct list_head *head, return 0; entry->class = this; + entry->links_to = links_to; entry->distance = distance; entry->trace = *trace; /* @@ -955,7 +1302,7 @@ static inline void mark_lock_accessed(struct lock_list *lock, unsigned long nr; nr = lock - list_entries; - WARN_ON(nr >= nr_list_entries); /* Out-of-bounds, input fail */ + WARN_ON(nr >= ARRAY_SIZE(list_entries)); /* Out-of-bounds, input fail */ lock->parent = parent; lock->class->dep_gen_id = lockdep_dependency_gen_id; } @@ -965,7 +1312,7 @@ static inline unsigned long lock_accessed(struct lock_list *lock) unsigned long nr; nr = lock - list_entries; - WARN_ON(nr >= nr_list_entries); /* Out-of-bounds, input fail */ + WARN_ON(nr >= ARRAY_SIZE(list_entries)); /* Out-of-bounds, input fail */ return lock->class->dep_gen_id == lockdep_dependency_gen_id; } @@ -1624,29 +1971,18 @@ static const char *state_rnames[] = { static inline const char *state_name(enum lock_usage_bit bit) { - return (bit & 1) ? state_rnames[bit >> 2] : state_names[bit >> 2]; + return (bit & LOCK_USAGE_READ_MASK) ? state_rnames[bit >> 2] : state_names[bit >> 2]; } static int exclusive_bit(int new_bit) { - /* - * USED_IN - * USED_IN_READ - * ENABLED - * ENABLED_READ - * - * bit 0 - write/read - * bit 1 - used_in/enabled - * bit 2+ state - */ - - int state = new_bit & ~3; - int dir = new_bit & 2; + int state = new_bit & LOCK_USAGE_STATE_MASK; + int dir = new_bit & LOCK_USAGE_DIR_MASK; /* * keep state, bit flip the direction and strip read. */ - return state | (dir ^ 2); + return state | (dir ^ LOCK_USAGE_DIR_MASK); } static int check_irq_usage(struct task_struct *curr, struct held_lock *prev, @@ -1842,6 +2178,24 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, struct lock_list this; int ret; + if (!hlock_class(prev)->key || !hlock_class(next)->key) { + /* + * The warning statements below may trigger a use-after-free + * of the class name. It is better to trigger a use-after free + * and to have the class name most of the time instead of not + * having the class name available. + */ + WARN_ONCE(!debug_locks_silent && !hlock_class(prev)->key, + "Detected use-after-free of lock class %px/%s\n", + hlock_class(prev), + hlock_class(prev)->name); + WARN_ONCE(!debug_locks_silent && !hlock_class(next)->key, + "Detected use-after-free of lock class %px/%s\n", + hlock_class(next), + hlock_class(next)->name); + return 2; + } + /* * Prove that the new <prev> -> <next> dependency would not * create a circular dependency in the graph. (We do this by @@ -1918,14 +2272,14 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, * Ok, all validations passed, add the new lock * to the previous lock's dependency list: */ - ret = add_lock_to_list(hlock_class(next), + ret = add_lock_to_list(hlock_class(next), hlock_class(prev), &hlock_class(prev)->locks_after, next->acquire_ip, distance, trace); if (!ret) return 0; - ret = add_lock_to_list(hlock_class(prev), + ret = add_lock_to_list(hlock_class(prev), hlock_class(next), &hlock_class(next)->locks_before, next->acquire_ip, distance, trace); if (!ret) @@ -2018,8 +2372,8 @@ out_bug: return 0; } -unsigned long nr_lock_chains; struct lock_chain lock_chains[MAX_LOCKDEP_CHAINS]; +static DECLARE_BITMAP(lock_chains_in_use, MAX_LOCKDEP_CHAINS); int nr_chain_hlocks; static u16 chain_hlocks[MAX_LOCKDEP_CHAIN_HLOCKS]; @@ -2153,6 +2507,33 @@ static int check_no_collision(struct task_struct *curr, } /* + * Given an index that is >= -1, return the index of the next lock chain. + * Return -2 if there is no next lock chain. + */ +long lockdep_next_lockchain(long i) +{ + i = find_next_bit(lock_chains_in_use, ARRAY_SIZE(lock_chains), i + 1); + return i < ARRAY_SIZE(lock_chains) ? i : -2; +} + +unsigned long lock_chain_count(void) +{ + return bitmap_weight(lock_chains_in_use, ARRAY_SIZE(lock_chains)); +} + +/* Must be called with the graph lock held. */ +static struct lock_chain *alloc_lock_chain(void) +{ + int idx = find_first_zero_bit(lock_chains_in_use, + ARRAY_SIZE(lock_chains)); + + if (unlikely(idx >= ARRAY_SIZE(lock_chains))) + return NULL; + __set_bit(idx, lock_chains_in_use); + return lock_chains + idx; +} + +/* * Adds a dependency chain into chain hashtable. And must be called with * graph_lock held. * @@ -2169,19 +2550,15 @@ static inline int add_chain_cache(struct task_struct *curr, int i, j; /* - * Allocate a new chain entry from the static array, and add - * it to the hash: - */ - - /* - * We might need to take the graph lock, ensure we've got IRQs + * The caller must hold the graph lock, ensure we've got IRQs * disabled to make this an IRQ-safe lock.. for recursion reasons * lockdep won't complain about its own locking errors. */ if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) return 0; - if (unlikely(nr_lock_chains >= MAX_LOCKDEP_CHAINS)) { + chain = alloc_lock_chain(); + if (!chain) { if (!debug_locks_off_graph_unlock()) return 0; @@ -2189,7 +2566,6 @@ static inline int add_chain_cache(struct task_struct *curr, dump_stack(); return 0; } - chain = lock_chains + nr_lock_chains++; chain->chain_key = chain_key; chain->irq_context = hlock->irq_context; i = get_first_held_lock(curr, hlock); @@ -2206,16 +2582,8 @@ static inline int add_chain_cache(struct task_struct *curr, chain_hlocks[chain->base + j] = lock_id; } chain_hlocks[chain->base + j] = class - lock_classes; - } - - if (nr_chain_hlocks < MAX_LOCKDEP_CHAIN_HLOCKS) nr_chain_hlocks += chain->depth; - -#ifdef CONFIG_DEBUG_LOCKDEP - /* - * Important for check_no_collision(). - */ - if (unlikely(nr_chain_hlocks > MAX_LOCKDEP_CHAIN_HLOCKS)) { + } else { if (!debug_locks_off_graph_unlock()) return 0; @@ -2223,7 +2591,6 @@ static inline int add_chain_cache(struct task_struct *curr, dump_stack(); return 0; } -#endif hlist_add_head_rcu(&chain->entry, hash_head); debug_atomic_inc(chain_lookup_misses); @@ -2233,19 +2600,16 @@ static inline int add_chain_cache(struct task_struct *curr, } /* - * Look up a dependency chain. + * Look up a dependency chain. Must be called with either the graph lock or + * the RCU read lock held. */ static inline struct lock_chain *lookup_chain_cache(u64 chain_key) { struct hlist_head *hash_head = chainhashentry(chain_key); struct lock_chain *chain; - /* - * We can walk it lock-free, because entries only get added - * to the hash: - */ hlist_for_each_entry_rcu(chain, hash_head, entry) { - if (chain->chain_key == chain_key) { + if (READ_ONCE(chain->chain_key) == chain_key) { debug_atomic_inc(chain_lookup_hits); return chain; } @@ -2662,8 +3026,8 @@ mark_lock_irq(struct task_struct *curr, struct held_lock *this, enum lock_usage_bit new_bit) { int excl_bit = exclusive_bit(new_bit); - int read = new_bit & 1; - int dir = new_bit & 2; + int read = new_bit & LOCK_USAGE_READ_MASK; + int dir = new_bit & LOCK_USAGE_DIR_MASK; /* * mark USED_IN has to look forwards -- to ensure no dependency @@ -2687,19 +3051,19 @@ mark_lock_irq(struct task_struct *curr, struct held_lock *this, * states. */ if ((!read || !dir || STRICT_READ_CHECKS) && - !usage(curr, this, excl_bit, state_name(new_bit & ~1))) + !usage(curr, this, excl_bit, state_name(new_bit & ~LOCK_USAGE_READ_MASK))) return 0; /* * Check for read in write conflicts */ if (!read) { - if (!valid_state(curr, this, new_bit, excl_bit + 1)) + if (!valid_state(curr, this, new_bit, excl_bit + LOCK_USAGE_READ_MASK)) return 0; if (STRICT_READ_CHECKS && - !usage(curr, this, excl_bit + 1, - state_name(new_bit + 1))) + !usage(curr, this, excl_bit + LOCK_USAGE_READ_MASK, + state_name(new_bit + LOCK_USAGE_READ_MASK))) return 0; } @@ -2709,35 +3073,28 @@ mark_lock_irq(struct task_struct *curr, struct held_lock *this, return 1; } -enum mark_type { -#define LOCKDEP_STATE(__STATE) __STATE, -#include "lockdep_states.h" -#undef LOCKDEP_STATE -}; - /* * Mark all held locks with a usage bit: */ static int -mark_held_locks(struct task_struct *curr, enum mark_type mark) +mark_held_locks(struct task_struct *curr, enum lock_usage_bit base_bit) { - enum lock_usage_bit usage_bit; struct held_lock *hlock; int i; for (i = 0; i < curr->lockdep_depth; i++) { + enum lock_usage_bit hlock_bit = base_bit; hlock = curr->held_locks + i; - usage_bit = 2 + (mark << 2); /* ENABLED */ if (hlock->read) - usage_bit += 1; /* READ */ + hlock_bit += LOCK_USAGE_READ_MASK; - BUG_ON(usage_bit >= LOCK_USAGE_STATES); + BUG_ON(hlock_bit >= LOCK_USAGE_STATES); if (!hlock->check) continue; - if (!mark_lock(curr, hlock, usage_bit)) + if (!mark_lock(curr, hlock, hlock_bit)) return 0; } @@ -2758,7 +3115,7 @@ static void __trace_hardirqs_on_caller(unsigned long ip) * We are going to turn hardirqs on, so set the * usage bit for all held locks: */ - if (!mark_held_locks(curr, HARDIRQ)) + if (!mark_held_locks(curr, LOCK_ENABLED_HARDIRQ)) return; /* * If we have softirqs enabled, then set the usage @@ -2766,7 +3123,7 @@ static void __trace_hardirqs_on_caller(unsigned long ip) * this bit from being set before) */ if (curr->softirqs_enabled) - if (!mark_held_locks(curr, SOFTIRQ)) + if (!mark_held_locks(curr, LOCK_ENABLED_SOFTIRQ)) return; curr->hardirq_enable_ip = ip; @@ -2814,6 +3171,7 @@ void lockdep_hardirqs_on(unsigned long ip) __trace_hardirqs_on_caller(ip); current->lockdep_recursion = 0; } +NOKPROBE_SYMBOL(lockdep_hardirqs_on); /* * Hardirqs were disabled: @@ -2843,6 +3201,7 @@ void lockdep_hardirqs_off(unsigned long ip) } else debug_atomic_inc(redundant_hardirqs_off); } +NOKPROBE_SYMBOL(lockdep_hardirqs_off); /* * Softirqs will be enabled: @@ -2880,7 +3239,7 @@ void trace_softirqs_on(unsigned long ip) * enabled too: */ if (curr->hardirqs_enabled) - mark_held_locks(curr, SOFTIRQ); + mark_held_locks(curr, LOCK_ENABLED_SOFTIRQ); current->lockdep_recursion = 0; } @@ -3119,13 +3478,12 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name, if (DEBUG_LOCKS_WARN_ON(!key)) return; /* - * Sanity check, the lock-class key must be persistent: + * Sanity check, the lock-class key must either have been allocated + * statically or must have been registered as a dynamic key. */ - if (!static_obj(key)) { - printk("BUG: key %px not in .data!\n", key); - /* - * What it says above ^^^^^, I suggest you read it. - */ + if (!static_obj(key) && !is_dynamic_key(key)) { + if (debug_locks) + printk(KERN_ERR "BUG: key %px has not been registered!\n", key); DEBUG_LOCKS_WARN_ON(1); return; } @@ -3335,6 +3693,11 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, if (nest_lock && !__lock_is_held(nest_lock, -1)) return print_lock_nested_lock_not_held(curr, hlock, ip); + if (!debug_locks_silent) { + WARN_ON_ONCE(depth && !hlock_class(hlock - 1)->key); + WARN_ON_ONCE(!hlock_class(hlock)->key); + } + if (!validate_chain(curr, lock, hlock, chain_head, chain_key)) return 0; @@ -3497,6 +3860,9 @@ __lock_set_class(struct lockdep_map *lock, const char *name, unsigned int depth; int i; + if (unlikely(!debug_locks)) + return 0; + depth = curr->lockdep_depth; /* * This function is about (re)setting the class of a held lock, @@ -3535,6 +3901,9 @@ static int __lock_downgrade(struct lockdep_map *lock, unsigned long ip) unsigned int depth; int i; + if (unlikely(!debug_locks)) + return 0; + depth = curr->lockdep_depth; /* * This function is about (re)setting the class of a held lock, @@ -3650,7 +4019,8 @@ __lock_release(struct lockdep_map *lock, int nested, unsigned long ip) return 0; } -static int __lock_is_held(const struct lockdep_map *lock, int read) +static nokprobe_inline +int __lock_is_held(const struct lockdep_map *lock, int read) { struct task_struct *curr = current; int i; @@ -3883,6 +4253,7 @@ int lock_is_held_type(const struct lockdep_map *lock, int read) return ret; } EXPORT_SYMBOL_GPL(lock_is_held_type); +NOKPROBE_SYMBOL(lock_is_held_type); struct pin_cookie lock_pin_lock(struct lockdep_map *lock) { @@ -4123,29 +4494,131 @@ void lockdep_reset(void) raw_local_irq_restore(flags); } +/* Remove a class from a lock chain. Must be called with the graph lock held. */ +static void remove_class_from_lock_chain(struct pending_free *pf, + struct lock_chain *chain, + struct lock_class *class) +{ +#ifdef CONFIG_PROVE_LOCKING + struct lock_chain *new_chain; + u64 chain_key; + int i; + + for (i = chain->base; i < chain->base + chain->depth; i++) { + if (chain_hlocks[i] != class - lock_classes) + continue; + /* The code below leaks one chain_hlock[] entry. */ + if (--chain->depth > 0) { + memmove(&chain_hlocks[i], &chain_hlocks[i + 1], + (chain->base + chain->depth - i) * + sizeof(chain_hlocks[0])); + } + /* + * Each lock class occurs at most once in a lock chain so once + * we found a match we can break out of this loop. + */ + goto recalc; + } + /* Since the chain has not been modified, return. */ + return; + +recalc: + chain_key = 0; + for (i = chain->base; i < chain->base + chain->depth; i++) + chain_key = iterate_chain_key(chain_key, chain_hlocks[i] + 1); + if (chain->depth && chain->chain_key == chain_key) + return; + /* Overwrite the chain key for concurrent RCU readers. */ + WRITE_ONCE(chain->chain_key, chain_key); + /* + * Note: calling hlist_del_rcu() from inside a + * hlist_for_each_entry_rcu() loop is safe. + */ + hlist_del_rcu(&chain->entry); + __set_bit(chain - lock_chains, pf->lock_chains_being_freed); + if (chain->depth == 0) + return; + /* + * If the modified lock chain matches an existing lock chain, drop + * the modified lock chain. + */ + if (lookup_chain_cache(chain_key)) + return; + new_chain = alloc_lock_chain(); + if (WARN_ON_ONCE(!new_chain)) { + debug_locks_off(); + return; + } + *new_chain = *chain; + hlist_add_head_rcu(&new_chain->entry, chainhashentry(chain_key)); +#endif +} + +/* Must be called with the graph lock held. */ +static void remove_class_from_lock_chains(struct pending_free *pf, + struct lock_class *class) +{ + struct lock_chain *chain; + struct hlist_head *head; + int i; + + for (i = 0; i < ARRAY_SIZE(chainhash_table); i++) { + head = chainhash_table + i; + hlist_for_each_entry_rcu(chain, head, entry) { + remove_class_from_lock_chain(pf, chain, class); + } + } +} + /* * Remove all references to a lock class. The caller must hold the graph lock. */ -static void zap_class(struct lock_class *class) +static void zap_class(struct pending_free *pf, struct lock_class *class) { + struct lock_list *entry; int i; + WARN_ON_ONCE(!class->key); + /* * Remove all dependencies this lock is * involved in: */ - for (i = 0; i < nr_list_entries; i++) { - if (list_entries[i].class == class) - list_del_rcu(&list_entries[i].entry); + for_each_set_bit(i, list_entries_in_use, ARRAY_SIZE(list_entries)) { + entry = list_entries + i; + if (entry->class != class && entry->links_to != class) + continue; + __clear_bit(i, list_entries_in_use); + nr_list_entries--; + list_del_rcu(&entry->entry); + } + if (list_empty(&class->locks_after) && + list_empty(&class->locks_before)) { + list_move_tail(&class->lock_entry, &pf->zapped); + hlist_del_rcu(&class->hash_entry); + WRITE_ONCE(class->key, NULL); + WRITE_ONCE(class->name, NULL); + nr_lock_classes--; + } else { + WARN_ONCE(true, "%s() failed for class %s\n", __func__, + class->name); } - /* - * Unhash the class and remove it from the all_lock_classes list: - */ - hlist_del_rcu(&class->hash_entry); - list_del(&class->lock_entry); - RCU_INIT_POINTER(class->key, NULL); - RCU_INIT_POINTER(class->name, NULL); + remove_class_from_lock_chains(pf, class); +} + +static void reinit_class(struct lock_class *class) +{ + void *const p = class; + const unsigned int offset = offsetof(struct lock_class, key); + + WARN_ON_ONCE(!class->lock_entry.next); + WARN_ON_ONCE(!list_empty(&class->locks_after)); + WARN_ON_ONCE(!list_empty(&class->locks_before)); + memset(p + offset, 0, sizeof(*class) - offset); + WARN_ON_ONCE(!class->lock_entry.next); + WARN_ON_ONCE(!list_empty(&class->locks_after)); + WARN_ON_ONCE(!list_empty(&class->locks_before)); } static inline int within(const void *addr, void *start, unsigned long size) @@ -4153,55 +4626,175 @@ static inline int within(const void *addr, void *start, unsigned long size) return addr >= start && addr < start + size; } +static bool inside_selftest(void) +{ + return current == lockdep_selftest_task_struct; +} + +/* The caller must hold the graph lock. */ +static struct pending_free *get_pending_free(void) +{ + return delayed_free.pf + delayed_free.index; +} + +static void free_zapped_rcu(struct rcu_head *cb); + /* - * Used in module.c to remove lock classes from memory that is going to be - * freed; and possibly re-used by other modules. - * - * We will have had one sync_sched() before getting here, so we're guaranteed - * nobody will look up these exact classes -- they're properly dead but still - * allocated. + * Schedule an RCU callback if no RCU callback is pending. Must be called with + * the graph lock held. */ -void lockdep_free_key_range(void *start, unsigned long size) +static void call_rcu_zapped(struct pending_free *pf) +{ + WARN_ON_ONCE(inside_selftest()); + + if (list_empty(&pf->zapped)) + return; + + if (delayed_free.scheduled) + return; + + delayed_free.scheduled = true; + + WARN_ON_ONCE(delayed_free.pf + delayed_free.index != pf); + delayed_free.index ^= 1; + + call_rcu(&delayed_free.rcu_head, free_zapped_rcu); +} + +/* The caller must hold the graph lock. May be called from RCU context. */ +static void __free_zapped_classes(struct pending_free *pf) { struct lock_class *class; - struct hlist_head *head; + + check_data_structures(); + + list_for_each_entry(class, &pf->zapped, lock_entry) + reinit_class(class); + + list_splice_init(&pf->zapped, &free_lock_classes); + +#ifdef CONFIG_PROVE_LOCKING + bitmap_andnot(lock_chains_in_use, lock_chains_in_use, + pf->lock_chains_being_freed, ARRAY_SIZE(lock_chains)); + bitmap_clear(pf->lock_chains_being_freed, 0, ARRAY_SIZE(lock_chains)); +#endif +} + +static void free_zapped_rcu(struct rcu_head *ch) +{ + struct pending_free *pf; unsigned long flags; - int i; - int locked; + + if (WARN_ON_ONCE(ch != &delayed_free.rcu_head)) + return; raw_local_irq_save(flags); - locked = graph_lock(); + if (!graph_lock()) + goto out_irq; + + /* closed head */ + pf = delayed_free.pf + (delayed_free.index ^ 1); + __free_zapped_classes(pf); + delayed_free.scheduled = false; /* - * Unhash all classes that were created by this module: + * If there's anything on the open list, close and start a new callback. */ + call_rcu_zapped(delayed_free.pf + delayed_free.index); + + graph_unlock(); +out_irq: + raw_local_irq_restore(flags); +} + +/* + * Remove all lock classes from the class hash table and from the + * all_lock_classes list whose key or name is in the address range [start, + * start + size). Move these lock classes to the zapped_classes list. Must + * be called with the graph lock held. + */ +static void __lockdep_free_key_range(struct pending_free *pf, void *start, + unsigned long size) +{ + struct lock_class *class; + struct hlist_head *head; + int i; + + /* Unhash all classes that were created by a module. */ for (i = 0; i < CLASSHASH_SIZE; i++) { head = classhash_table + i; hlist_for_each_entry_rcu(class, head, hash_entry) { - if (within(class->key, start, size)) - zap_class(class); - else if (within(class->name, start, size)) - zap_class(class); + if (!within(class->key, start, size) && + !within(class->name, start, size)) + continue; + zap_class(pf, class); } } +} - if (locked) - graph_unlock(); +/* + * Used in module.c to remove lock classes from memory that is going to be + * freed; and possibly re-used by other modules. + * + * We will have had one synchronize_rcu() before getting here, so we're + * guaranteed nobody will look up these exact classes -- they're properly dead + * but still allocated. + */ +static void lockdep_free_key_range_reg(void *start, unsigned long size) +{ + struct pending_free *pf; + unsigned long flags; + int locked; + + init_data_structures_once(); + + raw_local_irq_save(flags); + locked = graph_lock(); + if (!locked) + goto out_irq; + + pf = get_pending_free(); + __lockdep_free_key_range(pf, start, size); + call_rcu_zapped(pf); + + graph_unlock(); +out_irq: raw_local_irq_restore(flags); /* * Wait for any possible iterators from look_up_lock_class() to pass * before continuing to free the memory they refer to. - * - * sync_sched() is sufficient because the read-side is IRQ disable. */ synchronize_rcu(); +} - /* - * XXX at this point we could return the resources to the pool; - * instead we leak them. We would need to change to bitmap allocators - * instead of the linear allocators we have now. - */ +/* + * Free all lockdep keys in the range [start, start+size). Does not sleep. + * Ignores debug_locks. Must only be used by the lockdep selftests. + */ +static void lockdep_free_key_range_imm(void *start, unsigned long size) +{ + struct pending_free *pf = delayed_free.pf; + unsigned long flags; + + init_data_structures_once(); + + raw_local_irq_save(flags); + arch_spin_lock(&lockdep_lock); + __lockdep_free_key_range(pf, start, size); + __free_zapped_classes(pf); + arch_spin_unlock(&lockdep_lock); + raw_local_irq_restore(flags); +} + +void lockdep_free_key_range(void *start, unsigned long size) +{ + init_data_structures_once(); + + if (inside_selftest()) + lockdep_free_key_range_imm(start, size); + else + lockdep_free_key_range_reg(start, size); } /* @@ -4226,14 +4819,12 @@ static bool lock_class_cache_is_registered(struct lockdep_map *lock) return false; } -void lockdep_reset_lock(struct lockdep_map *lock) +/* The caller must hold the graph lock. Does not sleep. */ +static void __lockdep_reset_lock(struct pending_free *pf, + struct lockdep_map *lock) { struct lock_class *class; - unsigned long flags; - int j, locked; - - raw_local_irq_save(flags); - locked = graph_lock(); + int j; /* * Remove all classes this lock might have: @@ -4244,27 +4835,104 @@ void lockdep_reset_lock(struct lockdep_map *lock) */ class = look_up_lock_class(lock, j); if (class) - zap_class(class); + zap_class(pf, class); } /* * Debug check: in the end all mapped classes should * be gone. */ - if (unlikely(lock_class_cache_is_registered(lock))) { - if (debug_locks_off_graph_unlock()) { - /* - * We all just reset everything, how did it match? - */ - WARN_ON(1); + if (WARN_ON_ONCE(lock_class_cache_is_registered(lock))) + debug_locks_off(); +} + +/* + * Remove all information lockdep has about a lock if debug_locks == 1. Free + * released data structures from RCU context. + */ +static void lockdep_reset_lock_reg(struct lockdep_map *lock) +{ + struct pending_free *pf; + unsigned long flags; + int locked; + + raw_local_irq_save(flags); + locked = graph_lock(); + if (!locked) + goto out_irq; + + pf = get_pending_free(); + __lockdep_reset_lock(pf, lock); + call_rcu_zapped(pf); + + graph_unlock(); +out_irq: + raw_local_irq_restore(flags); +} + +/* + * Reset a lock. Does not sleep. Ignores debug_locks. Must only be used by the + * lockdep selftests. + */ +static void lockdep_reset_lock_imm(struct lockdep_map *lock) +{ + struct pending_free *pf = delayed_free.pf; + unsigned long flags; + + raw_local_irq_save(flags); + arch_spin_lock(&lockdep_lock); + __lockdep_reset_lock(pf, lock); + __free_zapped_classes(pf); + arch_spin_unlock(&lockdep_lock); + raw_local_irq_restore(flags); +} + +void lockdep_reset_lock(struct lockdep_map *lock) +{ + init_data_structures_once(); + + if (inside_selftest()) + lockdep_reset_lock_imm(lock); + else + lockdep_reset_lock_reg(lock); +} + +/* Unregister a dynamically allocated key. */ +void lockdep_unregister_key(struct lock_class_key *key) +{ + struct hlist_head *hash_head = keyhashentry(key); + struct lock_class_key *k; + struct pending_free *pf; + unsigned long flags; + bool found = false; + + might_sleep(); + + if (WARN_ON_ONCE(static_obj(key))) + return; + + raw_local_irq_save(flags); + if (!graph_lock()) + goto out_irq; + + pf = get_pending_free(); + hlist_for_each_entry_rcu(k, hash_head, hash_entry) { + if (k == key) { + hlist_del_rcu(&k->hash_entry); + found = true; + break; } - goto out_restore; } - if (locked) - graph_unlock(); - -out_restore: + WARN_ON_ONCE(!found); + __lockdep_free_key_range(pf, key, 1); + call_rcu_zapped(pf); + graph_unlock(); +out_irq: raw_local_irq_restore(flags); + + /* Wait until is_dynamic_key() has finished accessing k->hash_entry. */ + synchronize_rcu(); } +EXPORT_SYMBOL_GPL(lockdep_unregister_key); void __init lockdep_init(void) { @@ -4278,20 +4946,24 @@ void __init lockdep_init(void) printk("... MAX_LOCKDEP_CHAINS: %lu\n", MAX_LOCKDEP_CHAINS); printk("... CHAINHASH_SIZE: %lu\n", CHAINHASH_SIZE); - printk(" memory used by lock dependency info: %lu kB\n", - (sizeof(struct lock_class) * MAX_LOCKDEP_KEYS + - sizeof(struct list_head) * CLASSHASH_SIZE + - sizeof(struct lock_list) * MAX_LOCKDEP_ENTRIES + - sizeof(struct lock_chain) * MAX_LOCKDEP_CHAINS + - sizeof(struct list_head) * CHAINHASH_SIZE + printk(" memory used by lock dependency info: %zu kB\n", + (sizeof(lock_classes) + + sizeof(classhash_table) + + sizeof(list_entries) + + sizeof(list_entries_in_use) + + sizeof(chainhash_table) + + sizeof(delayed_free) #ifdef CONFIG_PROVE_LOCKING - + sizeof(struct circular_queue) + + sizeof(lock_cq) + + sizeof(lock_chains) + + sizeof(lock_chains_in_use) + + sizeof(chain_hlocks) #endif ) / 1024 ); - printk(" per task-struct memory footprint: %lu bytes\n", - sizeof(struct held_lock) * MAX_LOCK_DEPTH); + printk(" per task-struct memory footprint: %zu bytes\n", + sizeof(((struct task_struct *)NULL)->held_locks)); } static void diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h index 88c847a41c8a..d4c197425f68 100644 --- a/kernel/locking/lockdep_internals.h +++ b/kernel/locking/lockdep_internals.h @@ -22,6 +22,10 @@ enum lock_usage_bit { LOCK_USAGE_STATES }; +#define LOCK_USAGE_READ_MASK 1 +#define LOCK_USAGE_DIR_MASK 2 +#define LOCK_USAGE_STATE_MASK (~(LOCK_USAGE_READ_MASK | LOCK_USAGE_DIR_MASK)) + /* * Usage-state bitmasks: */ @@ -96,7 +100,8 @@ struct lock_class *lock_chain_get_class(struct lock_chain *chain, int i); extern unsigned long nr_lock_classes; extern unsigned long nr_list_entries; -extern unsigned long nr_lock_chains; +long lockdep_next_lockchain(long i); +unsigned long lock_chain_count(void); extern int nr_chain_hlocks; extern unsigned long nr_stack_trace_entries; diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c index 3d31f9b0059e..9c49ec645d8b 100644 --- a/kernel/locking/lockdep_proc.c +++ b/kernel/locking/lockdep_proc.c @@ -104,18 +104,18 @@ static const struct seq_operations lockdep_ops = { #ifdef CONFIG_PROVE_LOCKING static void *lc_start(struct seq_file *m, loff_t *pos) { + if (*pos < 0) + return NULL; + if (*pos == 0) return SEQ_START_TOKEN; - if (*pos - 1 < nr_lock_chains) - return lock_chains + (*pos - 1); - - return NULL; + return lock_chains + (*pos - 1); } static void *lc_next(struct seq_file *m, void *v, loff_t *pos) { - (*pos)++; + *pos = lockdep_next_lockchain(*pos - 1) + 1; return lc_start(m, pos); } @@ -268,7 +268,7 @@ static int lockdep_stats_show(struct seq_file *m, void *v) #ifdef CONFIG_PROVE_LOCKING seq_printf(m, " dependency chains: %11lu [max: %lu]\n", - nr_lock_chains, MAX_LOCKDEP_CHAINS); + lock_chain_count(), MAX_LOCKDEP_CHAINS); seq_printf(m, " dependency chain hlocks: %11d [max: %lu]\n", nr_chain_hlocks, MAX_LOCKDEP_CHAIN_HLOCKS); #endif diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index 7d0b0ed74404..ad40a2617063 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -1,23 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * Module-based torture test facility for locking * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, you can access it online at - * http://www.gnu.org/licenses/gpl-2.0.html. - * * Copyright (C) IBM Corporation, 2014 * - * Authors: Paul E. McKenney <paulmck@us.ibm.com> + * Authors: Paul E. McKenney <paulmck@linux.ibm.com> * Davidlohr Bueso <dave@stgolabs.net> * Based on kernel/rcu/torture.c. */ @@ -45,7 +32,7 @@ #include <linux/torture.h> MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com>"); +MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com>"); torture_param(int, nwriters_stress, -1, "Number of write-locking stress-test threads"); @@ -970,7 +957,7 @@ static int __init lock_torture_init(void) /* Prepare torture context. */ if (onoff_interval > 0) { firsterr = torture_onoff_init(onoff_holdoff * HZ, - onoff_interval * HZ); + onoff_interval * HZ, NULL); if (firsterr) goto unwind; } diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index 8a8c3c208c5e..5e9247dc2515 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c @@ -124,9 +124,6 @@ static inline __pure u32 encode_tail(int cpu, int idx) { u32 tail; -#ifdef CONFIG_DEBUG_SPINLOCK - BUG_ON(idx > 3); -#endif tail = (cpu + 1) << _Q_TAIL_CPU_OFFSET; tail |= idx << _Q_TAIL_IDX_OFFSET; /* assume < 4 */ @@ -412,12 +409,28 @@ pv_queue: idx = node->count++; tail = encode_tail(smp_processor_id(), idx); + /* + * 4 nodes are allocated based on the assumption that there will + * not be nested NMIs taking spinlocks. That may not be true in + * some architectures even though the chance of needing more than + * 4 nodes will still be extremely unlikely. When that happens, + * we fall back to spinning on the lock directly without using + * any MCS node. This is not the most elegant solution, but is + * simple enough. + */ + if (unlikely(idx >= MAX_NODES)) { + qstat_inc(qstat_lock_no_node, true); + while (!queued_spin_trylock(lock)) + cpu_relax(); + goto release; + } + node = grab_mcs_node(node, idx); /* * Keep counts of non-zero index values: */ - qstat_inc(qstat_lock_idx1 + idx - 1, idx); + qstat_inc(qstat_lock_use_node2 + idx - 1, idx); /* * Ensure that we increment the head node->count before initialising diff --git a/kernel/locking/qspinlock_stat.h b/kernel/locking/qspinlock_stat.h index 42d3d8dc8f49..d73f85388d5c 100644 --- a/kernel/locking/qspinlock_stat.h +++ b/kernel/locking/qspinlock_stat.h @@ -30,6 +30,13 @@ * pv_wait_node - # of vCPU wait's at a non-head queue node * lock_pending - # of locking operations via pending code * lock_slowpath - # of locking operations via MCS lock queue + * lock_use_node2 - # of locking operations that use 2nd per-CPU node + * lock_use_node3 - # of locking operations that use 3rd per-CPU node + * lock_use_node4 - # of locking operations that use 4th per-CPU node + * lock_no_node - # of locking operations without using per-CPU node + * + * Subtracting lock_use_node[234] from lock_slowpath will give you + * lock_use_node1. * * Writing to the "reset_counters" file will reset all the above counter * values. @@ -55,9 +62,10 @@ enum qlock_stats { qstat_pv_wait_node, qstat_lock_pending, qstat_lock_slowpath, - qstat_lock_idx1, - qstat_lock_idx2, - qstat_lock_idx3, + qstat_lock_use_node2, + qstat_lock_use_node3, + qstat_lock_use_node4, + qstat_lock_no_node, qstat_num, /* Total number of statistical counters */ qstat_reset_cnts = qstat_num, }; @@ -85,9 +93,10 @@ static const char * const qstat_names[qstat_num + 1] = { [qstat_pv_wait_node] = "pv_wait_node", [qstat_lock_pending] = "lock_pending", [qstat_lock_slowpath] = "lock_slowpath", - [qstat_lock_idx1] = "lock_index1", - [qstat_lock_idx2] = "lock_index2", - [qstat_lock_idx3] = "lock_index3", + [qstat_lock_use_node2] = "lock_use_node2", + [qstat_lock_use_node3] = "lock_use_node3", + [qstat_lock_use_node4] = "lock_use_node4", + [qstat_lock_no_node] = "lock_no_node", [qstat_reset_cnts] = "reset_counters", }; diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index 50d9af615dc4..fbe96341beee 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -211,9 +211,7 @@ static void __rwsem_mark_wake(struct rw_semaphore *sem, * Ensure issuing the wakeup (either by us or someone else) * after setting the reader waiter to nil. */ - wake_q_add(wake_q, tsk); - /* wake_q_add() already take the task ref */ - put_task_struct(tsk); + wake_q_add_safe(wake_q, tsk); } adjustment = woken * RWSEM_ACTIVE_READ_BIAS - adjustment; diff --git a/kernel/module.c b/kernel/module.c index 2ad1b5239910..0b9aa8ab89f0 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2719,11 +2719,7 @@ static void dynamic_debug_setup(struct module *mod, struct _ddebug *debug, unsig { if (!debug) return; -#ifdef CONFIG_DYNAMIC_DEBUG - if (ddebug_add_module(debug, num, mod->name)) - pr_err("dynamic debug error adding module: %s\n", - debug->modname); -#endif + ddebug_add_module(debug, num, mod->name); } static void dynamic_debug_remove(struct module *mod, struct _ddebug *debug) diff --git a/kernel/panic.c b/kernel/panic.c index f121e6ba7e11..0ae0d7332f12 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -642,16 +642,14 @@ static int clear_warn_once_set(void *data, u64 val) return 0; } -DEFINE_SIMPLE_ATTRIBUTE(clear_warn_once_fops, - NULL, - clear_warn_once_set, - "%lld\n"); +DEFINE_DEBUGFS_ATTRIBUTE(clear_warn_once_fops, NULL, clear_warn_once_set, + "%lld\n"); static __init int register_warn_debugfs(void) { /* Don't care about failure */ - debugfs_create_file("clear_warn_once", 0200, NULL, - NULL, &clear_warn_once_fops); + debugfs_create_file_unsafe("clear_warn_once", 0200, NULL, NULL, + &clear_warn_once_fops); return 0; } diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c index d9dc2c38764a..7d66ee68aaaf 100644 --- a/kernel/power/energy_model.c +++ b/kernel/power/energy_model.c @@ -10,6 +10,7 @@ #include <linux/cpu.h> #include <linux/cpumask.h> +#include <linux/debugfs.h> #include <linux/energy_model.h> #include <linux/sched/topology.h> #include <linux/slab.h> @@ -23,6 +24,60 @@ static DEFINE_PER_CPU(struct em_perf_domain *, em_data); */ static DEFINE_MUTEX(em_pd_mutex); +#ifdef CONFIG_DEBUG_FS +static struct dentry *rootdir; + +static void em_debug_create_cs(struct em_cap_state *cs, struct dentry *pd) +{ + struct dentry *d; + char name[24]; + + snprintf(name, sizeof(name), "cs:%lu", cs->frequency); + + /* Create per-cs directory */ + d = debugfs_create_dir(name, pd); + debugfs_create_ulong("frequency", 0444, d, &cs->frequency); + debugfs_create_ulong("power", 0444, d, &cs->power); + debugfs_create_ulong("cost", 0444, d, &cs->cost); +} + +static int em_debug_cpus_show(struct seq_file *s, void *unused) +{ + seq_printf(s, "%*pbl\n", cpumask_pr_args(to_cpumask(s->private))); + + return 0; +} +DEFINE_SHOW_ATTRIBUTE(em_debug_cpus); + +static void em_debug_create_pd(struct em_perf_domain *pd, int cpu) +{ + struct dentry *d; + char name[8]; + int i; + + snprintf(name, sizeof(name), "pd%d", cpu); + + /* Create the directory of the performance domain */ + d = debugfs_create_dir(name, rootdir); + + debugfs_create_file("cpus", 0444, d, pd->cpus, &em_debug_cpus_fops); + + /* Create a sub-directory for each capacity state */ + for (i = 0; i < pd->nr_cap_states; i++) + em_debug_create_cs(&pd->table[i], d); +} + +static int __init em_debug_init(void) +{ + /* Create /sys/kernel/debug/energy_model directory */ + rootdir = debugfs_create_dir("energy_model", NULL); + + return 0; +} +core_initcall(em_debug_init); +#else /* CONFIG_DEBUG_FS */ +static void em_debug_create_pd(struct em_perf_domain *pd, int cpu) {} +#endif static struct em_perf_domain *em_create_pd(cpumask_t *span, int nr_states, struct em_data_callback *cb) { @@ -102,6 +157,8 @@ static struct em_perf_domain *em_create_pd(cpumask_t *span, int nr_states, pd->nr_cap_states = nr_states; cpumask_copy(to_cpumask(pd->cpus), span); + em_debug_create_pd(pd, cpu); + return pd; free_cs_table: diff --git a/kernel/power/qos.c b/kernel/power/qos.c index b7a82502857a..9d22131afc1e 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c @@ -582,10 +582,8 @@ static int register_pm_qos_misc(struct pm_qos_object *qos, struct dentry *d) qos->pm_qos_power_miscdev.name = qos->name; qos->pm_qos_power_miscdev.fops = &pm_qos_power_fops; - if (d) { - (void)debugfs_create_file(qos->name, S_IRUGO, d, - (void *)qos, &pm_qos_debug_fops); - } + debugfs_create_file(qos->name, S_IRUGO, d, (void *)qos, + &pm_qos_debug_fops); return misc_register(&qos->pm_qos_power_miscdev); } @@ -685,8 +683,6 @@ static int __init pm_qos_power_init(void) BUILD_BUG_ON(ARRAY_SIZE(pm_qos_array) != PM_QOS_NUM_CLASSES); d = debugfs_create_dir("pm_qos", NULL); - if (IS_ERR_OR_NULL(d)) - d = NULL; for (i = PM_QOS_CPU_DMA_LATENCY; i < PM_QOS_NUM_CLASSES; i++) { ret = register_pm_qos_misc(pm_qos_array[i], d); diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 640b2034edd6..4802b039b89f 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -1215,14 +1215,16 @@ static struct page *saveable_highmem_page(struct zone *zone, unsigned long pfn) if (!pfn_valid(pfn)) return NULL; - page = pfn_to_page(pfn); - if (page_zone(page) != zone) + page = pfn_to_online_page(pfn); + if (!page || page_zone(page) != zone) return NULL; BUG_ON(!PageHighMem(page)); - if (swsusp_page_is_forbidden(page) || swsusp_page_is_free(page) || - PageReserved(page)) + if (swsusp_page_is_forbidden(page) || swsusp_page_is_free(page)) + return NULL; + + if (PageReserved(page) || PageOffline(page)) return NULL; if (page_is_guard(page)) @@ -1277,8 +1279,8 @@ static struct page *saveable_page(struct zone *zone, unsigned long pfn) if (!pfn_valid(pfn)) return NULL; - page = pfn_to_page(pfn); - if (page_zone(page) != zone) + page = pfn_to_online_page(pfn); + if (!page || page_zone(page) != zone) return NULL; BUG_ON(PageHighMem(page)); @@ -1286,6 +1288,9 @@ static struct page *saveable_page(struct zone *zone, unsigned long pfn) if (swsusp_page_is_forbidden(page) || swsusp_page_is_free(page)) return NULL; + if (PageOffline(page)) + return NULL; + if (PageReserved(page) && (!kernel_page_present(page) || pfn_is_nosave(pfn))) return NULL; diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index d3d170374ceb..8eee85bb2687 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -344,7 +344,6 @@ static int console_msg_format = MSG_FORMAT_DEFAULT; enum log_flags { LOG_NEWLINE = 2, /* text ended with a newline */ - LOG_PREFIX = 4, /* text started with a prefix */ LOG_CONT = 8, /* text is a fragment of a continuation line */ }; @@ -356,6 +355,9 @@ struct printk_log { u8 facility; /* syslog facility */ u8 flags:5; /* internal record flags */ u8 level:3; /* syslog level */ +#ifdef CONFIG_PRINTK_CALLER + u32 caller_id; /* thread id or processor id */ +#endif } #ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS __packed __aligned(4) @@ -422,7 +424,11 @@ static u64 exclusive_console_stop_seq; static u64 clear_seq; static u32 clear_idx; +#ifdef CONFIG_PRINTK_CALLER +#define PREFIX_MAX 48 +#else #define PREFIX_MAX 32 +#endif #define LOG_LINE_MAX (1024 - PREFIX_MAX) #define LOG_LEVEL(v) ((v) & 0x07) @@ -577,7 +583,7 @@ static u32 truncate_msg(u16 *text_len, u16 *trunc_msg_len, } /* insert record into the buffer, discard old ones, update heads */ -static int log_store(int facility, int level, +static int log_store(u32 caller_id, int facility, int level, enum log_flags flags, u64 ts_nsec, const char *dict, u16 dict_len, const char *text, u16 text_len) @@ -625,6 +631,9 @@ static int log_store(int facility, int level, msg->ts_nsec = ts_nsec; else msg->ts_nsec = local_clock(); +#ifdef CONFIG_PRINTK_CALLER + msg->caller_id = caller_id; +#endif memset(log_dict(msg) + dict_len, 0, pad_len); msg->len = size; @@ -688,12 +697,21 @@ static ssize_t msg_print_ext_header(char *buf, size_t size, struct printk_log *msg, u64 seq) { u64 ts_usec = msg->ts_nsec; + char caller[20]; +#ifdef CONFIG_PRINTK_CALLER + u32 id = msg->caller_id; + + snprintf(caller, sizeof(caller), ",caller=%c%u", + id & 0x80000000 ? 'C' : 'T', id & ~0x80000000); +#else + caller[0] = '\0'; +#endif do_div(ts_usec, 1000); - return scnprintf(buf, size, "%u,%llu,%llu,%c;", - (msg->facility << 3) | msg->level, seq, ts_usec, - msg->flags & LOG_CONT ? 'c' : '-'); + return scnprintf(buf, size, "%u,%llu,%llu,%c%s;", + (msg->facility << 3) | msg->level, seq, ts_usec, + msg->flags & LOG_CONT ? 'c' : '-', caller); } static ssize_t msg_print_ext_body(char *buf, size_t size, @@ -1038,6 +1056,9 @@ void log_buf_vmcoreinfo_setup(void) VMCOREINFO_OFFSET(printk_log, len); VMCOREINFO_OFFSET(printk_log, text_len); VMCOREINFO_OFFSET(printk_log, dict_len); +#ifdef CONFIG_PRINTK_CALLER + VMCOREINFO_OFFSET(printk_log, caller_id); +#endif } #endif @@ -1236,10 +1257,23 @@ static size_t print_time(u64 ts, char *buf) { unsigned long rem_nsec = do_div(ts, 1000000000); - return sprintf(buf, "[%5lu.%06lu] ", + return sprintf(buf, "[%5lu.%06lu]", (unsigned long)ts, rem_nsec / 1000); } +#ifdef CONFIG_PRINTK_CALLER +static size_t print_caller(u32 id, char *buf) +{ + char caller[12]; + + snprintf(caller, sizeof(caller), "%c%u", + id & 0x80000000 ? 'C' : 'T', id & ~0x80000000); + return sprintf(buf, "[%6s]", caller); +} +#else +#define print_caller(id, buf) 0 +#endif + static size_t print_prefix(const struct printk_log *msg, bool syslog, bool time, char *buf) { @@ -1247,8 +1281,17 @@ static size_t print_prefix(const struct printk_log *msg, bool syslog, if (syslog) len = print_syslog((msg->facility << 3) | msg->level, buf); + if (time) len += print_time(msg->ts_nsec, buf + len); + + len += print_caller(msg->caller_id, buf + len); + + if (IS_ENABLED(CONFIG_PRINTK_CALLER) || time) { + buf[len++] = ' '; + buf[len] = '\0'; + } + return len; } @@ -1752,6 +1795,12 @@ static inline void printk_delay(void) } } +static inline u32 printk_caller_id(void) +{ + return in_task() ? task_pid_nr(current) : + 0x80000000 + raw_smp_processor_id(); +} + /* * Continuation lines are buffered, and not committed to the record buffer * until the line is complete, or a race forces it. The line fragments @@ -1761,7 +1810,7 @@ static inline void printk_delay(void) static struct cont { char buf[LOG_LINE_MAX]; size_t len; /* length == 0 means unused buffer */ - struct task_struct *owner; /* task of first print*/ + u32 caller_id; /* printk_caller_id() of first print */ u64 ts_nsec; /* time of first print */ u8 level; /* log level of first message */ u8 facility; /* log facility of first message */ @@ -1773,12 +1822,13 @@ static void cont_flush(void) if (cont.len == 0) return; - log_store(cont.facility, cont.level, cont.flags, cont.ts_nsec, - NULL, 0, cont.buf, cont.len); + log_store(cont.caller_id, cont.facility, cont.level, cont.flags, + cont.ts_nsec, NULL, 0, cont.buf, cont.len); cont.len = 0; } -static bool cont_add(int facility, int level, enum log_flags flags, const char *text, size_t len) +static bool cont_add(u32 caller_id, int facility, int level, + enum log_flags flags, const char *text, size_t len) { /* If the line gets too long, split it up in separate records. */ if (cont.len + len > sizeof(cont.buf)) { @@ -1789,7 +1839,7 @@ static bool cont_add(int facility, int level, enum log_flags flags, const char * if (!cont.len) { cont.facility = facility; cont.level = level; - cont.owner = current; + cont.caller_id = caller_id; cont.ts_nsec = local_clock(); cont.flags = flags; } @@ -1809,13 +1859,15 @@ static bool cont_add(int facility, int level, enum log_flags flags, const char * static size_t log_output(int facility, int level, enum log_flags lflags, const char *dict, size_t dictlen, char *text, size_t text_len) { + const u32 caller_id = printk_caller_id(); + /* * If an earlier line was buffered, and we're a continuation - * write from the same process, try to add it to the buffer. + * write from the same context, try to add it to the buffer. */ if (cont.len) { - if (cont.owner == current && (lflags & LOG_CONT)) { - if (cont_add(facility, level, lflags, text, text_len)) + if (cont.caller_id == caller_id && (lflags & LOG_CONT)) { + if (cont_add(caller_id, facility, level, lflags, text, text_len)) return text_len; } /* Otherwise, make sure it's flushed */ @@ -1828,12 +1880,13 @@ static size_t log_output(int facility, int level, enum log_flags lflags, const c /* If it doesn't end in a newline, try to buffer the current line */ if (!(lflags & LOG_NEWLINE)) { - if (cont_add(facility, level, lflags, text, text_len)) + if (cont_add(caller_id, facility, level, lflags, text, text_len)) return text_len; } /* Store it in the record log */ - return log_store(facility, level, lflags, 0, dict, dictlen, text, text_len); + return log_store(caller_id, facility, level, lflags, 0, + dict, dictlen, text, text_len); } /* Must be called under logbuf_lock. */ @@ -1867,9 +1920,6 @@ int vprintk_store(int facility, int level, case '0' ... '7': if (level == LOGLEVEL_DEFAULT) level = kern_level - '0'; - /* fallthrough */ - case 'd': /* KERN_DEFAULT */ - lflags |= LOG_PREFIX; break; case 'c': /* KERN_CONT */ lflags |= LOG_CONT; @@ -1884,7 +1934,7 @@ int vprintk_store(int facility, int level, level = default_message_loglevel; if (dict) - lflags |= LOG_PREFIX|LOG_NEWLINE; + lflags |= LOG_NEWLINE; return log_output(facility, level, lflags, dict, dictlen, text, text_len); diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index 939a2056c87a..37301430970e 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig @@ -87,36 +87,6 @@ config RCU_STALL_COMMON config RCU_NEED_SEGCBLIST def_bool ( TREE_RCU || PREEMPT_RCU || TREE_SRCU ) -config CONTEXT_TRACKING - bool - -config CONTEXT_TRACKING_FORCE - bool "Force context tracking" - depends on CONTEXT_TRACKING - default y if !NO_HZ_FULL - help - The major pre-requirement for full dynticks to work is to - support the context tracking subsystem. But there are also - other dependencies to provide in order to make the full - dynticks working. - - This option stands for testing when an arch implements the - context tracking backend but doesn't yet fullfill all the - requirements to make the full dynticks feature working. - Without the full dynticks, there is no way to test the support - for context tracking and the subsystems that rely on it: RCU - userspace extended quiescent state and tickless cputime - accounting. This option copes with the absence of the full - dynticks subsystem by forcing the context tracking on all - CPUs in the system. - - Say Y only if you're working on the development of an - architecture backend for the context tracking. - - Say N otherwise, this option brings an overhead that you - don't want in production. - - config RCU_FANOUT int "Tree-based hierarchical RCU fanout value" range 2 64 if 64BIT diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index a393e24a9195..acee72c0b24b 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -1,23 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ /* * Read-Copy Update definitions shared among RCU implementations. * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, you can access it online at - * http://www.gnu.org/licenses/gpl-2.0.html. - * * Copyright IBM Corporation, 2011 * - * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com> + * Author: Paul E. McKenney <paulmck@linux.ibm.com> */ #ifndef __LINUX_RCU_H @@ -30,7 +17,7 @@ #define RCU_TRACE(stmt) #endif /* #else #ifdef CONFIG_RCU_TRACE */ -/* Offset to allow for unmatched rcu_irq_{enter,exit}(). */ +/* Offset to allow distinguishing irq vs. task-based idle entry/exit. */ #define DYNTICK_IRQ_NONIDLE ((LONG_MAX / 2) + 1) @@ -462,8 +449,6 @@ void rcu_request_urgent_qs_task(struct task_struct *t); enum rcutorture_type { RCU_FLAVOR, - RCU_BH_FLAVOR, - RCU_SCHED_FLAVOR, RCU_TASKS_FLAVOR, SRCU_FLAVOR, INVALID_RCU_FLAVOR diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c index 5aff271adf1e..9bd5f6023c21 100644 --- a/kernel/rcu/rcu_segcblist.c +++ b/kernel/rcu/rcu_segcblist.c @@ -1,23 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * RCU segmented callback lists, function definitions * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, you can access it online at - * http://www.gnu.org/licenses/gpl-2.0.html. - * * Copyright IBM Corporation, 2017 * - * Authors: Paul E. McKenney <paulmck@linux.vnet.ibm.com> + * Authors: Paul E. McKenney <paulmck@linux.ibm.com> */ #include <linux/types.h> diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h index 948470cef385..71b64648464e 100644 --- a/kernel/rcu/rcu_segcblist.h +++ b/kernel/rcu/rcu_segcblist.h @@ -1,23 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ /* * RCU segmented callback lists, internal-to-rcu header file * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, you can access it online at - * http://www.gnu.org/licenses/gpl-2.0.html. - * * Copyright IBM Corporation, 2017 * - * Authors: Paul E. McKenney <paulmck@linux.vnet.ibm.com> + * Authors: Paul E. McKenney <paulmck@linux.ibm.com> */ #include <linux/rcu_segcblist.h> diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index b459da70b4fc..c29761152874 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -1,23 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * Read-Copy Update module-based performance-test facility * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, you can access it online at - * http://www.gnu.org/licenses/gpl-2.0.html. - * * Copyright (C) IBM Corporation, 2015 * - * Authors: Paul E. McKenney <paulmck@us.ibm.com> + * Authors: Paul E. McKenney <paulmck@linux.ibm.com> */ #define pr_fmt(fmt) fmt @@ -54,7 +41,7 @@ #include "rcu.h" MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.vnet.ibm.com>"); +MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com>"); #define PERF_FLAG "-perf:" #define PERFOUT_STRING(s) \ @@ -83,13 +70,19 @@ MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.vnet.ibm.com>"); * Various other use cases may of course be specified. */ +#ifdef MODULE +# define RCUPERF_SHUTDOWN 0 +#else +# define RCUPERF_SHUTDOWN 1 +#endif + torture_param(bool, gp_async, false, "Use asynchronous GP wait primitives"); torture_param(int, gp_async_max, 1000, "Max # outstanding waits per reader"); torture_param(bool, gp_exp, false, "Use expedited GP wait primitives"); torture_param(int, holdoff, 10, "Holdoff time before test start (s)"); torture_param(int, nreaders, -1, "Number of RCU reader threads"); torture_param(int, nwriters, -1, "Number of RCU updater threads"); -torture_param(bool, shutdown, !IS_ENABLED(MODULE), +torture_param(bool, shutdown, RCUPERF_SHUTDOWN, "Shutdown at end of performance tests."); torture_param(int, verbose, 1, "Enable verbose debugging printk()s"); torture_param(int, writer_holdoff, 0, "Holdoff (us) between GPs, zero to disable"); diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index f6e85faa4ff4..f14d1b18a74f 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1,23 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * Read-Copy Update module-based torture test facility * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, you can access it online at - * http://www.gnu.org/licenses/gpl-2.0.html. - * * Copyright (C) IBM Corporation, 2005, 2006 * - * Authors: Paul E. McKenney <paulmck@us.ibm.com> + * Authors: Paul E. McKenney <paulmck@linux.ibm.com> * Josh Triplett <josh@joshtriplett.org> * * See also: Documentation/RCU/torture.txt @@ -61,7 +48,7 @@ #include "rcu.h" MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and Josh Triplett <josh@joshtriplett.org>"); +MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com> and Josh Triplett <josh@joshtriplett.org>"); /* Bits for ->extendables field, extendables param, and related definitions. */ @@ -1630,21 +1617,34 @@ static bool rcu_fwd_emergency_stop; #define MIN_FWD_CB_LAUNDERS 3 /* This many CB invocations to count. */ #define MIN_FWD_CBS_LAUNDERED 100 /* Number of counted CBs. */ #define FWD_CBS_HIST_DIV 10 /* Histogram buckets/second. */ -static long n_launders_hist[2 * MAX_FWD_CB_JIFFIES / (HZ / FWD_CBS_HIST_DIV)]; +struct rcu_launder_hist { + long n_launders; + unsigned long launder_gp_seq; +}; +#define N_LAUNDERS_HIST (2 * MAX_FWD_CB_JIFFIES / (HZ / FWD_CBS_HIST_DIV)) +static struct rcu_launder_hist n_launders_hist[N_LAUNDERS_HIST]; +static unsigned long rcu_launder_gp_seq_start; static void rcu_torture_fwd_cb_hist(void) { + unsigned long gps; + unsigned long gps_old; int i; int j; for (i = ARRAY_SIZE(n_launders_hist) - 1; i > 0; i--) - if (n_launders_hist[i] > 0) + if (n_launders_hist[i].n_launders > 0) break; pr_alert("%s: Callback-invocation histogram (duration %lu jiffies):", __func__, jiffies - rcu_fwd_startat); - for (j = 0; j <= i; j++) - pr_cont(" %ds/%d: %ld", - j + 1, FWD_CBS_HIST_DIV, n_launders_hist[j]); + gps_old = rcu_launder_gp_seq_start; + for (j = 0; j <= i; j++) { + gps = n_launders_hist[j].launder_gp_seq; + pr_cont(" %ds/%d: %ld:%ld", + j + 1, FWD_CBS_HIST_DIV, n_launders_hist[j].n_launders, + rcutorture_seq_diff(gps, gps_old)); + gps_old = gps; + } pr_cont("\n"); } @@ -1666,7 +1666,8 @@ static void rcu_torture_fwd_cb_cr(struct rcu_head *rhp) i = ((jiffies - rcu_fwd_startat) / (HZ / FWD_CBS_HIST_DIV)); if (i >= ARRAY_SIZE(n_launders_hist)) i = ARRAY_SIZE(n_launders_hist) - 1; - n_launders_hist[i]++; + n_launders_hist[i].n_launders++; + n_launders_hist[i].launder_gp_seq = cur_ops->get_gp_seq(); spin_unlock_irqrestore(&rcu_fwd_lock, flags); } @@ -1786,9 +1787,10 @@ static void rcu_torture_fwd_prog_cr(void) n_max_cbs = 0; n_max_gps = 0; for (i = 0; i < ARRAY_SIZE(n_launders_hist); i++) - n_launders_hist[i] = 0; + n_launders_hist[i].n_launders = 0; cver = READ_ONCE(rcu_torture_current_version); gps = cur_ops->get_gp_seq(); + rcu_launder_gp_seq_start = gps; while (time_before(jiffies, stopat) && !READ_ONCE(rcu_fwd_emergency_stop) && !torture_must_stop()) { rfcp = READ_ONCE(rcu_fwd_cb_head); @@ -2228,6 +2230,14 @@ static void rcu_test_debug_objects(void) #endif /* #else #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ } +static void rcutorture_sync(void) +{ + static unsigned long n; + + if (cur_ops->sync && !(++n & 0xfff)) + cur_ops->sync(); +} + static int __init rcu_torture_init(void) { @@ -2389,7 +2399,8 @@ rcu_torture_init(void) firsterr = torture_shutdown_init(shutdown_secs, rcu_torture_cleanup); if (firsterr) goto unwind; - firsterr = torture_onoff_init(onoff_holdoff * HZ, onoff_interval); + firsterr = torture_onoff_init(onoff_holdoff * HZ, onoff_interval, + rcutorture_sync); if (firsterr) goto unwind; firsterr = rcu_torture_stall_init(); diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c index 32dfd6522548..5d4a39a6505a 100644 --- a/kernel/rcu/srcutiny.c +++ b/kernel/rcu/srcutiny.c @@ -1,24 +1,11 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * Sleepable Read-Copy Update mechanism for mutual exclusion, * tiny version for non-preemptible single-CPU use. * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, you can access it online at - * http://www.gnu.org/licenses/gpl-2.0.html. - * * Copyright (C) IBM Corporation, 2017 * - * Author: Paul McKenney <paulmck@us.ibm.com> + * Author: Paul McKenney <paulmck@linux.ibm.com> */ #include <linux/export.h> diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 3600d88d8956..a60b8ba9e1ac 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -1,24 +1,11 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * Sleepable Read-Copy Update mechanism for mutual exclusion. * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, you can access it online at - * http://www.gnu.org/licenses/gpl-2.0.html. - * * Copyright (C) IBM Corporation, 2006 * Copyright (C) Fujitsu, 2012 * - * Author: Paul McKenney <paulmck@us.ibm.com> + * Author: Paul McKenney <paulmck@linux.ibm.com> * Lai Jiangshan <laijs@cn.fujitsu.com> * * For detailed explanation of Read-Copy Update mechanism see - @@ -58,6 +45,7 @@ static bool __read_mostly srcu_init_done; static void srcu_invoke_callbacks(struct work_struct *work); static void srcu_reschedule(struct srcu_struct *ssp, unsigned long delay); static void process_srcu(struct work_struct *work); +static void srcu_delay_timer(struct timer_list *t); /* Wrappers for lock acquisition and release, see raw_spin_lock_rcu_node(). */ #define spin_lock_rcu_node(p) \ @@ -156,7 +144,8 @@ static void init_srcu_struct_nodes(struct srcu_struct *ssp, bool is_static) snp->grphi = cpu; } sdp->cpu = cpu; - INIT_DELAYED_WORK(&sdp->work, srcu_invoke_callbacks); + INIT_WORK(&sdp->work, srcu_invoke_callbacks); + timer_setup(&sdp->delay_work, srcu_delay_timer, 0); sdp->ssp = ssp; sdp->grpmask = 1 << (cpu - sdp->mynode->grplo); if (is_static) @@ -386,13 +375,19 @@ void _cleanup_srcu_struct(struct srcu_struct *ssp, bool quiesced) } else { flush_delayed_work(&ssp->work); } - for_each_possible_cpu(cpu) + for_each_possible_cpu(cpu) { + struct srcu_data *sdp = per_cpu_ptr(ssp->sda, cpu); + if (quiesced) { - if (WARN_ON(delayed_work_pending(&per_cpu_ptr(ssp->sda, cpu)->work))) + if (WARN_ON(timer_pending(&sdp->delay_work))) + return; /* Just leak it! */ + if (WARN_ON(work_pending(&sdp->work))) return; /* Just leak it! */ } else { - flush_delayed_work(&per_cpu_ptr(ssp->sda, cpu)->work); + del_timer_sync(&sdp->delay_work); + flush_work(&sdp->work); } + } if (WARN_ON(rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)) != SRCU_STATE_IDLE) || WARN_ON(srcu_readers_active(ssp))) { pr_info("%s: Active srcu_struct %p state: %d\n", @@ -463,39 +458,23 @@ static void srcu_gp_start(struct srcu_struct *ssp) WARN_ON_ONCE(state != SRCU_STATE_SCAN1); } -/* - * Track online CPUs to guide callback workqueue placement. - */ -DEFINE_PER_CPU(bool, srcu_online); -void srcu_online_cpu(unsigned int cpu) +static void srcu_delay_timer(struct timer_list *t) { - WRITE_ONCE(per_cpu(srcu_online, cpu), true); -} + struct srcu_data *sdp = container_of(t, struct srcu_data, delay_work); -void srcu_offline_cpu(unsigned int cpu) -{ - WRITE_ONCE(per_cpu(srcu_online, cpu), false); + queue_work_on(sdp->cpu, rcu_gp_wq, &sdp->work); } -/* - * Place the workqueue handler on the specified CPU if online, otherwise - * just run it whereever. This is useful for placing workqueue handlers - * that are to invoke the specified CPU's callbacks. - */ -static bool srcu_queue_delayed_work_on(int cpu, struct workqueue_struct *wq, - struct delayed_work *dwork, +static void srcu_queue_delayed_work_on(struct srcu_data *sdp, unsigned long delay) { - bool ret; + if (!delay) { + queue_work_on(sdp->cpu, rcu_gp_wq, &sdp->work); + return; + } - preempt_disable(); - if (READ_ONCE(per_cpu(srcu_online, cpu))) - ret = queue_delayed_work_on(cpu, wq, dwork, delay); - else - ret = queue_delayed_work(wq, dwork, delay); - preempt_enable(); - return ret; + timer_reduce(&sdp->delay_work, jiffies + delay); } /* @@ -504,7 +483,7 @@ static bool srcu_queue_delayed_work_on(int cpu, struct workqueue_struct *wq, */ static void srcu_schedule_cbs_sdp(struct srcu_data *sdp, unsigned long delay) { - srcu_queue_delayed_work_on(sdp->cpu, rcu_gp_wq, &sdp->work, delay); + srcu_queue_delayed_work_on(sdp, delay); } /* @@ -1186,7 +1165,8 @@ static void srcu_invoke_callbacks(struct work_struct *work) struct srcu_data *sdp; struct srcu_struct *ssp; - sdp = container_of(work, struct srcu_data, work.work); + sdp = container_of(work, struct srcu_data, work); + ssp = sdp->ssp; rcu_cblist_init(&ready_cbs); spin_lock_irq_rcu_node(sdp); diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c index be10036fa621..a8304d90573f 100644 --- a/kernel/rcu/sync.c +++ b/kernel/rcu/sync.c @@ -1,20 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * RCU-based infrastructure for lightweight reader-writer locking * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, you can access it online at - * http://www.gnu.org/licenses/gpl-2.0.html. - * * Copyright (c) 2015, Red Hat, Inc. * * Author: Oleg Nesterov <oleg@redhat.com> diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index 5f5963ba313e..911bd9076d43 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -1,23 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * Read-Copy Update mechanism for mutual exclusion, the Bloatwatch edition. * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, you can access it online at - * http://www.gnu.org/licenses/gpl-2.0.html. - * * Copyright IBM Corporation, 2008 * - * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com> + * Author: Paul E. McKenney <paulmck@linux.ibm.com> * * For detailed explanation of Read-Copy Update mechanism see - * Documentation/RCU @@ -76,7 +63,7 @@ void rcu_qs(void) * be called from hardirq context. It is normally called from the * scheduling-clock interrupt. */ -void rcu_check_callbacks(int user) +void rcu_sched_clock_irq(int user) { if (user) { rcu_qs(); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 9180158756d2..acd6ccf56faf 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1,27 +1,14 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * Read-Copy Update mechanism for mutual exclusion * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, you can access it online at - * http://www.gnu.org/licenses/gpl-2.0.html. - * * Copyright IBM Corporation, 2008 * * Authors: Dipankar Sarma <dipankar@in.ibm.com> * Manfred Spraul <manfred@colorfullife.com> - * Paul E. McKenney <paulmck@linux.vnet.ibm.com> Hierarchical version + * Paul E. McKenney <paulmck@linux.ibm.com> Hierarchical version * - * Based on the original work by Paul McKenney <paulmck@us.ibm.com> + * Based on the original work by Paul McKenney <paulmck@linux.ibm.com> * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen. * * For detailed explanation of Read-Copy Update mechanism see - @@ -62,6 +49,8 @@ #include <linux/suspend.h> #include <linux/ftrace.h> #include <linux/tick.h> +#include <linux/sysrq.h> +#include <linux/kprobes.h> #include "tree.h" #include "rcu.h" @@ -115,6 +104,9 @@ int num_rcu_lvl[] = NUM_RCU_LVL_INIT; int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */ /* panic() on RCU Stall sysctl. */ int sysctl_panic_on_rcu_stall __read_mostly; +/* Commandeer a sysrq key to dump RCU's tree. */ +static bool sysrq_rcu; +module_param(sysrq_rcu, bool, 0444); /* * The rcu_scheduler_active variable is initialized to the value @@ -479,7 +471,6 @@ module_param_cb(jiffies_till_next_fqs, &next_fqs_jiffies_ops, &jiffies_till_next module_param(rcu_kick_kthreads, bool, 0644); static void force_qs_rnp(int (*f)(struct rcu_data *rdp)); -static void force_quiescent_state(void); static int rcu_pending(void); /* @@ -504,13 +495,12 @@ unsigned long rcu_exp_batches_completed(void) EXPORT_SYMBOL_GPL(rcu_exp_batches_completed); /* - * Force a quiescent state. + * Return the root node of the rcu_state structure. */ -void rcu_force_quiescent_state(void) +static struct rcu_node *rcu_get_root(void) { - force_quiescent_state(); + return &rcu_state.node[0]; } -EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); /* * Convert a ->gp_state value to a character string. @@ -529,19 +519,30 @@ void show_rcu_gp_kthreads(void) { int cpu; unsigned long j; + unsigned long ja; + unsigned long jr; + unsigned long jw; struct rcu_data *rdp; struct rcu_node *rnp; - j = jiffies - READ_ONCE(rcu_state.gp_activity); - pr_info("%s: wait state: %s(%d) ->state: %#lx delta ->gp_activity %ld\n", + j = jiffies; + ja = j - READ_ONCE(rcu_state.gp_activity); + jr = j - READ_ONCE(rcu_state.gp_req_activity); + jw = j - READ_ONCE(rcu_state.gp_wake_time); + pr_info("%s: wait state: %s(%d) ->state: %#lx delta ->gp_activity %lu ->gp_req_activity %lu ->gp_wake_time %lu ->gp_wake_seq %ld ->gp_seq %ld ->gp_seq_needed %ld ->gp_flags %#x\n", rcu_state.name, gp_state_getname(rcu_state.gp_state), - rcu_state.gp_state, rcu_state.gp_kthread->state, j); + rcu_state.gp_state, + rcu_state.gp_kthread ? rcu_state.gp_kthread->state : 0x1ffffL, + ja, jr, jw, (long)READ_ONCE(rcu_state.gp_wake_seq), + (long)READ_ONCE(rcu_state.gp_seq), + (long)READ_ONCE(rcu_get_root()->gp_seq_needed), + READ_ONCE(rcu_state.gp_flags)); rcu_for_each_node_breadth_first(rnp) { if (ULONG_CMP_GE(rcu_state.gp_seq, rnp->gp_seq_needed)) continue; - pr_info("\trcu_node %d:%d ->gp_seq %lu ->gp_seq_needed %lu\n", - rnp->grplo, rnp->grphi, rnp->gp_seq, - rnp->gp_seq_needed); + pr_info("\trcu_node %d:%d ->gp_seq %ld ->gp_seq_needed %ld\n", + rnp->grplo, rnp->grphi, (long)rnp->gp_seq, + (long)rnp->gp_seq_needed); if (!rcu_is_leaf_node(rnp)) continue; for_each_leaf_node_possible_cpu(rnp, cpu) { @@ -550,14 +551,35 @@ void show_rcu_gp_kthreads(void) ULONG_CMP_GE(rcu_state.gp_seq, rdp->gp_seq_needed)) continue; - pr_info("\tcpu %d ->gp_seq_needed %lu\n", - cpu, rdp->gp_seq_needed); + pr_info("\tcpu %d ->gp_seq_needed %ld\n", + cpu, (long)rdp->gp_seq_needed); } } /* sched_show_task(rcu_state.gp_kthread); */ } EXPORT_SYMBOL_GPL(show_rcu_gp_kthreads); +/* Dump grace-period-request information due to commandeered sysrq. */ +static void sysrq_show_rcu(int key) +{ + show_rcu_gp_kthreads(); +} + +static struct sysrq_key_op sysrq_rcudump_op = { + .handler = sysrq_show_rcu, + .help_msg = "show-rcu(y)", + .action_msg = "Show RCU tree", + .enable_mask = SYSRQ_ENABLE_DUMP, +}; + +static int __init rcu_sysrq_init(void) +{ + if (sysrq_rcu) + return register_sysrq_key('y', &sysrq_rcudump_op); + return 0; +} +early_initcall(rcu_sysrq_init); + /* * Send along grace-period-related data for rcutorture diagnostics. */ @@ -566,8 +588,6 @@ void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags, { switch (test_type) { case RCU_FLAVOR: - case RCU_BH_FLAVOR: - case RCU_SCHED_FLAVOR: *flags = READ_ONCE(rcu_state.gp_flags); *gp_seq = rcu_seq_current(&rcu_state.gp_seq); break; @@ -578,14 +598,6 @@ void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags, EXPORT_SYMBOL_GPL(rcutorture_get_gp_data); /* - * Return the root node of the rcu_state structure. - */ -static struct rcu_node *rcu_get_root(void) -{ - return &rcu_state.node[0]; -} - -/* * Enter an RCU extended quiescent state, which can be either the * idle loop or adaptive-tickless usermode execution. * @@ -701,7 +713,6 @@ static __always_inline void rcu_nmi_exit_common(bool irq) /** * rcu_nmi_exit - inform RCU of exit from NMI context - * @irq: Is this call from rcu_irq_exit? * * If you add or remove a call to rcu_nmi_exit(), be sure to test * with CONFIG_RCU_EQS_DEBUG=y. @@ -872,6 +883,7 @@ void rcu_nmi_enter(void) { rcu_nmi_enter_common(false); } +NOKPROBE_SYMBOL(rcu_nmi_enter); /** * rcu_irq_enter - inform RCU that current CPU is entering irq away from idle @@ -1115,7 +1127,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) } /* - * NO_HZ_FULL CPUs can run in-kernel without rcu_check_callbacks! + * NO_HZ_FULL CPUs can run in-kernel without rcu_sched_clock_irq! * The above code handles this, but only for straight cond_resched(). * And some in-kernel loops check need_resched() before calling * cond_resched(), which defeats the above code for CPUs that are @@ -1181,7 +1193,7 @@ static void rcu_check_gp_kthread_starvation(void) pr_err("%s kthread starved for %ld jiffies! g%ld f%#x %s(%d) ->state=%#lx ->cpu=%d\n", rcu_state.name, j, (long)rcu_seq_current(&rcu_state.gp_seq), - rcu_state.gp_flags, + READ_ONCE(rcu_state.gp_flags), gp_state_getname(rcu_state.gp_state), rcu_state.gp_state, gpk ? gpk->state : ~0, gpk ? task_cpu(gpk) : -1); if (gpk) { @@ -1310,7 +1322,7 @@ static void print_other_cpu_stall(unsigned long gp_seq) panic_on_rcu_stall(); - force_quiescent_state(); /* Kick them all. */ + rcu_force_quiescent_state(); /* Kick them all. */ } static void print_cpu_stall(void) @@ -1557,17 +1569,28 @@ static bool rcu_future_gp_cleanup(struct rcu_node *rnp) } /* - * Awaken the grace-period kthread. Don't do a self-awaken, and don't - * bother awakening when there is nothing for the grace-period kthread - * to do (as in several CPUs raced to awaken, and we lost), and finally - * don't try to awaken a kthread that has not yet been created. + * Awaken the grace-period kthread. Don't do a self-awaken (unless in + * an interrupt or softirq handler), and don't bother awakening when there + * is nothing for the grace-period kthread to do (as in several CPUs raced + * to awaken, and we lost), and finally don't try to awaken a kthread that + * has not yet been created. If all those checks are passed, track some + * debug information and awaken. + * + * So why do the self-wakeup when in an interrupt or softirq handler + * in the grace-period kthread's context? Because the kthread might have + * been interrupted just as it was going to sleep, and just after the final + * pre-sleep check of the awaken condition. In this case, a wakeup really + * is required, and is therefore supplied. */ static void rcu_gp_kthread_wake(void) { - if (current == rcu_state.gp_kthread || + if ((current == rcu_state.gp_kthread && + !in_interrupt() && !in_serving_softirq()) || !READ_ONCE(rcu_state.gp_flags) || !rcu_state.gp_kthread) return; + WRITE_ONCE(rcu_state.gp_wake_time, jiffies); + WRITE_ONCE(rcu_state.gp_wake_seq, READ_ONCE(rcu_state.gp_seq)); swake_up_one(&rcu_state.gp_wq); } @@ -1711,7 +1734,7 @@ static bool __note_gp_changes(struct rcu_node *rnp, struct rcu_data *rdp) zero_cpu_stall_ticks(rdp); } rdp->gp_seq = rnp->gp_seq; /* Remember new grace-period state. */ - if (ULONG_CMP_GE(rnp->gp_seq_needed, rdp->gp_seq_needed) || rdp->gpwrap) + if (ULONG_CMP_LT(rdp->gp_seq_needed, rnp->gp_seq_needed) || rdp->gpwrap) rdp->gp_seq_needed = rnp->gp_seq_needed; WRITE_ONCE(rdp->gpwrap, false); rcu_gpnum_ovf(rnp, rdp); @@ -1939,7 +1962,7 @@ static void rcu_gp_fqs_loop(void) if (!ret) { rcu_state.jiffies_force_qs = jiffies + j; WRITE_ONCE(rcu_state.jiffies_kick_kthreads, - jiffies + 3 * j); + jiffies + (j ? 3 * j : 2)); } trace_rcu_grace_period(rcu_state.name, READ_ONCE(rcu_state.gp_seq), @@ -2497,14 +2520,14 @@ static void rcu_do_batch(struct rcu_data *rdp) } /* - * Check to see if this CPU is in a non-context-switch quiescent state - * (user mode or idle loop for rcu, non-softirq execution for rcu_bh). - * Also schedule RCU core processing. - * - * This function must be called from hardirq context. It is normally - * invoked from the scheduling-clock interrupt. + * This function is invoked from each scheduling-clock interrupt, + * and checks to see if this CPU is in a non-context-switch quiescent + * state, for example, user mode or idle loop. It also schedules RCU + * core processing. If the current grace period has gone on too long, + * it will ask the scheduler to manufacture a context switch for the sole + * purpose of providing a providing the needed quiescent state. */ -void rcu_check_callbacks(int user) +void rcu_sched_clock_irq(int user) { trace_rcu_utilization(TPS("Start scheduler-tick")); raw_cpu_inc(rcu_data.ticks_this_gp); @@ -2517,7 +2540,7 @@ void rcu_check_callbacks(int user) } __this_cpu_write(rcu_data.rcu_urgent_qs, false); } - rcu_flavor_check_callbacks(user); + rcu_flavor_sched_clock_irq(user); if (rcu_pending()) invoke_rcu_core(); @@ -2578,7 +2601,7 @@ static void force_qs_rnp(int (*f)(struct rcu_data *rdp)) * Force quiescent states on reluctant CPUs, and also detect which * CPUs are in dyntick-idle mode. */ -static void force_quiescent_state(void) +void rcu_force_quiescent_state(void) { unsigned long flags; bool ret; @@ -2610,6 +2633,7 @@ static void force_quiescent_state(void) raw_spin_unlock_irqrestore_rcu_node(rnp_old, flags); rcu_gp_kthread_wake(); } +EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); /* * This function checks for grace-period requests that fail to motivate @@ -2657,16 +2681,11 @@ rcu_check_gp_start_stall(struct rcu_node *rnp, struct rcu_data *rdp, raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return; } - pr_alert("%s: g%ld->%ld gar:%lu ga:%lu f%#x gs:%d %s->state:%#lx\n", - __func__, (long)READ_ONCE(rcu_state.gp_seq), - (long)READ_ONCE(rnp_root->gp_seq_needed), - j - rcu_state.gp_req_activity, j - rcu_state.gp_activity, - rcu_state.gp_flags, rcu_state.gp_state, rcu_state.name, - rcu_state.gp_kthread ? rcu_state.gp_kthread->state : 0x1ffffL); WARN_ON(1); if (rnp_root != rnp) raw_spin_unlock_rcu_node(rnp_root); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + show_rcu_gp_kthreads(); } /* @@ -2711,12 +2730,8 @@ void rcu_fwd_progress_check(unsigned long j) } EXPORT_SYMBOL_GPL(rcu_fwd_progress_check); -/* - * This does the RCU core processing work for the specified rcu_data - * structures. This may be called only from the CPU to whom the rdp - * belongs. - */ -static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused) +/* Perform RCU core processing work for the current CPU. */ +static __latent_entropy void rcu_core(struct softirq_action *unused) { unsigned long flags; struct rcu_data *rdp = raw_cpu_ptr(&rcu_data); @@ -2801,9 +2816,9 @@ static void __call_rcu_core(struct rcu_data *rdp, struct rcu_head *head, /* * Force the grace period if too many callbacks or too long waiting. - * Enforce hysteresis, and don't invoke force_quiescent_state() + * Enforce hysteresis, and don't invoke rcu_force_quiescent_state() * if some other CPU has recently done so. Also, don't bother - * invoking force_quiescent_state() if the newly enqueued callback + * invoking rcu_force_quiescent_state() if the newly enqueued callback * is the only one waiting for a grace period to complete. */ if (unlikely(rcu_segcblist_n_cbs(&rdp->cblist) > @@ -2820,7 +2835,7 @@ static void __call_rcu_core(struct rcu_data *rdp, struct rcu_head *head, rdp->blimit = LONG_MAX; if (rcu_state.n_force_qs == rdp->n_force_qs_snap && rcu_segcblist_first_pend_cb(&rdp->cblist) != head) - force_quiescent_state(); + rcu_force_quiescent_state(); rdp->n_force_qs_snap = rcu_state.n_force_qs; rdp->qlen_last_fqs_check = rcu_segcblist_n_cbs(&rdp->cblist); } @@ -2889,9 +2904,6 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func, int cpu, bool lazy) rcu_segcblist_init(&rdp->cblist); } rcu_segcblist_enqueue(&rdp->cblist, head, lazy); - if (!lazy) - rcu_idle_count_callbacks_posted(); - if (__is_kfree_rcu_offset((unsigned long)func)) trace_rcu_kfree_callback(rcu_state.name, head, (unsigned long)func, @@ -2961,6 +2973,79 @@ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func) } EXPORT_SYMBOL_GPL(kfree_call_rcu); +/* + * During early boot, any blocking grace-period wait automatically + * implies a grace period. Later on, this is never the case for PREEMPT. + * + * Howevr, because a context switch is a grace period for !PREEMPT, any + * blocking grace-period wait automatically implies a grace period if + * there is only one CPU online at any point time during execution of + * either synchronize_rcu() or synchronize_rcu_expedited(). It is OK to + * occasionally incorrectly indicate that there are multiple CPUs online + * when there was in fact only one the whole time, as this just adds some + * overhead: RCU still operates correctly. + */ +static int rcu_blocking_is_gp(void) +{ + int ret; + + if (IS_ENABLED(CONFIG_PREEMPT)) + return rcu_scheduler_active == RCU_SCHEDULER_INACTIVE; + might_sleep(); /* Check for RCU read-side critical section. */ + preempt_disable(); + ret = num_online_cpus() <= 1; + preempt_enable(); + return ret; +} + +/** + * synchronize_rcu - wait until a grace period has elapsed. + * + * Control will return to the caller some time after a full grace + * period has elapsed, in other words after all currently executing RCU + * read-side critical sections have completed. Note, however, that + * upon return from synchronize_rcu(), the caller might well be executing + * concurrently with new RCU read-side critical sections that began while + * synchronize_rcu() was waiting. RCU read-side critical sections are + * delimited by rcu_read_lock() and rcu_read_unlock(), and may be nested. + * In addition, regions of code across which interrupts, preemption, or + * softirqs have been disabled also serve as RCU read-side critical + * sections. This includes hardware interrupt handlers, softirq handlers, + * and NMI handlers. + * + * Note that this guarantee implies further memory-ordering guarantees. + * On systems with more than one CPU, when synchronize_rcu() returns, + * each CPU is guaranteed to have executed a full memory barrier since + * the end of its last RCU read-side critical section whose beginning + * preceded the call to synchronize_rcu(). In addition, each CPU having + * an RCU read-side critical section that extends beyond the return from + * synchronize_rcu() is guaranteed to have executed a full memory barrier + * after the beginning of synchronize_rcu() and before the beginning of + * that RCU read-side critical section. Note that these guarantees include + * CPUs that are offline, idle, or executing in user mode, as well as CPUs + * that are executing in the kernel. + * + * Furthermore, if CPU A invoked synchronize_rcu(), which returned + * to its caller on CPU B, then both CPU A and CPU B are guaranteed + * to have executed a full memory barrier during the execution of + * synchronize_rcu() -- even if CPU A and CPU B are the same CPU (but + * again only if the system has more than one CPU). + */ +void synchronize_rcu(void) +{ + RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) || + lock_is_held(&rcu_lock_map) || + lock_is_held(&rcu_sched_lock_map), + "Illegal synchronize_rcu() in RCU read-side critical section"); + if (rcu_blocking_is_gp()) + return; + if (rcu_gp_is_expedited()) + synchronize_rcu_expedited(); + else + wait_rcu_gp(call_rcu); +} +EXPORT_SYMBOL_GPL(synchronize_rcu); + /** * get_state_synchronize_rcu - Snapshot current RCU state * @@ -3049,28 +3134,6 @@ static int rcu_pending(void) } /* - * Return true if the specified CPU has any callback. If all_lazy is - * non-NULL, store an indication of whether all callbacks are lazy. - * (If there are no callbacks, all of them are deemed to be lazy.) - */ -static bool rcu_cpu_has_callbacks(bool *all_lazy) -{ - bool al = true; - bool hc = false; - struct rcu_data *rdp; - - rdp = this_cpu_ptr(&rcu_data); - if (!rcu_segcblist_empty(&rdp->cblist)) { - hc = true; - if (rcu_segcblist_n_nonlazy_cbs(&rdp->cblist)) - al = false; - } - if (all_lazy) - *all_lazy = al; - return hc; -} - -/* * Helper function for rcu_barrier() tracing. If tracing is disabled, * the compiler is expected to optimize this away. */ @@ -3299,7 +3362,7 @@ int rcutree_prepare_cpu(unsigned int cpu) trace_rcu_grace_period(rcu_state.name, rdp->gp_seq, TPS("cpuonl")); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); rcu_prepare_kthreads(cpu); - rcu_spawn_all_nocb_kthreads(cpu); + rcu_spawn_cpu_nocb_kthread(cpu); return 0; } @@ -3329,8 +3392,6 @@ int rcutree_online_cpu(unsigned int cpu) raw_spin_lock_irqsave_rcu_node(rnp, flags); rnp->ffmask |= rdp->grpmask; raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - if (IS_ENABLED(CONFIG_TREE_SRCU)) - srcu_online_cpu(cpu); if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE) return 0; /* Too early in boot for scheduler work. */ sync_sched_exp_online_cleanup(cpu); @@ -3355,8 +3416,6 @@ int rcutree_offline_cpu(unsigned int cpu) raw_spin_unlock_irqrestore_rcu_node(rnp, flags); rcutree_affinity_setting(cpu, cpu); - if (IS_ENABLED(CONFIG_TREE_SRCU)) - srcu_offline_cpu(cpu); return 0; } @@ -3777,7 +3836,7 @@ void __init rcu_init(void) rcu_init_one(); if (dump_tree) rcu_dump_rcu_node_tree(); - open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); + open_softirq(RCU_SOFTIRQ, rcu_core); /* * We don't need protection against CPU-hotplug here because diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index d90b02b53c0e..bb4f995f2d3f 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -1,25 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ /* * Read-Copy Update mechanism for mutual exclusion (tree-based version) * Internal non-public definitions. * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, you can access it online at - * http://www.gnu.org/licenses/gpl-2.0.html. - * * Copyright IBM Corporation, 2008 * * Author: Ingo Molnar <mingo@elte.hu> - * Paul E. McKenney <paulmck@linux.vnet.ibm.com> + * Paul E. McKenney <paulmck@linux.ibm.com> */ #include <linux/cache.h> @@ -36,7 +23,6 @@ /* Communicate arguments to a workqueue handler. */ struct rcu_exp_work { - smp_call_func_t rew_func; unsigned long rew_s; struct work_struct rew_work; }; @@ -194,10 +180,7 @@ struct rcu_data { bool rcu_need_heavy_qs; /* GP old, so heavy quiescent state! */ bool rcu_urgent_qs; /* GP old need light quiescent state. */ #ifdef CONFIG_RCU_FAST_NO_HZ - bool all_lazy; /* Are all CPU's CBs lazy? */ - unsigned long nonlazy_posted; /* # times non-lazy CB posted to CPU. */ - unsigned long nonlazy_posted_snap; - /* Nonlazy_posted snapshot. */ + bool all_lazy; /* All CPU's CBs lazy at idle start? */ unsigned long last_accelerate; /* Last jiffy CBs were accelerated. */ unsigned long last_advance_all; /* Last jiffy CBs were all advanced. */ int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */ @@ -234,7 +217,13 @@ struct rcu_data { /* Leader CPU takes GP-end wakeups. */ #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ - /* 6) Diagnostic data, including RCU CPU stall warnings. */ + /* 6) RCU priority boosting. */ + struct task_struct *rcu_cpu_kthread_task; + /* rcuc per-CPU kthread or NULL. */ + unsigned int rcu_cpu_kthread_status; + char rcu_cpu_has_work; + + /* 7) Diagnostic data, including RCU CPU stall warnings. */ unsigned int softirq_snap; /* Snapshot of softirq activity. */ /* ->rcu_iw* fields protected by leaf rcu_node ->lock. */ struct irq_work rcu_iw; /* Check for non-irq activity. */ @@ -303,6 +292,8 @@ struct rcu_state { struct swait_queue_head gp_wq; /* Where GP task waits. */ short gp_flags; /* Commands for GP task. */ short gp_state; /* GP kthread sleep state. */ + unsigned long gp_wake_time; /* Last GP kthread wake. */ + unsigned long gp_wake_seq; /* ->gp_seq at ^^^. */ /* End of fields guarded by root rcu_node's lock. */ @@ -402,13 +393,6 @@ static const char *tp_rcu_varname __used __tracepoint_string = rcu_name; int rcu_dynticks_snap(struct rcu_data *rdp); -#ifdef CONFIG_RCU_BOOST -DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status); -DECLARE_PER_CPU(int, rcu_cpu_kthread_cpu); -DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_loops); -DECLARE_PER_CPU(char, rcu_cpu_has_work); -#endif /* #ifdef CONFIG_RCU_BOOST */ - /* Forward declarations for rcutree_plugin.h */ static void rcu_bootup_announce(void); static void rcu_qs(void); @@ -420,7 +404,7 @@ static void rcu_print_detail_task_stall(void); static int rcu_print_task_stall(struct rcu_node *rnp); static int rcu_print_task_exp_stall(struct rcu_node *rnp); static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); -static void rcu_flavor_check_callbacks(int user); +static void rcu_flavor_sched_clock_irq(int user); void call_rcu(struct rcu_head *head, rcu_callback_t func); static void dump_blkd_tasks(struct rcu_node *rnp, int ncheck); static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags); @@ -431,7 +415,6 @@ static void __init rcu_spawn_boost_kthreads(void); static void rcu_prepare_kthreads(int cpu); static void rcu_cleanup_after_idle(void); static void rcu_prepare_for_idle(void); -static void rcu_idle_count_callbacks_posted(void); static bool rcu_preempt_has_tasks(struct rcu_node *rnp); static bool rcu_preempt_need_deferred_qs(struct task_struct *t); static void rcu_preempt_deferred_qs(struct task_struct *t); @@ -451,7 +434,7 @@ static bool rcu_nocb_adopt_orphan_cbs(struct rcu_data *my_rdp, static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp); static void do_nocb_deferred_wakeup(struct rcu_data *rdp); static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp); -static void rcu_spawn_all_nocb_kthreads(int cpu); +static void rcu_spawn_cpu_nocb_kthread(int cpu); static void __init rcu_spawn_nocb_kthreads(void); #ifdef CONFIG_RCU_NOCB_CPU static void __init rcu_organize_nocb_kthreads(void); @@ -462,11 +445,3 @@ static void rcu_bind_gp_kthread(void); static bool rcu_nohz_full_cpu(void); static void rcu_dynticks_task_enter(void); static void rcu_dynticks_task_exit(void); - -#ifdef CONFIG_SRCU -void srcu_online_cpu(unsigned int cpu); -void srcu_offline_cpu(unsigned int cpu); -#else /* #ifdef CONFIG_SRCU */ -void srcu_online_cpu(unsigned int cpu) { } -void srcu_offline_cpu(unsigned int cpu) { } -#endif /* #else #ifdef CONFIG_SRCU */ diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 928fe5893a57..4c2a0189e748 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -1,27 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ /* * RCU expedited grace periods * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, you can access it online at - * http://www.gnu.org/licenses/gpl-2.0.html. - * * Copyright IBM Corporation, 2016 * - * Authors: Paul E. McKenney <paulmck@linux.vnet.ibm.com> + * Authors: Paul E. McKenney <paulmck@linux.ibm.com> */ #include <linux/lockdep.h> +static void rcu_exp_handler(void *unused); + /* * Record the start of an expedited grace period. */ @@ -344,7 +333,6 @@ static void sync_rcu_exp_select_node_cpus(struct work_struct *wp) { int cpu; unsigned long flags; - smp_call_func_t func; unsigned long mask_ofl_test; unsigned long mask_ofl_ipi; int ret; @@ -352,7 +340,6 @@ static void sync_rcu_exp_select_node_cpus(struct work_struct *wp) container_of(wp, struct rcu_exp_work, rew_work); struct rcu_node *rnp = container_of(rewp, struct rcu_node, rew); - func = rewp->rew_func; raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Each pass checks a CPU for identity, offline, and idle. */ @@ -396,7 +383,7 @@ retry_ipi: mask_ofl_test |= mask; continue; } - ret = smp_call_function_single(cpu, func, NULL, 0); + ret = smp_call_function_single(cpu, rcu_exp_handler, NULL, 0); if (!ret) { mask_ofl_ipi &= ~mask; continue; @@ -426,7 +413,7 @@ retry_ipi: * Select the nodes that the upcoming expedited grace period needs * to wait for. */ -static void sync_rcu_exp_select_cpus(smp_call_func_t func) +static void sync_rcu_exp_select_cpus(void) { int cpu; struct rcu_node *rnp; @@ -440,7 +427,6 @@ static void sync_rcu_exp_select_cpus(smp_call_func_t func) rnp->exp_need_flush = false; if (!READ_ONCE(rnp->expmask)) continue; /* Avoid early boot non-existent wq. */ - rnp->rew.rew_func = func; if (!READ_ONCE(rcu_par_gp_wq) || rcu_scheduler_active != RCU_SCHEDULER_RUNNING || rcu_is_last_leaf_node(rnp)) { @@ -449,7 +435,6 @@ static void sync_rcu_exp_select_cpus(smp_call_func_t func) continue; } INIT_WORK(&rnp->rew.rew_work, sync_rcu_exp_select_node_cpus); - preempt_disable(); cpu = find_next_bit(&rnp->ffmask, BITS_PER_LONG, -1); /* If all offline, queue the work on an unbound CPU. */ if (unlikely(cpu > rnp->grphi - rnp->grplo)) @@ -457,7 +442,6 @@ static void sync_rcu_exp_select_cpus(smp_call_func_t func) else cpu += rnp->grplo; queue_work_on(cpu, rcu_par_gp_wq, &rnp->rew.rew_work); - preempt_enable(); rnp->exp_need_flush = true; } @@ -580,10 +564,10 @@ static void rcu_exp_wait_wake(unsigned long s) * Common code to drive an expedited grace period forward, used by * workqueues and mid-boot-time tasks. */ -static void rcu_exp_sel_wait_wake(smp_call_func_t func, unsigned long s) +static void rcu_exp_sel_wait_wake(unsigned long s) { /* Initialize the rcu_node tree in preparation for the wait. */ - sync_rcu_exp_select_cpus(func); + sync_rcu_exp_select_cpus(); /* Wait and clean up, including waking everyone. */ rcu_exp_wait_wake(s); @@ -597,52 +581,7 @@ static void wait_rcu_exp_gp(struct work_struct *wp) struct rcu_exp_work *rewp; rewp = container_of(wp, struct rcu_exp_work, rew_work); - rcu_exp_sel_wait_wake(rewp->rew_func, rewp->rew_s); -} - -/* - * Given a smp_call_function() handler, kick off the specified - * implementation of expedited grace period. - */ -static void _synchronize_rcu_expedited(smp_call_func_t func) -{ - struct rcu_data *rdp; - struct rcu_exp_work rew; - struct rcu_node *rnp; - unsigned long s; - - /* If expedited grace periods are prohibited, fall back to normal. */ - if (rcu_gp_is_normal()) { - wait_rcu_gp(call_rcu); - return; - } - - /* Take a snapshot of the sequence number. */ - s = rcu_exp_gp_seq_snap(); - if (exp_funnel_lock(s)) - return; /* Someone else did our work for us. */ - - /* Ensure that load happens before action based on it. */ - if (unlikely(rcu_scheduler_active == RCU_SCHEDULER_INIT)) { - /* Direct call during scheduler init and early_initcalls(). */ - rcu_exp_sel_wait_wake(func, s); - } else { - /* Marshall arguments & schedule the expedited grace period. */ - rew.rew_func = func; - rew.rew_s = s; - INIT_WORK_ONSTACK(&rew.rew_work, wait_rcu_exp_gp); - queue_work(rcu_gp_wq, &rew.rew_work); - } - - /* Wait for expedited grace period to complete. */ - rdp = per_cpu_ptr(&rcu_data, raw_smp_processor_id()); - rnp = rcu_get_root(); - wait_event(rnp->exp_wq[rcu_seq_ctr(s) & 0x3], - sync_exp_work_done(s)); - smp_mb(); /* Workqueue actions happen before return. */ - - /* Let the next expedited grace period start. */ - mutex_unlock(&rcu_state.exp_mutex); + rcu_exp_sel_wait_wake(rewp->rew_s); } #ifdef CONFIG_PREEMPT_RCU @@ -654,7 +593,7 @@ static void _synchronize_rcu_expedited(smp_call_func_t func) * ->expmask fields in the rcu_node tree. Otherwise, immediately * report the quiescent state. */ -static void sync_rcu_exp_handler(void *unused) +static void rcu_exp_handler(void *unused) { unsigned long flags; struct rcu_data *rdp = this_cpu_ptr(&rcu_data); @@ -697,6 +636,7 @@ static void sync_rcu_exp_handler(void *unused) WRITE_ONCE(t->rcu_read_unlock_special.b.exp_hint, true); } raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + return; } /* @@ -730,43 +670,10 @@ static void sync_sched_exp_online_cleanup(int cpu) { } -/** - * synchronize_rcu_expedited - Brute-force RCU grace period - * - * Wait for an RCU-preempt grace period, but expedite it. The basic - * idea is to IPI all non-idle non-nohz online CPUs. The IPI handler - * checks whether the CPU is in an RCU-preempt critical section, and - * if so, it sets a flag that causes the outermost rcu_read_unlock() - * to report the quiescent state. On the other hand, if the CPU is - * not in an RCU read-side critical section, the IPI handler reports - * the quiescent state immediately. - * - * Although this is a greate improvement over previous expedited - * implementations, it is still unfriendly to real-time workloads, so is - * thus not recommended for any sort of common-case code. In fact, if - * you are using synchronize_rcu_expedited() in a loop, please restructure - * your code to batch your updates, and then Use a single synchronize_rcu() - * instead. - * - * This has the same semantics as (but is more brutal than) synchronize_rcu(). - */ -void synchronize_rcu_expedited(void) -{ - RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) || - lock_is_held(&rcu_lock_map) || - lock_is_held(&rcu_sched_lock_map), - "Illegal synchronize_rcu_expedited() in RCU read-side critical section"); - - if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE) - return; - _synchronize_rcu_expedited(sync_rcu_exp_handler); -} -EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); - #else /* #ifdef CONFIG_PREEMPT_RCU */ /* Invoked on each online non-idle CPU for expedited quiescent state. */ -static void sync_sched_exp_handler(void *unused) +static void rcu_exp_handler(void *unused) { struct rcu_data *rdp; struct rcu_node *rnp; @@ -798,44 +705,78 @@ static void sync_sched_exp_online_cleanup(int cpu) rnp = rdp->mynode; if (!(READ_ONCE(rnp->expmask) & rdp->grpmask)) return; - ret = smp_call_function_single(cpu, sync_sched_exp_handler, NULL, 0); + ret = smp_call_function_single(cpu, rcu_exp_handler, NULL, 0); WARN_ON_ONCE(ret); } -/* - * Because a context switch is a grace period for !PREEMPT, any - * blocking grace-period wait automatically implies a grace period if - * there is only one CPU online at any point time during execution of - * either synchronize_rcu() or synchronize_rcu_expedited(). It is OK to - * occasionally incorrectly indicate that there are multiple CPUs online - * when there was in fact only one the whole time, as this just adds some - * overhead: RCU still operates correctly. - */ -static int rcu_blocking_is_gp(void) -{ - int ret; - - might_sleep(); /* Check for RCU read-side critical section. */ - preempt_disable(); - ret = num_online_cpus() <= 1; - preempt_enable(); - return ret; -} +#endif /* #else #ifdef CONFIG_PREEMPT_RCU */ -/* PREEMPT=n implementation of synchronize_rcu_expedited(). */ +/** + * synchronize_rcu_expedited - Brute-force RCU grace period + * + * Wait for an RCU grace period, but expedite it. The basic idea is to + * IPI all non-idle non-nohz online CPUs. The IPI handler checks whether + * the CPU is in an RCU critical section, and if so, it sets a flag that + * causes the outermost rcu_read_unlock() to report the quiescent state + * for RCU-preempt or asks the scheduler for help for RCU-sched. On the + * other hand, if the CPU is not in an RCU read-side critical section, + * the IPI handler reports the quiescent state immediately. + * + * Although this is a greate improvement over previous expedited + * implementations, it is still unfriendly to real-time workloads, so is + * thus not recommended for any sort of common-case code. In fact, if + * you are using synchronize_rcu_expedited() in a loop, please restructure + * your code to batch your updates, and then Use a single synchronize_rcu() + * instead. + * + * This has the same semantics as (but is more brutal than) synchronize_rcu(). + */ void synchronize_rcu_expedited(void) { + struct rcu_data *rdp; + struct rcu_exp_work rew; + struct rcu_node *rnp; + unsigned long s; + RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) || lock_is_held(&rcu_lock_map) || lock_is_held(&rcu_sched_lock_map), "Illegal synchronize_rcu_expedited() in RCU read-side critical section"); - /* If only one CPU, this is automatically a grace period. */ + /* Is the state is such that the call is a grace period? */ if (rcu_blocking_is_gp()) return; - _synchronize_rcu_expedited(sync_sched_exp_handler); + /* If expedited grace periods are prohibited, fall back to normal. */ + if (rcu_gp_is_normal()) { + wait_rcu_gp(call_rcu); + return; + } + + /* Take a snapshot of the sequence number. */ + s = rcu_exp_gp_seq_snap(); + if (exp_funnel_lock(s)) + return; /* Someone else did our work for us. */ + + /* Ensure that load happens before action based on it. */ + if (unlikely(rcu_scheduler_active == RCU_SCHEDULER_INIT)) { + /* Direct call during scheduler init and early_initcalls(). */ + rcu_exp_sel_wait_wake(s); + } else { + /* Marshall arguments & schedule the expedited grace period. */ + rew.rew_s = s; + INIT_WORK_ONSTACK(&rew.rew_work, wait_rcu_exp_gp); + queue_work(rcu_gp_wq, &rew.rew_work); + } + + /* Wait for expedited grace period to complete. */ + rdp = per_cpu_ptr(&rcu_data, raw_smp_processor_id()); + rnp = rcu_get_root(); + wait_event(rnp->exp_wq[rcu_seq_ctr(s) & 0x3], + sync_exp_work_done(s)); + smp_mb(); /* Workqueue actions happen before return. */ + + /* Let the next expedited grace period start. */ + mutex_unlock(&rcu_state.exp_mutex); } EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); - -#endif /* #else #ifdef CONFIG_PREEMPT_RCU */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 1b3dd2fc0cd6..97dba50f6fb2 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1,27 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ /* * Read-Copy Update mechanism for mutual exclusion (tree-based version) * Internal non-public definitions that provide either classic * or preemptible semantics. * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, you can access it online at - * http://www.gnu.org/licenses/gpl-2.0.html. - * * Copyright Red Hat, 2009 * Copyright IBM Corporation, 2009 * * Author: Ingo Molnar <mingo@elte.hu> - * Paul E. McKenney <paulmck@linux.vnet.ibm.com> + * Paul E. McKenney <paulmck@linux.ibm.com> */ #include <linux/delay.h> @@ -34,17 +21,7 @@ #include "../time/tick-internal.h" #ifdef CONFIG_RCU_BOOST - #include "../locking/rtmutex_common.h" - -/* - * Control variables for per-CPU and per-rcu_node kthreads. - */ -static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task); -DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status); -DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops); -DEFINE_PER_CPU(char, rcu_cpu_has_work); - #else /* #ifdef CONFIG_RCU_BOOST */ /* @@ -307,7 +284,7 @@ static void rcu_qs(void) __this_cpu_read(rcu_data.gp_seq), TPS("cpuqs")); __this_cpu_write(rcu_data.cpu_no_qs.b.norm, false); - barrier(); /* Coordinate with rcu_flavor_check_callbacks(). */ + barrier(); /* Coordinate with rcu_flavor_sched_clock_irq(). */ current->rcu_read_unlock_special.b.need_qs = false; } } @@ -788,13 +765,13 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) } /* - * Check for a quiescent state from the current CPU. When a task blocks, - * the task is recorded in the corresponding CPU's rcu_node structure, - * which is checked elsewhere. - * - * Caller must disable hard irqs. + * Check for a quiescent state from the current CPU, including voluntary + * context switches for Tasks RCU. When a task blocks, the task is + * recorded in the corresponding CPU's rcu_node structure, which is checked + * elsewhere, hence this function need only check for quiescent states + * related to the current CPU, not to those related to tasks. */ -static void rcu_flavor_check_callbacks(int user) +static void rcu_flavor_sched_clock_irq(int user) { struct task_struct *t = current; @@ -825,54 +802,6 @@ static void rcu_flavor_check_callbacks(int user) t->rcu_read_unlock_special.b.need_qs = true; } -/** - * synchronize_rcu - wait until a grace period has elapsed. - * - * Control will return to the caller some time after a full grace - * period has elapsed, in other words after all currently executing RCU - * read-side critical sections have completed. Note, however, that - * upon return from synchronize_rcu(), the caller might well be executing - * concurrently with new RCU read-side critical sections that began while - * synchronize_rcu() was waiting. RCU read-side critical sections are - * delimited by rcu_read_lock() and rcu_read_unlock(), and may be nested. - * In addition, regions of code across which interrupts, preemption, or - * softirqs have been disabled also serve as RCU read-side critical - * sections. This includes hardware interrupt handlers, softirq handlers, - * and NMI handlers. - * - * Note that this guarantee implies further memory-ordering guarantees. - * On systems with more than one CPU, when synchronize_rcu() returns, - * each CPU is guaranteed to have executed a full memory barrier since - * the end of its last RCU read-side critical section whose beginning - * preceded the call to synchronize_rcu(). In addition, each CPU having - * an RCU read-side critical section that extends beyond the return from - * synchronize_rcu() is guaranteed to have executed a full memory barrier - * after the beginning of synchronize_rcu() and before the beginning of - * that RCU read-side critical section. Note that these guarantees include - * CPUs that are offline, idle, or executing in user mode, as well as CPUs - * that are executing in the kernel. - * - * Furthermore, if CPU A invoked synchronize_rcu(), which returned - * to its caller on CPU B, then both CPU A and CPU B are guaranteed - * to have executed a full memory barrier during the execution of - * synchronize_rcu() -- even if CPU A and CPU B are the same CPU (but - * again only if the system has more than one CPU). - */ -void synchronize_rcu(void) -{ - RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) || - lock_is_held(&rcu_lock_map) || - lock_is_held(&rcu_sched_lock_map), - "Illegal synchronize_rcu() in RCU read-side critical section"); - if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE) - return; - if (rcu_gp_is_expedited()) - synchronize_rcu_expedited(); - else - wait_rcu_gp(call_rcu); -} -EXPORT_SYMBOL_GPL(synchronize_rcu); - /* * Check for a task exiting while in a preemptible-RCU read-side * critical section, clean up if so. No need to issue warnings, @@ -1088,14 +1017,10 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) } /* - * Check to see if this CPU is in a non-context-switch quiescent state - * (user mode or idle loop for rcu, non-softirq execution for rcu_bh). - * Also schedule RCU core processing. - * - * This function must be called from hardirq context. It is normally - * invoked from the scheduling-clock interrupt. + * Check to see if this CPU is in a non-context-switch quiescent state, + * namely user mode and idle loop. */ -static void rcu_flavor_check_callbacks(int user) +static void rcu_flavor_sched_clock_irq(int user) { if (user || rcu_is_cpu_rrupt_from_idle()) { @@ -1115,22 +1040,6 @@ static void rcu_flavor_check_callbacks(int user) } } -/* PREEMPT=n implementation of synchronize_rcu(). */ -void synchronize_rcu(void) -{ - RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) || - lock_is_held(&rcu_lock_map) || - lock_is_held(&rcu_sched_lock_map), - "Illegal synchronize_rcu() in RCU read-side critical section"); - if (rcu_blocking_is_gp()) - return; - if (rcu_gp_is_expedited()) - synchronize_rcu_expedited(); - else - wait_rcu_gp(call_rcu); -} -EXPORT_SYMBOL_GPL(synchronize_rcu); - /* * Because preemptible RCU does not exist, tasks cannot possibly exit * while in preemptible RCU read-side critical sections. @@ -1307,11 +1216,11 @@ static void invoke_rcu_callbacks_kthread(void) unsigned long flags; local_irq_save(flags); - __this_cpu_write(rcu_cpu_has_work, 1); - if (__this_cpu_read(rcu_cpu_kthread_task) != NULL && - current != __this_cpu_read(rcu_cpu_kthread_task)) { - rcu_wake_cond(__this_cpu_read(rcu_cpu_kthread_task), - __this_cpu_read(rcu_cpu_kthread_status)); + __this_cpu_write(rcu_data.rcu_cpu_has_work, 1); + if (__this_cpu_read(rcu_data.rcu_cpu_kthread_task) != NULL && + current != __this_cpu_read(rcu_data.rcu_cpu_kthread_task)) { + rcu_wake_cond(__this_cpu_read(rcu_data.rcu_cpu_kthread_task), + __this_cpu_read(rcu_data.rcu_cpu_kthread_status)); } local_irq_restore(flags); } @@ -1322,7 +1231,7 @@ static void invoke_rcu_callbacks_kthread(void) */ static bool rcu_is_callbacks_kthread(void) { - return __this_cpu_read(rcu_cpu_kthread_task) == current; + return __this_cpu_read(rcu_data.rcu_cpu_kthread_task) == current; } #define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000) @@ -1369,11 +1278,6 @@ static int rcu_spawn_one_boost_kthread(struct rcu_node *rnp) return 0; } -static void rcu_kthread_do_work(void) -{ - rcu_do_batch(this_cpu_ptr(&rcu_data)); -} - static void rcu_cpu_kthread_setup(unsigned int cpu) { struct sched_param sp; @@ -1384,12 +1288,12 @@ static void rcu_cpu_kthread_setup(unsigned int cpu) static void rcu_cpu_kthread_park(unsigned int cpu) { - per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU; + per_cpu(rcu_data.rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU; } static int rcu_cpu_kthread_should_run(unsigned int cpu) { - return __this_cpu_read(rcu_cpu_has_work); + return __this_cpu_read(rcu_data.rcu_cpu_has_work); } /* @@ -1399,21 +1303,20 @@ static int rcu_cpu_kthread_should_run(unsigned int cpu) */ static void rcu_cpu_kthread(unsigned int cpu) { - unsigned int *statusp = this_cpu_ptr(&rcu_cpu_kthread_status); - char work, *workp = this_cpu_ptr(&rcu_cpu_has_work); + unsigned int *statusp = this_cpu_ptr(&rcu_data.rcu_cpu_kthread_status); + char work, *workp = this_cpu_ptr(&rcu_data.rcu_cpu_has_work); int spincnt; for (spincnt = 0; spincnt < 10; spincnt++) { trace_rcu_utilization(TPS("Start CPU kthread@rcu_wait")); local_bh_disable(); *statusp = RCU_KTHREAD_RUNNING; - this_cpu_inc(rcu_cpu_kthread_loops); local_irq_disable(); work = *workp; *workp = 0; local_irq_enable(); if (work) - rcu_kthread_do_work(); + rcu_do_batch(this_cpu_ptr(&rcu_data)); local_bh_enable(); if (*workp == 0) { trace_rcu_utilization(TPS("End CPU kthread@rcu_wait")); @@ -1459,7 +1362,7 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) } static struct smp_hotplug_thread rcu_cpu_thread_spec = { - .store = &rcu_cpu_kthread_task, + .store = &rcu_data.rcu_cpu_kthread_task, .thread_should_run = rcu_cpu_kthread_should_run, .thread_fn = rcu_cpu_kthread, .thread_comm = "rcuc/%u", @@ -1476,7 +1379,7 @@ static void __init rcu_spawn_boost_kthreads(void) int cpu; for_each_possible_cpu(cpu) - per_cpu(rcu_cpu_has_work, cpu) = 0; + per_cpu(rcu_data.rcu_cpu_has_work, cpu) = 0; if (WARN_ONCE(smpboot_register_percpu_thread(&rcu_cpu_thread_spec), "%s: Could not start rcub kthread, OOM is now expected behavior\n", __func__)) return; rcu_for_each_leaf_node(rnp) @@ -1543,7 +1446,7 @@ static void rcu_prepare_kthreads(int cpu) int rcu_needs_cpu(u64 basemono, u64 *nextevt) { *nextevt = KTIME_MAX; - return rcu_cpu_has_callbacks(NULL); + return !rcu_segcblist_empty(&this_cpu_ptr(&rcu_data)->cblist); } /* @@ -1562,14 +1465,6 @@ static void rcu_prepare_for_idle(void) { } -/* - * Don't bother keeping a running count of the number of RCU callbacks - * posted because CONFIG_RCU_FAST_NO_HZ=n. - */ -static void rcu_idle_count_callbacks_posted(void) -{ -} - #else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */ /* @@ -1652,11 +1547,8 @@ int rcu_needs_cpu(u64 basemono, u64 *nextevt) lockdep_assert_irqs_disabled(); - /* Snapshot to detect later posting of non-lazy callback. */ - rdp->nonlazy_posted_snap = rdp->nonlazy_posted; - /* If no callbacks, RCU doesn't need the CPU. */ - if (!rcu_cpu_has_callbacks(&rdp->all_lazy)) { + if (rcu_segcblist_empty(&rdp->cblist)) { *nextevt = KTIME_MAX; return 0; } @@ -1670,11 +1562,12 @@ int rcu_needs_cpu(u64 basemono, u64 *nextevt) rdp->last_accelerate = jiffies; /* Request timer delay depending on laziness, and round. */ - if (!rdp->all_lazy) { + rdp->all_lazy = !rcu_segcblist_n_nonlazy_cbs(&rdp->cblist); + if (rdp->all_lazy) { + dj = round_jiffies(rcu_idle_lazy_gp_delay + jiffies) - jiffies; + } else { dj = round_up(rcu_idle_gp_delay + jiffies, rcu_idle_gp_delay) - jiffies; - } else { - dj = round_jiffies(rcu_idle_lazy_gp_delay + jiffies) - jiffies; } *nextevt = basemono + dj * TICK_NSEC; return 0; @@ -1704,7 +1597,7 @@ static void rcu_prepare_for_idle(void) /* Handle nohz enablement switches conservatively. */ tne = READ_ONCE(tick_nohz_active); if (tne != rdp->tick_nohz_enabled_snap) { - if (rcu_cpu_has_callbacks(NULL)) + if (!rcu_segcblist_empty(&rdp->cblist)) invoke_rcu_core(); /* force nohz to see update. */ rdp->tick_nohz_enabled_snap = tne; return; @@ -1717,10 +1610,8 @@ static void rcu_prepare_for_idle(void) * callbacks, invoke RCU core for the side-effect of recalculating * idle duration on re-entry to idle. */ - if (rdp->all_lazy && - rdp->nonlazy_posted != rdp->nonlazy_posted_snap) { + if (rdp->all_lazy && rcu_segcblist_n_nonlazy_cbs(&rdp->cblist)) { rdp->all_lazy = false; - rdp->nonlazy_posted_snap = rdp->nonlazy_posted; invoke_rcu_core(); return; } @@ -1756,19 +1647,6 @@ static void rcu_cleanup_after_idle(void) invoke_rcu_core(); } -/* - * Keep a running count of the number of non-lazy callbacks posted - * on this CPU. This running counter (which is never decremented) allows - * rcu_prepare_for_idle() to detect when something out of the idle loop - * posts a callback, even if an equal number of callbacks are invoked. - * Of course, callbacks should only be posted from within a trace event - * designed to be called from idle or from within RCU_NONIDLE(). - */ -static void rcu_idle_count_callbacks_posted(void) -{ - __this_cpu_add(rcu_data.nonlazy_posted, 1); -} - #endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */ #ifdef CONFIG_RCU_FAST_NO_HZ @@ -1776,13 +1654,12 @@ static void rcu_idle_count_callbacks_posted(void) static void print_cpu_stall_fast_no_hz(char *cp, int cpu) { struct rcu_data *rdp = &per_cpu(rcu_data, cpu); - unsigned long nlpd = rdp->nonlazy_posted - rdp->nonlazy_posted_snap; - sprintf(cp, "last_accelerate: %04lx/%04lx, nonlazy_posted: %ld, %c%c", + sprintf(cp, "last_accelerate: %04lx/%04lx, Nonlazy posted: %c%c%c", rdp->last_accelerate & 0xffff, jiffies & 0xffff, - ulong2long(nlpd), - rdp->all_lazy ? 'L' : '.', - rdp->tick_nohz_enabled_snap ? '.' : 'D'); + ".l"[rdp->all_lazy], + ".L"[!rcu_segcblist_n_nonlazy_cbs(&rdp->cblist)], + ".D"[!rdp->tick_nohz_enabled_snap]); } #else /* #ifdef CONFIG_RCU_FAST_NO_HZ */ @@ -1868,22 +1745,24 @@ static void zero_cpu_stall_ticks(struct rcu_data *rdp) /* * Offload callback processing from the boot-time-specified set of CPUs - * specified by rcu_nocb_mask. For each CPU in the set, there is a - * kthread created that pulls the callbacks from the corresponding CPU, - * waits for a grace period to elapse, and invokes the callbacks. - * The no-CBs CPUs do a wake_up() on their kthread when they insert - * a callback into any empty list, unless the rcu_nocb_poll boot parameter - * has been specified, in which case each kthread actively polls its - * CPU. (Which isn't so great for energy efficiency, but which does - * reduce RCU's overhead on that CPU.) + * specified by rcu_nocb_mask. For the CPUs in the set, there are kthreads + * created that pull the callbacks from the corresponding CPU, wait for + * a grace period to elapse, and invoke the callbacks. These kthreads + * are organized into leaders, which manage incoming callbacks, wait for + * grace periods, and awaken followers, and the followers, which only + * invoke callbacks. Each leader is its own follower. The no-CBs CPUs + * do a wake_up() on their kthread when they insert a callback into any + * empty list, unless the rcu_nocb_poll boot parameter has been specified, + * in which case each kthread actively polls its CPU. (Which isn't so great + * for energy efficiency, but which does reduce RCU's overhead on that CPU.) * * This is intended to be used in conjunction with Frederic Weisbecker's * adaptive-idle work, which would seriously reduce OS jitter on CPUs * running CPU-bound user-mode computations. * - * Offloading of callback processing could also in theory be used as - * an energy-efficiency measure because CPUs with no RCU callbacks - * queued are more aggressive about entering dyntick-idle mode. + * Offloading of callbacks can also be used as an energy-efficiency + * measure because CPUs with no RCU callbacks queued are more aggressive + * about entering dyntick-idle mode. */ @@ -1987,10 +1866,7 @@ static void wake_nocb_leader_defer(struct rcu_data *rdp, int waketype, raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); } -/* - * Does the specified CPU need an RCU callback for this invocation - * of rcu_barrier()? - */ +/* Does rcu_barrier need to queue an RCU callback on the specified CPU? */ static bool rcu_nocb_cpu_needs_barrier(int cpu) { struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); @@ -2006,8 +1882,8 @@ static bool rcu_nocb_cpu_needs_barrier(int cpu) * callbacks would be posted. In the worst case, the first * barrier in rcu_barrier() suffices (but the caller cannot * necessarily rely on this, not a substitute for the caller - * getting the concurrency design right!). There must also be - * a barrier between the following load an posting of a callback + * getting the concurrency design right!). There must also be a + * barrier between the following load and posting of a callback * (if a callback is in fact needed). This is associated with an * atomic_inc() in the caller. */ @@ -2517,9 +2393,9 @@ static void rcu_spawn_one_nocb_kthread(int cpu) /* * If the specified CPU is a no-CBs CPU that does not already have its - * rcuo kthreads, spawn them. + * rcuo kthread, spawn it. */ -static void rcu_spawn_all_nocb_kthreads(int cpu) +static void rcu_spawn_cpu_nocb_kthread(int cpu) { if (rcu_scheduler_fully_active) rcu_spawn_one_nocb_kthread(cpu); @@ -2536,7 +2412,7 @@ static void __init rcu_spawn_nocb_kthreads(void) int cpu; for_each_online_cpu(cpu) - rcu_spawn_all_nocb_kthreads(cpu); + rcu_spawn_cpu_nocb_kthread(cpu); } /* How many follower CPU IDs per leader? Default of -1 for sqrt(nr_cpu_ids). */ @@ -2670,7 +2546,7 @@ static void do_nocb_deferred_wakeup(struct rcu_data *rdp) { } -static void rcu_spawn_all_nocb_kthreads(int cpu) +static void rcu_spawn_cpu_nocb_kthread(int cpu) { } diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 1971869c4072..cbaa976c5945 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -1,26 +1,13 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * Read-Copy Update mechanism for mutual exclusion * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, you can access it online at - * http://www.gnu.org/licenses/gpl-2.0.html. - * * Copyright IBM Corporation, 2001 * * Authors: Dipankar Sarma <dipankar@in.ibm.com> * Manfred Spraul <manfred@colorfullife.com> * - * Based on the original work by Paul McKenney <paulmck@us.ibm.com> + * Based on the original work by Paul McKenney <paulmck@linux.ibm.com> * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen. * Papers: * http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf @@ -52,6 +39,7 @@ #include <linux/tick.h> #include <linux/rcupdate_wait.h> #include <linux/sched/isolation.h> +#include <linux/kprobes.h> #define CREATE_TRACE_POINTS @@ -249,6 +237,7 @@ int notrace debug_lockdep_rcu_enabled(void) current->lockdep_recursion == 0; } EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled); +NOKPROBE_SYMBOL(debug_lockdep_rcu_enabled); /** * rcu_read_lock_held() - might we be in RCU read-side critical section? diff --git a/kernel/resource.c b/kernel/resource.c index 915c02e8e5dd..e81b17b53fa5 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -448,8 +448,6 @@ int walk_mem_res(u64 start, u64 end, void *arg, arg, func); } -#if !defined(CONFIG_ARCH_HAS_WALK_MEMORY) - /* * This function calls the @func callback against all memory ranges of type * System RAM which are marked as IORESOURCE_SYSTEM_RAM and IORESOUCE_BUSY. @@ -481,8 +479,6 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages, return ret; } -#endif - static int __is_ram(unsigned long pfn, unsigned long nr_pages, void *arg) { return 1; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 7cbb5658be80..ead464a0f2e5 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -107,11 +107,12 @@ struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) * [L] ->on_rq * RELEASE (rq->lock) * - * If we observe the old CPU in task_rq_lock, the acquire of + * If we observe the old CPU in task_rq_lock(), the acquire of * the old rq->lock will fully serialize against the stores. * - * If we observe the new CPU in task_rq_lock, the acquire will - * pair with the WMB to ensure we must then also see migrating. + * If we observe the new CPU in task_rq_lock(), the address + * dependency headed by '[L] rq = task_rq()' and the acquire + * will pair with the WMB to ensure we then also see migrating. */ if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) { rq_pin_lock(rq, rf); @@ -180,6 +181,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta) if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY)) update_irq_load_avg(rq, irq_delta + steal); #endif + update_rq_clock_pelt(rq, delta); } void update_rq_clock(struct rq *rq) @@ -396,19 +398,7 @@ static bool set_nr_if_polling(struct task_struct *p) #endif #endif -/** - * wake_q_add() - queue a wakeup for 'later' waking. - * @head: the wake_q_head to add @task to - * @task: the task to queue for 'later' wakeup - * - * Queue a task for later wakeup, most likely by the wake_up_q() call in the - * same context, _HOWEVER_ this is not guaranteed, the wakeup can come - * instantly. - * - * This function must be used as-if it were wake_up_process(); IOW the task - * must be ready to be woken at this location. - */ -void wake_q_add(struct wake_q_head *head, struct task_struct *task) +static bool __wake_q_add(struct wake_q_head *head, struct task_struct *task) { struct wake_q_node *node = &task->wake_q; @@ -421,16 +411,56 @@ void wake_q_add(struct wake_q_head *head, struct task_struct *task) * state, even in the failed case, an explicit smp_mb() must be used. */ smp_mb__before_atomic(); - if (cmpxchg_relaxed(&node->next, NULL, WAKE_Q_TAIL)) - return; - - get_task_struct(task); + if (unlikely(cmpxchg_relaxed(&node->next, NULL, WAKE_Q_TAIL))) + return false; /* * The head is context local, there can be no concurrency. */ *head->lastp = node; head->lastp = &node->next; + return true; +} + +/** + * wake_q_add() - queue a wakeup for 'later' waking. + * @head: the wake_q_head to add @task to + * @task: the task to queue for 'later' wakeup + * + * Queue a task for later wakeup, most likely by the wake_up_q() call in the + * same context, _HOWEVER_ this is not guaranteed, the wakeup can come + * instantly. + * + * This function must be used as-if it were wake_up_process(); IOW the task + * must be ready to be woken at this location. + */ +void wake_q_add(struct wake_q_head *head, struct task_struct *task) +{ + if (__wake_q_add(head, task)) + get_task_struct(task); +} + +/** + * wake_q_add_safe() - safely queue a wakeup for 'later' waking. + * @head: the wake_q_head to add @task to + * @task: the task to queue for 'later' wakeup + * + * Queue a task for later wakeup, most likely by the wake_up_q() call in the + * same context, _HOWEVER_ this is not guaranteed, the wakeup can come + * instantly. + * + * This function must be used as-if it were wake_up_process(); IOW the task + * must be ready to be woken at this location. + * + * This function is essentially a task-safe equivalent to wake_q_add(). Callers + * that already hold reference to @task can call the 'safe' version and trust + * wake_q to do the right thing depending whether or not the @task is already + * queued for wakeup. + */ +void wake_q_add_safe(struct wake_q_head *head, struct task_struct *task) +{ + if (!__wake_q_add(head, task)) + put_task_struct(task); } void wake_up_q(struct wake_q_head *head) @@ -928,7 +958,7 @@ static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf, { lockdep_assert_held(&rq->lock); - p->on_rq = TASK_ON_RQ_MIGRATING; + WRITE_ONCE(p->on_rq, TASK_ON_RQ_MIGRATING); dequeue_task(rq, p, DEQUEUE_NOCLOCK); set_task_cpu(p, new_cpu); rq_unlock(rq, rf); @@ -2190,6 +2220,9 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) INIT_HLIST_HEAD(&p->preempt_notifiers); #endif +#ifdef CONFIG_COMPACTION + p->capture_control = NULL; +#endif init_numa_balancing(clone_flags, p); } @@ -2431,7 +2464,7 @@ void wake_up_new_task(struct task_struct *p) #endif rq = __task_rq_lock(p, &rf); update_rq_clock(rq); - post_init_entity_util_avg(&p->se); + post_init_entity_util_avg(p); activate_task(rq, p, ENQUEUE_NOCLOCK); p->on_rq = TASK_ON_RQ_QUEUED; @@ -5265,9 +5298,8 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, } #ifdef CONFIG_COMPAT_32BIT_TIME -COMPAT_SYSCALL_DEFINE2(sched_rr_get_interval, - compat_pid_t, pid, - struct old_timespec32 __user *, interval) +SYSCALL_DEFINE2(sched_rr_get_interval_time32, pid_t, pid, + struct old_timespec32 __user *, interval) { struct timespec64 t; int retval = sched_rr_get_interval(pid, &t); @@ -5867,14 +5899,11 @@ void __init sched_init_smp(void) /* * There's no userspace yet to cause hotplug operations; hence all the * CPU masks are stable and all blatant races in the below code cannot - * happen. The hotplug lock is nevertheless taken to satisfy lockdep, - * but there won't be any contention on it. + * happen. */ - cpus_read_lock(); mutex_lock(&sched_domains_mutex); sched_init_domains(cpu_active_mask); mutex_unlock(&sched_domains_mutex); - cpus_read_unlock(); /* Move init over to a non-isolated CPU */ if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) < 0) diff --git a/kernel/sched/cpufreq.c b/kernel/sched/cpufreq.c index 22bd8980f32f..835671f0f917 100644 --- a/kernel/sched/cpufreq.c +++ b/kernel/sched/cpufreq.c @@ -48,8 +48,8 @@ EXPORT_SYMBOL_GPL(cpufreq_add_update_util_hook); * * Clear the update_util_data pointer for the given CPU. * - * Callers must use RCU-sched callbacks to free any memory that might be - * accessed via the old update_util_data pointer or invoke synchronize_sched() + * Callers must use RCU callbacks to free any memory that might be + * accessed via the old update_util_data pointer or invoke synchronize_rcu() * right after this function to avoid use-after-free. */ void cpufreq_remove_update_util_hook(int cpu) diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 033ec7c45f13..2efe629425be 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -859,7 +859,7 @@ static void sugov_stop(struct cpufreq_policy *policy) for_each_cpu(cpu, policy->cpus) cpufreq_remove_update_util_hook(cpu); - synchronize_sched(); + synchronize_rcu(); if (!policy->fast_switch_enabled) { irq_work_sync(&sg_policy->irq_work); diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index fb8b7b5d745d..6a73e41a2016 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1767,7 +1767,7 @@ pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) deadline_queue_push_tasks(rq); if (rq->curr->sched_class != &dl_sched_class) - update_dl_rq_load_avg(rq_clock_task(rq), rq, 0); + update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 0); return p; } @@ -1776,7 +1776,7 @@ static void put_prev_task_dl(struct rq *rq, struct task_struct *p) { update_curr_dl(rq); - update_dl_rq_load_avg(rq_clock_task(rq), rq, 1); + update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 1); if (on_dl_rq(&p->dl) && p->nr_cpus_allowed > 1) enqueue_pushable_dl_task(rq, p); } @@ -1793,7 +1793,7 @@ static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued) { update_curr_dl(rq); - update_dl_rq_load_avg(rq_clock_task(rq), rq, 1); + update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 1); /* * Even when we have runtime, update_curr_dl() might have resulted in us * not being the leftmost task anymore. In that case NEED_RESCHED will diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index de3de997e245..8039d62ae36e 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -315,6 +315,7 @@ void register_sched_domain_sysctl(void) { static struct ctl_table *cpu_entries; static struct ctl_table **cpu_idx; + static bool init_done = false; char buf[32]; int i; @@ -344,7 +345,10 @@ void register_sched_domain_sysctl(void) if (!cpumask_available(sd_sysctl_cpus)) { if (!alloc_cpumask_var(&sd_sysctl_cpus, GFP_KERNEL)) return; + } + if (!init_done) { + init_done = true; /* init to possible to not have holes in @cpu_entries */ cpumask_copy(sd_sysctl_cpus, cpu_possible_mask); } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 310d0637fe4b..ea74d43924b2 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -248,13 +248,6 @@ const struct sched_class fair_sched_class; */ #ifdef CONFIG_FAIR_GROUP_SCHED - -/* cpu runqueue to which this cfs_rq is attached */ -static inline struct rq *rq_of(struct cfs_rq *cfs_rq) -{ - return cfs_rq->rq; -} - static inline struct task_struct *task_of(struct sched_entity *se) { SCHED_WARN_ON(!entity_is_task(se)); @@ -282,79 +275,103 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) return grp->my_q; } -static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) +static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) { - if (!cfs_rq->on_list) { - struct rq *rq = rq_of(cfs_rq); - int cpu = cpu_of(rq); + struct rq *rq = rq_of(cfs_rq); + int cpu = cpu_of(rq); + + if (cfs_rq->on_list) + return rq->tmp_alone_branch == &rq->leaf_cfs_rq_list; + + cfs_rq->on_list = 1; + + /* + * Ensure we either appear before our parent (if already + * enqueued) or force our parent to appear after us when it is + * enqueued. The fact that we always enqueue bottom-up + * reduces this to two cases and a special case for the root + * cfs_rq. Furthermore, it also means that we will always reset + * tmp_alone_branch either when the branch is connected + * to a tree or when we reach the top of the tree + */ + if (cfs_rq->tg->parent && + cfs_rq->tg->parent->cfs_rq[cpu]->on_list) { /* - * Ensure we either appear before our parent (if already - * enqueued) or force our parent to appear after us when it is - * enqueued. The fact that we always enqueue bottom-up - * reduces this to two cases and a special case for the root - * cfs_rq. Furthermore, it also means that we will always reset - * tmp_alone_branch either when the branch is connected - * to a tree or when we reach the beg of the tree + * If parent is already on the list, we add the child + * just before. Thanks to circular linked property of + * the list, this means to put the child at the tail + * of the list that starts by parent. */ - if (cfs_rq->tg->parent && - cfs_rq->tg->parent->cfs_rq[cpu]->on_list) { - /* - * If parent is already on the list, we add the child - * just before. Thanks to circular linked property of - * the list, this means to put the child at the tail - * of the list that starts by parent. - */ - list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list, - &(cfs_rq->tg->parent->cfs_rq[cpu]->leaf_cfs_rq_list)); - /* - * The branch is now connected to its tree so we can - * reset tmp_alone_branch to the beginning of the - * list. - */ - rq->tmp_alone_branch = &rq->leaf_cfs_rq_list; - } else if (!cfs_rq->tg->parent) { - /* - * cfs rq without parent should be put - * at the tail of the list. - */ - list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list, - &rq->leaf_cfs_rq_list); - /* - * We have reach the beg of a tree so we can reset - * tmp_alone_branch to the beginning of the list. - */ - rq->tmp_alone_branch = &rq->leaf_cfs_rq_list; - } else { - /* - * The parent has not already been added so we want to - * make sure that it will be put after us. - * tmp_alone_branch points to the beg of the branch - * where we will add parent. - */ - list_add_rcu(&cfs_rq->leaf_cfs_rq_list, - rq->tmp_alone_branch); - /* - * update tmp_alone_branch to points to the new beg - * of the branch - */ - rq->tmp_alone_branch = &cfs_rq->leaf_cfs_rq_list; - } + list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list, + &(cfs_rq->tg->parent->cfs_rq[cpu]->leaf_cfs_rq_list)); + /* + * The branch is now connected to its tree so we can + * reset tmp_alone_branch to the beginning of the + * list. + */ + rq->tmp_alone_branch = &rq->leaf_cfs_rq_list; + return true; + } - cfs_rq->on_list = 1; + if (!cfs_rq->tg->parent) { + /* + * cfs rq without parent should be put + * at the tail of the list. + */ + list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list, + &rq->leaf_cfs_rq_list); + /* + * We have reach the top of a tree so we can reset + * tmp_alone_branch to the beginning of the list. + */ + rq->tmp_alone_branch = &rq->leaf_cfs_rq_list; + return true; } + + /* + * The parent has not already been added so we want to + * make sure that it will be put after us. + * tmp_alone_branch points to the begin of the branch + * where we will add parent. + */ + list_add_rcu(&cfs_rq->leaf_cfs_rq_list, rq->tmp_alone_branch); + /* + * update tmp_alone_branch to points to the new begin + * of the branch + */ + rq->tmp_alone_branch = &cfs_rq->leaf_cfs_rq_list; + return false; } static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq) { if (cfs_rq->on_list) { + struct rq *rq = rq_of(cfs_rq); + + /* + * With cfs_rq being unthrottled/throttled during an enqueue, + * it can happen the tmp_alone_branch points the a leaf that + * we finally want to del. In this case, tmp_alone_branch moves + * to the prev element but it will point to rq->leaf_cfs_rq_list + * at the end of the enqueue. + */ + if (rq->tmp_alone_branch == &cfs_rq->leaf_cfs_rq_list) + rq->tmp_alone_branch = cfs_rq->leaf_cfs_rq_list.prev; + list_del_rcu(&cfs_rq->leaf_cfs_rq_list); cfs_rq->on_list = 0; } } -/* Iterate through all leaf cfs_rq's on a runqueue: */ -#define for_each_leaf_cfs_rq(rq, cfs_rq) \ - list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list) +static inline void assert_list_leaf_cfs_rq(struct rq *rq) +{ + SCHED_WARN_ON(rq->tmp_alone_branch != &rq->leaf_cfs_rq_list); +} + +/* Iterate thr' all leaf cfs_rq's on a runqueue */ +#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \ + list_for_each_entry_safe(cfs_rq, pos, &rq->leaf_cfs_rq_list, \ + leaf_cfs_rq_list) /* Do the two (enqueued) entities belong to the same group ? */ static inline struct cfs_rq * @@ -410,12 +427,6 @@ static inline struct task_struct *task_of(struct sched_entity *se) return container_of(se, struct task_struct, se); } -static inline struct rq *rq_of(struct cfs_rq *cfs_rq) -{ - return container_of(cfs_rq, struct rq, cfs); -} - - #define for_each_sched_entity(se) \ for (; se; se = NULL) @@ -438,16 +449,21 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) return NULL; } -static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) +static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) { + return true; } static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq) { } -#define for_each_leaf_cfs_rq(rq, cfs_rq) \ - for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL) +static inline void assert_list_leaf_cfs_rq(struct rq *rq) +{ +} + +#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \ + for (cfs_rq = &rq->cfs, pos = NULL; cfs_rq; cfs_rq = pos) static inline struct sched_entity *parent_entity(struct sched_entity *se) { @@ -686,9 +702,8 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) return calc_delta_fair(sched_slice(cfs_rq, se), se); } -#ifdef CONFIG_SMP #include "pelt.h" -#include "sched-pelt.h" +#ifdef CONFIG_SMP static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu); static unsigned long task_h_load(struct task_struct *p); @@ -744,8 +759,9 @@ static void attach_entity_cfs_rq(struct sched_entity *se); * Finally, that extrapolated util_avg is clamped to the cap (util_avg_cap) * if util_avg > util_avg_cap. */ -void post_init_entity_util_avg(struct sched_entity *se) +void post_init_entity_util_avg(struct task_struct *p) { + struct sched_entity *se = &p->se; struct cfs_rq *cfs_rq = cfs_rq_of(se); struct sched_avg *sa = &se->avg; long cpu_scale = arch_scale_cpu_capacity(NULL, cpu_of(rq_of(cfs_rq))); @@ -763,22 +779,19 @@ void post_init_entity_util_avg(struct sched_entity *se) } } - if (entity_is_task(se)) { - struct task_struct *p = task_of(se); - if (p->sched_class != &fair_sched_class) { - /* - * For !fair tasks do: - * - update_cfs_rq_load_avg(now, cfs_rq); - attach_entity_load_avg(cfs_rq, se, 0); - switched_from_fair(rq, p); - * - * such that the next switched_to_fair() has the - * expected state. - */ - se->avg.last_update_time = cfs_rq_clock_task(cfs_rq); - return; - } + if (p->sched_class != &fair_sched_class) { + /* + * For !fair tasks do: + * + update_cfs_rq_load_avg(now, cfs_rq); + attach_entity_load_avg(cfs_rq, se, 0); + switched_from_fair(rq, p); + * + * such that the next switched_to_fair() has the + * expected state. + */ + se->avg.last_update_time = cfs_rq_clock_pelt(cfs_rq); + return; } attach_entity_cfs_rq(se); @@ -788,7 +801,7 @@ void post_init_entity_util_avg(struct sched_entity *se) void init_entity_runnable_average(struct sched_entity *se) { } -void post_init_entity_util_avg(struct sched_entity *se) +void post_init_entity_util_avg(struct task_struct *p) { } static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) @@ -1035,7 +1048,7 @@ unsigned int sysctl_numa_balancing_scan_size = 256; unsigned int sysctl_numa_balancing_scan_delay = 1000; struct numa_group { - atomic_t refcount; + refcount_t refcount; spinlock_t lock; /* nr_tasks, tasks */ int nr_tasks; @@ -1104,7 +1117,7 @@ static unsigned int task_scan_start(struct task_struct *p) unsigned long shared = group_faults_shared(ng); unsigned long private = group_faults_priv(ng); - period *= atomic_read(&ng->refcount); + period *= refcount_read(&ng->refcount); period *= shared + 1; period /= private + shared + 1; } @@ -1127,7 +1140,7 @@ static unsigned int task_scan_max(struct task_struct *p) unsigned long private = group_faults_priv(ng); unsigned long period = smax; - period *= atomic_read(&ng->refcount); + period *= refcount_read(&ng->refcount); period *= shared + 1; period /= private + shared + 1; @@ -1160,7 +1173,7 @@ void init_numa_balancing(unsigned long clone_flags, struct task_struct *p) /* New address space, reset the preferred nid */ if (!(clone_flags & CLONE_VM)) { - p->numa_preferred_nid = -1; + p->numa_preferred_nid = NUMA_NO_NODE; return; } @@ -1180,13 +1193,13 @@ void init_numa_balancing(unsigned long clone_flags, struct task_struct *p) static void account_numa_enqueue(struct rq *rq, struct task_struct *p) { - rq->nr_numa_running += (p->numa_preferred_nid != -1); + rq->nr_numa_running += (p->numa_preferred_nid != NUMA_NO_NODE); rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p)); } static void account_numa_dequeue(struct rq *rq, struct task_struct *p) { - rq->nr_numa_running -= (p->numa_preferred_nid != -1); + rq->nr_numa_running -= (p->numa_preferred_nid != NUMA_NO_NODE); rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p)); } @@ -1400,7 +1413,7 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page, * two full passes of the "multi-stage node selection" test that is * executed below. */ - if ((p->numa_preferred_nid == -1 || p->numa_scan_seq <= 4) && + if ((p->numa_preferred_nid == NUMA_NO_NODE || p->numa_scan_seq <= 4) && (cpupid_pid_unset(last_cpupid) || cpupid_match_pid(p, last_cpupid))) return true; @@ -1848,7 +1861,7 @@ static void numa_migrate_preferred(struct task_struct *p) unsigned long interval = HZ; /* This task has no NUMA fault statistics yet */ - if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults)) + if (unlikely(p->numa_preferred_nid == NUMA_NO_NODE || !p->numa_faults)) return; /* Periodically retry migrating the task to the preferred node */ @@ -2095,7 +2108,7 @@ static int preferred_group_nid(struct task_struct *p, int nid) static void task_numa_placement(struct task_struct *p) { - int seq, nid, max_nid = -1; + int seq, nid, max_nid = NUMA_NO_NODE; unsigned long max_faults = 0; unsigned long fault_types[2] = { 0, 0 }; unsigned long total_faults; @@ -2203,12 +2216,12 @@ static void task_numa_placement(struct task_struct *p) static inline int get_numa_group(struct numa_group *grp) { - return atomic_inc_not_zero(&grp->refcount); + return refcount_inc_not_zero(&grp->refcount); } static inline void put_numa_group(struct numa_group *grp) { - if (atomic_dec_and_test(&grp->refcount)) + if (refcount_dec_and_test(&grp->refcount)) kfree_rcu(grp, rcu); } @@ -2229,7 +2242,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags, if (!grp) return; - atomic_set(&grp->refcount, 1); + refcount_set(&grp->refcount, 1); grp->active_nodes = 1; grp->max_faults_cpu = 0; spin_lock_init(&grp->lock); @@ -2638,7 +2651,8 @@ static void update_scan_period(struct task_struct *p, int new_cpu) * the preferred node. */ if (dst_nid == p->numa_preferred_nid || - (p->numa_preferred_nid != -1 && src_nid != p->numa_preferred_nid)) + (p->numa_preferred_nid != NUMA_NO_NODE && + src_nid != p->numa_preferred_nid)) return; } @@ -3122,7 +3136,7 @@ void set_task_rq_fair(struct sched_entity *se, p_last_update_time = prev->avg.last_update_time; n_last_update_time = next->avg.last_update_time; #endif - __update_load_avg_blocked_se(p_last_update_time, cpu_of(rq_of(prev)), se); + __update_load_avg_blocked_se(p_last_update_time, se); se->avg.last_update_time = n_last_update_time; } @@ -3257,11 +3271,11 @@ update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cf /* * runnable_sum can't be lower than running_sum - * As running sum is scale with CPU capacity wehreas the runnable sum - * is not we rescale running_sum 1st + * Rescale running sum to be in the same range as runnable sum + * running_sum is in [0 : LOAD_AVG_MAX << SCHED_CAPACITY_SHIFT] + * runnable_sum is in [0 : LOAD_AVG_MAX] */ - running_sum = se->avg.util_sum / - arch_scale_cpu_capacity(NULL, cpu_of(rq_of(cfs_rq))); + running_sum = se->avg.util_sum >> SCHED_CAPACITY_SHIFT; runnable_sum = max(runnable_sum, running_sum); load_sum = (s64)se_weight(se) * runnable_sum; @@ -3364,7 +3378,7 @@ static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum /** * update_cfs_rq_load_avg - update the cfs_rq's load/util averages - * @now: current time, as per cfs_rq_clock_task() + * @now: current time, as per cfs_rq_clock_pelt() * @cfs_rq: cfs_rq to update * * The cfs_rq avg is the direct sum of all its entities (blocked and runnable) @@ -3409,7 +3423,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) decayed = 1; } - decayed |= __update_load_avg_cfs_rq(now, cpu_of(rq_of(cfs_rq)), cfs_rq); + decayed |= __update_load_avg_cfs_rq(now, cfs_rq); #ifndef CONFIG_64BIT smp_wmb(); @@ -3499,9 +3513,7 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s /* Update task and its cfs_rq load average */ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { - u64 now = cfs_rq_clock_task(cfs_rq); - struct rq *rq = rq_of(cfs_rq); - int cpu = cpu_of(rq); + u64 now = cfs_rq_clock_pelt(cfs_rq); int decayed; /* @@ -3509,7 +3521,7 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s * track group sched_entity load average for task_h_load calc in migration */ if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD)) - __update_load_avg_se(now, cpu, cfs_rq, se); + __update_load_avg_se(now, cfs_rq, se); decayed = update_cfs_rq_load_avg(now, cfs_rq); decayed |= propagate_entity_load_avg(se); @@ -3561,7 +3573,7 @@ void sync_entity_load_avg(struct sched_entity *se) u64 last_update_time; last_update_time = cfs_rq_last_update_time(cfs_rq); - __update_load_avg_blocked_se(last_update_time, cpu_of(rq_of(cfs_rq)), se); + __update_load_avg_blocked_se(last_update_time, se); } /* @@ -3577,10 +3589,6 @@ void remove_entity_load_avg(struct sched_entity *se) * tasks cannot exit without having gone through wake_up_new_task() -> * post_init_entity_util_avg() which will have added things to the * cfs_rq, so we can remove unconditionally. - * - * Similarly for groups, they will have passed through - * post_init_entity_util_avg() before unregister_sched_fair_group() - * calls this. */ sync_entity_load_avg(se); @@ -3654,6 +3662,7 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep) { long last_ewma_diff; struct util_est ue; + int cpu; if (!sched_feat(UTIL_EST)) return; @@ -3688,6 +3697,14 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep) return; /* + * To avoid overestimation of actual task utilization, skip updates if + * we cannot grant there is idle time in this CPU. + */ + cpu = cpu_of(rq_of(cfs_rq)); + if (task_util(p) > capacity_orig_of(cpu)) + return; + + /* * Update Task's estimated utilization * * When *p completes an activation we can consolidate another sample @@ -4429,6 +4446,10 @@ static int tg_unthrottle_up(struct task_group *tg, void *data) /* adjust cfs_rq_clock_task() */ cfs_rq->throttled_clock_task_time += rq_clock_task(rq) - cfs_rq->throttled_clock_task; + + /* Add cfs_rq with already running entity in the list */ + if (cfs_rq->nr_running >= 1) + list_add_leaf_cfs_rq(cfs_rq); } return 0; @@ -4440,8 +4461,10 @@ static int tg_throttle_down(struct task_group *tg, void *data) struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; /* group is entering throttled state, stop time */ - if (!cfs_rq->throttle_count) + if (!cfs_rq->throttle_count) { cfs_rq->throttled_clock_task = rq_clock_task(rq); + list_del_leaf_cfs_rq(cfs_rq); + } cfs_rq->throttle_count++; return 0; @@ -4544,6 +4567,8 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) break; } + assert_list_leaf_cfs_rq(rq); + if (!se) add_nr_running(rq, task_delta); @@ -4565,7 +4590,7 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, struct rq *rq = rq_of(cfs_rq); struct rq_flags rf; - rq_lock(rq, &rf); + rq_lock_irqsave(rq, &rf); if (!cfs_rq_throttled(cfs_rq)) goto next; @@ -4582,7 +4607,7 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, unthrottle_cfs_rq(cfs_rq); next: - rq_unlock(rq, &rf); + rq_unlock_irqrestore(rq, &rf); if (!remaining) break; @@ -4598,7 +4623,7 @@ next: * period the timer is deactivated until scheduling resumes; cfs_b->idle is * used to track this state. */ -static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun) +static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, unsigned long flags) { u64 runtime, runtime_expires; int throttled; @@ -4640,11 +4665,11 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun) while (throttled && cfs_b->runtime > 0 && !cfs_b->distribute_running) { runtime = cfs_b->runtime; cfs_b->distribute_running = 1; - raw_spin_unlock(&cfs_b->lock); + raw_spin_unlock_irqrestore(&cfs_b->lock, flags); /* we can't nest cfs_b->lock while distributing bandwidth */ runtime = distribute_cfs_runtime(cfs_b, runtime, runtime_expires); - raw_spin_lock(&cfs_b->lock); + raw_spin_lock_irqsave(&cfs_b->lock, flags); cfs_b->distribute_running = 0; throttled = !list_empty(&cfs_b->throttled_cfs_rq); @@ -4753,17 +4778,18 @@ static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) { u64 runtime = 0, slice = sched_cfs_bandwidth_slice(); + unsigned long flags; u64 expires; /* confirm we're still not at a refresh boundary */ - raw_spin_lock(&cfs_b->lock); + raw_spin_lock_irqsave(&cfs_b->lock, flags); if (cfs_b->distribute_running) { - raw_spin_unlock(&cfs_b->lock); + raw_spin_unlock_irqrestore(&cfs_b->lock, flags); return; } if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) { - raw_spin_unlock(&cfs_b->lock); + raw_spin_unlock_irqrestore(&cfs_b->lock, flags); return; } @@ -4774,18 +4800,18 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) if (runtime) cfs_b->distribute_running = 1; - raw_spin_unlock(&cfs_b->lock); + raw_spin_unlock_irqrestore(&cfs_b->lock, flags); if (!runtime) return; runtime = distribute_cfs_runtime(cfs_b, runtime, expires); - raw_spin_lock(&cfs_b->lock); + raw_spin_lock_irqsave(&cfs_b->lock, flags); if (expires == cfs_b->runtime_expires) lsub_positive(&cfs_b->runtime, runtime); cfs_b->distribute_running = 0; - raw_spin_unlock(&cfs_b->lock); + raw_spin_unlock_irqrestore(&cfs_b->lock, flags); } /* @@ -4863,20 +4889,21 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) { struct cfs_bandwidth *cfs_b = container_of(timer, struct cfs_bandwidth, period_timer); + unsigned long flags; int overrun; int idle = 0; - raw_spin_lock(&cfs_b->lock); + raw_spin_lock_irqsave(&cfs_b->lock, flags); for (;;) { overrun = hrtimer_forward_now(timer, cfs_b->period); if (!overrun) break; - idle = do_sched_cfs_period_timer(cfs_b, overrun); + idle = do_sched_cfs_period_timer(cfs_b, overrun, flags); } if (idle) cfs_b->period_active = 0; - raw_spin_unlock(&cfs_b->lock); + raw_spin_unlock_irqrestore(&cfs_b->lock, flags); return idle ? HRTIMER_NORESTART : HRTIMER_RESTART; } @@ -4986,6 +5013,12 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) } #else /* CONFIG_CFS_BANDWIDTH */ + +static inline bool cfs_bandwidth_used(void) +{ + return false; +} + static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq) { return rq_clock_task(rq_of(cfs_rq)); @@ -5177,6 +5210,23 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) } + if (cfs_bandwidth_used()) { + /* + * When bandwidth control is enabled; the cfs_rq_throttled() + * breaks in the above iteration can result in incomplete + * leaf list maintenance, resulting in triggering the assertion + * below. + */ + for_each_sched_entity(se) { + cfs_rq = cfs_rq_of(se); + + if (list_add_leaf_cfs_rq(cfs_rq)) + break; + } + } + + assert_list_leaf_cfs_rq(rq); + hrtick_update(rq); } @@ -5556,11 +5606,6 @@ static unsigned long capacity_of(int cpu) return cpu_rq(cpu)->cpu_capacity; } -static unsigned long capacity_orig_of(int cpu) -{ - return cpu_rq(cpu)->cpu_capacity_orig; -} - static unsigned long cpu_avg_load_per_task(int cpu) { struct rq *rq = cpu_rq(cpu); @@ -6053,7 +6098,7 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int bool idle = true; for_each_cpu(cpu, cpu_smt_mask(core)) { - cpumask_clear_cpu(cpu, cpus); + __cpumask_clear_cpu(cpu, cpus); if (!available_idle_cpu(cpu)) idle = false; } @@ -6073,7 +6118,7 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int /* * Scan the local SMT mask for idle CPUs. */ -static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target) +static int select_idle_smt(struct task_struct *p, int target) { int cpu; @@ -6097,7 +6142,7 @@ static inline int select_idle_core(struct task_struct *p, struct sched_domain *s return -1; } -static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target) +static inline int select_idle_smt(struct task_struct *p, int target) { return -1; } @@ -6202,7 +6247,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) if ((unsigned)i < nr_cpumask_bits) return i; - i = select_idle_smt(p, sd, target); + i = select_idle_smt(p, target); if ((unsigned)i < nr_cpumask_bits) return i; @@ -6608,7 +6653,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f if (sd_flag & SD_BALANCE_WAKE) { record_wakee(p); - if (static_branch_unlikely(&sched_energy_present)) { + if (sched_energy_enabled()) { new_cpu = find_energy_efficient_cpu(p, prev_cpu); if (new_cpu >= 0) return new_cpu; @@ -7027,6 +7072,12 @@ idle: if (new_tasks > 0) goto again; + /* + * rq is about to be idle, check if we need to update the + * lost_idle_time of clock_pelt + */ + update_idle_rq_clock_pelt(rq); + return NULL; } @@ -7647,10 +7698,27 @@ static inline bool others_have_blocked(struct rq *rq) #ifdef CONFIG_FAIR_GROUP_SCHED +static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) +{ + if (cfs_rq->load.weight) + return false; + + if (cfs_rq->avg.load_sum) + return false; + + if (cfs_rq->avg.util_sum) + return false; + + if (cfs_rq->avg.runnable_load_sum) + return false; + + return true; +} + static void update_blocked_averages(int cpu) { struct rq *rq = cpu_rq(cpu); - struct cfs_rq *cfs_rq; + struct cfs_rq *cfs_rq, *pos; const struct sched_class *curr_class; struct rq_flags rf; bool done = true; @@ -7662,14 +7730,10 @@ static void update_blocked_averages(int cpu) * Iterates the task_group tree in a bottom up fashion, see * list_add_leaf_cfs_rq() for details. */ - for_each_leaf_cfs_rq(rq, cfs_rq) { + for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) { struct sched_entity *se; - /* throttled entities do not contribute to load */ - if (throttled_hierarchy(cfs_rq)) - continue; - - if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq)) + if (update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq)) update_tg_load_avg(cfs_rq, 0); /* Propagate pending load changes to the parent, if any: */ @@ -7677,14 +7741,21 @@ static void update_blocked_averages(int cpu) if (se && !skip_blocked_update(se)) update_load_avg(cfs_rq_of(se), se, 0); + /* + * There can be a lot of idle CPU cgroups. Don't let fully + * decayed cfs_rqs linger on the list. + */ + if (cfs_rq_is_decayed(cfs_rq)) + list_del_leaf_cfs_rq(cfs_rq); + /* Don't need periodic decay once load/util_avg are null */ if (cfs_rq_has_blocked(cfs_rq)) done = false; } curr_class = rq->curr->sched_class; - update_rt_rq_load_avg(rq_clock_task(rq), rq, curr_class == &rt_sched_class); - update_dl_rq_load_avg(rq_clock_task(rq), rq, curr_class == &dl_sched_class); + update_rt_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &rt_sched_class); + update_dl_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &dl_sched_class); update_irq_load_avg(rq, 0); /* Don't need periodic decay once load/util_avg are null */ if (others_have_blocked(rq)) @@ -7754,11 +7825,11 @@ static inline void update_blocked_averages(int cpu) rq_lock_irqsave(rq, &rf); update_rq_clock(rq); - update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq); + update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq); curr_class = rq->curr->sched_class; - update_rt_rq_load_avg(rq_clock_task(rq), rq, curr_class == &rt_sched_class); - update_dl_rq_load_avg(rq_clock_task(rq), rq, curr_class == &dl_sched_class); + update_rt_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &rt_sched_class); + update_dl_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &dl_sched_class); update_irq_load_avg(rq, 0); #ifdef CONFIG_NO_HZ_COMMON rq->last_blocked_load_update_tick = jiffies; @@ -8452,9 +8523,7 @@ static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds) if (sched_asym_prefer(busiest_cpu, env->dst_cpu)) return 0; - env->imbalance = DIV_ROUND_CLOSEST( - sds->busiest_stat.avg_load * sds->busiest_stat.group_capacity, - SCHED_CAPACITY_SCALE); + env->imbalance = sds->busiest_stat.group_load; return 1; } @@ -8636,7 +8705,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env) */ update_sd_lb_stats(env, &sds); - if (static_branch_unlikely(&sched_energy_present)) { + if (sched_energy_enabled()) { struct root_domain *rd = env->dst_rq->rd; if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized)) @@ -8827,21 +8896,25 @@ static struct rq *find_busiest_queue(struct lb_env *env, */ #define MAX_PINNED_INTERVAL 512 -static int need_active_balance(struct lb_env *env) +static inline bool +asym_active_balance(struct lb_env *env) { - struct sched_domain *sd = env->sd; + /* + * ASYM_PACKING needs to force migrate tasks from busy but + * lower priority CPUs in order to pack all tasks in the + * highest priority CPUs. + */ + return env->idle != CPU_NOT_IDLE && (env->sd->flags & SD_ASYM_PACKING) && + sched_asym_prefer(env->dst_cpu, env->src_cpu); +} - if (env->idle == CPU_NEWLY_IDLE) { +static inline bool +voluntary_active_balance(struct lb_env *env) +{ + struct sched_domain *sd = env->sd; - /* - * ASYM_PACKING needs to force migrate tasks from busy but - * lower priority CPUs in order to pack all tasks in the - * highest priority CPUs. - */ - if ((sd->flags & SD_ASYM_PACKING) && - sched_asym_prefer(env->dst_cpu, env->src_cpu)) - return 1; - } + if (asym_active_balance(env)) + return 1; /* * The dst_cpu is idle and the src_cpu CPU has only 1 CFS task. @@ -8859,6 +8932,16 @@ static int need_active_balance(struct lb_env *env) if (env->src_grp_type == group_misfit_task) return 1; + return 0; +} + +static int need_active_balance(struct lb_env *env) +{ + struct sched_domain *sd = env->sd; + + if (voluntary_active_balance(env)) + return 1; + return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2); } @@ -9023,7 +9106,7 @@ more_balance: if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) { /* Prevent to re-select dst_cpu via env's CPUs */ - cpumask_clear_cpu(env.dst_cpu, env.cpus); + __cpumask_clear_cpu(env.dst_cpu, env.cpus); env.dst_rq = cpu_rq(env.new_dst_cpu); env.dst_cpu = env.new_dst_cpu; @@ -9050,7 +9133,7 @@ more_balance: /* All tasks on this runqueue were pinned by CPU affinity */ if (unlikely(env.flags & LBF_ALL_PINNED)) { - cpumask_clear_cpu(cpu_of(busiest), cpus); + __cpumask_clear_cpu(cpu_of(busiest), cpus); /* * Attempting to continue load balancing at the current * sched_domain level only makes sense if there are @@ -9120,7 +9203,7 @@ more_balance: } else sd->nr_balance_failed = 0; - if (likely(!active_balance)) { + if (likely(!active_balance) || voluntary_active_balance(&env)) { /* We were unbalanced, so reset the balancing interval */ sd->balance_interval = sd->min_interval; } else { @@ -9469,15 +9552,8 @@ static void kick_ilb(unsigned int flags) } /* - * Current heuristic for kicking the idle load balancer in the presence - * of an idle cpu in the system. - * - This rq has more than one task. - * - This rq has at least one CFS task and the capacity of the CPU is - * significantly reduced because of RT tasks or IRQs. - * - At parent of LLC scheduler domain level, this cpu's scheduler group has - * multiple busy cpu. - * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler - * domain span are idle. + * Current decision point for kicking the idle load balancer in the presence + * of idle CPUs in the system. */ static void nohz_balancer_kick(struct rq *rq) { @@ -9519,8 +9595,13 @@ static void nohz_balancer_kick(struct rq *rq) sds = rcu_dereference(per_cpu(sd_llc_shared, cpu)); if (sds) { /* - * XXX: write a coherent comment on why we do this. - * See also: http://lkml.kernel.org/r/20111202010832.602203411@sbsiddha-desk.sc.intel.com + * If there is an imbalance between LLC domains (IOW we could + * increase the overall cache use), we need some less-loaded LLC + * domain to pull some load. Likewise, we may need to spread + * load within the current LLC domain (e.g. packed SMT cores but + * other CPUs are idle). We can't really know from here how busy + * the others are - so just get a nohz balance going if it looks + * like this LLC domain has tasks we could move. */ nr_busy = atomic_read(&sds->nr_busy_cpus); if (nr_busy > 1) { @@ -9533,7 +9614,7 @@ static void nohz_balancer_kick(struct rq *rq) sd = rcu_dereference(rq->sd); if (sd) { if ((rq->cfs.h_nr_running >= 1) && - check_cpu_capacity(rq, sd)) { + check_cpu_capacity(rq, sd)) { flags = NOHZ_KICK_MASK; goto unlock; } @@ -9541,11 +9622,7 @@ static void nohz_balancer_kick(struct rq *rq) sd = rcu_dereference(per_cpu(sd_asym_packing, cpu)); if (sd) { - for_each_cpu(i, sched_domain_span(sd)) { - if (i == cpu || - !cpumask_test_cpu(i, nohz.idle_cpus_mask)) - continue; - + for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) { if (sched_asym_prefer(i, cpu)) { flags = NOHZ_KICK_MASK; goto unlock; @@ -10546,10 +10623,10 @@ const struct sched_class fair_sched_class = { #ifdef CONFIG_SCHED_DEBUG void print_cfs_stats(struct seq_file *m, int cpu) { - struct cfs_rq *cfs_rq; + struct cfs_rq *cfs_rq, *pos; rcu_read_lock(); - for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq) + for_each_leaf_cfs_rq_safe(cpu_rq(cpu), cfs_rq, pos) print_cfs_rq(m, cpu, cfs_rq); rcu_read_unlock(); } diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c index 81faddba9e20..b02d148e7672 100644 --- a/kernel/sched/isolation.c +++ b/kernel/sched/isolation.c @@ -80,7 +80,7 @@ static int __init housekeeping_setup(char *str, enum hk_flags flags) cpumask_andnot(housekeeping_mask, cpu_possible_mask, non_housekeeping_mask); if (cpumask_empty(housekeeping_mask)) - cpumask_set_cpu(smp_processor_id(), housekeeping_mask); + __cpumask_set_cpu(smp_processor_id(), housekeeping_mask); } else { cpumask_var_t tmp; diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c index 90fb5bc12ad4..befce29bd882 100644 --- a/kernel/sched/pelt.c +++ b/kernel/sched/pelt.c @@ -26,7 +26,6 @@ #include <linux/sched.h> #include "sched.h" -#include "sched-pelt.h" #include "pelt.h" /* @@ -106,16 +105,12 @@ static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3) * n=1 */ static __always_inline u32 -accumulate_sum(u64 delta, int cpu, struct sched_avg *sa, +accumulate_sum(u64 delta, struct sched_avg *sa, unsigned long load, unsigned long runnable, int running) { - unsigned long scale_freq, scale_cpu; u32 contrib = (u32)delta; /* p == 0 -> delta < 1024 */ u64 periods; - scale_freq = arch_scale_freq_capacity(cpu); - scale_cpu = arch_scale_cpu_capacity(NULL, cpu); - delta += sa->period_contrib; periods = delta / 1024; /* A period is 1024us (~1ms) */ @@ -137,13 +132,12 @@ accumulate_sum(u64 delta, int cpu, struct sched_avg *sa, } sa->period_contrib = delta; - contrib = cap_scale(contrib, scale_freq); if (load) sa->load_sum += load * contrib; if (runnable) sa->runnable_load_sum += runnable * contrib; if (running) - sa->util_sum += contrib * scale_cpu; + sa->util_sum += contrib << SCHED_CAPACITY_SHIFT; return periods; } @@ -177,7 +171,7 @@ accumulate_sum(u64 delta, int cpu, struct sched_avg *sa, * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}] */ static __always_inline int -___update_load_sum(u64 now, int cpu, struct sched_avg *sa, +___update_load_sum(u64 now, struct sched_avg *sa, unsigned long load, unsigned long runnable, int running) { u64 delta; @@ -221,7 +215,7 @@ ___update_load_sum(u64 now, int cpu, struct sched_avg *sa, * Step 1: accumulate *_sum since last_update_time. If we haven't * crossed period boundaries, finish. */ - if (!accumulate_sum(delta, cpu, sa, load, runnable, running)) + if (!accumulate_sum(delta, sa, load, runnable, running)) return 0; return 1; @@ -267,9 +261,9 @@ ___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runna * runnable_load_avg = \Sum se->avg.runable_load_avg */ -int __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se) +int __update_load_avg_blocked_se(u64 now, struct sched_entity *se) { - if (___update_load_sum(now, cpu, &se->avg, 0, 0, 0)) { + if (___update_load_sum(now, &se->avg, 0, 0, 0)) { ___update_load_avg(&se->avg, se_weight(se), se_runnable(se)); return 1; } @@ -277,9 +271,9 @@ int __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se) return 0; } -int __update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se) +int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_entity *se) { - if (___update_load_sum(now, cpu, &se->avg, !!se->on_rq, !!se->on_rq, + if (___update_load_sum(now, &se->avg, !!se->on_rq, !!se->on_rq, cfs_rq->curr == se)) { ___update_load_avg(&se->avg, se_weight(se), se_runnable(se)); @@ -290,9 +284,9 @@ int __update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_e return 0; } -int __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq) +int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq) { - if (___update_load_sum(now, cpu, &cfs_rq->avg, + if (___update_load_sum(now, &cfs_rq->avg, scale_load_down(cfs_rq->load.weight), scale_load_down(cfs_rq->runnable_weight), cfs_rq->curr != NULL)) { @@ -317,7 +311,7 @@ int __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq) int update_rt_rq_load_avg(u64 now, struct rq *rq, int running) { - if (___update_load_sum(now, rq->cpu, &rq->avg_rt, + if (___update_load_sum(now, &rq->avg_rt, running, running, running)) { @@ -340,7 +334,7 @@ int update_rt_rq_load_avg(u64 now, struct rq *rq, int running) int update_dl_rq_load_avg(u64 now, struct rq *rq, int running) { - if (___update_load_sum(now, rq->cpu, &rq->avg_dl, + if (___update_load_sum(now, &rq->avg_dl, running, running, running)) { @@ -365,22 +359,31 @@ int update_dl_rq_load_avg(u64 now, struct rq *rq, int running) int update_irq_load_avg(struct rq *rq, u64 running) { int ret = 0; + + /* + * We can't use clock_pelt because irq time is not accounted in + * clock_task. Instead we directly scale the running time to + * reflect the real amount of computation + */ + running = cap_scale(running, arch_scale_freq_capacity(cpu_of(rq))); + running = cap_scale(running, arch_scale_cpu_capacity(NULL, cpu_of(rq))); + /* * We know the time that has been used by interrupt since last update * but we don't when. Let be pessimistic and assume that interrupt has * happened just before the update. This is not so far from reality * because interrupt will most probably wake up task and trig an update - * of rq clock during which the metric si updated. + * of rq clock during which the metric is updated. * We start to decay with normal context time and then we add the * interrupt context time. * We can safely remove running from rq->clock because * rq->clock += delta with delta >= running */ - ret = ___update_load_sum(rq->clock - running, rq->cpu, &rq->avg_irq, + ret = ___update_load_sum(rq->clock - running, &rq->avg_irq, 0, 0, 0); - ret += ___update_load_sum(rq->clock, rq->cpu, &rq->avg_irq, + ret += ___update_load_sum(rq->clock, &rq->avg_irq, 1, 1, 1); diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h index 7e56b489ff32..7489d5f56960 100644 --- a/kernel/sched/pelt.h +++ b/kernel/sched/pelt.h @@ -1,8 +1,9 @@ #ifdef CONFIG_SMP +#include "sched-pelt.h" -int __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se); -int __update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se); -int __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq); +int __update_load_avg_blocked_se(u64 now, struct sched_entity *se); +int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_entity *se); +int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq); int update_rt_rq_load_avg(u64 now, struct rq *rq, int running); int update_dl_rq_load_avg(u64 now, struct rq *rq, int running); @@ -42,6 +43,101 @@ static inline void cfs_se_util_change(struct sched_avg *avg) WRITE_ONCE(avg->util_est.enqueued, enqueued); } +/* + * The clock_pelt scales the time to reflect the effective amount of + * computation done during the running delta time but then sync back to + * clock_task when rq is idle. + * + * + * absolute time | 1| 2| 3| 4| 5| 6| 7| 8| 9|10|11|12|13|14|15|16 + * @ max capacity ------******---------------******--------------- + * @ half capacity ------************---------************--------- + * clock pelt | 1| 2| 3| 4| 7| 8| 9| 10| 11|14|15|16 + * + */ +static inline void update_rq_clock_pelt(struct rq *rq, s64 delta) +{ + if (unlikely(is_idle_task(rq->curr))) { + /* The rq is idle, we can sync to clock_task */ + rq->clock_pelt = rq_clock_task(rq); + return; + } + + /* + * When a rq runs at a lower compute capacity, it will need + * more time to do the same amount of work than at max + * capacity. In order to be invariant, we scale the delta to + * reflect how much work has been really done. + * Running longer results in stealing idle time that will + * disturb the load signal compared to max capacity. This + * stolen idle time will be automatically reflected when the + * rq will be idle and the clock will be synced with + * rq_clock_task. + */ + + /* + * Scale the elapsed time to reflect the real amount of + * computation + */ + delta = cap_scale(delta, arch_scale_cpu_capacity(NULL, cpu_of(rq))); + delta = cap_scale(delta, arch_scale_freq_capacity(cpu_of(rq))); + + rq->clock_pelt += delta; +} + +/* + * When rq becomes idle, we have to check if it has lost idle time + * because it was fully busy. A rq is fully used when the /Sum util_sum + * is greater or equal to: + * (LOAD_AVG_MAX - 1024 + rq->cfs.avg.period_contrib) << SCHED_CAPACITY_SHIFT; + * For optimization and computing rounding purpose, we don't take into account + * the position in the current window (period_contrib) and we use the higher + * bound of util_sum to decide. + */ +static inline void update_idle_rq_clock_pelt(struct rq *rq) +{ + u32 divider = ((LOAD_AVG_MAX - 1024) << SCHED_CAPACITY_SHIFT) - LOAD_AVG_MAX; + u32 util_sum = rq->cfs.avg.util_sum; + util_sum += rq->avg_rt.util_sum; + util_sum += rq->avg_dl.util_sum; + + /* + * Reflecting stolen time makes sense only if the idle + * phase would be present at max capacity. As soon as the + * utilization of a rq has reached the maximum value, it is + * considered as an always runnig rq without idle time to + * steal. This potential idle time is considered as lost in + * this case. We keep track of this lost idle time compare to + * rq's clock_task. + */ + if (util_sum >= divider) + rq->lost_idle_time += rq_clock_task(rq) - rq->clock_pelt; +} + +static inline u64 rq_clock_pelt(struct rq *rq) +{ + lockdep_assert_held(&rq->lock); + assert_clock_updated(rq); + + return rq->clock_pelt - rq->lost_idle_time; +} + +#ifdef CONFIG_CFS_BANDWIDTH +/* rq->task_clock normalized against any time this cfs_rq has spent throttled */ +static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) +{ + if (unlikely(cfs_rq->throttle_count)) + return cfs_rq->throttled_clock_task - cfs_rq->throttled_clock_task_time; + + return rq_clock_pelt(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time; +} +#else +static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) +{ + return rq_clock_pelt(rq_of(cfs_rq)); +} +#endif + #else static inline int @@ -67,6 +163,18 @@ update_irq_load_avg(struct rq *rq, u64 running) { return 0; } + +static inline u64 rq_clock_pelt(struct rq *rq) +{ + return rq_clock_task(rq); +} + +static inline void +update_rq_clock_pelt(struct rq *rq, s64 delta) { } + +static inline void +update_idle_rq_clock_pelt(struct rq *rq) { } + #endif diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index e4f398ad9e73..90fa23d36565 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1587,7 +1587,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) * rt task */ if (rq->curr->sched_class != &rt_sched_class) - update_rt_rq_load_avg(rq_clock_task(rq), rq, 0); + update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 0); return p; } @@ -1596,7 +1596,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p) { update_curr_rt(rq); - update_rt_rq_load_avg(rq_clock_task(rq), rq, 1); + update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 1); /* * The previous task needs to be made eligible for pushing @@ -2325,7 +2325,7 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) struct sched_rt_entity *rt_se = &p->rt; update_curr_rt(rq); - update_rt_rq_load_avg(rq_clock_task(rq), rq, 1); + update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 1); watchdog(rq, p); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index d04530bf251f..efa686eeff26 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -861,7 +861,10 @@ struct rq { unsigned int clock_update_flags; u64 clock; - u64 clock_task; + /* Ensure that all clocks are in the same cache line */ + u64 clock_task ____cacheline_aligned; + u64 clock_pelt; + unsigned long lost_idle_time; atomic_t nr_iowait; @@ -951,6 +954,22 @@ struct rq { #endif }; +#ifdef CONFIG_FAIR_GROUP_SCHED + +/* CPU runqueue to which this cfs_rq is attached */ +static inline struct rq *rq_of(struct cfs_rq *cfs_rq) +{ + return cfs_rq->rq; +} + +#else + +static inline struct rq *rq_of(struct cfs_rq *cfs_rq) +{ + return container_of(cfs_rq, struct rq, cfs); +} +#endif + static inline int cpu_of(struct rq *rq) { #ifdef CONFIG_SMP @@ -1260,7 +1279,7 @@ extern void sched_ttwu_pending(void); /* * The domain tree (rq->sd) is protected by RCU's quiescent state transition. - * See detach_destroy_domains: synchronize_sched for details. + * See destroy_sched_domains: call_rcu for details. * * The domain tree of any CPU may only be accessed from within * preempt-disabled sections. @@ -1460,9 +1479,9 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) */ smp_wmb(); #ifdef CONFIG_THREAD_INFO_IN_TASK - p->cpu = cpu; + WRITE_ONCE(p->cpu, cpu); #else - task_thread_info(p)->cpu = cpu; + WRITE_ONCE(task_thread_info(p)->cpu, cpu); #endif p->wake_cpu = cpu; #endif @@ -1563,7 +1582,7 @@ static inline int task_on_rq_queued(struct task_struct *p) static inline int task_on_rq_migrating(struct task_struct *p) { - return p->on_rq == TASK_ON_RQ_MIGRATING; + return READ_ONCE(p->on_rq) == TASK_ON_RQ_MIGRATING; } /* @@ -1781,7 +1800,7 @@ extern void init_dl_rq_bw_ratio(struct dl_rq *dl_rq); unsigned long to_ratio(u64 period, u64 runtime); extern void init_entity_runnable_average(struct sched_entity *se); -extern void post_init_entity_util_avg(struct sched_entity *se); +extern void post_init_entity_util_avg(struct task_struct *p); #ifdef CONFIG_NO_HZ_FULL extern bool sched_can_stop_tick(struct rq *rq); @@ -2211,6 +2230,13 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {} # define arch_scale_freq_invariant() false #endif +#ifdef CONFIG_SMP +static inline unsigned long capacity_orig_of(int cpu) +{ + return cpu_rq(cpu)->cpu_capacity_orig; +} +#endif + #ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL /** * enum schedutil_type - CPU utilization type @@ -2299,11 +2325,19 @@ unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned #endif #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) + #define perf_domain_span(pd) (to_cpumask(((pd)->em_pd->cpus))) -#else + +DECLARE_STATIC_KEY_FALSE(sched_energy_present); + +static inline bool sched_energy_enabled(void) +{ + return static_branch_unlikely(&sched_energy_present); +} + +#else /* ! (CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL) */ + #define perf_domain_span(pd) NULL -#endif +static inline bool sched_energy_enabled(void) { return false; } -#ifdef CONFIG_SMP -extern struct static_key_false sched_energy_present; -#endif +#endif /* CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL */ diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 3f35ba1d8fde..ab7f371a3a17 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -201,11 +201,37 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) return 1; } -DEFINE_STATIC_KEY_FALSE(sched_energy_present); #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) +DEFINE_STATIC_KEY_FALSE(sched_energy_present); +unsigned int sysctl_sched_energy_aware = 1; DEFINE_MUTEX(sched_energy_mutex); bool sched_energy_update; +#ifdef CONFIG_PROC_SYSCTL +int sched_energy_aware_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int ret, state; + + if (write && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + if (!ret && write) { + state = static_branch_unlikely(&sched_energy_present); + if (state != sysctl_sched_energy_aware) { + mutex_lock(&sched_energy_mutex); + sched_energy_update = 1; + rebuild_sched_domains(); + sched_energy_update = 0; + mutex_unlock(&sched_energy_mutex); + } + } + + return ret; +} +#endif + static void free_pd(struct perf_domain *pd) { struct perf_domain *tmp; @@ -322,6 +348,9 @@ static bool build_perf_domains(const struct cpumask *cpu_map) struct cpufreq_policy *policy; struct cpufreq_governor *gov; + if (!sysctl_sched_energy_aware) + goto free; + /* EAS is enabled for asymmetric CPU capacity topologies. */ if (!per_cpu(sd_asym_cpucapacity, cpu)) { if (sched_debug()) { @@ -442,7 +471,7 @@ void rq_attach_root(struct rq *rq, struct root_domain *rd) raw_spin_unlock_irqrestore(&rq->lock, flags); if (old_rd) - call_rcu_sched(&old_rd->rcu, free_rootdomain); + call_rcu(&old_rd->rcu, free_rootdomain); } void sched_get_rd(struct root_domain *rd) @@ -455,7 +484,7 @@ void sched_put_rd(struct root_domain *rd) if (!atomic_dec_and_test(&rd->refcount)) return; - call_rcu_sched(&rd->rcu, free_rootdomain); + call_rcu(&rd->rcu, free_rootdomain); } static int init_rootdomain(struct root_domain *rd) @@ -676,7 +705,7 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) } struct s_data { - struct sched_domain ** __percpu sd; + struct sched_domain * __percpu *sd; struct root_domain *rd; }; diff --git a/kernel/seccomp.c b/kernel/seccomp.c index a43c601ac252..54a0347ca812 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -445,8 +445,8 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog) * behavior of privileged children. */ if (!task_no_new_privs(current) && - security_capable_noaudit(current_cred(), current_user_ns(), - CAP_SYS_ADMIN) != 0) + security_capable(current_cred(), current_user_ns(), + CAP_SYS_ADMIN, CAP_OPT_NOAUDIT) != 0) return ERR_PTR(-EACCES); /* Allocate a new seccomp_filter */ diff --git a/kernel/signal.c b/kernel/signal.c index 57b7771e20d7..5d53183e2705 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -3455,7 +3455,7 @@ COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait_time64, compat_sigset_t __user *, uthese, } #ifdef CONFIG_COMPAT_32BIT_TIME -COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait, compat_sigset_t __user *, uthese, +COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait_time32, compat_sigset_t __user *, uthese, struct compat_siginfo __user *, uinfo, struct old_timespec32 __user *, uts, compat_size_t, sigsetsize) { diff --git a/kernel/softirq.c b/kernel/softirq.c index d28813306b2c..10277429ed84 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -89,7 +89,8 @@ static bool ksoftirqd_running(unsigned long pending) if (pending & SOFTIRQ_NOW_MASK) return false; - return tsk && (tsk->state == TASK_RUNNING); + return tsk && (tsk->state == TASK_RUNNING) && + !__kthread_should_park(tsk); } /* diff --git a/kernel/sys.c b/kernel/sys.c index f7eb62eceb24..12df0e5434b8 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -516,7 +516,7 @@ long __sys_setreuid(uid_t ruid, uid_t euid) new->uid = kruid; if (!uid_eq(old->uid, kruid) && !uid_eq(old->euid, kruid) && - !ns_capable(old->user_ns, CAP_SETUID)) + !ns_capable_setid(old->user_ns, CAP_SETUID)) goto error; } @@ -525,7 +525,7 @@ long __sys_setreuid(uid_t ruid, uid_t euid) if (!uid_eq(old->uid, keuid) && !uid_eq(old->euid, keuid) && !uid_eq(old->suid, keuid) && - !ns_capable(old->user_ns, CAP_SETUID)) + !ns_capable_setid(old->user_ns, CAP_SETUID)) goto error; } @@ -584,7 +584,7 @@ long __sys_setuid(uid_t uid) old = current_cred(); retval = -EPERM; - if (ns_capable(old->user_ns, CAP_SETUID)) { + if (ns_capable_setid(old->user_ns, CAP_SETUID)) { new->suid = new->uid = kuid; if (!uid_eq(kuid, old->uid)) { retval = set_user(new); @@ -646,7 +646,7 @@ long __sys_setresuid(uid_t ruid, uid_t euid, uid_t suid) old = current_cred(); retval = -EPERM; - if (!ns_capable(old->user_ns, CAP_SETUID)) { + if (!ns_capable_setid(old->user_ns, CAP_SETUID)) { if (ruid != (uid_t) -1 && !uid_eq(kruid, old->uid) && !uid_eq(kruid, old->euid) && !uid_eq(kruid, old->suid)) goto error; @@ -814,7 +814,7 @@ long __sys_setfsuid(uid_t uid) if (uid_eq(kuid, old->uid) || uid_eq(kuid, old->euid) || uid_eq(kuid, old->suid) || uid_eq(kuid, old->fsuid) || - ns_capable(old->user_ns, CAP_SETUID)) { + ns_capable_setid(old->user_ns, CAP_SETUID)) { if (!uid_eq(kuid, old->fsuid)) { new->fsuid = kuid; if (security_task_fix_setuid(new, old, LSM_SETID_FS) == 0) @@ -1747,6 +1747,7 @@ void getrusage(struct task_struct *p, int who, struct rusage *r) if (who == RUSAGE_CHILDREN) break; + /* fall through */ case RUSAGE_SELF: thread_group_cputime_adjusted(p, &tgutime, &tgstime); diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index bc934f31ab10..51d7c6794bf1 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -42,10 +42,15 @@ COND_SYSCALL(io_destroy); COND_SYSCALL(io_submit); COND_SYSCALL_COMPAT(io_submit); COND_SYSCALL(io_cancel); +COND_SYSCALL(io_getevents_time32); COND_SYSCALL(io_getevents); +COND_SYSCALL(io_pgetevents_time32); COND_SYSCALL(io_pgetevents); -COND_SYSCALL_COMPAT(io_getevents); +COND_SYSCALL_COMPAT(io_pgetevents_time32); COND_SYSCALL_COMPAT(io_pgetevents); +COND_SYSCALL(io_uring_setup); +COND_SYSCALL(io_uring_enter); +COND_SYSCALL(io_uring_register); /* fs/xattr.c */ @@ -114,9 +119,9 @@ COND_SYSCALL_COMPAT(signalfd4); /* fs/timerfd.c */ COND_SYSCALL(timerfd_create); COND_SYSCALL(timerfd_settime); -COND_SYSCALL_COMPAT(timerfd_settime); +COND_SYSCALL(timerfd_settime32); COND_SYSCALL(timerfd_gettime); -COND_SYSCALL_COMPAT(timerfd_gettime); +COND_SYSCALL(timerfd_gettime32); /* fs/utimes.c */ @@ -135,7 +140,7 @@ COND_SYSCALL(capset); /* kernel/futex.c */ COND_SYSCALL(futex); -COND_SYSCALL_COMPAT(futex); +COND_SYSCALL(futex_time32); COND_SYSCALL(set_robust_list); COND_SYSCALL_COMPAT(set_robust_list); COND_SYSCALL(get_robust_list); @@ -187,9 +192,9 @@ COND_SYSCALL(mq_open); COND_SYSCALL_COMPAT(mq_open); COND_SYSCALL(mq_unlink); COND_SYSCALL(mq_timedsend); -COND_SYSCALL_COMPAT(mq_timedsend); +COND_SYSCALL(mq_timedsend_time32); COND_SYSCALL(mq_timedreceive); -COND_SYSCALL_COMPAT(mq_timedreceive); +COND_SYSCALL(mq_timedreceive_time32); COND_SYSCALL(mq_notify); COND_SYSCALL_COMPAT(mq_notify); COND_SYSCALL(mq_getsetattr); @@ -197,8 +202,10 @@ COND_SYSCALL_COMPAT(mq_getsetattr); /* ipc/msg.c */ COND_SYSCALL(msgget); +COND_SYSCALL(old_msgctl); COND_SYSCALL(msgctl); COND_SYSCALL_COMPAT(msgctl); +COND_SYSCALL_COMPAT(old_msgctl); COND_SYSCALL(msgrcv); COND_SYSCALL_COMPAT(msgrcv); COND_SYSCALL(msgsnd); @@ -206,16 +213,20 @@ COND_SYSCALL_COMPAT(msgsnd); /* ipc/sem.c */ COND_SYSCALL(semget); +COND_SYSCALL(old_semctl); COND_SYSCALL(semctl); COND_SYSCALL_COMPAT(semctl); +COND_SYSCALL_COMPAT(old_semctl); COND_SYSCALL(semtimedop); -COND_SYSCALL_COMPAT(semtimedop); +COND_SYSCALL(semtimedop_time32); COND_SYSCALL(semop); /* ipc/shm.c */ COND_SYSCALL(shmget); +COND_SYSCALL(old_shmctl); COND_SYSCALL(shmctl); COND_SYSCALL_COMPAT(shmctl); +COND_SYSCALL_COMPAT(old_shmctl); COND_SYSCALL(shmat); COND_SYSCALL_COMPAT(shmat); COND_SYSCALL(shmdt); @@ -285,7 +296,7 @@ COND_SYSCALL(perf_event_open); COND_SYSCALL(accept4); COND_SYSCALL(recvmmsg); COND_SYSCALL(recvmmsg_time32); -COND_SYSCALL_COMPAT(recvmmsg); +COND_SYSCALL_COMPAT(recvmmsg_time32); COND_SYSCALL_COMPAT(recvmmsg_time64); /* diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 2322b12cb4cf..7bb3988425ee 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -67,6 +67,8 @@ #include <linux/bpf.h> #include <linux/mount.h> +#include "../lib/kstrtox.h" + #include <linux/uaccess.h> #include <asm/processor.h> @@ -127,6 +129,7 @@ static int __maybe_unused one = 1; static int __maybe_unused two = 2; static int __maybe_unused four = 4; static unsigned long one_ul = 1; +static unsigned long long_max = LONG_MAX; static int one_hundred = 100; static int one_thousand = 1000; #ifdef CONFIG_PRINTK @@ -472,6 +475,17 @@ static struct ctl_table kern_table[] = { .extra1 = &one, }, #endif +#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) + { + .procname = "sched_energy_aware", + .data = &sysctl_sched_energy_aware, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sched_energy_aware_handler, + .extra1 = &zero, + .extra2 = &one, + }, +#endif #ifdef CONFIG_PROVE_LOCKING { .procname = "prove_locking", @@ -1460,7 +1474,7 @@ static struct ctl_table vm_table[] = { .data = &sysctl_extfrag_threshold, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = sysctl_extfrag_handler, + .proc_handler = proc_dointvec_minmax, .extra1 = &min_extfrag_threshold, .extra2 = &max_extfrag_threshold, }, @@ -1736,6 +1750,8 @@ static struct ctl_table fs_table[] = { .maxlen = sizeof(files_stat.max_files), .mode = 0644, .proc_handler = proc_doulongvec_minmax, + .extra1 = &zero, + .extra2 = &long_max, }, { .procname = "nr_open", @@ -2106,6 +2122,41 @@ static void proc_skip_char(char **buf, size_t *size, const char v) } } +/** + * strtoul_lenient - parse an ASCII formatted integer from a buffer and only + * fail on overflow + * + * @cp: kernel buffer containing the string to parse + * @endp: pointer to store the trailing characters + * @base: the base to use + * @res: where the parsed integer will be stored + * + * In case of success 0 is returned and @res will contain the parsed integer, + * @endp will hold any trailing characters. + * This function will fail the parse on overflow. If there wasn't an overflow + * the function will defer the decision what characters count as invalid to the + * caller. + */ +static int strtoul_lenient(const char *cp, char **endp, unsigned int base, + unsigned long *res) +{ + unsigned long long result; + unsigned int rv; + + cp = _parse_integer_fixup_radix(cp, &base); + rv = _parse_integer(cp, base, &result); + if ((rv & KSTRTOX_OVERFLOW) || (result != (unsigned long)result)) + return -ERANGE; + + cp += rv; + + if (endp) + *endp = (char *)cp; + + *res = (unsigned long)result; + return 0; +} + #define TMPBUFLEN 22 /** * proc_get_long - reads an ASCII formatted integer from a user buffer @@ -2149,7 +2200,8 @@ static int proc_get_long(char **buf, size_t *size, if (!isdigit(*p)) return -EINVAL; - *val = simple_strtoul(p, &p, 0); + if (strtoul_lenient(p, &p, 0, val)) + return -EINVAL; len = p - tmp; diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index 58b981f4bb5d..e2c038d6c13c 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -117,6 +117,35 @@ config NO_HZ_FULL endchoice +config CONTEXT_TRACKING + bool + +config CONTEXT_TRACKING_FORCE + bool "Force context tracking" + depends on CONTEXT_TRACKING + default y if !NO_HZ_FULL + help + The major pre-requirement for full dynticks to work is to + support the context tracking subsystem. But there are also + other dependencies to provide in order to make the full + dynticks working. + + This option stands for testing when an arch implements the + context tracking backend but doesn't yet fullfill all the + requirements to make the full dynticks feature working. + Without the full dynticks, there is no way to test the support + for context tracking and the subsystems that rely on it: RCU + userspace extended quiescent state and tickless cputime + accounting. This option copes with the absence of the full + dynticks subsystem by forcing the context tracking on all + CPUs in the system. + + Say Y only if you're working on the development of an + architecture backend for the context tracking. + + Say N otherwise, this option brings an overhead that you + don't want in production. + config NO_HZ bool "Old Idle dynticks config" depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index f5cfa1b73d6f..41dfff23c1f9 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -364,7 +364,7 @@ static bool hrtimer_fixup_activate(void *addr, enum debug_obj_state state) switch (state) { case ODEBUG_STATE_ACTIVE: WARN_ON(1); - + /* fall through */ default: return false; } @@ -1771,7 +1771,7 @@ SYSCALL_DEFINE2(nanosleep, struct __kernel_timespec __user *, rqtp, #ifdef CONFIG_COMPAT_32BIT_TIME -COMPAT_SYSCALL_DEFINE2(nanosleep, struct old_timespec32 __user *, rqtp, +SYSCALL_DEFINE2(nanosleep_time32, struct old_timespec32 __user *, rqtp, struct old_timespec32 __user *, rmtp) { struct timespec64 tu; diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 36a2bef00125..92a90014a925 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -188,13 +188,13 @@ static inline int is_error_status(int status) && (status & (STA_PPSWANDER|STA_PPSERROR))); } -static inline void pps_fill_timex(struct timex *txc) +static inline void pps_fill_timex(struct __kernel_timex *txc) { txc->ppsfreq = shift_right((pps_freq >> PPM_SCALE_INV_SHIFT) * PPM_SCALE_INV, NTP_SCALE_SHIFT); txc->jitter = pps_jitter; if (!(time_status & STA_NANO)) - txc->jitter /= NSEC_PER_USEC; + txc->jitter = pps_jitter / NSEC_PER_USEC; txc->shift = pps_shift; txc->stabil = pps_stabil; txc->jitcnt = pps_jitcnt; @@ -220,7 +220,7 @@ static inline int is_error_status(int status) return status & (STA_UNSYNC|STA_CLOCKERR); } -static inline void pps_fill_timex(struct timex *txc) +static inline void pps_fill_timex(struct __kernel_timex *txc) { /* PPS is not implemented, so these are zero */ txc->ppsfreq = 0; @@ -633,7 +633,7 @@ void ntp_notify_cmos_timer(void) /* * Propagate a new txc->status value into the NTP state: */ -static inline void process_adj_status(const struct timex *txc) +static inline void process_adj_status(const struct __kernel_timex *txc) { if ((time_status & STA_PLL) && !(txc->status & STA_PLL)) { time_state = TIME_OK; @@ -656,7 +656,8 @@ static inline void process_adj_status(const struct timex *txc) } -static inline void process_adjtimex_modes(const struct timex *txc, s32 *time_tai) +static inline void process_adjtimex_modes(const struct __kernel_timex *txc, + s32 *time_tai) { if (txc->modes & ADJ_STATUS) process_adj_status(txc); @@ -707,7 +708,8 @@ static inline void process_adjtimex_modes(const struct timex *txc, s32 *time_tai * adjtimex mainly allows reading (and writing, if superuser) of * kernel time-keeping variables. used by xntpd. */ -int __do_adjtimex(struct timex *txc, const struct timespec64 *ts, s32 *time_tai) +int __do_adjtimex(struct __kernel_timex *txc, const struct timespec64 *ts, + s32 *time_tai) { int result; @@ -729,7 +731,7 @@ int __do_adjtimex(struct timex *txc, const struct timespec64 *ts, s32 *time_tai) txc->offset = shift_right(time_offset * NTP_INTERVAL_FREQ, NTP_SCALE_SHIFT); if (!(time_status & STA_NANO)) - txc->offset /= NSEC_PER_USEC; + txc->offset = (u32)txc->offset / NSEC_PER_USEC; } result = time_state; /* mostly `TIME_OK' */ @@ -754,7 +756,7 @@ int __do_adjtimex(struct timex *txc, const struct timespec64 *ts, s32 *time_tai) txc->time.tv_sec = (time_t)ts->tv_sec; txc->time.tv_usec = ts->tv_nsec; if (!(time_status & STA_NANO)) - txc->time.tv_usec /= NSEC_PER_USEC; + txc->time.tv_usec = ts->tv_nsec / NSEC_PER_USEC; /* Handle leapsec adjustments */ if (unlikely(ts->tv_sec >= ntp_next_leap_sec)) { diff --git a/kernel/time/ntp_internal.h b/kernel/time/ntp_internal.h index c24b0e13f011..40e6122e634e 100644 --- a/kernel/time/ntp_internal.h +++ b/kernel/time/ntp_internal.h @@ -8,6 +8,6 @@ extern void ntp_clear(void); extern u64 ntp_tick_length(void); extern ktime_t ntp_get_next_leap(void); extern int second_overflow(time64_t secs); -extern int __do_adjtimex(struct timex *txc, const struct timespec64 *ts, s32 *time_tai); +extern int __do_adjtimex(struct __kernel_timex *txc, const struct timespec64 *ts, s32 *time_tai); extern void __hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_ts); #endif /* _LINUX_NTP_INTERNAL_H */ diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c index 425bbfce6819..ec960bb939fd 100644 --- a/kernel/time/posix-clock.c +++ b/kernel/time/posix-clock.c @@ -228,7 +228,7 @@ static void put_clock_desc(struct posix_clock_desc *cd) fput(cd->fp); } -static int pc_clock_adjtime(clockid_t id, struct timex *tx) +static int pc_clock_adjtime(clockid_t id, struct __kernel_timex *tx) { struct posix_clock_desc cd; int err; diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 80f955210861..0a426f4e3125 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -67,13 +67,13 @@ static void bump_cpu_timer(struct k_itimer *timer, u64 now) int i; u64 delta, incr; - if (timer->it.cpu.incr == 0) + if (!timer->it_interval) return; if (now < timer->it.cpu.expires) return; - incr = timer->it.cpu.incr; + incr = timer->it_interval; delta = now + incr - timer->it.cpu.expires; /* Don't use (incr*2 < delta), incr*2 might overflow. */ @@ -520,7 +520,7 @@ static void cpu_timer_fire(struct k_itimer *timer) */ wake_up_process(timer->it_process); timer->it.cpu.expires = 0; - } else if (timer->it.cpu.incr == 0) { + } else if (!timer->it_interval) { /* * One-shot timer. Clear it as soon as it's fired. */ @@ -606,7 +606,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags, */ ret = 0; - old_incr = timer->it.cpu.incr; + old_incr = timer->it_interval; old_expires = timer->it.cpu.expires; if (unlikely(timer->it.cpu.firing)) { timer->it.cpu.firing = -1; @@ -684,8 +684,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags, * Install the new reload setting, and * set up the signal and overrun bookkeeping. */ - timer->it.cpu.incr = timespec64_to_ns(&new->it_interval); - timer->it_interval = ns_to_ktime(timer->it.cpu.incr); + timer->it_interval = timespec64_to_ktime(new->it_interval); /* * This acts as a modification timestamp for the timer, @@ -724,7 +723,7 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec64 *itp /* * Easy part: convert the reload time. */ - itp->it_interval = ns_to_timespec64(timer->it.cpu.incr); + itp->it_interval = ktime_to_timespec64(timer->it_interval); if (!timer->it.cpu.expires) return; diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c index a51895486e5e..67df65f887ac 100644 --- a/kernel/time/posix-stubs.c +++ b/kernel/time/posix-stubs.c @@ -45,6 +45,7 @@ SYS_NI(timer_delete); SYS_NI(clock_adjtime); SYS_NI(getitimer); SYS_NI(setitimer); +SYS_NI(clock_adjtime32); #ifdef __ARCH_WANT_SYS_ALARM SYS_NI(alarm); #endif @@ -150,16 +151,16 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, #ifdef CONFIG_COMPAT COMPAT_SYS_NI(timer_create); -COMPAT_SYS_NI(clock_adjtime); -COMPAT_SYS_NI(timer_settime); -COMPAT_SYS_NI(timer_gettime); COMPAT_SYS_NI(getitimer); COMPAT_SYS_NI(setitimer); #endif #ifdef CONFIG_COMPAT_32BIT_TIME -COMPAT_SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock, - struct old_timespec32 __user *, tp) +SYS_NI(timer_settime32); +SYS_NI(timer_gettime32); + +SYSCALL_DEFINE2(clock_settime32, const clockid_t, which_clock, + struct old_timespec32 __user *, tp) { struct timespec64 new_tp; @@ -171,8 +172,8 @@ COMPAT_SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock, return do_sys_settimeofday64(&new_tp, NULL); } -COMPAT_SYSCALL_DEFINE2(clock_gettime, clockid_t, which_clock, - struct old_timespec32 __user *, tp) +SYSCALL_DEFINE2(clock_gettime32, clockid_t, which_clock, + struct old_timespec32 __user *, tp) { int ret; struct timespec64 kernel_tp; @@ -186,8 +187,8 @@ COMPAT_SYSCALL_DEFINE2(clock_gettime, clockid_t, which_clock, return 0; } -COMPAT_SYSCALL_DEFINE2(clock_getres, clockid_t, which_clock, - struct old_timespec32 __user *, tp) +SYSCALL_DEFINE2(clock_getres_time32, clockid_t, which_clock, + struct old_timespec32 __user *, tp) { struct timespec64 rtn_tp = { .tv_sec = 0, @@ -206,9 +207,9 @@ COMPAT_SYSCALL_DEFINE2(clock_getres, clockid_t, which_clock, } } -COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags, - struct old_timespec32 __user *, rqtp, - struct old_timespec32 __user *, rmtp) +SYSCALL_DEFINE4(clock_nanosleep_time32, clockid_t, which_clock, int, flags, + struct old_timespec32 __user *, rqtp, + struct old_timespec32 __user *, rmtp) { struct timespec64 t; diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 0e84bb72a3da..29176635991f 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -179,7 +179,7 @@ static int posix_clock_realtime_set(const clockid_t which_clock, } static int posix_clock_realtime_adj(const clockid_t which_clock, - struct timex *t) + struct __kernel_timex *t) { return do_adjtimex(t); } @@ -730,8 +730,8 @@ SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id, #ifdef CONFIG_COMPAT_32BIT_TIME -COMPAT_SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id, - struct old_itimerspec32 __user *, setting) +SYSCALL_DEFINE2(timer_gettime32, timer_t, timer_id, + struct old_itimerspec32 __user *, setting) { struct itimerspec64 cur_setting; @@ -903,9 +903,9 @@ SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags, } #ifdef CONFIG_COMPAT_32BIT_TIME -COMPAT_SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags, - struct old_itimerspec32 __user *, new, - struct old_itimerspec32 __user *, old) +SYSCALL_DEFINE4(timer_settime32, timer_t, timer_id, int, flags, + struct old_itimerspec32 __user *, new, + struct old_itimerspec32 __user *, old) { struct itimerspec64 new_spec, old_spec; struct itimerspec64 *rtn = old ? &old_spec : NULL; @@ -1047,22 +1047,28 @@ SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock, return error; } -SYSCALL_DEFINE2(clock_adjtime, const clockid_t, which_clock, - struct timex __user *, utx) +int do_clock_adjtime(const clockid_t which_clock, struct __kernel_timex * ktx) { const struct k_clock *kc = clockid_to_kclock(which_clock); - struct timex ktx; - int err; if (!kc) return -EINVAL; if (!kc->clock_adj) return -EOPNOTSUPP; + return kc->clock_adj(which_clock, ktx); +} + +SYSCALL_DEFINE2(clock_adjtime, const clockid_t, which_clock, + struct __kernel_timex __user *, utx) +{ + struct __kernel_timex ktx; + int err; + if (copy_from_user(&ktx, utx, sizeof(ktx))) return -EFAULT; - err = kc->clock_adj(which_clock, &ktx); + err = do_clock_adjtime(which_clock, &ktx); if (err >= 0 && copy_to_user(utx, &ktx, sizeof(ktx))) return -EFAULT; @@ -1090,8 +1096,8 @@ SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, #ifdef CONFIG_COMPAT_32BIT_TIME -COMPAT_SYSCALL_DEFINE2(clock_settime, clockid_t, which_clock, - struct old_timespec32 __user *, tp) +SYSCALL_DEFINE2(clock_settime32, clockid_t, which_clock, + struct old_timespec32 __user *, tp) { const struct k_clock *kc = clockid_to_kclock(which_clock); struct timespec64 ts; @@ -1105,8 +1111,8 @@ COMPAT_SYSCALL_DEFINE2(clock_settime, clockid_t, which_clock, return kc->clock_set(which_clock, &ts); } -COMPAT_SYSCALL_DEFINE2(clock_gettime, clockid_t, which_clock, - struct old_timespec32 __user *, tp) +SYSCALL_DEFINE2(clock_gettime32, clockid_t, which_clock, + struct old_timespec32 __user *, tp) { const struct k_clock *kc = clockid_to_kclock(which_clock); struct timespec64 ts; @@ -1123,40 +1129,26 @@ COMPAT_SYSCALL_DEFINE2(clock_gettime, clockid_t, which_clock, return err; } -#endif - -#ifdef CONFIG_COMPAT - -COMPAT_SYSCALL_DEFINE2(clock_adjtime, clockid_t, which_clock, - struct compat_timex __user *, utp) +SYSCALL_DEFINE2(clock_adjtime32, clockid_t, which_clock, + struct old_timex32 __user *, utp) { - const struct k_clock *kc = clockid_to_kclock(which_clock); - struct timex ktx; + struct __kernel_timex ktx; int err; - if (!kc) - return -EINVAL; - if (!kc->clock_adj) - return -EOPNOTSUPP; - - err = compat_get_timex(&ktx, utp); + err = get_old_timex32(&ktx, utp); if (err) return err; - err = kc->clock_adj(which_clock, &ktx); + err = do_clock_adjtime(which_clock, &ktx); if (err >= 0) - err = compat_put_timex(utp, &ktx); + err = put_old_timex32(utp, &ktx); return err; } -#endif - -#ifdef CONFIG_COMPAT_32BIT_TIME - -COMPAT_SYSCALL_DEFINE2(clock_getres, clockid_t, which_clock, - struct old_timespec32 __user *, tp) +SYSCALL_DEFINE2(clock_getres_time32, clockid_t, which_clock, + struct old_timespec32 __user *, tp) { const struct k_clock *kc = clockid_to_kclock(which_clock); struct timespec64 ts; @@ -1212,9 +1204,9 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, #ifdef CONFIG_COMPAT_32BIT_TIME -COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags, - struct old_timespec32 __user *, rqtp, - struct old_timespec32 __user *, rmtp) +SYSCALL_DEFINE4(clock_nanosleep_time32, clockid_t, which_clock, int, flags, + struct old_timespec32 __user *, rqtp, + struct old_timespec32 __user *, rmtp) { const struct k_clock *kc = clockid_to_kclock(which_clock); struct timespec64 t; diff --git a/kernel/time/posix-timers.h b/kernel/time/posix-timers.h index ddb21145211a..de5daa6d975a 100644 --- a/kernel/time/posix-timers.h +++ b/kernel/time/posix-timers.h @@ -8,7 +8,7 @@ struct k_clock { const struct timespec64 *tp); int (*clock_get)(const clockid_t which_clock, struct timespec64 *tp); - int (*clock_adj)(const clockid_t which_clock, struct timex *tx); + int (*clock_adj)(const clockid_t which_clock, struct __kernel_timex *tx); int (*timer_create)(struct k_itimer *timer); int (*nsleep)(const clockid_t which_clock, int flags, const struct timespec64 *); diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 803fa67aace9..ee834d4fb814 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -375,6 +375,7 @@ void tick_broadcast_control(enum tick_broadcast_mode mode) switch (mode) { case TICK_BROADCAST_FORCE: tick_broadcast_forced = 1; + /* fall through */ case TICK_BROADCAST_ON: cpumask_set_cpu(cpu, tick_broadcast_on); if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_mask)) { diff --git a/kernel/time/time.c b/kernel/time/time.c index 2edb5088a70b..c3f756f8534b 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -98,11 +98,11 @@ SYSCALL_DEFINE1(stime, time_t __user *, tptr) #endif /* __ARCH_WANT_SYS_TIME */ -#ifdef CONFIG_COMPAT -#ifdef __ARCH_WANT_COMPAT_SYS_TIME +#ifdef CONFIG_COMPAT_32BIT_TIME +#ifdef __ARCH_WANT_SYS_TIME32 /* old_time32_t is a 32 bit "long" and needs to get converted. */ -COMPAT_SYSCALL_DEFINE1(time, old_time32_t __user *, tloc) +SYSCALL_DEFINE1(time32, old_time32_t __user *, tloc) { old_time32_t i; @@ -116,7 +116,7 @@ COMPAT_SYSCALL_DEFINE1(time, old_time32_t __user *, tloc) return i; } -COMPAT_SYSCALL_DEFINE1(stime, old_time32_t __user *, tptr) +SYSCALL_DEFINE1(stime32, old_time32_t __user *, tptr) { struct timespec64 tv; int err; @@ -134,7 +134,7 @@ COMPAT_SYSCALL_DEFINE1(stime, old_time32_t __user *, tptr) return 0; } -#endif /* __ARCH_WANT_COMPAT_SYS_TIME */ +#endif /* __ARCH_WANT_SYS_TIME32 */ #endif SYSCALL_DEFINE2(gettimeofday, struct timeval __user *, tv, @@ -263,35 +263,99 @@ COMPAT_SYSCALL_DEFINE2(settimeofday, struct old_timeval32 __user *, tv, } #endif -SYSCALL_DEFINE1(adjtimex, struct timex __user *, txc_p) +#if !defined(CONFIG_64BIT_TIME) || defined(CONFIG_64BIT) +SYSCALL_DEFINE1(adjtimex, struct __kernel_timex __user *, txc_p) { - struct timex txc; /* Local copy of parameter */ + struct __kernel_timex txc; /* Local copy of parameter */ int ret; /* Copy the user data space into the kernel copy * structure. But bear in mind that the structures * may change */ - if (copy_from_user(&txc, txc_p, sizeof(struct timex))) + if (copy_from_user(&txc, txc_p, sizeof(struct __kernel_timex))) return -EFAULT; ret = do_adjtimex(&txc); - return copy_to_user(txc_p, &txc, sizeof(struct timex)) ? -EFAULT : ret; + return copy_to_user(txc_p, &txc, sizeof(struct __kernel_timex)) ? -EFAULT : ret; } +#endif -#ifdef CONFIG_COMPAT +#ifdef CONFIG_COMPAT_32BIT_TIME +int get_old_timex32(struct __kernel_timex *txc, const struct old_timex32 __user *utp) +{ + struct old_timex32 tx32; + + memset(txc, 0, sizeof(struct __kernel_timex)); + if (copy_from_user(&tx32, utp, sizeof(struct old_timex32))) + return -EFAULT; + + txc->modes = tx32.modes; + txc->offset = tx32.offset; + txc->freq = tx32.freq; + txc->maxerror = tx32.maxerror; + txc->esterror = tx32.esterror; + txc->status = tx32.status; + txc->constant = tx32.constant; + txc->precision = tx32.precision; + txc->tolerance = tx32.tolerance; + txc->time.tv_sec = tx32.time.tv_sec; + txc->time.tv_usec = tx32.time.tv_usec; + txc->tick = tx32.tick; + txc->ppsfreq = tx32.ppsfreq; + txc->jitter = tx32.jitter; + txc->shift = tx32.shift; + txc->stabil = tx32.stabil; + txc->jitcnt = tx32.jitcnt; + txc->calcnt = tx32.calcnt; + txc->errcnt = tx32.errcnt; + txc->stbcnt = tx32.stbcnt; + + return 0; +} + +int put_old_timex32(struct old_timex32 __user *utp, const struct __kernel_timex *txc) +{ + struct old_timex32 tx32; + + memset(&tx32, 0, sizeof(struct old_timex32)); + tx32.modes = txc->modes; + tx32.offset = txc->offset; + tx32.freq = txc->freq; + tx32.maxerror = txc->maxerror; + tx32.esterror = txc->esterror; + tx32.status = txc->status; + tx32.constant = txc->constant; + tx32.precision = txc->precision; + tx32.tolerance = txc->tolerance; + tx32.time.tv_sec = txc->time.tv_sec; + tx32.time.tv_usec = txc->time.tv_usec; + tx32.tick = txc->tick; + tx32.ppsfreq = txc->ppsfreq; + tx32.jitter = txc->jitter; + tx32.shift = txc->shift; + tx32.stabil = txc->stabil; + tx32.jitcnt = txc->jitcnt; + tx32.calcnt = txc->calcnt; + tx32.errcnt = txc->errcnt; + tx32.stbcnt = txc->stbcnt; + tx32.tai = txc->tai; + if (copy_to_user(utp, &tx32, sizeof(struct old_timex32))) + return -EFAULT; + return 0; +} -COMPAT_SYSCALL_DEFINE1(adjtimex, struct compat_timex __user *, utp) +SYSCALL_DEFINE1(adjtimex_time32, struct old_timex32 __user *, utp) { - struct timex txc; + struct __kernel_timex txc; int err, ret; - err = compat_get_timex(&txc, utp); + err = get_old_timex32(&txc, utp); if (err) return err; ret = do_adjtimex(&txc); - err = compat_put_timex(utp, &txc); + err = put_old_timex32(utp, &txc); if (err) return err; diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index ac5dbf2cd4a2..f986e1918d12 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -2234,7 +2234,7 @@ ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq, ktime_t *offs_real, /** * timekeeping_validate_timex - Ensures the timex is ok for use in do_adjtimex */ -static int timekeeping_validate_timex(const struct timex *txc) +static int timekeeping_validate_timex(const struct __kernel_timex *txc) { if (txc->modes & ADJ_ADJTIME) { /* singleshot must not be used with any other mode bits */ @@ -2300,7 +2300,7 @@ static int timekeeping_validate_timex(const struct timex *txc) /** * do_adjtimex() - Accessor function to NTP __do_adjtimex function */ -int do_adjtimex(struct timex *txc) +int do_adjtimex(struct __kernel_timex *txc) { struct timekeeper *tk = &tk_core.timekeeper; unsigned long flags; diff --git a/kernel/time/timekeeping_debug.c b/kernel/time/timekeeping_debug.c index 86489950d690..b73e8850e58d 100644 --- a/kernel/time/timekeeping_debug.c +++ b/kernel/time/timekeeping_debug.c @@ -37,15 +37,8 @@ DEFINE_SHOW_ATTRIBUTE(tk_debug_sleep_time); static int __init tk_debug_sleep_time_init(void) { - struct dentry *d; - - d = debugfs_create_file("sleep_time", 0444, NULL, NULL, - &tk_debug_sleep_time_fops); - if (!d) { - pr_err("Failed to create sleep_time debug file\n"); - return -ENOMEM; - } - + debugfs_create_file("sleep_time", 0444, NULL, NULL, + &tk_debug_sleep_time_fops); return 0; } late_initcall(tk_debug_sleep_time_init); diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 444156debfa0..2fce056f8a49 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -647,7 +647,7 @@ static bool timer_fixup_activate(void *addr, enum debug_obj_state state) case ODEBUG_STATE_ACTIVE: WARN_ON(1); - + /* fall through */ default: return false; } @@ -1632,7 +1632,7 @@ void update_process_times(int user_tick) /* Note: this timer irq context must be accounted for as well. */ account_process_tick(p, user_tick); run_local_timers(); - rcu_check_callbacks(user_tick); + rcu_sched_clock_irq(user_tick); #ifdef CONFIG_IRQ_WORK if (in_irq()) irq_work_tick(); diff --git a/kernel/torture.c b/kernel/torture.c index bbf6d473e50c..8faa1a9aaeb9 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -1,23 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * Common functions for in-kernel torture tests. * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, you can access it online at - * http://www.gnu.org/licenses/gpl-2.0.html. - * * Copyright (C) IBM Corporation, 2014 * - * Author: Paul E. McKenney <paulmck@us.ibm.com> + * Author: Paul E. McKenney <paulmck@linux.ibm.com> * Based on kernel/rcu/torture.c. */ @@ -53,7 +40,7 @@ #include "rcu/rcu.h" MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com>"); +MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com>"); static char *torture_type; static int verbose; @@ -75,6 +62,7 @@ static DEFINE_MUTEX(fullstop_mutex); static struct task_struct *onoff_task; static long onoff_holdoff; static long onoff_interval; +static torture_ofl_func *onoff_f; static long n_offline_attempts; static long n_offline_successes; static unsigned long sum_offline; @@ -118,6 +106,8 @@ bool torture_offline(int cpu, long *n_offl_attempts, long *n_offl_successes, pr_alert("%s" TORTURE_FLAG "torture_onoff task: offlined %d\n", torture_type, cpu); + if (onoff_f) + onoff_f(); (*n_offl_successes)++; delta = jiffies - starttime; *sum_offl += delta; @@ -243,11 +233,12 @@ stop: /* * Initiate online-offline handling. */ -int torture_onoff_init(long ooholdoff, long oointerval) +int torture_onoff_init(long ooholdoff, long oointerval, torture_ofl_func *f) { #ifdef CONFIG_HOTPLUG_CPU onoff_holdoff = ooholdoff; onoff_interval = oointerval; + onoff_f = f; if (onoff_interval <= 0) return 0; return torture_create_kthread(torture_onoff, NULL, onoff_task); diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index fa8b1fe824f3..8bd1d6d001d7 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -370,6 +370,7 @@ config PROFILE_ANNOTATED_BRANCHES config PROFILE_ALL_BRANCHES bool "Profile all if conditionals" if !FORTIFY_SOURCE select TRACE_BRANCH_PROFILING + imply CC_DISABLE_WARN_MAYBE_UNINITIALIZED # avoid false positives help This tracer profiles all branch conditions. Every if () taken in the kernel is recorded whether it hit or miss. diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index f1a86a0d881d..d64c00afceb5 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -431,8 +431,7 @@ __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map, if (unlikely(event->oncpu != cpu)) return -EOPNOTSUPP; - perf_event_output(event, sd, regs); - return 0; + return perf_event_output(event, sd, regs); } BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map, diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 27821480105e..217ef481fbbb 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -1301,7 +1301,7 @@ static int parse_pred(const char *str, void *data, /* go past the last quote */ i++; - } else if (isdigit(str[i])) { + } else if (isdigit(str[i]) || str[i] == '-') { /* Make sure the field is not a string */ if (is_string_field(field)) { @@ -1314,6 +1314,9 @@ static int parse_pred(const char *str, void *data, goto err_free; } + if (str[i] == '-') + i++; + /* We allow 0xDEADBEEF */ while (isalnum(str[i])) i++; diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index d3294721f119..d42a473b8240 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -14,6 +14,7 @@ #include <linux/uaccess.h> #include <linux/module.h> #include <linux/ftrace.h> +#include <linux/kprobes.h> #include "trace.h" @@ -365,7 +366,7 @@ out: __trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc); } -static inline void +static nokprobe_inline void start_critical_timing(unsigned long ip, unsigned long parent_ip, int pc) { int cpu; @@ -401,7 +402,7 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip, int pc) atomic_dec(&data->disabled); } -static inline void +static nokprobe_inline void stop_critical_timing(unsigned long ip, unsigned long parent_ip, int pc) { int cpu; @@ -443,6 +444,7 @@ void start_critical_timings(void) start_critical_timing(CALLER_ADDR0, CALLER_ADDR1, pc); } EXPORT_SYMBOL_GPL(start_critical_timings); +NOKPROBE_SYMBOL(start_critical_timings); void stop_critical_timings(void) { @@ -452,6 +454,7 @@ void stop_critical_timings(void) stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1, pc); } EXPORT_SYMBOL_GPL(stop_critical_timings); +NOKPROBE_SYMBOL(stop_critical_timings); #ifdef CONFIG_FUNCTION_TRACER static bool function_enabled; @@ -611,6 +614,7 @@ void tracer_hardirqs_on(unsigned long a0, unsigned long a1) if (!preempt_trace(pc) && irq_trace()) stop_critical_timing(a0, a1, pc); } +NOKPROBE_SYMBOL(tracer_hardirqs_on); void tracer_hardirqs_off(unsigned long a0, unsigned long a1) { @@ -619,6 +623,7 @@ void tracer_hardirqs_off(unsigned long a0, unsigned long a1) if (!preempt_trace(pc) && irq_trace()) start_critical_timing(a0, a1, pc); } +NOKPROBE_SYMBOL(tracer_hardirqs_off); static int irqsoff_tracer_init(struct trace_array *tr) { diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 9eaf07f99212..99592c27465e 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -865,7 +865,7 @@ fetch_store_strlen(unsigned long addr) u8 c; do { - ret = probe_mem_read(&c, (u8 *)addr + len, 1); + ret = probe_kernel_read(&c, (u8 *)addr + len, 1); len++; } while (c && ret == 0 && len < MAX_STRING_SIZE); diff --git a/kernel/trace/trace_preemptirq.c b/kernel/trace/trace_preemptirq.c index 71f553cceb3c..4d8e99fdbbbe 100644 --- a/kernel/trace/trace_preemptirq.c +++ b/kernel/trace/trace_preemptirq.c @@ -9,6 +9,7 @@ #include <linux/uaccess.h> #include <linux/module.h> #include <linux/ftrace.h> +#include <linux/kprobes.h> #include "trace.h" #define CREATE_TRACE_POINTS @@ -30,6 +31,7 @@ void trace_hardirqs_on(void) lockdep_hardirqs_on(CALLER_ADDR0); } EXPORT_SYMBOL(trace_hardirqs_on); +NOKPROBE_SYMBOL(trace_hardirqs_on); void trace_hardirqs_off(void) { @@ -43,6 +45,7 @@ void trace_hardirqs_off(void) lockdep_hardirqs_off(CALLER_ADDR0); } EXPORT_SYMBOL(trace_hardirqs_off); +NOKPROBE_SYMBOL(trace_hardirqs_off); __visible void trace_hardirqs_on_caller(unsigned long caller_addr) { @@ -56,6 +59,7 @@ __visible void trace_hardirqs_on_caller(unsigned long caller_addr) lockdep_hardirqs_on(CALLER_ADDR0); } EXPORT_SYMBOL(trace_hardirqs_on_caller); +NOKPROBE_SYMBOL(trace_hardirqs_on_caller); __visible void trace_hardirqs_off_caller(unsigned long caller_addr) { @@ -69,6 +73,7 @@ __visible void trace_hardirqs_off_caller(unsigned long caller_addr) lockdep_hardirqs_off(CALLER_ADDR0); } EXPORT_SYMBOL(trace_hardirqs_off_caller); +NOKPROBE_SYMBOL(trace_hardirqs_off_caller); #endif /* CONFIG_TRACE_IRQFLAGS */ #ifdef CONFIG_TRACE_PREEMPT_TOGGLE diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 977918d5d350..8fbfda94a67b 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -199,6 +199,13 @@ static int __init nosoftlockup_setup(char *str) } __setup("nosoftlockup", nosoftlockup_setup); +static int __init watchdog_thresh_setup(char *str) +{ + get_option(&str, &watchdog_thresh); + return 1; +} +__setup("watchdog_thresh=", watchdog_thresh_setup); + #ifdef CONFIG_SMP int __read_mostly sysctl_softlockup_all_cpu_backtrace; diff --git a/kernel/workqueue.c b/kernel/workqueue.c index fc5d23d752a5..4026d1871407 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -259,6 +259,8 @@ struct workqueue_struct { struct wq_device *wq_dev; /* I: for sysfs interface */ #endif #ifdef CONFIG_LOCKDEP + char *lock_name; + struct lock_class_key key; struct lockdep_map lockdep_map; #endif char name[WQ_NAME_LEN]; /* I: workqueue name */ @@ -646,7 +648,7 @@ static void set_work_pool_and_clear_pending(struct work_struct *work, * The following mb guarantees that previous clear of a PENDING bit * will not be reordered with any speculative LOADS or STORES from * work->current_func, which is executed afterwards. This possible - * reordering can lead to a missed execution on attempt to qeueue + * reordering can lead to a missed execution on attempt to queue * the same @work. E.g. consider this case: * * CPU#0 CPU#1 @@ -918,6 +920,16 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task) * CONTEXT: * spin_lock_irq(rq->lock) * + * This function is called during schedule() when a kworker is going + * to sleep. It's used by psi to identify aggregation workers during + * dequeuing, to allow periodic aggregation to shut-off when that + * worker is the last task in the system or cgroup to go to sleep. + * + * As this function doesn't involve any workqueue-related locking, it + * only returns stable values when called from inside the scheduler's + * queuing and dequeuing paths, when @task, which must be a kworker, + * is guaranteed to not be processing any works. + * * Return: * The last work function %current executed as a worker, NULL if it * hasn't executed any work yet. @@ -1341,7 +1353,7 @@ static bool is_chained_work(struct workqueue_struct *wq) worker = current_wq_worker(); /* - * Return %true iff I'm a worker execuing a work item on @wq. If + * Return %true iff I'm a worker executing a work item on @wq. If * I'm @worker, it's safe to dereference it without locking. */ return worker && worker->current_pwq->wq == wq; @@ -1512,6 +1524,90 @@ bool queue_work_on(int cpu, struct workqueue_struct *wq, } EXPORT_SYMBOL(queue_work_on); +/** + * workqueue_select_cpu_near - Select a CPU based on NUMA node + * @node: NUMA node ID that we want to select a CPU from + * + * This function will attempt to find a "random" cpu available on a given + * node. If there are no CPUs available on the given node it will return + * WORK_CPU_UNBOUND indicating that we should just schedule to any + * available CPU if we need to schedule this work. + */ +static int workqueue_select_cpu_near(int node) +{ + int cpu; + + /* No point in doing this if NUMA isn't enabled for workqueues */ + if (!wq_numa_enabled) + return WORK_CPU_UNBOUND; + + /* Delay binding to CPU if node is not valid or online */ + if (node < 0 || node >= MAX_NUMNODES || !node_online(node)) + return WORK_CPU_UNBOUND; + + /* Use local node/cpu if we are already there */ + cpu = raw_smp_processor_id(); + if (node == cpu_to_node(cpu)) + return cpu; + + /* Use "random" otherwise know as "first" online CPU of node */ + cpu = cpumask_any_and(cpumask_of_node(node), cpu_online_mask); + + /* If CPU is valid return that, otherwise just defer */ + return cpu < nr_cpu_ids ? cpu : WORK_CPU_UNBOUND; +} + +/** + * queue_work_node - queue work on a "random" cpu for a given NUMA node + * @node: NUMA node that we are targeting the work for + * @wq: workqueue to use + * @work: work to queue + * + * We queue the work to a "random" CPU within a given NUMA node. The basic + * idea here is to provide a way to somehow associate work with a given + * NUMA node. + * + * This function will only make a best effort attempt at getting this onto + * the right NUMA node. If no node is requested or the requested node is + * offline then we just fall back to standard queue_work behavior. + * + * Currently the "random" CPU ends up being the first available CPU in the + * intersection of cpu_online_mask and the cpumask of the node, unless we + * are running on the node. In that case we just use the current CPU. + * + * Return: %false if @work was already on a queue, %true otherwise. + */ +bool queue_work_node(int node, struct workqueue_struct *wq, + struct work_struct *work) +{ + unsigned long flags; + bool ret = false; + + /* + * This current implementation is specific to unbound workqueues. + * Specifically we only return the first available CPU for a given + * node instead of cycling through individual CPUs within the node. + * + * If this is used with a per-cpu workqueue then the logic in + * workqueue_select_cpu_near would need to be updated to allow for + * some round robin type logic. + */ + WARN_ON_ONCE(!(wq->flags & WQ_UNBOUND)); + + local_irq_save(flags); + + if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) { + int cpu = workqueue_select_cpu_near(node); + + __queue_work(cpu, wq, work); + ret = true; + } + + local_irq_restore(flags); + return ret; +} +EXPORT_SYMBOL_GPL(queue_work_node); + void delayed_work_timer_fn(struct timer_list *t) { struct delayed_work *dwork = from_timer(dwork, t, timer); @@ -1639,7 +1735,7 @@ static void rcu_work_rcufn(struct rcu_head *rcu) * * Return: %false if @rwork was already pending, %true otherwise. Note * that a full RCU grace period is guaranteed only after a %true return. - * While @rwork is guarnateed to be executed after a %false return, the + * While @rwork is guaranteed to be executed after a %false return, the * execution may happen before a full RCU grace period has passed. */ bool queue_rcu_work(struct workqueue_struct *wq, struct rcu_work *rwork) @@ -2931,6 +3027,9 @@ static bool __flush_work(struct work_struct *work, bool from_cancel) if (WARN_ON(!wq_online)) return false; + if (WARN_ON(!work->func)) + return false; + if (!from_cancel) { lock_map_acquire(&work->lockdep_map); lock_map_release(&work->lockdep_map); @@ -3337,11 +3436,51 @@ static int init_worker_pool(struct worker_pool *pool) return 0; } +#ifdef CONFIG_LOCKDEP +static void wq_init_lockdep(struct workqueue_struct *wq) +{ + char *lock_name; + + lockdep_register_key(&wq->key); + lock_name = kasprintf(GFP_KERNEL, "%s%s", "(wq_completion)", wq->name); + if (!lock_name) + lock_name = wq->name; + + wq->lock_name = lock_name; + lockdep_init_map(&wq->lockdep_map, lock_name, &wq->key, 0); +} + +static void wq_unregister_lockdep(struct workqueue_struct *wq) +{ + lockdep_unregister_key(&wq->key); +} + +static void wq_free_lockdep(struct workqueue_struct *wq) +{ + if (wq->lock_name != wq->name) + kfree(wq->lock_name); +} +#else +static void wq_init_lockdep(struct workqueue_struct *wq) +{ +} + +static void wq_unregister_lockdep(struct workqueue_struct *wq) +{ +} + +static void wq_free_lockdep(struct workqueue_struct *wq) +{ +} +#endif + static void rcu_free_wq(struct rcu_head *rcu) { struct workqueue_struct *wq = container_of(rcu, struct workqueue_struct, rcu); + wq_free_lockdep(wq); + if (!(wq->flags & WQ_UNBOUND)) free_percpu(wq->cpu_pwqs); else @@ -3532,8 +3671,10 @@ static void pwq_unbound_release_workfn(struct work_struct *work) * If we're the last pwq going away, @wq is already dead and no one * is gonna access it anymore. Schedule RCU free. */ - if (is_last) + if (is_last) { + wq_unregister_lockdep(wq); call_rcu(&wq->rcu, rcu_free_wq); + } } /** @@ -4067,11 +4208,9 @@ static int init_rescuer(struct workqueue_struct *wq) return 0; } -struct workqueue_struct *__alloc_workqueue_key(const char *fmt, - unsigned int flags, - int max_active, - struct lock_class_key *key, - const char *lock_name, ...) +struct workqueue_struct *alloc_workqueue(const char *fmt, + unsigned int flags, + int max_active, ...) { size_t tbl_size = 0; va_list args; @@ -4106,7 +4245,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, goto err_free_wq; } - va_start(args, lock_name); + va_start(args, max_active); vsnprintf(wq->name, sizeof(wq->name), fmt, args); va_end(args); @@ -4123,7 +4262,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, INIT_LIST_HEAD(&wq->flusher_overflow); INIT_LIST_HEAD(&wq->maydays); - lockdep_init_map(&wq->lockdep_map, lock_name, key, 0); + wq_init_lockdep(wq); INIT_LIST_HEAD(&wq->list); if (alloc_and_link_pwqs(wq) < 0) @@ -4154,6 +4293,8 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, return wq; err_free_wq: + wq_unregister_lockdep(wq); + wq_free_lockdep(wq); free_workqueue_attrs(wq->unbound_attrs); kfree(wq); return NULL; @@ -4161,7 +4302,7 @@ err_destroy: destroy_workqueue(wq); return NULL; } -EXPORT_SYMBOL_GPL(__alloc_workqueue_key); +EXPORT_SYMBOL_GPL(alloc_workqueue); /** * destroy_workqueue - safely terminate a workqueue @@ -4214,6 +4355,7 @@ void destroy_workqueue(struct workqueue_struct *wq) kthread_stop(wq->rescuer->task); if (!(wq->flags & WQ_UNBOUND)) { + wq_unregister_lockdep(wq); /* * The base ref is never dropped on per-cpu pwqs. Directly * schedule RCU free. |