diff options
Diffstat (limited to 'kernel')
93 files changed, 2750 insertions, 1607 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 7343b3a9bff0..9dc7f519129d 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -76,9 +76,7 @@ obj-$(CONFIG_IKCONFIG) += configs.o obj-$(CONFIG_SMP) += stop_machine.o obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o obj-$(CONFIG_AUDIT) += audit.o auditfilter.o -obj-$(CONFIG_AUDITSYSCALL) += auditsc.o -obj-$(CONFIG_AUDIT_WATCH) += audit_watch.o audit_fsnotify.o -obj-$(CONFIG_AUDIT_TREE) += audit_tree.o +obj-$(CONFIG_AUDITSYSCALL) += auditsc.o audit_watch.o audit_fsnotify.o audit_tree.o obj-$(CONFIG_GCOV_KERNEL) += gcov/ obj-$(CONFIG_KCOV) += kcov.o obj-$(CONFIG_KPROBES) += kprobes.o diff --git a/kernel/audit.c b/kernel/audit.c index 2a8058764aa6..632d36059556 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -60,7 +60,6 @@ #include <linux/mutex.h> #include <linux/gfp.h> #include <linux/pid.h> -#include <linux/slab.h> #include <linux/audit.h> @@ -400,7 +399,7 @@ static int audit_log_config_change(char *function_name, u32 new, u32 old, ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); if (unlikely(!ab)) return rc; - audit_log_format(ab, "%s=%u old=%u", function_name, new, old); + audit_log_format(ab, "%s=%u old=%u ", function_name, new, old); audit_log_session_info(ab); rc = audit_log_task_context(ab); if (rc) @@ -1067,7 +1066,7 @@ static void audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type) *ab = audit_log_start(NULL, GFP_KERNEL, msg_type); if (unlikely(!*ab)) return; - audit_log_format(*ab, "pid=%d uid=%u", pid, uid); + audit_log_format(*ab, "pid=%d uid=%u ", pid, uid); audit_log_session_info(*ab); audit_log_task_context(*ab); } @@ -1096,10 +1095,11 @@ static void audit_log_feature_change(int which, u32 old_feature, u32 new_feature if (audit_enabled == AUDIT_OFF) return; + ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_FEATURE_CHANGE); if (!ab) return; - audit_log_task_info(ab, current); + audit_log_task_info(ab); audit_log_format(ab, " feature=%s old=%u new=%u old_lock=%u new_lock=%u res=%d", audit_feature_names[which], !!old_feature, !!new_feature, !!old_lock, !!new_lock, res); @@ -2042,7 +2042,7 @@ void audit_log_session_info(struct audit_buffer *ab) unsigned int sessionid = audit_get_sessionid(current); uid_t auid = from_kuid(&init_user_ns, audit_get_loginuid(current)); - audit_log_format(ab, " auid=%u ses=%u", auid, sessionid); + audit_log_format(ab, "auid=%u ses=%u", auid, sessionid); } void audit_log_key(struct audit_buffer *ab, char *key) @@ -2058,11 +2058,13 @@ void audit_log_cap(struct audit_buffer *ab, char *prefix, kernel_cap_t *cap) { int i; - audit_log_format(ab, " %s=", prefix); - CAP_FOR_EACH_U32(i) { - audit_log_format(ab, "%08x", - cap->cap[CAP_LAST_U32 - i]); + if (cap_isclear(*cap)) { + audit_log_format(ab, " %s=0", prefix); + return; } + audit_log_format(ab, " %s=", prefix); + CAP_FOR_EACH_U32(i) + audit_log_format(ab, "%08x", cap->cap[CAP_LAST_U32 - i]); } static void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name) @@ -2177,22 +2179,21 @@ void audit_log_name(struct audit_context *context, struct audit_names *n, } /* log the audit_names record type */ - audit_log_format(ab, " nametype="); switch(n->type) { case AUDIT_TYPE_NORMAL: - audit_log_format(ab, "NORMAL"); + audit_log_format(ab, " nametype=NORMAL"); break; case AUDIT_TYPE_PARENT: - audit_log_format(ab, "PARENT"); + audit_log_format(ab, " nametype=PARENT"); break; case AUDIT_TYPE_CHILD_DELETE: - audit_log_format(ab, "DELETE"); + audit_log_format(ab, " nametype=DELETE"); break; case AUDIT_TYPE_CHILD_CREATE: - audit_log_format(ab, "CREATE"); + audit_log_format(ab, " nametype=CREATE"); break; default: - audit_log_format(ab, "UNKNOWN"); + audit_log_format(ab, " nametype=UNKNOWN"); break; } @@ -2247,15 +2248,15 @@ out_null: audit_log_format(ab, " exe=(null)"); } -struct tty_struct *audit_get_tty(struct task_struct *tsk) +struct tty_struct *audit_get_tty(void) { struct tty_struct *tty = NULL; unsigned long flags; - spin_lock_irqsave(&tsk->sighand->siglock, flags); - if (tsk->signal) - tty = tty_kref_get(tsk->signal->tty); - spin_unlock_irqrestore(&tsk->sighand->siglock, flags); + spin_lock_irqsave(¤t->sighand->siglock, flags); + if (current->signal) + tty = tty_kref_get(current->signal->tty); + spin_unlock_irqrestore(¤t->sighand->siglock, flags); return tty; } @@ -2264,25 +2265,24 @@ void audit_put_tty(struct tty_struct *tty) tty_kref_put(tty); } -void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk) +void audit_log_task_info(struct audit_buffer *ab) { const struct cred *cred; - char comm[sizeof(tsk->comm)]; + char comm[sizeof(current->comm)]; struct tty_struct *tty; if (!ab) return; - /* tsk == current */ cred = current_cred(); - tty = audit_get_tty(tsk); + tty = audit_get_tty(); audit_log_format(ab, " ppid=%d pid=%d auid=%u uid=%u gid=%u" " euid=%u suid=%u fsuid=%u" " egid=%u sgid=%u fsgid=%u tty=%s ses=%u", - task_ppid_nr(tsk), - task_tgid_nr(tsk), - from_kuid(&init_user_ns, audit_get_loginuid(tsk)), + task_ppid_nr(current), + task_tgid_nr(current), + from_kuid(&init_user_ns, audit_get_loginuid(current)), from_kuid(&init_user_ns, cred->uid), from_kgid(&init_user_ns, cred->gid), from_kuid(&init_user_ns, cred->euid), @@ -2292,11 +2292,11 @@ void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk) from_kgid(&init_user_ns, cred->sgid), from_kgid(&init_user_ns, cred->fsgid), tty ? tty_name(tty) : "(none)", - audit_get_sessionid(tsk)); + audit_get_sessionid(current)); audit_put_tty(tty); audit_log_format(ab, " comm="); - audit_log_untrustedstring(ab, get_task_comm(comm, tsk)); - audit_log_d_path_exe(ab, tsk->mm); + audit_log_untrustedstring(ab, get_task_comm(comm, current)); + audit_log_d_path_exe(ab, current->mm); audit_log_task_context(ab); } EXPORT_SYMBOL(audit_log_task_info); @@ -2317,7 +2317,7 @@ void audit_log_link_denied(const char *operation) if (!ab) return; audit_log_format(ab, "op=%s", operation); - audit_log_task_info(ab, current); + audit_log_task_info(ab); audit_log_format(ab, " res=0"); audit_log_end(ab); } diff --git a/kernel/audit.h b/kernel/audit.h index 214e14948370..91421679a168 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -210,6 +210,8 @@ struct audit_context { extern bool audit_ever_enabled; +extern void audit_log_session_info(struct audit_buffer *ab); + extern void audit_copy_inode(struct audit_names *name, const struct dentry *dentry, struct inode *inode); @@ -262,11 +264,11 @@ extern struct audit_entry *audit_dupe_rule(struct audit_krule *old); extern void audit_log_d_path_exe(struct audit_buffer *ab, struct mm_struct *mm); -extern struct tty_struct *audit_get_tty(struct task_struct *tsk); +extern struct tty_struct *audit_get_tty(void); extern void audit_put_tty(struct tty_struct *tty); /* audit watch functions */ -#ifdef CONFIG_AUDIT_WATCH +#ifdef CONFIG_AUDITSYSCALL extern void audit_put_watch(struct audit_watch *watch); extern void audit_get_watch(struct audit_watch *watch); extern int audit_to_watch(struct audit_krule *krule, char *path, int len, u32 op); @@ -299,9 +301,9 @@ extern int audit_exe_compare(struct task_struct *tsk, struct audit_fsnotify_mark #define audit_mark_compare(m, i, d) 0 #define audit_exe_compare(t, m) (-EINVAL) #define audit_dupe_exe(n, o) (-EINVAL) -#endif /* CONFIG_AUDIT_WATCH */ +#endif /* CONFIG_AUDITSYSCALL */ -#ifdef CONFIG_AUDIT_TREE +#ifdef CONFIG_AUDITSYSCALL extern struct audit_chunk *audit_tree_lookup(const struct inode *inode); extern void audit_put_chunk(struct audit_chunk *chunk); extern bool audit_tree_match(struct audit_chunk *chunk, struct audit_tree *tree); diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c index fba78047fb37..cf4512a33675 100644 --- a/kernel/audit_fsnotify.c +++ b/kernel/audit_fsnotify.c @@ -130,10 +130,8 @@ static void audit_mark_log_rule_change(struct audit_fsnotify_mark *audit_mark, c ab = audit_log_start(NULL, GFP_NOFS, AUDIT_CONFIG_CHANGE); if (unlikely(!ab)) return; - audit_log_format(ab, "auid=%u ses=%u op=%s", - from_kuid(&init_user_ns, audit_get_loginuid(current)), - audit_get_sessionid(current), op); - audit_log_format(ab, " path="); + audit_log_session_info(ab); + audit_log_format(ab, " op=%s path=", op); audit_log_untrustedstring(ab, audit_mark->path); audit_log_key(ab, rule->filterkey); audit_log_format(ab, " list=%d res=1", rule->listnr); diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index ea43181cde4a..d4af4d97f847 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -24,9 +24,9 @@ struct audit_tree { struct audit_chunk { struct list_head hash; - struct fsnotify_mark mark; + unsigned long key; + struct fsnotify_mark *mark; struct list_head trees; /* with root here */ - int dead; int count; atomic_long_t refs; struct rcu_head head; @@ -37,13 +37,25 @@ struct audit_chunk { } owners[]; }; +struct audit_tree_mark { + struct fsnotify_mark mark; + struct audit_chunk *chunk; +}; + static LIST_HEAD(tree_list); static LIST_HEAD(prune_list); static struct task_struct *prune_thread; /* - * One struct chunk is attached to each inode of interest. - * We replace struct chunk on tagging/untagging. + * One struct chunk is attached to each inode of interest through + * audit_tree_mark (fsnotify mark). We replace struct chunk on tagging / + * untagging, the mark is stable as long as there is chunk attached. The + * association between mark and chunk is protected by hash_lock and + * audit_tree_group->mark_mutex. Thus as long as we hold + * audit_tree_group->mark_mutex and check that the mark is alive by + * FSNOTIFY_MARK_FLAG_ATTACHED flag check, we are sure the mark points to + * the current chunk. + * * Rules have pointer to struct audit_tree. * Rules have struct list_head rlist forming a list of rules over * the same tree. @@ -62,8 +74,12 @@ static struct task_struct *prune_thread; * tree is refcounted; one reference for "some rules on rules_list refer to * it", one for each chunk with pointer to it. * - * chunk is refcounted by embedded fsnotify_mark + .refs (non-zero refcount - * of watch contributes 1 to .refs). + * chunk is refcounted by embedded .refs. Mark associated with the chunk holds + * one chunk reference. This reference is dropped either when a mark is going + * to be freed (corresponding inode goes away) or when chunk attached to the + * mark gets replaced. This reference must be dropped using + * audit_mark_put_chunk() to make sure the reference is dropped only after RCU + * grace period as it protects RCU readers of the hash table. * * node.index allows to get from node.list to containing chunk. * MSB of that sucker is stolen to mark taggings that we might have to @@ -72,6 +88,7 @@ static struct task_struct *prune_thread; */ static struct fsnotify_group *audit_tree_group; +static struct kmem_cache *audit_tree_mark_cachep __read_mostly; static struct audit_tree *alloc_tree(const char *s) { @@ -131,12 +148,43 @@ static void __put_chunk(struct rcu_head *rcu) audit_put_chunk(chunk); } -static void audit_tree_destroy_watch(struct fsnotify_mark *entry) +/* + * Drop reference to the chunk that was held by the mark. This is the reference + * that gets dropped after we've removed the chunk from the hash table and we + * use it to make sure chunk cannot be freed before RCU grace period expires. + */ +static void audit_mark_put_chunk(struct audit_chunk *chunk) { - struct audit_chunk *chunk = container_of(entry, struct audit_chunk, mark); call_rcu(&chunk->head, __put_chunk); } +static inline struct audit_tree_mark *audit_mark(struct fsnotify_mark *mark) +{ + return container_of(mark, struct audit_tree_mark, mark); +} + +static struct audit_chunk *mark_chunk(struct fsnotify_mark *mark) +{ + return audit_mark(mark)->chunk; +} + +static void audit_tree_destroy_watch(struct fsnotify_mark *mark) +{ + kmem_cache_free(audit_tree_mark_cachep, audit_mark(mark)); +} + +static struct fsnotify_mark *alloc_mark(void) +{ + struct audit_tree_mark *amark; + + amark = kmem_cache_zalloc(audit_tree_mark_cachep, GFP_KERNEL); + if (!amark) + return NULL; + fsnotify_init_mark(&amark->mark, audit_tree_group); + amark->mark.mask = FS_IN_IGNORED; + return &amark->mark; +} + static struct audit_chunk *alloc_chunk(int count) { struct audit_chunk *chunk; @@ -156,8 +204,6 @@ static struct audit_chunk *alloc_chunk(int count) INIT_LIST_HEAD(&chunk->owners[i].list); chunk->owners[i].index = i; } - fsnotify_init_mark(&chunk->mark, audit_tree_group); - chunk->mark.mask = FS_IN_IGNORED; return chunk; } @@ -172,36 +218,25 @@ static unsigned long inode_to_key(const struct inode *inode) return (unsigned long)&inode->i_fsnotify_marks; } -/* - * Function to return search key in our hash from chunk. Key 0 is special and - * should never be present in the hash. - */ -static unsigned long chunk_to_key(struct audit_chunk *chunk) -{ - /* - * We have a reference to the mark so it should be attached to a - * connector. - */ - if (WARN_ON_ONCE(!chunk->mark.connector)) - return 0; - return (unsigned long)chunk->mark.connector->obj; -} - static inline struct list_head *chunk_hash(unsigned long key) { unsigned long n = key / L1_CACHE_BYTES; return chunk_hash_heads + n % HASH_SIZE; } -/* hash_lock & entry->lock is held by caller */ +/* hash_lock & mark->group->mark_mutex is held by caller */ static void insert_hash(struct audit_chunk *chunk) { - unsigned long key = chunk_to_key(chunk); struct list_head *list; - if (!(chunk->mark.flags & FSNOTIFY_MARK_FLAG_ATTACHED)) - return; - list = chunk_hash(key); + /* + * Make sure chunk is fully initialized before making it visible in the + * hash. Pairs with a data dependency barrier in READ_ONCE() in + * audit_tree_lookup(). + */ + smp_wmb(); + WARN_ON_ONCE(!chunk->key); + list = chunk_hash(chunk->key); list_add_rcu(&chunk->hash, list); } @@ -213,7 +248,11 @@ struct audit_chunk *audit_tree_lookup(const struct inode *inode) struct audit_chunk *p; list_for_each_entry_rcu(p, list, hash) { - if (chunk_to_key(p) == key) { + /* + * We use a data dependency barrier in READ_ONCE() to make sure + * the chunk we see is fully initialized. + */ + if (READ_ONCE(p->key) == key) { atomic_long_inc(&p->refs); return p; } @@ -239,137 +278,159 @@ static struct audit_chunk *find_chunk(struct node *p) return container_of(p, struct audit_chunk, owners[0]); } -static void untag_chunk(struct node *p) +static void replace_mark_chunk(struct fsnotify_mark *mark, + struct audit_chunk *chunk) +{ + struct audit_chunk *old; + + assert_spin_locked(&hash_lock); + old = mark_chunk(mark); + audit_mark(mark)->chunk = chunk; + if (chunk) + chunk->mark = mark; + if (old) + old->mark = NULL; +} + +static void replace_chunk(struct audit_chunk *new, struct audit_chunk *old) { - struct audit_chunk *chunk = find_chunk(p); - struct fsnotify_mark *entry = &chunk->mark; - struct audit_chunk *new = NULL; struct audit_tree *owner; - int size = chunk->count - 1; int i, j; - fsnotify_get_mark(entry); + new->key = old->key; + list_splice_init(&old->trees, &new->trees); + list_for_each_entry(owner, &new->trees, same_root) + owner->root = new; + for (i = j = 0; j < old->count; i++, j++) { + if (!old->owners[j].owner) { + i--; + continue; + } + owner = old->owners[j].owner; + new->owners[i].owner = owner; + new->owners[i].index = old->owners[j].index - j + i; + if (!owner) /* result of earlier fallback */ + continue; + get_tree(owner); + list_replace_init(&old->owners[j].list, &new->owners[i].list); + } + replace_mark_chunk(old->mark, new); + /* + * Make sure chunk is fully initialized before making it visible in the + * hash. Pairs with a data dependency barrier in READ_ONCE() in + * audit_tree_lookup(). + */ + smp_wmb(); + list_replace_rcu(&old->hash, &new->hash); +} - spin_unlock(&hash_lock); +static void remove_chunk_node(struct audit_chunk *chunk, struct node *p) +{ + struct audit_tree *owner = p->owner; + + if (owner->root == chunk) { + list_del_init(&owner->same_root); + owner->root = NULL; + } + list_del_init(&p->list); + p->owner = NULL; + put_tree(owner); +} - if (size) - new = alloc_chunk(size); +static int chunk_count_trees(struct audit_chunk *chunk) +{ + int i; + int ret = 0; - mutex_lock(&entry->group->mark_mutex); - spin_lock(&entry->lock); + for (i = 0; i < chunk->count; i++) + if (chunk->owners[i].owner) + ret++; + return ret; +} + +static void untag_chunk(struct audit_chunk *chunk, struct fsnotify_mark *mark) +{ + struct audit_chunk *new; + int size; + + mutex_lock(&audit_tree_group->mark_mutex); /* - * mark_mutex protects mark from getting detached and thus also from - * mark->connector->obj getting NULL. + * mark_mutex stabilizes chunk attached to the mark so we can check + * whether it didn't change while we've dropped hash_lock. */ - if (chunk->dead || !(entry->flags & FSNOTIFY_MARK_FLAG_ATTACHED)) { - spin_unlock(&entry->lock); - mutex_unlock(&entry->group->mark_mutex); - if (new) - fsnotify_put_mark(&new->mark); - goto out; - } - - owner = p->owner; + if (!(mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED) || + mark_chunk(mark) != chunk) + goto out_mutex; + size = chunk_count_trees(chunk); if (!size) { - chunk->dead = 1; spin_lock(&hash_lock); list_del_init(&chunk->trees); - if (owner->root == chunk) - owner->root = NULL; - list_del_init(&p->list); list_del_rcu(&chunk->hash); + replace_mark_chunk(mark, NULL); spin_unlock(&hash_lock); - spin_unlock(&entry->lock); - mutex_unlock(&entry->group->mark_mutex); - fsnotify_destroy_mark(entry, audit_tree_group); - goto out; + fsnotify_detach_mark(mark); + mutex_unlock(&audit_tree_group->mark_mutex); + audit_mark_put_chunk(chunk); + fsnotify_free_mark(mark); + return; } + new = alloc_chunk(size); if (!new) - goto Fallback; + goto out_mutex; - if (fsnotify_add_mark_locked(&new->mark, entry->connector->obj, - FSNOTIFY_OBJ_TYPE_INODE, 1)) { - fsnotify_put_mark(&new->mark); - goto Fallback; - } - - chunk->dead = 1; spin_lock(&hash_lock); - list_replace_init(&chunk->trees, &new->trees); - if (owner->root == chunk) { - list_del_init(&owner->same_root); - owner->root = NULL; - } - - for (i = j = 0; j <= size; i++, j++) { - struct audit_tree *s; - if (&chunk->owners[j] == p) { - list_del_init(&p->list); - i--; - continue; - } - s = chunk->owners[j].owner; - new->owners[i].owner = s; - new->owners[i].index = chunk->owners[j].index - j + i; - if (!s) /* result of earlier fallback */ - continue; - get_tree(s); - list_replace_init(&chunk->owners[j].list, &new->owners[i].list); - } - - list_replace_rcu(&chunk->hash, &new->hash); - list_for_each_entry(owner, &new->trees, same_root) - owner->root = new; - spin_unlock(&hash_lock); - spin_unlock(&entry->lock); - mutex_unlock(&entry->group->mark_mutex); - fsnotify_destroy_mark(entry, audit_tree_group); - fsnotify_put_mark(&new->mark); /* drop initial reference */ - goto out; - -Fallback: - // do the best we can - spin_lock(&hash_lock); - if (owner->root == chunk) { - list_del_init(&owner->same_root); - owner->root = NULL; - } - list_del_init(&p->list); - p->owner = NULL; - put_tree(owner); + /* + * This has to go last when updating chunk as once replace_chunk() is + * called, new RCU readers can see the new chunk. + */ + replace_chunk(new, chunk); spin_unlock(&hash_lock); - spin_unlock(&entry->lock); - mutex_unlock(&entry->group->mark_mutex); -out: - fsnotify_put_mark(entry); - spin_lock(&hash_lock); + mutex_unlock(&audit_tree_group->mark_mutex); + audit_mark_put_chunk(chunk); + return; + +out_mutex: + mutex_unlock(&audit_tree_group->mark_mutex); } +/* Call with group->mark_mutex held, releases it */ static int create_chunk(struct inode *inode, struct audit_tree *tree) { - struct fsnotify_mark *entry; + struct fsnotify_mark *mark; struct audit_chunk *chunk = alloc_chunk(1); - if (!chunk) + + if (!chunk) { + mutex_unlock(&audit_tree_group->mark_mutex); return -ENOMEM; + } - entry = &chunk->mark; - if (fsnotify_add_inode_mark(entry, inode, 0)) { - fsnotify_put_mark(entry); + mark = alloc_mark(); + if (!mark) { + mutex_unlock(&audit_tree_group->mark_mutex); + kfree(chunk); + return -ENOMEM; + } + + if (fsnotify_add_inode_mark_locked(mark, inode, 0)) { + mutex_unlock(&audit_tree_group->mark_mutex); + fsnotify_put_mark(mark); + kfree(chunk); return -ENOSPC; } - spin_lock(&entry->lock); spin_lock(&hash_lock); if (tree->goner) { spin_unlock(&hash_lock); - chunk->dead = 1; - spin_unlock(&entry->lock); - fsnotify_destroy_mark(entry, audit_tree_group); - fsnotify_put_mark(entry); + fsnotify_detach_mark(mark); + mutex_unlock(&audit_tree_group->mark_mutex); + fsnotify_free_mark(mark); + fsnotify_put_mark(mark); + kfree(chunk); return 0; } + replace_mark_chunk(mark, chunk); chunk->owners[0].index = (1U << 31); chunk->owners[0].owner = tree; get_tree(tree); @@ -378,35 +439,49 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree) tree->root = chunk; list_add(&tree->same_root, &chunk->trees); } + chunk->key = inode_to_key(inode); + /* + * Inserting into the hash table has to go last as once we do that RCU + * readers can see the chunk. + */ insert_hash(chunk); spin_unlock(&hash_lock); - spin_unlock(&entry->lock); - fsnotify_put_mark(entry); /* drop initial reference */ + mutex_unlock(&audit_tree_group->mark_mutex); + /* + * Drop our initial reference. When mark we point to is getting freed, + * we get notification through ->freeing_mark callback and cleanup + * chunk pointing to this mark. + */ + fsnotify_put_mark(mark); return 0; } /* the first tagged inode becomes root of tree */ static int tag_chunk(struct inode *inode, struct audit_tree *tree) { - struct fsnotify_mark *old_entry, *chunk_entry; - struct audit_tree *owner; + struct fsnotify_mark *mark; struct audit_chunk *chunk, *old; struct node *p; int n; - old_entry = fsnotify_find_mark(&inode->i_fsnotify_marks, - audit_tree_group); - if (!old_entry) + mutex_lock(&audit_tree_group->mark_mutex); + mark = fsnotify_find_mark(&inode->i_fsnotify_marks, audit_tree_group); + if (!mark) return create_chunk(inode, tree); - old = container_of(old_entry, struct audit_chunk, mark); - + /* + * Found mark is guaranteed to be attached and mark_mutex protects mark + * from getting detached and thus it makes sure there is chunk attached + * to the mark. + */ /* are we already there? */ spin_lock(&hash_lock); + old = mark_chunk(mark); for (n = 0; n < old->count; n++) { if (old->owners[n].owner == tree) { spin_unlock(&hash_lock); - fsnotify_put_mark(old_entry); + mutex_unlock(&audit_tree_group->mark_mutex); + fsnotify_put_mark(mark); return 0; } } @@ -414,83 +489,38 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) chunk = alloc_chunk(old->count + 1); if (!chunk) { - fsnotify_put_mark(old_entry); + mutex_unlock(&audit_tree_group->mark_mutex); + fsnotify_put_mark(mark); return -ENOMEM; } - chunk_entry = &chunk->mark; - - mutex_lock(&old_entry->group->mark_mutex); - spin_lock(&old_entry->lock); - /* - * mark_mutex protects mark from getting detached and thus also from - * mark->connector->obj getting NULL. - */ - if (!(old_entry->flags & FSNOTIFY_MARK_FLAG_ATTACHED)) { - /* old_entry is being shot, lets just lie */ - spin_unlock(&old_entry->lock); - mutex_unlock(&old_entry->group->mark_mutex); - fsnotify_put_mark(old_entry); - fsnotify_put_mark(&chunk->mark); - return -ENOENT; - } - - if (fsnotify_add_mark_locked(chunk_entry, old_entry->connector->obj, - FSNOTIFY_OBJ_TYPE_INODE, 1)) { - spin_unlock(&old_entry->lock); - mutex_unlock(&old_entry->group->mark_mutex); - fsnotify_put_mark(chunk_entry); - fsnotify_put_mark(old_entry); - return -ENOSPC; - } - - /* even though we hold old_entry->lock, this is safe since chunk_entry->lock could NEVER have been grabbed before */ - spin_lock(&chunk_entry->lock); spin_lock(&hash_lock); - - /* we now hold old_entry->lock, chunk_entry->lock, and hash_lock */ if (tree->goner) { spin_unlock(&hash_lock); - chunk->dead = 1; - spin_unlock(&chunk_entry->lock); - spin_unlock(&old_entry->lock); - mutex_unlock(&old_entry->group->mark_mutex); - - fsnotify_destroy_mark(chunk_entry, audit_tree_group); - - fsnotify_put_mark(chunk_entry); - fsnotify_put_mark(old_entry); + mutex_unlock(&audit_tree_group->mark_mutex); + fsnotify_put_mark(mark); + kfree(chunk); return 0; } - list_replace_init(&old->trees, &chunk->trees); - for (n = 0, p = chunk->owners; n < old->count; n++, p++) { - struct audit_tree *s = old->owners[n].owner; - p->owner = s; - p->index = old->owners[n].index; - if (!s) /* result of fallback in untag */ - continue; - get_tree(s); - list_replace_init(&old->owners[n].list, &p->list); - } + p = &chunk->owners[chunk->count - 1]; p->index = (chunk->count - 1) | (1U<<31); p->owner = tree; get_tree(tree); list_add(&p->list, &tree->chunks); - list_replace_rcu(&old->hash, &chunk->hash); - list_for_each_entry(owner, &chunk->trees, same_root) - owner->root = chunk; - old->dead = 1; if (!tree->root) { tree->root = chunk; list_add(&tree->same_root, &chunk->trees); } + /* + * This has to go last when updating chunk as once replace_chunk() is + * called, new RCU readers can see the new chunk. + */ + replace_chunk(chunk, old); spin_unlock(&hash_lock); - spin_unlock(&chunk_entry->lock); - spin_unlock(&old_entry->lock); - mutex_unlock(&old_entry->group->mark_mutex); - fsnotify_destroy_mark(old_entry, audit_tree_group); - fsnotify_put_mark(chunk_entry); /* drop initial reference */ - fsnotify_put_mark(old_entry); /* pair to fsnotify_find mark_entry */ + mutex_unlock(&audit_tree_group->mark_mutex); + fsnotify_put_mark(mark); /* pair to fsnotify_find_mark */ + audit_mark_put_chunk(old); + return 0; } @@ -503,8 +533,7 @@ static void audit_tree_log_remove_rule(struct audit_krule *rule) ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); if (unlikely(!ab)) return; - audit_log_format(ab, "op=remove_rule"); - audit_log_format(ab, " dir="); + audit_log_format(ab, "op=remove_rule dir="); audit_log_untrustedstring(ab, rule->tree->pathname); audit_log_key(ab, rule->filterkey); audit_log_format(ab, " list=%d res=1", rule->listnr); @@ -534,22 +563,48 @@ static void kill_rules(struct audit_tree *tree) } /* - * finish killing struct audit_tree + * Remove tree from chunks. If 'tagged' is set, remove tree only from tagged + * chunks. The function expects tagged chunks are all at the beginning of the + * chunks list. */ -static void prune_one(struct audit_tree *victim) +static void prune_tree_chunks(struct audit_tree *victim, bool tagged) { spin_lock(&hash_lock); while (!list_empty(&victim->chunks)) { struct node *p; + struct audit_chunk *chunk; + struct fsnotify_mark *mark; + + p = list_first_entry(&victim->chunks, struct node, list); + /* have we run out of marked? */ + if (tagged && !(p->index & (1U<<31))) + break; + chunk = find_chunk(p); + mark = chunk->mark; + remove_chunk_node(chunk, p); + /* Racing with audit_tree_freeing_mark()? */ + if (!mark) + continue; + fsnotify_get_mark(mark); + spin_unlock(&hash_lock); - p = list_entry(victim->chunks.next, struct node, list); + untag_chunk(chunk, mark); + fsnotify_put_mark(mark); - untag_chunk(p); + spin_lock(&hash_lock); } spin_unlock(&hash_lock); put_tree(victim); } +/* + * finish killing struct audit_tree + */ +static void prune_one(struct audit_tree *victim) +{ + prune_tree_chunks(victim, false); +} + /* trim the uncommitted chunks from tree */ static void trim_marked(struct audit_tree *tree) @@ -569,18 +624,11 @@ static void trim_marked(struct audit_tree *tree) list_add(p, &tree->chunks); } } + spin_unlock(&hash_lock); - while (!list_empty(&tree->chunks)) { - struct node *node; - - node = list_entry(tree->chunks.next, struct node, list); - - /* have we run out of marked? */ - if (!(node->index & (1U<<31))) - break; + prune_tree_chunks(tree, true); - untag_chunk(node); - } + spin_lock(&hash_lock); if (!tree->root && !tree->goner) { tree->goner = 1; spin_unlock(&hash_lock); @@ -661,7 +709,7 @@ void audit_trim_trees(void) /* this could be NULL if the watch is dying else where... */ node->index |= 1U<<31; if (iterate_mounts(compare_root, - (void *)chunk_to_key(chunk), + (void *)(chunk->key), root_mnt)) node->index &= ~(1U<<31); } @@ -959,10 +1007,6 @@ static void evict_chunk(struct audit_chunk *chunk) int need_prune = 0; int n; - if (chunk->dead) - return; - - chunk->dead = 1; mutex_lock(&audit_filter_mutex); spin_lock(&hash_lock); while (!list_empty(&chunk->trees)) { @@ -999,17 +1043,27 @@ static int audit_tree_handle_event(struct fsnotify_group *group, return 0; } -static void audit_tree_freeing_mark(struct fsnotify_mark *entry, struct fsnotify_group *group) +static void audit_tree_freeing_mark(struct fsnotify_mark *mark, + struct fsnotify_group *group) { - struct audit_chunk *chunk = container_of(entry, struct audit_chunk, mark); + struct audit_chunk *chunk; - evict_chunk(chunk); + mutex_lock(&mark->group->mark_mutex); + spin_lock(&hash_lock); + chunk = mark_chunk(mark); + replace_mark_chunk(mark, NULL); + spin_unlock(&hash_lock); + mutex_unlock(&mark->group->mark_mutex); + if (chunk) { + evict_chunk(chunk); + audit_mark_put_chunk(chunk); + } /* * We are guaranteed to have at least one reference to the mark from * either the inode or the caller of fsnotify_destroy_mark(). */ - BUG_ON(refcount_read(&entry->refcnt) < 1); + BUG_ON(refcount_read(&mark->refcnt) < 1); } static const struct fsnotify_ops audit_tree_ops = { @@ -1022,6 +1076,8 @@ static int __init audit_tree_init(void) { int i; + audit_tree_mark_cachep = KMEM_CACHE(audit_tree_mark, SLAB_PANIC); + audit_tree_group = fsnotify_alloc_group(&audit_tree_ops); if (IS_ERR(audit_tree_group)) audit_panic("cannot initialize fsnotify group for rectree watches"); diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 787c7afdf829..20ef9ba134b0 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -245,10 +245,8 @@ static void audit_watch_log_rule_change(struct audit_krule *r, struct audit_watc ab = audit_log_start(NULL, GFP_NOFS, AUDIT_CONFIG_CHANGE); if (!ab) return; - audit_log_format(ab, "auid=%u ses=%u op=%s", - from_kuid(&init_user_ns, audit_get_loginuid(current)), - audit_get_sessionid(current), op); - audit_log_format(ab, " path="); + audit_log_session_info(ab); + audit_log_format(ab, "op=%s path=", op); audit_log_untrustedstring(ab, w->path); audit_log_key(ab, r->filterkey); audit_log_format(ab, " list=%d res=1", r->listnr); diff --git a/kernel/auditsc.c b/kernel/auditsc.c index b2d1f043f17f..6593a5207fb0 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -200,7 +200,6 @@ static int audit_match_filetype(struct audit_context *ctx, int val) * References in it _are_ dropped - at the same time we free/drop aux stuff. */ -#ifdef CONFIG_AUDIT_TREE static void audit_set_auditable(struct audit_context *ctx) { if (!ctx->prio) { @@ -245,12 +244,10 @@ static int grow_tree_refs(struct audit_context *ctx) ctx->tree_count = 31; return 1; } -#endif static void unroll_tree_refs(struct audit_context *ctx, struct audit_tree_refs *p, int count) { -#ifdef CONFIG_AUDIT_TREE struct audit_tree_refs *q; int n; if (!p) { @@ -274,7 +271,6 @@ static void unroll_tree_refs(struct audit_context *ctx, } ctx->trees = p; ctx->tree_count = count; -#endif } static void free_tree_refs(struct audit_context *ctx) @@ -288,7 +284,6 @@ static void free_tree_refs(struct audit_context *ctx) static int match_tree_refs(struct audit_context *ctx, struct audit_tree *tree) { -#ifdef CONFIG_AUDIT_TREE struct audit_tree_refs *p; int n; if (!tree) @@ -305,7 +300,6 @@ static int match_tree_refs(struct audit_context *ctx, struct audit_tree *tree) if (audit_tree_match(p->c[n], tree)) return 1; } -#endif return 0; } @@ -836,44 +830,6 @@ void audit_filter_inodes(struct task_struct *tsk, struct audit_context *ctx) rcu_read_unlock(); } -/* Transfer the audit context pointer to the caller, clearing it in the tsk's struct */ -static inline struct audit_context *audit_take_context(struct task_struct *tsk, - int return_valid, - long return_code) -{ - struct audit_context *context = tsk->audit_context; - - if (!context) - return NULL; - context->return_valid = return_valid; - - /* - * we need to fix up the return code in the audit logs if the actual - * return codes are later going to be fixed up by the arch specific - * signal handlers - * - * This is actually a test for: - * (rc == ERESTARTSYS ) || (rc == ERESTARTNOINTR) || - * (rc == ERESTARTNOHAND) || (rc == ERESTART_RESTARTBLOCK) - * - * but is faster than a bunch of || - */ - if (unlikely(return_code <= -ERESTARTSYS) && - (return_code >= -ERESTART_RESTARTBLOCK) && - (return_code != -ENOIOCTLCMD)) - context->return_code = -EINTR; - else - context->return_code = return_code; - - if (context->in_syscall && !context->dummy) { - audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_EXIT]); - audit_filter_inodes(tsk, context); - } - - audit_set_context(tsk, NULL); - return context; -} - static inline void audit_proctitle_free(struct audit_context *context) { kfree(context->proctitle.value); @@ -1107,7 +1063,7 @@ static void audit_log_execve_info(struct audit_context *context, } /* write as much as we can to the audit log */ - if (len_buf > 0) { + if (len_buf >= 0) { /* NOTE: some magic numbers here - basically if we * can't fit a reasonable amount of data into the * existing audit buffer, flush it and start with @@ -1302,15 +1258,18 @@ static inline int audit_proctitle_rtrim(char *proctitle, int len) return len; } -static void audit_log_proctitle(struct task_struct *tsk, - struct audit_context *context) +static void audit_log_proctitle(void) { int res; char *buf; char *msg = "(null)"; int len = strlen(msg); + struct audit_context *context = audit_context(); struct audit_buffer *ab; + if (!context || context->dummy) + return; + ab = audit_log_start(context, GFP_KERNEL, AUDIT_PROCTITLE); if (!ab) return; /* audit_panic or being filtered */ @@ -1323,7 +1282,7 @@ static void audit_log_proctitle(struct task_struct *tsk, if (!buf) goto out; /* Historically called this from procfs naming */ - res = get_cmdline(tsk, buf, MAX_PROCTITLE_AUDIT_LEN); + res = get_cmdline(current, buf, MAX_PROCTITLE_AUDIT_LEN); if (res == 0) { kfree(buf); goto out; @@ -1343,15 +1302,15 @@ out: audit_log_end(ab); } -static void audit_log_exit(struct audit_context *context, struct task_struct *tsk) +static void audit_log_exit(void) { int i, call_panic = 0; + struct audit_context *context = audit_context(); struct audit_buffer *ab; struct audit_aux_data *aux; struct audit_names *n; - /* tsk == current */ - context->personality = tsk->personality; + context->personality = current->personality; ab = audit_log_start(context, GFP_KERNEL, AUDIT_SYSCALL); if (!ab) @@ -1373,7 +1332,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts context->argv[3], context->name_count); - audit_log_task_info(ab, tsk); + audit_log_task_info(ab); audit_log_key(ab, context->filterkey); audit_log_end(ab); @@ -1462,7 +1421,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts audit_log_name(context, n, NULL, i++, &call_panic); } - audit_log_proctitle(tsk, context); + audit_log_proctitle(); /* Send end of event record to help user space know we are finished */ ab = audit_log_start(context, GFP_KERNEL, AUDIT_EOE); @@ -1480,22 +1439,31 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts */ void __audit_free(struct task_struct *tsk) { - struct audit_context *context; + struct audit_context *context = tsk->audit_context; - context = audit_take_context(tsk, 0, 0); if (!context) return; - /* Check for system calls that do not go through the exit - * function (e.g., exit_group), then free context block. - * We use GFP_ATOMIC here because we might be doing this - * in the context of the idle thread */ - /* that can happen only if we are called from do_exit() */ - if (context->in_syscall && context->current_state == AUDIT_RECORD_CONTEXT) - audit_log_exit(context, tsk); + /* We are called either by do_exit() or the fork() error handling code; + * in the former case tsk == current and in the latter tsk is a + * random task_struct that doesn't doesn't have any meaningful data we + * need to log via audit_log_exit(). + */ + if (tsk == current && !context->dummy && context->in_syscall) { + context->return_valid = 0; + context->return_code = 0; + + audit_filter_syscall(tsk, context, + &audit_filter_list[AUDIT_FILTER_EXIT]); + audit_filter_inodes(tsk, context); + if (context->current_state == AUDIT_RECORD_CONTEXT) + audit_log_exit(); + } + if (!list_empty(&context->killed_trees)) audit_kill_trees(&context->killed_trees); + audit_set_context(tsk, NULL); audit_free_context(context); } @@ -1565,17 +1533,40 @@ void __audit_syscall_exit(int success, long return_code) { struct audit_context *context; - if (success) - success = AUDITSC_SUCCESS; - else - success = AUDITSC_FAILURE; - - context = audit_take_context(current, success, return_code); + context = audit_context(); if (!context) return; - if (context->in_syscall && context->current_state == AUDIT_RECORD_CONTEXT) - audit_log_exit(context, current); + if (!context->dummy && context->in_syscall) { + if (success) + context->return_valid = AUDITSC_SUCCESS; + else + context->return_valid = AUDITSC_FAILURE; + + /* + * we need to fix up the return code in the audit logs if the + * actual return codes are later going to be fixed up by the + * arch specific signal handlers + * + * This is actually a test for: + * (rc == ERESTARTSYS ) || (rc == ERESTARTNOINTR) || + * (rc == ERESTARTNOHAND) || (rc == ERESTART_RESTARTBLOCK) + * + * but is faster than a bunch of || + */ + if (unlikely(return_code <= -ERESTARTSYS) && + (return_code >= -ERESTART_RESTARTBLOCK) && + (return_code != -ENOIOCTLCMD)) + context->return_code = -EINTR; + else + context->return_code = return_code; + + audit_filter_syscall(current, context, + &audit_filter_list[AUDIT_FILTER_EXIT]); + audit_filter_inodes(current, context); + if (context->current_state == AUDIT_RECORD_CONTEXT) + audit_log_exit(); + } context->in_syscall = 0; context->prio = context->state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0; @@ -1597,12 +1588,10 @@ void __audit_syscall_exit(int success, long return_code) kfree(context->filterkey); context->filterkey = NULL; } - audit_set_context(current, context); } static inline void handle_one(const struct inode *inode) { -#ifdef CONFIG_AUDIT_TREE struct audit_context *context; struct audit_tree_refs *p; struct audit_chunk *chunk; @@ -1627,12 +1616,10 @@ static inline void handle_one(const struct inode *inode) return; } put_tree_ref(context, chunk); -#endif } static void handle_path(const struct dentry *dentry) { -#ifdef CONFIG_AUDIT_TREE struct audit_context *context; struct audit_tree_refs *p; const struct dentry *d, *parent; @@ -1685,7 +1672,6 @@ retry: return; } rcu_read_unlock(); -#endif } static struct audit_names *audit_alloc_name(struct audit_context *context, @@ -2035,7 +2021,7 @@ static void audit_log_set_loginuid(kuid_t koldloginuid, kuid_t kloginuid, uid = from_kuid(&init_user_ns, task_uid(current)); oldloginuid = from_kuid(&init_user_ns, koldloginuid); loginuid = from_kuid(&init_user_ns, kloginuid), - tty = audit_get_tty(current); + tty = audit_get_tty(); audit_log_format(ab, "pid=%d uid=%u", task_tgid_nr(current), uid); audit_log_task_context(ab); @@ -2056,7 +2042,6 @@ static void audit_log_set_loginuid(kuid_t koldloginuid, kuid_t kloginuid, */ int audit_set_loginuid(kuid_t loginuid) { - struct task_struct *task = current; unsigned int oldsessionid, sessionid = AUDIT_SID_UNSET; kuid_t oldloginuid; int rc; @@ -2075,8 +2060,8 @@ int audit_set_loginuid(kuid_t loginuid) sessionid = (unsigned int)atomic_inc_return(&session_id); } - task->sessionid = sessionid; - task->loginuid = loginuid; + current->sessionid = sessionid; + current->loginuid = loginuid; out: audit_log_set_loginuid(oldloginuid, loginuid, oldsessionid, sessionid, rc); return rc; @@ -2513,10 +2498,9 @@ void audit_seccomp_actions_logged(const char *names, const char *old_names, if (unlikely(!ab)) return; - audit_log_format(ab, "op=seccomp-logging"); - audit_log_format(ab, " actions=%s", names); - audit_log_format(ab, " old-actions=%s", old_names); - audit_log_format(ab, " res=%d", res); + audit_log_format(ab, + "op=seccomp-logging actions=%s old-actions=%s res=%d", + names, old_names, res); audit_log_end(ab); } diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 6aaf5dd5383b..7a8429f8e280 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -5343,7 +5343,7 @@ int __init cgroup_init(void) cgroup_rstat_boot(); /* - * The latency of the synchronize_sched() is too high for cgroups, + * The latency of the synchronize_rcu() is too high for cgroups, * avoid it at the cost of forcing all readers into the slow path. */ rcu_sync_enter_start(&cgroup_threadgroup_rwsem.rss); diff --git a/kernel/events/core.c b/kernel/events/core.c index 84530ab358c3..67ecac337374 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5541,7 +5541,7 @@ out_put: static const struct vm_operations_struct perf_mmap_vmops = { .open = perf_mmap_open, - .close = perf_mmap_close, /* non mergable */ + .close = perf_mmap_close, /* non mergeable */ .fault = perf_mmap_fault, .page_mkwrite = perf_mmap_fault, }; @@ -9918,7 +9918,7 @@ static void account_event(struct perf_event *event) * call the perf scheduling hooks before proceeding to * install events that need them. */ - synchronize_sched(); + synchronize_rcu(); } /* * Now that we have waited for the sync_sched(), allow further diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index d6b56180827c..5befb338a18d 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -238,7 +238,7 @@ __weak void arch_unregister_hw_breakpoint(struct perf_event *bp) } /* - * Contraints to check before allowing this new breakpoint counter: + * Constraints to check before allowing this new breakpoint counter: * * == Non-pinned counter == (Considered as pinned for now) * diff --git a/kernel/fork.c b/kernel/fork.c index 07cddff89c7b..e2a5156bc9c3 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -240,8 +240,10 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) * free_thread_stack() can be called in interrupt context, * so cache the vm_struct. */ - if (stack) + if (stack) { tsk->stack_vm_area = find_vm_area(stack); + tsk->stack = stack; + } return stack; #else struct page *page = alloc_pages_node(node, THREADINFO_GFP, @@ -288,7 +290,10 @@ static struct kmem_cache *thread_stack_cache; static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) { - return kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node); + unsigned long *stack; + stack = kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node); + tsk->stack = stack; + return stack; } static void free_thread_stack(struct task_struct *tsk) diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c index f4f29b9d90ee..45b68b4ea48b 100644 --- a/kernel/irq/affinity.c +++ b/kernel/irq/affinity.c @@ -94,15 +94,15 @@ static int get_nodes_in_cpumask(cpumask_var_t *node_to_cpumask, return nodes; } -static int irq_build_affinity_masks(const struct irq_affinity *affd, - int startvec, int numvecs, - cpumask_var_t *node_to_cpumask, - const struct cpumask *cpu_mask, - struct cpumask *nmsk, - struct cpumask *masks) +static int __irq_build_affinity_masks(const struct irq_affinity *affd, + int startvec, int numvecs, int firstvec, + cpumask_var_t *node_to_cpumask, + const struct cpumask *cpu_mask, + struct cpumask *nmsk, + struct irq_affinity_desc *masks) { int n, nodes, cpus_per_vec, extra_vecs, done = 0; - int last_affv = affd->pre_vectors + numvecs; + int last_affv = firstvec + numvecs; int curvec = startvec; nodemask_t nodemsk = NODE_MASK_NONE; @@ -117,12 +117,13 @@ static int irq_build_affinity_masks(const struct irq_affinity *affd, */ if (numvecs <= nodes) { for_each_node_mask(n, nodemsk) { - cpumask_copy(masks + curvec, node_to_cpumask[n]); - if (++done == numvecs) - break; + cpumask_or(&masks[curvec].mask, + &masks[curvec].mask, + node_to_cpumask[n]); if (++curvec == last_affv) - curvec = affd->pre_vectors; + curvec = firstvec; } + done = numvecs; goto out; } @@ -130,7 +131,7 @@ static int irq_build_affinity_masks(const struct irq_affinity *affd, int ncpus, v, vecs_to_assign, vecs_per_node; /* Spread the vectors per node */ - vecs_per_node = (numvecs - (curvec - affd->pre_vectors)) / nodes; + vecs_per_node = (numvecs - (curvec - firstvec)) / nodes; /* Get the cpus on this node which are in the mask */ cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]); @@ -151,14 +152,15 @@ static int irq_build_affinity_masks(const struct irq_affinity *affd, cpus_per_vec++; --extra_vecs; } - irq_spread_init_one(masks + curvec, nmsk, cpus_per_vec); + irq_spread_init_one(&masks[curvec].mask, nmsk, + cpus_per_vec); } done += v; if (done >= numvecs) break; if (curvec >= last_affv) - curvec = affd->pre_vectors; + curvec = firstvec; --nodes; } @@ -166,20 +168,77 @@ out: return done; } +/* + * build affinity in two stages: + * 1) spread present CPU on these vectors + * 2) spread other possible CPUs on these vectors + */ +static int irq_build_affinity_masks(const struct irq_affinity *affd, + int startvec, int numvecs, int firstvec, + cpumask_var_t *node_to_cpumask, + struct irq_affinity_desc *masks) +{ + int curvec = startvec, nr_present, nr_others; + int ret = -ENOMEM; + cpumask_var_t nmsk, npresmsk; + + if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL)) + return ret; + + if (!zalloc_cpumask_var(&npresmsk, GFP_KERNEL)) + goto fail; + + ret = 0; + /* Stabilize the cpumasks */ + get_online_cpus(); + build_node_to_cpumask(node_to_cpumask); + + /* Spread on present CPUs starting from affd->pre_vectors */ + nr_present = __irq_build_affinity_masks(affd, curvec, numvecs, + firstvec, node_to_cpumask, + cpu_present_mask, nmsk, masks); + + /* + * Spread on non present CPUs starting from the next vector to be + * handled. If the spreading of present CPUs already exhausted the + * vector space, assign the non present CPUs to the already spread + * out vectors. + */ + if (nr_present >= numvecs) + curvec = firstvec; + else + curvec = firstvec + nr_present; + cpumask_andnot(npresmsk, cpu_possible_mask, cpu_present_mask); + nr_others = __irq_build_affinity_masks(affd, curvec, numvecs, + firstvec, node_to_cpumask, + npresmsk, nmsk, masks); + put_online_cpus(); + + if (nr_present < numvecs) + WARN_ON(nr_present + nr_others < numvecs); + + free_cpumask_var(npresmsk); + + fail: + free_cpumask_var(nmsk); + return ret; +} + /** * irq_create_affinity_masks - Create affinity masks for multiqueue spreading * @nvecs: The total number of vectors * @affd: Description of the affinity requirements * - * Returns the masks pointer or NULL if allocation failed. + * Returns the irq_affinity_desc pointer or NULL if allocation failed. */ -struct cpumask * +struct irq_affinity_desc * irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) { int affvecs = nvecs - affd->pre_vectors - affd->post_vectors; int curvec, usedvecs; - cpumask_var_t nmsk, npresmsk, *node_to_cpumask; - struct cpumask *masks = NULL; + cpumask_var_t *node_to_cpumask; + struct irq_affinity_desc *masks = NULL; + int i, nr_sets; /* * If there aren't any vectors left after applying the pre/post @@ -188,15 +247,9 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) if (nvecs == affd->pre_vectors + affd->post_vectors) return NULL; - if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL)) - return NULL; - - if (!zalloc_cpumask_var(&npresmsk, GFP_KERNEL)) - goto outcpumsk; - node_to_cpumask = alloc_node_to_cpumask(); if (!node_to_cpumask) - goto outnpresmsk; + return NULL; masks = kcalloc(nvecs, sizeof(*masks), GFP_KERNEL); if (!masks) @@ -204,32 +257,29 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) /* Fill out vectors at the beginning that don't need affinity */ for (curvec = 0; curvec < affd->pre_vectors; curvec++) - cpumask_copy(masks + curvec, irq_default_affinity); - - /* Stabilize the cpumasks */ - get_online_cpus(); - build_node_to_cpumask(node_to_cpumask); - - /* Spread on present CPUs starting from affd->pre_vectors */ - usedvecs = irq_build_affinity_masks(affd, curvec, affvecs, - node_to_cpumask, cpu_present_mask, - nmsk, masks); - + cpumask_copy(&masks[curvec].mask, irq_default_affinity); /* - * Spread on non present CPUs starting from the next vector to be - * handled. If the spreading of present CPUs already exhausted the - * vector space, assign the non present CPUs to the already spread - * out vectors. + * Spread on present CPUs starting from affd->pre_vectors. If we + * have multiple sets, build each sets affinity mask separately. */ - if (usedvecs >= affvecs) - curvec = affd->pre_vectors; - else - curvec = affd->pre_vectors + usedvecs; - cpumask_andnot(npresmsk, cpu_possible_mask, cpu_present_mask); - usedvecs += irq_build_affinity_masks(affd, curvec, affvecs, - node_to_cpumask, npresmsk, - nmsk, masks); - put_online_cpus(); + nr_sets = affd->nr_sets; + if (!nr_sets) + nr_sets = 1; + + for (i = 0, usedvecs = 0; i < nr_sets; i++) { + int this_vecs = affd->sets ? affd->sets[i] : affvecs; + int ret; + + ret = irq_build_affinity_masks(affd, curvec, this_vecs, + curvec, node_to_cpumask, masks); + if (ret) { + kfree(masks); + masks = NULL; + goto outnodemsk; + } + curvec += this_vecs; + usedvecs += this_vecs; + } /* Fill out vectors at the end that don't need affinity */ if (usedvecs >= affvecs) @@ -237,14 +287,14 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) else curvec = affd->pre_vectors + usedvecs; for (; curvec < nvecs; curvec++) - cpumask_copy(masks + curvec, irq_default_affinity); + cpumask_copy(&masks[curvec].mask, irq_default_affinity); + + /* Mark the managed interrupts */ + for (i = affd->pre_vectors; i < nvecs - affd->post_vectors; i++) + masks[i].is_managed = 1; outnodemsk: free_node_to_cpumask(node_to_cpumask); -outnpresmsk: - free_cpumask_var(npresmsk); -outcpumsk: - free_cpumask_var(nmsk); return masks; } @@ -258,13 +308,21 @@ int irq_calc_affinity_vectors(int minvec, int maxvec, const struct irq_affinity { int resv = affd->pre_vectors + affd->post_vectors; int vecs = maxvec - resv; - int ret; + int set_vecs; if (resv > minvec) return 0; - get_online_cpus(); - ret = min_t(int, cpumask_weight(cpu_possible_mask), vecs) + resv; - put_online_cpus(); - return ret; + if (affd->nr_sets) { + int i; + + for (i = 0, set_vecs = 0; i < affd->nr_sets; i++) + set_vecs += affd->sets[i]; + } else { + get_online_cpus(); + set_vecs = cpumask_weight(cpu_possible_mask); + put_online_cpus(); + } + + return resv + min(set_vecs, vecs); } diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index a2b3d9de999c..34e969069488 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -929,7 +929,7 @@ __irq_do_set_handler(struct irq_desc *desc, irq_flow_handler_t handle, break; /* * Bail out if the outer chip is not set up - * and the interrrupt supposed to be started + * and the interrupt supposed to be started * right away. */ if (WARN_ON(is_chained)) diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c index 6a682c229e10..5d5378ea0afe 100644 --- a/kernel/irq/devres.c +++ b/kernel/irq/devres.c @@ -169,7 +169,7 @@ static void devm_irq_desc_release(struct device *dev, void *res) * @cnt: Number of consecutive irqs to allocate * @node: Preferred node on which the irq descriptor should be allocated * @owner: Owning module (can be NULL) - * @affinity: Optional pointer to an affinity mask array of size @cnt + * @affinity: Optional pointer to an irq_affinity_desc array of size @cnt * which hints where the irq descriptors should be allocated * and which default affinities to use * @@ -179,7 +179,7 @@ static void devm_irq_desc_release(struct device *dev, void *res) */ int __devm_irq_alloc_descs(struct device *dev, int irq, unsigned int from, unsigned int cnt, int node, struct module *owner, - const struct cpumask *affinity) + const struct irq_affinity_desc *affinity) { struct irq_desc_devres *dr; int base; diff --git a/kernel/irq/ipi.c b/kernel/irq/ipi.c index 8b778e37dc6d..43e3d1be622c 100644 --- a/kernel/irq/ipi.c +++ b/kernel/irq/ipi.c @@ -56,7 +56,7 @@ int irq_reserve_ipi(struct irq_domain *domain, unsigned int next; /* - * The IPI requires a seperate HW irq on each CPU. We require + * The IPI requires a separate HW irq on each CPU. We require * that the destination mask is consecutive. If an * implementation needs to support holes, it can reserve * several IPI ranges. @@ -172,7 +172,7 @@ irq_hw_number_t ipi_get_hwirq(unsigned int irq, unsigned int cpu) /* * Get the real hardware irq number if the underlying implementation - * uses a seperate irq per cpu. If the underlying implementation uses + * uses a separate irq per cpu. If the underlying implementation uses * a single hardware irq for all cpus then the IPI send mechanism * needs to take care of the cpu destinations. */ diff --git a/kernel/irq/irq_sim.c b/kernel/irq/irq_sim.c index dd20d0d528d4..98a20e1594ce 100644 --- a/kernel/irq/irq_sim.c +++ b/kernel/irq/irq_sim.c @@ -34,9 +34,20 @@ static struct irq_chip irq_sim_irqchip = { static void irq_sim_handle_irq(struct irq_work *work) { struct irq_sim_work_ctx *work_ctx; + unsigned int offset = 0; + struct irq_sim *sim; + int irqnum; work_ctx = container_of(work, struct irq_sim_work_ctx, work); - handle_simple_irq(irq_to_desc(work_ctx->irq)); + sim = container_of(work_ctx, struct irq_sim, work_ctx); + + while (!bitmap_empty(work_ctx->pending, sim->irq_count)) { + offset = find_next_bit(work_ctx->pending, + sim->irq_count, offset); + clear_bit(offset, work_ctx->pending); + irqnum = irq_sim_irqnum(sim, offset); + handle_simple_irq(irq_to_desc(irqnum)); + } } /** @@ -63,6 +74,13 @@ int irq_sim_init(struct irq_sim *sim, unsigned int num_irqs) return sim->irq_base; } + sim->work_ctx.pending = bitmap_zalloc(num_irqs, GFP_KERNEL); + if (!sim->work_ctx.pending) { + kfree(sim->irqs); + irq_free_descs(sim->irq_base, num_irqs); + return -ENOMEM; + } + for (i = 0; i < num_irqs; i++) { sim->irqs[i].irqnum = sim->irq_base + i; sim->irqs[i].enabled = false; @@ -89,6 +107,7 @@ EXPORT_SYMBOL_GPL(irq_sim_init); void irq_sim_fini(struct irq_sim *sim) { irq_work_sync(&sim->work_ctx.work); + bitmap_free(sim->work_ctx.pending); irq_free_descs(sim->irq_base, sim->irq_count); kfree(sim->irqs); } @@ -143,7 +162,7 @@ EXPORT_SYMBOL_GPL(devm_irq_sim_init); void irq_sim_fire(struct irq_sim *sim, unsigned int offset) { if (sim->irqs[offset].enabled) { - sim->work_ctx.irq = irq_sim_irqnum(sim, offset); + set_bit(offset, sim->work_ctx.pending); irq_work_queue(&sim->work_ctx.work); } } diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 578d0e5f1b5b..ee062b7939d3 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -449,30 +449,34 @@ static void free_desc(unsigned int irq) } static int alloc_descs(unsigned int start, unsigned int cnt, int node, - const struct cpumask *affinity, struct module *owner) + const struct irq_affinity_desc *affinity, + struct module *owner) { - const struct cpumask *mask = NULL; struct irq_desc *desc; - unsigned int flags; int i; /* Validate affinity mask(s) */ if (affinity) { - for (i = 0, mask = affinity; i < cnt; i++, mask++) { - if (cpumask_empty(mask)) + for (i = 0; i < cnt; i++, i++) { + if (cpumask_empty(&affinity[i].mask)) return -EINVAL; } } - flags = affinity ? IRQD_AFFINITY_MANAGED | IRQD_MANAGED_SHUTDOWN : 0; - mask = NULL; - for (i = 0; i < cnt; i++) { + const struct cpumask *mask = NULL; + unsigned int flags = 0; + if (affinity) { - node = cpu_to_node(cpumask_first(affinity)); - mask = affinity; + if (affinity->is_managed) { + flags = IRQD_AFFINITY_MANAGED | + IRQD_MANAGED_SHUTDOWN; + } + mask = &affinity->mask; + node = cpu_to_node(cpumask_first(mask)); affinity++; } + desc = alloc_desc(start + i, node, flags, mask, owner); if (!desc) goto err; @@ -575,7 +579,7 @@ static void free_desc(unsigned int irq) } static inline int alloc_descs(unsigned int start, unsigned int cnt, int node, - const struct cpumask *affinity, + const struct irq_affinity_desc *affinity, struct module *owner) { u32 i; @@ -705,7 +709,7 @@ EXPORT_SYMBOL_GPL(irq_free_descs); */ int __ref __irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node, - struct module *owner, const struct cpumask *affinity) + struct module *owner, const struct irq_affinity_desc *affinity) { int start, ret; diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 3366d11c3e02..8b0be4bd6565 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -969,7 +969,7 @@ const struct irq_domain_ops irq_domain_simple_ops = { EXPORT_SYMBOL_GPL(irq_domain_simple_ops); int irq_domain_alloc_descs(int virq, unsigned int cnt, irq_hw_number_t hwirq, - int node, const struct cpumask *affinity) + int node, const struct irq_affinity_desc *affinity) { unsigned int hint; @@ -1281,7 +1281,7 @@ int irq_domain_alloc_irqs_hierarchy(struct irq_domain *domain, */ int __irq_domain_alloc_irqs(struct irq_domain *domain, int irq_base, unsigned int nr_irqs, int node, void *arg, - bool realloc, const struct cpumask *affinity) + bool realloc, const struct irq_affinity_desc *affinity) { int i, ret, virq; diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 9dbdccab3b6a..a4888ce4667a 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -915,7 +915,7 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) { } #endif /* - * Interrupts which are not explicitely requested as threaded + * Interrupts which are not explicitly requested as threaded * interrupts rely on the implicit bh/preempt disable of the hard irq * context. So we need to disable bh here to avoid deadlocks and other * side effects. diff --git a/kernel/irq/matrix.c b/kernel/irq/matrix.c index 1f0985adf193..30cc217b8631 100644 --- a/kernel/irq/matrix.c +++ b/kernel/irq/matrix.c @@ -14,6 +14,7 @@ struct cpumap { unsigned int available; unsigned int allocated; unsigned int managed; + unsigned int managed_allocated; bool initialized; bool online; unsigned long alloc_map[IRQ_MATRIX_SIZE]; @@ -145,6 +146,27 @@ static unsigned int matrix_find_best_cpu(struct irq_matrix *m, return best_cpu; } +/* Find the best CPU which has the lowest number of managed IRQs allocated */ +static unsigned int matrix_find_best_cpu_managed(struct irq_matrix *m, + const struct cpumask *msk) +{ + unsigned int cpu, best_cpu, allocated = UINT_MAX; + struct cpumap *cm; + + best_cpu = UINT_MAX; + + for_each_cpu(cpu, msk) { + cm = per_cpu_ptr(m->maps, cpu); + + if (!cm->online || cm->managed_allocated > allocated) + continue; + + best_cpu = cpu; + allocated = cm->managed_allocated; + } + return best_cpu; +} + /** * irq_matrix_assign_system - Assign system wide entry in the matrix * @m: Matrix pointer @@ -269,7 +291,7 @@ int irq_matrix_alloc_managed(struct irq_matrix *m, const struct cpumask *msk, if (cpumask_empty(msk)) return -EINVAL; - cpu = matrix_find_best_cpu(m, msk); + cpu = matrix_find_best_cpu_managed(m, msk); if (cpu == UINT_MAX) return -ENOSPC; @@ -282,6 +304,7 @@ int irq_matrix_alloc_managed(struct irq_matrix *m, const struct cpumask *msk, return -ENOSPC; set_bit(bit, cm->alloc_map); cm->allocated++; + cm->managed_allocated++; m->total_allocated++; *mapped_cpu = cpu; trace_irq_matrix_alloc_managed(bit, cpu, m, cm); @@ -395,6 +418,8 @@ void irq_matrix_free(struct irq_matrix *m, unsigned int cpu, clear_bit(bit, cm->alloc_map); cm->allocated--; + if(managed) + cm->managed_allocated--; if (cm->online) m->total_allocated--; @@ -464,13 +489,14 @@ void irq_matrix_debug_show(struct seq_file *sf, struct irq_matrix *m, int ind) seq_printf(sf, "Total allocated: %6u\n", m->total_allocated); seq_printf(sf, "System: %u: %*pbl\n", nsys, m->matrix_bits, m->system_map); - seq_printf(sf, "%*s| CPU | avl | man | act | vectors\n", ind, " "); + seq_printf(sf, "%*s| CPU | avl | man | mac | act | vectors\n", ind, " "); cpus_read_lock(); for_each_online_cpu(cpu) { struct cpumap *cm = per_cpu_ptr(m->maps, cpu); - seq_printf(sf, "%*s %4d %4u %4u %4u %*pbl\n", ind, " ", - cpu, cm->available, cm->managed, cm->allocated, + seq_printf(sf, "%*s %4d %4u %4u %4u %4u %*pbl\n", ind, " ", + cpu, cm->available, cm->managed, + cm->managed_allocated, cm->allocated, m->matrix_bits, cm->alloc_map); } cpus_read_unlock(); diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 4ca2fd46645d..ad26fbcfbfc8 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -23,11 +23,11 @@ * @nvec: The number of vectors used in this entry * @affinity: Optional pointer to an affinity mask array size of @nvec * - * If @affinity is not NULL then a an affinity array[@nvec] is allocated - * and the affinity masks from @affinity are copied. + * If @affinity is not NULL then an affinity array[@nvec] is allocated + * and the affinity masks and flags from @affinity are copied. */ -struct msi_desc * -alloc_msi_entry(struct device *dev, int nvec, const struct cpumask *affinity) +struct msi_desc *alloc_msi_entry(struct device *dev, int nvec, + const struct irq_affinity_desc *affinity) { struct msi_desc *desc; diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index d867d6ddafdd..6d2fa6914b30 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -66,7 +66,7 @@ static int try_one_irq(struct irq_desc *desc, bool force) raw_spin_lock(&desc->lock); /* - * PER_CPU, nested thread interrupts and interrupts explicitely + * PER_CPU, nested thread interrupts and interrupts explicitly * marked polled are excluded from polling. */ if (irq_settings_is_per_cpu(desc) || @@ -76,7 +76,7 @@ static int try_one_irq(struct irq_desc *desc, bool force) /* * Do not poll disabled interrupts unless the spurious - * disabled poller asks explicitely. + * disabled poller asks explicitly. */ if (irqd_irq_disabled(&desc->irq_data) && !force) goto out; @@ -292,7 +292,7 @@ void note_interrupt(struct irq_desc *desc, irqreturn_t action_ret) * So in case a thread is woken, we just note the fact and * defer the analysis to the next hardware interrupt. * - * The threaded handlers store whether they sucessfully + * The threaded handlers store whether they successfully * handled an interrupt and we check whether that number * changed versus the last invocation. * diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 35cf0ad29718..f1d0e00a3971 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -16,6 +16,7 @@ #include <linux/file.h> #include <linux/slab.h> #include <linux/kexec.h> +#include <linux/memblock.h> #include <linux/mutex.h> #include <linux/list.h> #include <linux/fs.h> @@ -76,7 +77,7 @@ void * __weak arch_kexec_kernel_image_load(struct kimage *image) return kexec_image_load_default(image); } -static int kexec_image_post_load_cleanup_default(struct kimage *image) +int kexec_image_post_load_cleanup_default(struct kimage *image) { if (!image->fops || !image->fops->cleanup) return 0; @@ -499,8 +500,60 @@ static int locate_mem_hole_callback(struct resource *res, void *arg) return locate_mem_hole_bottom_up(start, end, kbuf); } +#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK +static int kexec_walk_memblock(struct kexec_buf *kbuf, + int (*func)(struct resource *, void *)) +{ + return 0; +} +#else +static int kexec_walk_memblock(struct kexec_buf *kbuf, + int (*func)(struct resource *, void *)) +{ + int ret = 0; + u64 i; + phys_addr_t mstart, mend; + struct resource res = { }; + + if (kbuf->image->type == KEXEC_TYPE_CRASH) + return func(&crashk_res, kbuf); + + if (kbuf->top_down) { + for_each_free_mem_range_reverse(i, NUMA_NO_NODE, MEMBLOCK_NONE, + &mstart, &mend, NULL) { + /* + * In memblock, end points to the first byte after the + * range while in kexec, end points to the last byte + * in the range. + */ + res.start = mstart; + res.end = mend - 1; + ret = func(&res, kbuf); + if (ret) + break; + } + } else { + for_each_free_mem_range(i, NUMA_NO_NODE, MEMBLOCK_NONE, + &mstart, &mend, NULL) { + /* + * In memblock, end points to the first byte after the + * range while in kexec, end points to the last byte + * in the range. + */ + res.start = mstart; + res.end = mend - 1; + ret = func(&res, kbuf); + if (ret) + break; + } + } + + return ret; +} +#endif + /** - * arch_kexec_walk_mem - call func(data) on free memory regions + * kexec_walk_resources - call func(data) on free memory regions * @kbuf: Context info for the search. Also passed to @func. * @func: Function to call for each memory region. * @@ -508,8 +561,8 @@ static int locate_mem_hole_callback(struct resource *res, void *arg) * and that value will be returned. If all free regions are visited without * func returning non-zero, then zero will be returned. */ -int __weak arch_kexec_walk_mem(struct kexec_buf *kbuf, - int (*func)(struct resource *, void *)) +static int kexec_walk_resources(struct kexec_buf *kbuf, + int (*func)(struct resource *, void *)) { if (kbuf->image->type == KEXEC_TYPE_CRASH) return walk_iomem_res_desc(crashk_res.desc, @@ -532,7 +585,14 @@ int kexec_locate_mem_hole(struct kexec_buf *kbuf) { int ret; - ret = arch_kexec_walk_mem(kbuf, locate_mem_hole_callback); + /* Arch knows where to place */ + if (kbuf->mem != KEXEC_BUF_MEM_UNKNOWN) + return 0; + + if (IS_ENABLED(CONFIG_ARCH_DISCARD_MEMBLOCK)) + ret = kexec_walk_resources(kbuf, locate_mem_hole_callback); + else + ret = kexec_walk_memblock(kbuf, locate_mem_hole_callback); return ret == 1 ? 0 : -EADDRNOTAVAIL; } diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 90e98e233647..f4ddfdd2d07e 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -229,7 +229,7 @@ static int collect_garbage_slots(struct kprobe_insn_cache *c) struct kprobe_insn_page *kip, *next; /* Ensure no-one is interrupted on the garbages */ - synchronize_sched(); + synchronize_rcu(); list_for_each_entry_safe(kip, next, &c->pages, list) { int i; @@ -1382,7 +1382,7 @@ out: if (ret) { ap->flags |= KPROBE_FLAG_DISABLED; list_del_rcu(&p->list); - synchronize_sched(); + synchronize_rcu(); } } } @@ -1597,7 +1597,7 @@ int register_kprobe(struct kprobe *p) ret = arm_kprobe(p); if (ret) { hlist_del_rcu(&p->hlist); - synchronize_sched(); + synchronize_rcu(); goto out; } } @@ -1776,7 +1776,7 @@ void unregister_kprobes(struct kprobe **kps, int num) kps[i]->addr = NULL; mutex_unlock(&kprobe_mutex); - synchronize_sched(); + synchronize_rcu(); for (i = 0; i < num; i++) if (kps[i]->addr) __unregister_kprobe_bottom(kps[i]); @@ -1966,7 +1966,7 @@ void unregister_kretprobes(struct kretprobe **rps, int num) rps[i]->kp.addr = NULL; mutex_unlock(&kprobe_mutex); - synchronize_sched(); + synchronize_rcu(); for (i = 0; i < num; i++) { if (rps[i]->kp.addr) { __unregister_kprobe_bottom(&rps[i]->kp); @@ -2093,6 +2093,47 @@ void dump_kprobe(struct kprobe *kp) } NOKPROBE_SYMBOL(dump_kprobe); +int kprobe_add_ksym_blacklist(unsigned long entry) +{ + struct kprobe_blacklist_entry *ent; + unsigned long offset = 0, size = 0; + + if (!kernel_text_address(entry) || + !kallsyms_lookup_size_offset(entry, &size, &offset)) + return -EINVAL; + + ent = kmalloc(sizeof(*ent), GFP_KERNEL); + if (!ent) + return -ENOMEM; + ent->start_addr = entry; + ent->end_addr = entry + size; + INIT_LIST_HEAD(&ent->list); + list_add_tail(&ent->list, &kprobe_blacklist); + + return (int)size; +} + +/* Add all symbols in given area into kprobe blacklist */ +int kprobe_add_area_blacklist(unsigned long start, unsigned long end) +{ + unsigned long entry; + int ret = 0; + + for (entry = start; entry < end; entry += ret) { + ret = kprobe_add_ksym_blacklist(entry); + if (ret < 0) + return ret; + if (ret == 0) /* In case of alias symbol */ + ret = 1; + } + return 0; +} + +int __init __weak arch_populate_kprobe_blacklist(void) +{ + return 0; +} + /* * Lookup and populate the kprobe_blacklist. * @@ -2104,26 +2145,24 @@ NOKPROBE_SYMBOL(dump_kprobe); static int __init populate_kprobe_blacklist(unsigned long *start, unsigned long *end) { + unsigned long entry; unsigned long *iter; - struct kprobe_blacklist_entry *ent; - unsigned long entry, offset = 0, size = 0; + int ret; for (iter = start; iter < end; iter++) { entry = arch_deref_entry_point((void *)*iter); - - if (!kernel_text_address(entry) || - !kallsyms_lookup_size_offset(entry, &size, &offset)) + ret = kprobe_add_ksym_blacklist(entry); + if (ret == -EINVAL) continue; - - ent = kmalloc(sizeof(*ent), GFP_KERNEL); - if (!ent) - return -ENOMEM; - ent->start_addr = entry; - ent->end_addr = entry + size; - INIT_LIST_HEAD(&ent->list); - list_add_tail(&ent->list, &kprobe_blacklist); + if (ret < 0) + return ret; } - return 0; + + /* Symbols in __kprobes_text are blacklisted */ + ret = kprobe_add_area_blacklist((unsigned long)__kprobes_text_start, + (unsigned long)__kprobes_text_end); + + return ret ? : arch_populate_kprobe_blacklist(); } /* Module notifier call back, checking kprobes on the module */ diff --git a/kernel/livepatch/patch.c b/kernel/livepatch/patch.c index 82d584225dc6..7702cb4064fc 100644 --- a/kernel/livepatch/patch.c +++ b/kernel/livepatch/patch.c @@ -61,7 +61,7 @@ static void notrace klp_ftrace_handler(unsigned long ip, ops = container_of(fops, struct klp_ops, fops); /* - * A variant of synchronize_sched() is used to allow patching functions + * A variant of synchronize_rcu() is used to allow patching functions * where RCU is not watching, see klp_synchronize_transition(). */ preempt_disable_notrace(); @@ -72,7 +72,7 @@ static void notrace klp_ftrace_handler(unsigned long ip, /* * func should never be NULL because preemption should be disabled here * and unregister_ftrace_function() does the equivalent of a - * synchronize_sched() before the func_stack removal. + * synchronize_rcu() before the func_stack removal. */ if (WARN_ON_ONCE(!func)) goto unlock; diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c index 5bc349805e03..304d5eb8a98c 100644 --- a/kernel/livepatch/transition.c +++ b/kernel/livepatch/transition.c @@ -52,7 +52,7 @@ static DECLARE_DELAYED_WORK(klp_transition_work, klp_transition_work_fn); /* * This function is just a stub to implement a hard force - * of synchronize_sched(). This requires synchronizing + * of synchronize_rcu(). This requires synchronizing * tasks even in userspace and idle. */ static void klp_sync(struct work_struct *work) @@ -175,7 +175,7 @@ void klp_cancel_transition(void) void klp_update_patch_state(struct task_struct *task) { /* - * A variant of synchronize_sched() is used to allow patching functions + * A variant of synchronize_rcu() is used to allow patching functions * where RCU is not watching, see klp_synchronize_transition(). */ preempt_disable_notrace(); diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 1efada2dd9dd..95932333a48b 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -138,6 +138,9 @@ static struct lock_list list_entries[MAX_LOCKDEP_ENTRIES]; * get freed - this significantly simplifies the debugging code. */ unsigned long nr_lock_classes; +#ifndef CONFIG_DEBUG_LOCKDEP +static +#endif struct lock_class lock_classes[MAX_LOCKDEP_KEYS]; static inline struct lock_class *hlock_class(struct held_lock *hlock) @@ -626,7 +629,8 @@ static int static_obj(void *obj) /* * To make lock name printouts unique, we calculate a unique - * class->name_version generation counter: + * class->name_version generation counter. The caller must hold the graph + * lock. */ static int count_matching_names(struct lock_class *new_class) { @@ -636,7 +640,7 @@ static int count_matching_names(struct lock_class *new_class) if (!new_class->name) return 0; - list_for_each_entry_rcu(class, &all_lock_classes, lock_entry) { + list_for_each_entry(class, &all_lock_classes, lock_entry) { if (new_class->key - new_class->subclass == class->key) return class->name_version; if (class->name && !strcmp(class->name, new_class->name)) @@ -789,7 +793,6 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) class->key = key; class->name = lock->name; class->subclass = subclass; - INIT_LIST_HEAD(&class->lock_entry); INIT_LIST_HEAD(&class->locks_before); INIT_LIST_HEAD(&class->locks_after); class->name_version = count_matching_names(class); @@ -801,7 +804,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) /* * Add it to the global list of classes: */ - list_add_tail_rcu(&class->lock_entry, &all_lock_classes); + list_add_tail(&class->lock_entry, &all_lock_classes); if (verbose(class)) { graph_unlock(); @@ -3088,7 +3091,7 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this, /* * Initialize a lock instance's lock-class mapping info: */ -static void __lockdep_init_map(struct lockdep_map *lock, const char *name, +void lockdep_init_map(struct lockdep_map *lock, const char *name, struct lock_class_key *key, int subclass) { int i; @@ -3144,12 +3147,6 @@ static void __lockdep_init_map(struct lockdep_map *lock, const char *name, raw_local_irq_restore(flags); } } - -void lockdep_init_map(struct lockdep_map *lock, const char *name, - struct lock_class_key *key, int subclass) -{ - __lockdep_init_map(lock, name, key, subclass); -} EXPORT_SYMBOL_GPL(lockdep_init_map); struct lock_class_key __lockdep_no_validate__; @@ -4126,6 +4123,9 @@ void lockdep_reset(void) raw_local_irq_restore(flags); } +/* + * Remove all references to a lock class. The caller must hold the graph lock. + */ static void zap_class(struct lock_class *class) { int i; @@ -4142,7 +4142,7 @@ static void zap_class(struct lock_class *class) * Unhash the class and remove it from the all_lock_classes list: */ hlist_del_rcu(&class->hash_entry); - list_del_rcu(&class->lock_entry); + list_del(&class->lock_entry); RCU_INIT_POINTER(class->key, NULL); RCU_INIT_POINTER(class->name, NULL); @@ -4195,7 +4195,7 @@ void lockdep_free_key_range(void *start, unsigned long size) * * sync_sched() is sufficient because the read-side is IRQ disable. */ - synchronize_sched(); + synchronize_rcu(); /* * XXX at this point we could return the resources to the pool; @@ -4204,15 +4204,36 @@ void lockdep_free_key_range(void *start, unsigned long size) */ } -void lockdep_reset_lock(struct lockdep_map *lock) +/* + * Check whether any element of the @lock->class_cache[] array refers to a + * registered lock class. The caller must hold either the graph lock or the + * RCU read lock. + */ +static bool lock_class_cache_is_registered(struct lockdep_map *lock) { struct lock_class *class; struct hlist_head *head; - unsigned long flags; int i, j; - int locked; + + for (i = 0; i < CLASSHASH_SIZE; i++) { + head = classhash_table + i; + hlist_for_each_entry_rcu(class, head, hash_entry) { + for (j = 0; j < NR_LOCKDEP_CACHING_CLASSES; j++) + if (lock->class_cache[j] == class) + return true; + } + } + return false; +} + +void lockdep_reset_lock(struct lockdep_map *lock) +{ + struct lock_class *class; + unsigned long flags; + int j, locked; raw_local_irq_save(flags); + locked = graph_lock(); /* * Remove all classes this lock might have: @@ -4229,25 +4250,14 @@ void lockdep_reset_lock(struct lockdep_map *lock) * Debug check: in the end all mapped classes should * be gone. */ - locked = graph_lock(); - for (i = 0; i < CLASSHASH_SIZE; i++) { - head = classhash_table + i; - hlist_for_each_entry_rcu(class, head, hash_entry) { - int match = 0; - - for (j = 0; j < NR_LOCKDEP_CACHING_CLASSES; j++) - match |= class == lock->class_cache[j]; - - if (unlikely(match)) { - if (debug_locks_off_graph_unlock()) { - /* - * We all just reset everything, how did it match? - */ - WARN_ON(1); - } - goto out_restore; - } + if (unlikely(lock_class_cache_is_registered(lock))) { + if (debug_locks_off_graph_unlock()) { + /* + * We all just reset everything, how did it match? + */ + WARN_ON(1); } + goto out_restore; } if (locked) graph_unlock(); diff --git a/kernel/locking/mutex-debug.c b/kernel/locking/mutex-debug.c index 9aa713629387..771d4ca96dda 100644 --- a/kernel/locking/mutex-debug.c +++ b/kernel/locking/mutex-debug.c @@ -36,7 +36,7 @@ void debug_mutex_lock_common(struct mutex *lock, struct mutex_waiter *waiter) void debug_mutex_wake_waiter(struct mutex *lock, struct mutex_waiter *waiter) { - SMP_DEBUG_LOCKS_WARN_ON(!spin_is_locked(&lock->wait_lock)); + lockdep_assert_held(&lock->wait_lock); DEBUG_LOCKS_WARN_ON(list_empty(&lock->wait_list)); DEBUG_LOCKS_WARN_ON(waiter->magic != waiter); DEBUG_LOCKS_WARN_ON(list_empty(&waiter->list)); @@ -51,7 +51,7 @@ void debug_mutex_free_waiter(struct mutex_waiter *waiter) void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter, struct task_struct *task) { - SMP_DEBUG_LOCKS_WARN_ON(!spin_is_locked(&lock->wait_lock)); + lockdep_assert_held(&lock->wait_lock); /* Mark the current thread as blocked on the lock: */ task->blocked_on = waiter; diff --git a/kernel/module.c b/kernel/module.c index 06ec68f08387..d46c7814a00e 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -495,9 +495,9 @@ struct find_symbol_arg { const struct kernel_symbol *sym; }; -static bool check_symbol(const struct symsearch *syms, - struct module *owner, - unsigned int symnum, void *data) +static bool check_exported_symbol(const struct symsearch *syms, + struct module *owner, + unsigned int symnum, void *data) { struct find_symbol_arg *fsa = data; @@ -555,9 +555,9 @@ static int cmp_name(const void *va, const void *vb) return strcmp(a, kernel_symbol_name(b)); } -static bool find_symbol_in_section(const struct symsearch *syms, - struct module *owner, - void *data) +static bool find_exported_symbol_in_section(const struct symsearch *syms, + struct module *owner, + void *data) { struct find_symbol_arg *fsa = data; struct kernel_symbol *sym; @@ -565,13 +565,14 @@ static bool find_symbol_in_section(const struct symsearch *syms, sym = bsearch(fsa->name, syms->start, syms->stop - syms->start, sizeof(struct kernel_symbol), cmp_name); - if (sym != NULL && check_symbol(syms, owner, sym - syms->start, data)) + if (sym != NULL && check_exported_symbol(syms, owner, + sym - syms->start, data)) return true; return false; } -/* Find a symbol and return it, along with, (optional) crc and +/* Find an exported symbol and return it, along with, (optional) crc and * (optional) module which owns it. Needs preempt disabled or module_mutex. */ const struct kernel_symbol *find_symbol(const char *name, struct module **owner, @@ -585,7 +586,7 @@ const struct kernel_symbol *find_symbol(const char *name, fsa.gplok = gplok; fsa.warn = warn; - if (each_symbol_section(find_symbol_in_section, &fsa)) { + if (each_symbol_section(find_exported_symbol_in_section, &fsa)) { if (owner) *owner = fsa.owner; if (crc) @@ -2159,7 +2160,7 @@ static void free_module(struct module *mod) /* Remove this module from bug list, this uses list_del_rcu */ module_bug_cleanup(mod); /* Wait for RCU-sched synchronizing before releasing mod->list and buglist. */ - synchronize_sched(); + synchronize_rcu(); mutex_unlock(&module_mutex); /* This may be empty, but that's OK */ @@ -2198,7 +2199,7 @@ EXPORT_SYMBOL_GPL(__symbol_get); * * You must hold the module_mutex. */ -static int verify_export_symbols(struct module *mod) +static int verify_exported_symbols(struct module *mod) { unsigned int i; struct module *owner; @@ -2519,10 +2520,10 @@ static void free_modinfo(struct module *mod) #ifdef CONFIG_KALLSYMS -/* lookup symbol in given range of kernel_symbols */ -static const struct kernel_symbol *lookup_symbol(const char *name, - const struct kernel_symbol *start, - const struct kernel_symbol *stop) +/* Lookup exported symbol in given range of kernel_symbols */ +static const struct kernel_symbol *lookup_exported_symbol(const char *name, + const struct kernel_symbol *start, + const struct kernel_symbol *stop) { return bsearch(name, start, stop - start, sizeof(struct kernel_symbol), cmp_name); @@ -2533,9 +2534,10 @@ static int is_exported(const char *name, unsigned long value, { const struct kernel_symbol *ks; if (!mod) - ks = lookup_symbol(name, __start___ksymtab, __stop___ksymtab); + ks = lookup_exported_symbol(name, __start___ksymtab, __stop___ksymtab); else - ks = lookup_symbol(name, mod->syms, mod->syms + mod->num_syms); + ks = lookup_exported_symbol(name, mod->syms, mod->syms + mod->num_syms); + return ks != NULL && kernel_symbol_value(ks) == value; } @@ -2682,7 +2684,7 @@ static void add_kallsyms(struct module *mod, const struct load_info *info) /* Set types up while we still have access to sections. */ for (i = 0; i < mod->kallsyms->num_symtab; i++) - mod->kallsyms->symtab[i].st_info + mod->kallsyms->symtab[i].st_size = elf_type(&mod->kallsyms->symtab[i], info); /* Now populate the cut down core kallsyms for after init. */ @@ -3512,15 +3514,15 @@ static noinline int do_init_module(struct module *mod) /* * We want to free module_init, but be aware that kallsyms may be * walking this with preempt disabled. In all the failure paths, we - * call synchronize_sched(), but we don't want to slow down the success + * call synchronize_rcu(), but we don't want to slow down the success * path, so use actual RCU here. * Note that module_alloc() on most architectures creates W+X page * mappings which won't be cleaned up until do_free_init() runs. Any * code such as mark_rodata_ro() which depends on those mappings to * be cleaned up needs to sync with the queued work - ie - * rcu_barrier_sched() + * rcu_barrier() */ - call_rcu_sched(&freeinit->rcu, do_free_init); + call_rcu(&freeinit->rcu, do_free_init); mutex_unlock(&module_mutex); wake_up_all(&module_wq); @@ -3531,7 +3533,7 @@ fail_free_freeinit: fail: /* Try to protect us from buggy refcounters. */ mod->state = MODULE_STATE_GOING; - synchronize_sched(); + synchronize_rcu(); module_put(mod); blocking_notifier_call_chain(&module_notify_list, MODULE_STATE_GOING, mod); @@ -3597,7 +3599,7 @@ static int complete_formation(struct module *mod, struct load_info *info) mutex_lock(&module_mutex); /* Find duplicate symbols (must be called under lock). */ - err = verify_export_symbols(mod); + err = verify_exported_symbols(mod); if (err < 0) goto out; @@ -3824,7 +3826,7 @@ static int load_module(struct load_info *info, const char __user *uargs, ddebug_cleanup: ftrace_release_mod(mod); dynamic_debug_remove(mod, info->debug); - synchronize_sched(); + synchronize_rcu(); kfree(mod->args); free_arch_cleanup: module_arch_cleanup(mod); @@ -3839,7 +3841,7 @@ static int load_module(struct load_info *info, const char __user *uargs, mod_tree_remove(mod); wake_up_all(&module_wq); /* Wait for RCU-sched synchronizing before releasing mod->list. */ - synchronize_sched(); + synchronize_rcu(); mutex_unlock(&module_mutex); free_module: /* Free lock-classes; relies on the preceding sync_rcu() */ @@ -3916,18 +3918,22 @@ static inline int is_arm_mapping_symbol(const char *str) && (str[2] == '\0' || str[2] == '.'); } -static const char *symname(struct mod_kallsyms *kallsyms, unsigned int symnum) +static const char *kallsyms_symbol_name(struct mod_kallsyms *kallsyms, unsigned int symnum) { return kallsyms->strtab + kallsyms->symtab[symnum].st_name; } -static const char *get_ksymbol(struct module *mod, - unsigned long addr, - unsigned long *size, - unsigned long *offset) +/* + * Given a module and address, find the corresponding symbol and return its name + * while providing its size and offset if needed. + */ +static const char *find_kallsyms_symbol(struct module *mod, + unsigned long addr, + unsigned long *size, + unsigned long *offset) { unsigned int i, best = 0; - unsigned long nextval; + unsigned long nextval, bestval; struct mod_kallsyms *kallsyms = rcu_dereference_sched(mod->kallsyms); /* At worse, next value is at end of module */ @@ -3936,34 +3942,40 @@ static const char *get_ksymbol(struct module *mod, else nextval = (unsigned long)mod->core_layout.base+mod->core_layout.text_size; + bestval = kallsyms_symbol_value(&kallsyms->symtab[best]); + /* Scan for closest preceding symbol, and next symbol. (ELF starts real symbols at 1). */ for (i = 1; i < kallsyms->num_symtab; i++) { - if (kallsyms->symtab[i].st_shndx == SHN_UNDEF) + const Elf_Sym *sym = &kallsyms->symtab[i]; + unsigned long thisval = kallsyms_symbol_value(sym); + + if (sym->st_shndx == SHN_UNDEF) continue; /* We ignore unnamed symbols: they're uninformative * and inserted at a whim. */ - if (*symname(kallsyms, i) == '\0' - || is_arm_mapping_symbol(symname(kallsyms, i))) + if (*kallsyms_symbol_name(kallsyms, i) == '\0' + || is_arm_mapping_symbol(kallsyms_symbol_name(kallsyms, i))) continue; - if (kallsyms->symtab[i].st_value <= addr - && kallsyms->symtab[i].st_value > kallsyms->symtab[best].st_value) + if (thisval <= addr && thisval > bestval) { best = i; - if (kallsyms->symtab[i].st_value > addr - && kallsyms->symtab[i].st_value < nextval) - nextval = kallsyms->symtab[i].st_value; + bestval = thisval; + } + if (thisval > addr && thisval < nextval) + nextval = thisval; } if (!best) return NULL; if (size) - *size = nextval - kallsyms->symtab[best].st_value; + *size = nextval - bestval; if (offset) - *offset = addr - kallsyms->symtab[best].st_value; - return symname(kallsyms, best); + *offset = addr - bestval; + + return kallsyms_symbol_name(kallsyms, best); } void * __weak dereference_module_function_descriptor(struct module *mod, @@ -3988,7 +4000,8 @@ const char *module_address_lookup(unsigned long addr, if (mod) { if (modname) *modname = mod->name; - ret = get_ksymbol(mod, addr, size, offset); + + ret = find_kallsyms_symbol(mod, addr, size, offset); } /* Make a copy in here where it's safe */ if (ret) { @@ -4011,9 +4024,10 @@ int lookup_module_symbol_name(unsigned long addr, char *symname) if (within_module(addr, mod)) { const char *sym; - sym = get_ksymbol(mod, addr, NULL, NULL); + sym = find_kallsyms_symbol(mod, addr, NULL, NULL); if (!sym) goto out; + strlcpy(symname, sym, KSYM_NAME_LEN); preempt_enable(); return 0; @@ -4036,7 +4050,7 @@ int lookup_module_symbol_attrs(unsigned long addr, unsigned long *size, if (within_module(addr, mod)) { const char *sym; - sym = get_ksymbol(mod, addr, size, offset); + sym = find_kallsyms_symbol(mod, addr, size, offset); if (!sym) goto out; if (modname) @@ -4065,9 +4079,11 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type, continue; kallsyms = rcu_dereference_sched(mod->kallsyms); if (symnum < kallsyms->num_symtab) { - *value = kallsyms->symtab[symnum].st_value; - *type = kallsyms->symtab[symnum].st_info; - strlcpy(name, symname(kallsyms, symnum), KSYM_NAME_LEN); + const Elf_Sym *sym = &kallsyms->symtab[symnum]; + + *value = kallsyms_symbol_value(sym); + *type = sym->st_size; + strlcpy(name, kallsyms_symbol_name(kallsyms, symnum), KSYM_NAME_LEN); strlcpy(module_name, mod->name, MODULE_NAME_LEN); *exported = is_exported(name, *value, mod); preempt_enable(); @@ -4079,15 +4095,19 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type, return -ERANGE; } -static unsigned long mod_find_symname(struct module *mod, const char *name) +/* Given a module and name of symbol, find and return the symbol's value */ +static unsigned long find_kallsyms_symbol_value(struct module *mod, const char *name) { unsigned int i; struct mod_kallsyms *kallsyms = rcu_dereference_sched(mod->kallsyms); - for (i = 0; i < kallsyms->num_symtab; i++) - if (strcmp(name, symname(kallsyms, i)) == 0 && - kallsyms->symtab[i].st_shndx != SHN_UNDEF) - return kallsyms->symtab[i].st_value; + for (i = 0; i < kallsyms->num_symtab; i++) { + const Elf_Sym *sym = &kallsyms->symtab[i]; + + if (strcmp(name, kallsyms_symbol_name(kallsyms, i)) == 0 && + sym->st_shndx != SHN_UNDEF) + return kallsyms_symbol_value(sym); + } return 0; } @@ -4102,12 +4122,12 @@ unsigned long module_kallsyms_lookup_name(const char *name) preempt_disable(); if ((colon = strnchr(name, MODULE_NAME_LEN, ':')) != NULL) { if ((mod = find_module_all(name, colon - name, false)) != NULL) - ret = mod_find_symname(mod, colon+1); + ret = find_kallsyms_symbol_value(mod, colon+1); } else { list_for_each_entry_rcu(mod, &modules, list) { if (mod->state == MODULE_STATE_UNFORMED) continue; - if ((ret = mod_find_symname(mod, name)) != 0) + if ((ret = find_kallsyms_symbol_value(mod, name)) != 0) break; } } @@ -4132,12 +4152,13 @@ int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *, if (mod->state == MODULE_STATE_UNFORMED) continue; for (i = 0; i < kallsyms->num_symtab; i++) { + const Elf_Sym *sym = &kallsyms->symtab[i]; - if (kallsyms->symtab[i].st_shndx == SHN_UNDEF) + if (sym->st_shndx == SHN_UNDEF) continue; - ret = fn(data, symname(kallsyms, i), - mod, kallsyms->symtab[i].st_value); + ret = fn(data, kallsyms_symbol_name(kallsyms, i), + mod, kallsyms_symbol_value(sym)); if (ret != 0) return ret; } diff --git a/kernel/module_signing.c b/kernel/module_signing.c index f2075ce8e4b3..6b9a926fd86b 100644 --- a/kernel/module_signing.c +++ b/kernel/module_signing.c @@ -83,6 +83,7 @@ int mod_verify_sig(const void *mod, struct load_info *info) } return verify_pkcs7_signature(mod, modlen, mod + modlen, sig_len, - NULL, VERIFYING_MODULE_SIGNATURE, + VERIFY_USE_SECONDARY_KEYRING, + VERIFYING_MODULE_SIGNATURE, NULL, NULL); } diff --git a/kernel/panic.c b/kernel/panic.c index f6d549a29a5c..d10c340c43b0 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -14,6 +14,7 @@ #include <linux/kmsg_dump.h> #include <linux/kallsyms.h> #include <linux/notifier.h> +#include <linux/vt_kern.h> #include <linux/module.h> #include <linux/random.h> #include <linux/ftrace.h> @@ -237,7 +238,10 @@ void panic(const char *fmt, ...) if (_crash_kexec_post_notifiers) __crash_kexec(NULL); - bust_spinlocks(0); +#ifdef CONFIG_VT + unblank_screen(); +#endif + console_unblank(); /* * We may have ended up stopping the CPU holding the lock (in diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 3a6c2f87699e..f8fe57d1022e 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -298,3 +298,18 @@ config PM_GENERIC_DOMAINS_OF config CPU_PM bool + +config ENERGY_MODEL + bool "Energy Model for CPUs" + depends on SMP + depends on CPU_FREQ + default n + help + Several subsystems (thermal and/or the task scheduler for example) + can leverage information about the energy consumed by CPUs to make + smarter decisions. This config option enables the framework from + which subsystems can access the energy models. + + The exact usage of the energy model is subsystem-dependent. + + If in doubt, say N. diff --git a/kernel/power/Makefile b/kernel/power/Makefile index a3f79f0eef36..e7e47d9be1e5 100644 --- a/kernel/power/Makefile +++ b/kernel/power/Makefile @@ -15,3 +15,5 @@ obj-$(CONFIG_PM_AUTOSLEEP) += autosleep.o obj-$(CONFIG_PM_WAKELOCKS) += wakelock.o obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o + +obj-$(CONFIG_ENERGY_MODEL) += energy_model.o diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c new file mode 100644 index 000000000000..d9dc2c38764a --- /dev/null +++ b/kernel/power/energy_model.c @@ -0,0 +1,201 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Energy Model of CPUs + * + * Copyright (c) 2018, Arm ltd. + * Written by: Quentin Perret, Arm ltd. + */ + +#define pr_fmt(fmt) "energy_model: " fmt + +#include <linux/cpu.h> +#include <linux/cpumask.h> +#include <linux/energy_model.h> +#include <linux/sched/topology.h> +#include <linux/slab.h> + +/* Mapping of each CPU to the performance domain to which it belongs. */ +static DEFINE_PER_CPU(struct em_perf_domain *, em_data); + +/* + * Mutex serializing the registrations of performance domains and letting + * callbacks defined by drivers sleep. + */ +static DEFINE_MUTEX(em_pd_mutex); + +static struct em_perf_domain *em_create_pd(cpumask_t *span, int nr_states, + struct em_data_callback *cb) +{ + unsigned long opp_eff, prev_opp_eff = ULONG_MAX; + unsigned long power, freq, prev_freq = 0; + int i, ret, cpu = cpumask_first(span); + struct em_cap_state *table; + struct em_perf_domain *pd; + u64 fmax; + + if (!cb->active_power) + return NULL; + + pd = kzalloc(sizeof(*pd) + cpumask_size(), GFP_KERNEL); + if (!pd) + return NULL; + + table = kcalloc(nr_states, sizeof(*table), GFP_KERNEL); + if (!table) + goto free_pd; + + /* Build the list of capacity states for this performance domain */ + for (i = 0, freq = 0; i < nr_states; i++, freq++) { + /* + * active_power() is a driver callback which ceils 'freq' to + * lowest capacity state of 'cpu' above 'freq' and updates + * 'power' and 'freq' accordingly. + */ + ret = cb->active_power(&power, &freq, cpu); + if (ret) { + pr_err("pd%d: invalid cap. state: %d\n", cpu, ret); + goto free_cs_table; + } + + /* + * We expect the driver callback to increase the frequency for + * higher capacity states. + */ + if (freq <= prev_freq) { + pr_err("pd%d: non-increasing freq: %lu\n", cpu, freq); + goto free_cs_table; + } + + /* + * The power returned by active_state() is expected to be + * positive, in milli-watts and to fit into 16 bits. + */ + if (!power || power > EM_CPU_MAX_POWER) { + pr_err("pd%d: invalid power: %lu\n", cpu, power); + goto free_cs_table; + } + + table[i].power = power; + table[i].frequency = prev_freq = freq; + + /* + * The hertz/watts efficiency ratio should decrease as the + * frequency grows on sane platforms. But this isn't always + * true in practice so warn the user if a higher OPP is more + * power efficient than a lower one. + */ + opp_eff = freq / power; + if (opp_eff >= prev_opp_eff) + pr_warn("pd%d: hertz/watts ratio non-monotonically decreasing: em_cap_state %d >= em_cap_state%d\n", + cpu, i, i - 1); + prev_opp_eff = opp_eff; + } + + /* Compute the cost of each capacity_state. */ + fmax = (u64) table[nr_states - 1].frequency; + for (i = 0; i < nr_states; i++) { + table[i].cost = div64_u64(fmax * table[i].power, + table[i].frequency); + } + + pd->table = table; + pd->nr_cap_states = nr_states; + cpumask_copy(to_cpumask(pd->cpus), span); + + return pd; + +free_cs_table: + kfree(table); +free_pd: + kfree(pd); + + return NULL; +} + +/** + * em_cpu_get() - Return the performance domain for a CPU + * @cpu : CPU to find the performance domain for + * + * Return: the performance domain to which 'cpu' belongs, or NULL if it doesn't + * exist. + */ +struct em_perf_domain *em_cpu_get(int cpu) +{ + return READ_ONCE(per_cpu(em_data, cpu)); +} +EXPORT_SYMBOL_GPL(em_cpu_get); + +/** + * em_register_perf_domain() - Register the Energy Model of a performance domain + * @span : Mask of CPUs in the performance domain + * @nr_states : Number of capacity states to register + * @cb : Callback functions providing the data of the Energy Model + * + * Create Energy Model tables for a performance domain using the callbacks + * defined in cb. + * + * If multiple clients register the same performance domain, all but the first + * registration will be ignored. + * + * Return 0 on success + */ +int em_register_perf_domain(cpumask_t *span, unsigned int nr_states, + struct em_data_callback *cb) +{ + unsigned long cap, prev_cap = 0; + struct em_perf_domain *pd; + int cpu, ret = 0; + + if (!span || !nr_states || !cb) + return -EINVAL; + + /* + * Use a mutex to serialize the registration of performance domains and + * let the driver-defined callback functions sleep. + */ + mutex_lock(&em_pd_mutex); + + for_each_cpu(cpu, span) { + /* Make sure we don't register again an existing domain. */ + if (READ_ONCE(per_cpu(em_data, cpu))) { + ret = -EEXIST; + goto unlock; + } + + /* + * All CPUs of a domain must have the same micro-architecture + * since they all share the same table. + */ + cap = arch_scale_cpu_capacity(NULL, cpu); + if (prev_cap && prev_cap != cap) { + pr_err("CPUs of %*pbl must have the same capacity\n", + cpumask_pr_args(span)); + ret = -EINVAL; + goto unlock; + } + prev_cap = cap; + } + + /* Create the performance domain and add it to the Energy Model. */ + pd = em_create_pd(span, nr_states, cb); + if (!pd) { + ret = -EINVAL; + goto unlock; + } + + for_each_cpu(cpu, span) { + /* + * The per-cpu array can be read concurrently from em_cpu_get(). + * The barrier enforces the ordering needed to make sure readers + * can only access well formed em_perf_domain structs. + */ + smp_store_release(per_cpu_ptr(&em_data, cpu), pd); + } + + pr_debug("Created perf domain %*pbl\n", cpumask_pr_args(span)); +unlock: + mutex_unlock(&em_pd_mutex); + + return ret; +} +EXPORT_SYMBOL_GPL(em_register_perf_domain); diff --git a/kernel/power/main.c b/kernel/power/main.c index 35b50823d83b..98e76cad128b 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -318,23 +318,12 @@ static int suspend_stats_show(struct seq_file *s, void *unused) return 0; } - -static int suspend_stats_open(struct inode *inode, struct file *file) -{ - return single_open(file, suspend_stats_show, NULL); -} - -static const struct file_operations suspend_stats_operations = { - .open = suspend_stats_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; +DEFINE_SHOW_ATTRIBUTE(suspend_stats); static int __init pm_debugfs_init(void) { debugfs_create_file("suspend_stats", S_IFREG | S_IRUGO, - NULL, NULL, &suspend_stats_operations); + NULL, NULL, &suspend_stats_fops); return 0; } diff --git a/kernel/power/qos.c b/kernel/power/qos.c index 86d72ffb811b..b7a82502857a 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c @@ -184,7 +184,7 @@ static inline void pm_qos_set_value(struct pm_qos_constraints *c, s32 value) c->target_value = value; } -static int pm_qos_dbg_show_requests(struct seq_file *s, void *unused) +static int pm_qos_debug_show(struct seq_file *s, void *unused) { struct pm_qos_object *qos = (struct pm_qos_object *)s->private; struct pm_qos_constraints *c; @@ -245,18 +245,7 @@ out: return 0; } -static int pm_qos_dbg_open(struct inode *inode, struct file *file) -{ - return single_open(file, pm_qos_dbg_show_requests, - inode->i_private); -} - -static const struct file_operations pm_qos_debug_fops = { - .open = pm_qos_dbg_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; +DEFINE_SHOW_ATTRIBUTE(pm_qos_debug); /** * pm_qos_update_target - manages the constraints list and calls the notifiers diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 1b2a029360b7..1306fe0c1dc6 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -403,6 +403,7 @@ DECLARE_WAIT_QUEUE_HEAD(log_wait); static u64 syslog_seq; static u32 syslog_idx; static size_t syslog_partial; +static bool syslog_time; /* index and sequence number of the first record stored in the buffer */ static u64 log_first_seq; @@ -752,6 +753,19 @@ struct devkmsg_user { char buf[CONSOLE_EXT_LOG_MAX]; }; +static __printf(3, 4) __cold +int devkmsg_emit(int facility, int level, const char *fmt, ...) +{ + va_list args; + int r; + + va_start(args, fmt); + r = vprintk_emit(facility, level, NULL, 0, fmt, args); + va_end(args); + + return r; +} + static ssize_t devkmsg_write(struct kiocb *iocb, struct iov_iter *from) { char *buf, *line; @@ -810,7 +824,7 @@ static ssize_t devkmsg_write(struct kiocb *iocb, struct iov_iter *from) } } - printk_emit(facility, level, NULL, 0, "%s", line); + devkmsg_emit(facility, level, "%s", line); kfree(buf); return ret; } @@ -1213,50 +1227,39 @@ static inline void boot_delay_msec(int level) static bool printk_time = IS_ENABLED(CONFIG_PRINTK_TIME); module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR); -static size_t print_time(u64 ts, char *buf) +static size_t print_syslog(unsigned int level, char *buf) { - unsigned long rem_nsec; - - if (!printk_time) - return 0; - - rem_nsec = do_div(ts, 1000000000); + return sprintf(buf, "<%u>", level); +} - if (!buf) - return snprintf(NULL, 0, "[%5lu.000000] ", (unsigned long)ts); +static size_t print_time(u64 ts, char *buf) +{ + unsigned long rem_nsec = do_div(ts, 1000000000); return sprintf(buf, "[%5lu.%06lu] ", (unsigned long)ts, rem_nsec / 1000); } -static size_t print_prefix(const struct printk_log *msg, bool syslog, char *buf) +static size_t print_prefix(const struct printk_log *msg, bool syslog, + bool time, char *buf) { size_t len = 0; - unsigned int prefix = (msg->facility << 3) | msg->level; - - if (syslog) { - if (buf) { - len += sprintf(buf, "<%u>", prefix); - } else { - len += 3; - if (prefix > 999) - len += 3; - else if (prefix > 99) - len += 2; - else if (prefix > 9) - len++; - } - } - len += print_time(msg->ts_nsec, buf ? buf + len : NULL); + if (syslog) + len = print_syslog((msg->facility << 3) | msg->level, buf); + if (time) + len += print_time(msg->ts_nsec, buf + len); return len; } -static size_t msg_print_text(const struct printk_log *msg, bool syslog, char *buf, size_t size) +static size_t msg_print_text(const struct printk_log *msg, bool syslog, + bool time, char *buf, size_t size) { const char *text = log_text(msg); size_t text_size = msg->text_len; size_t len = 0; + char prefix[PREFIX_MAX]; + const size_t prefix_len = print_prefix(msg, syslog, time, prefix); do { const char *next = memchr(text, '\n', text_size); @@ -1271,19 +1274,17 @@ static size_t msg_print_text(const struct printk_log *msg, bool syslog, char *bu } if (buf) { - if (print_prefix(msg, syslog, NULL) + - text_len + 1 >= size - len) + if (prefix_len + text_len + 1 >= size - len) break; - len += print_prefix(msg, syslog, buf + len); + memcpy(buf + len, prefix, prefix_len); + len += prefix_len; memcpy(buf + len, text, text_len); len += text_len; buf[len++] = '\n'; } else { /* SYSLOG_ACTION_* buffer size only calculation */ - len += print_prefix(msg, syslog, NULL); - len += text_len; - len++; + len += prefix_len + text_len + 1; } text = next; @@ -1318,9 +1319,17 @@ static int syslog_print(char __user *buf, int size) break; } + /* + * To keep reading/counting partial line consistent, + * use printk_time value as of the beginning of a line. + */ + if (!syslog_partial) + syslog_time = printk_time; + skip = syslog_partial; msg = log_from_idx(syslog_idx); - n = msg_print_text(msg, true, text, LOG_LINE_MAX + PREFIX_MAX); + n = msg_print_text(msg, true, syslog_time, text, + LOG_LINE_MAX + PREFIX_MAX); if (n - syslog_partial <= size) { /* message fits into buffer, move forward */ syslog_idx = log_next(syslog_idx); @@ -1360,11 +1369,13 @@ static int syslog_print_all(char __user *buf, int size, bool clear) u64 next_seq; u64 seq; u32 idx; + bool time; text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL); if (!text) return -ENOMEM; + time = printk_time; logbuf_lock_irq(); /* * Find first record that fits, including all following records, @@ -1375,7 +1386,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear) while (seq < log_next_seq) { struct printk_log *msg = log_from_idx(idx); - len += msg_print_text(msg, true, NULL, 0); + len += msg_print_text(msg, true, time, NULL, 0); idx = log_next(idx); seq++; } @@ -1386,7 +1397,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear) while (len > size && seq < log_next_seq) { struct printk_log *msg = log_from_idx(idx); - len -= msg_print_text(msg, true, NULL, 0); + len -= msg_print_text(msg, true, time, NULL, 0); idx = log_next(idx); seq++; } @@ -1397,14 +1408,9 @@ static int syslog_print_all(char __user *buf, int size, bool clear) len = 0; while (len >= 0 && seq < next_seq) { struct printk_log *msg = log_from_idx(idx); - int textlen; + int textlen = msg_print_text(msg, true, time, text, + LOG_LINE_MAX + PREFIX_MAX); - textlen = msg_print_text(msg, true, text, - LOG_LINE_MAX + PREFIX_MAX); - if (textlen < 0) { - len = textlen; - break; - } idx = log_next(idx); seq++; @@ -1528,11 +1534,14 @@ int do_syslog(int type, char __user *buf, int len, int source) } else { u64 seq = syslog_seq; u32 idx = syslog_idx; + bool time = syslog_partial ? syslog_time : printk_time; while (seq < log_next_seq) { struct printk_log *msg = log_from_idx(idx); - error += msg_print_text(msg, true, NULL, 0); + error += msg_print_text(msg, true, time, NULL, + 0); + time = printk_time; idx = log_next(idx); seq++; } @@ -1935,21 +1944,6 @@ asmlinkage int vprintk(const char *fmt, va_list args) } EXPORT_SYMBOL(vprintk); -asmlinkage int printk_emit(int facility, int level, - const char *dict, size_t dictlen, - const char *fmt, ...) -{ - va_list args; - int r; - - va_start(args, fmt); - r = vprintk_emit(facility, level, dict, dictlen, fmt, args); - va_end(args); - - return r; -} -EXPORT_SYMBOL(printk_emit); - int vprintk_default(const char *fmt, va_list args) { int r; @@ -2005,6 +1999,7 @@ EXPORT_SYMBOL(printk); #define LOG_LINE_MAX 0 #define PREFIX_MAX 0 +#define printk_time false static u64 syslog_seq; static u32 syslog_idx; @@ -2028,8 +2023,8 @@ static void console_lock_spinning_enable(void) { } static int console_lock_spinning_disable_and_check(void) { return 0; } static void call_console_drivers(const char *ext_text, size_t ext_len, const char *text, size_t len) {} -static size_t msg_print_text(const struct printk_log *msg, - bool syslog, char *buf, size_t size) { return 0; } +static size_t msg_print_text(const struct printk_log *msg, bool syslog, + bool time, char *buf, size_t size) { return 0; } static bool suppress_message_printing(int level) { return false; } #endif /* CONFIG_PRINTK */ @@ -2387,8 +2382,7 @@ skip: len += msg_print_text(msg, console_msg_format & MSG_FORMAT_SYSLOG, - text + len, - sizeof(text) - len); + printk_time, text + len, sizeof(text) - len); if (nr_ext_console_drivers) { ext_len = msg_print_ext_header(ext_text, sizeof(ext_text), @@ -3112,7 +3106,7 @@ bool kmsg_dump_get_line_nolock(struct kmsg_dumper *dumper, bool syslog, goto out; msg = log_from_idx(dumper->cur_idx); - l = msg_print_text(msg, syslog, line, size); + l = msg_print_text(msg, syslog, printk_time, line, size); dumper->cur_idx = log_next(dumper->cur_idx); dumper->cur_seq++; @@ -3183,6 +3177,7 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog, u32 next_idx; size_t l = 0; bool ret = false; + bool time = printk_time; if (!dumper->active) goto out; @@ -3206,7 +3201,7 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog, while (seq < dumper->next_seq) { struct printk_log *msg = log_from_idx(idx); - l += msg_print_text(msg, true, NULL, 0); + l += msg_print_text(msg, true, time, NULL, 0); idx = log_next(idx); seq++; } @@ -3217,7 +3212,7 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog, while (l > size && seq < dumper->next_seq) { struct printk_log *msg = log_from_idx(idx); - l -= msg_print_text(msg, true, NULL, 0); + l -= msg_print_text(msg, true, time, NULL, 0); idx = log_next(idx); seq++; } @@ -3230,7 +3225,7 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog, while (seq < dumper->next_seq) { struct printk_log *msg = log_from_idx(idx); - l += msg_print_text(msg, syslog, buf + l, size - l); + l += msg_print_text(msg, syslog, time, buf + l, size - l); idx = log_next(idx); seq++; } diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 2866166863f0..a393e24a9195 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -526,12 +526,14 @@ srcu_batches_completed(struct srcu_struct *sp) { return 0; } static inline void rcu_force_quiescent_state(void) { } static inline void show_rcu_gp_kthreads(void) { } static inline int rcu_get_gp_kthreads_prio(void) { return 0; } +static inline void rcu_fwd_progress_check(unsigned long j) { } #else /* #ifdef CONFIG_TINY_RCU */ unsigned long rcu_get_gp_seq(void); unsigned long rcu_exp_batches_completed(void); unsigned long srcu_batches_completed(struct srcu_struct *sp); void show_rcu_gp_kthreads(void); int rcu_get_gp_kthreads_prio(void); +void rcu_fwd_progress_check(unsigned long j); void rcu_force_quiescent_state(void); extern struct workqueue_struct *rcu_gp_wq; extern struct workqueue_struct *rcu_par_gp_wq; @@ -539,8 +541,10 @@ extern struct workqueue_struct *rcu_par_gp_wq; #ifdef CONFIG_RCU_NOCB_CPU bool rcu_is_nocb_cpu(int cpu); +void rcu_bind_current_to_nocb(void); #else static inline bool rcu_is_nocb_cpu(int cpu) { return false; } +static inline void rcu_bind_current_to_nocb(void) { } #endif #endif /* __LINUX_RCU_H */ diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 210c77460365..f6e85faa4ff4 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -56,6 +56,7 @@ #include <linux/vmalloc.h> #include <linux/sched/debug.h> #include <linux/sched/sysctl.h> +#include <linux/oom.h> #include "rcu.h" @@ -80,13 +81,6 @@ MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and Josh Triplett <josh@jos /* Must be power of two minus one. */ #define RCUTORTURE_RDR_MAX_SEGS (RCUTORTURE_RDR_MAX_LOOPS + 3) -torture_param(int, cbflood_inter_holdoff, HZ, - "Holdoff between floods (jiffies)"); -torture_param(int, cbflood_intra_holdoff, 1, - "Holdoff between bursts (jiffies)"); -torture_param(int, cbflood_n_burst, 3, "# bursts in flood, zero to disable"); -torture_param(int, cbflood_n_per_burst, 20000, - "# callbacks per burst in flood"); torture_param(int, extendables, RCUTORTURE_MAX_EXTEND, "Extend readers by disabling bh (1), irqs (2), or preempt (4)"); torture_param(int, fqs_duration, 0, @@ -138,12 +132,10 @@ module_param(torture_type, charp, 0444); MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, srcu, ...)"); static int nrealreaders; -static int ncbflooders; static struct task_struct *writer_task; static struct task_struct **fakewriter_tasks; static struct task_struct **reader_tasks; static struct task_struct *stats_task; -static struct task_struct **cbflood_task; static struct task_struct *fqs_task; static struct task_struct *boost_tasks[NR_CPUS]; static struct task_struct *stall_task; @@ -181,7 +173,6 @@ static long n_rcu_torture_boosts; static atomic_long_t n_rcu_torture_timers; static long n_barrier_attempts; static long n_barrier_successes; /* did rcu_barrier test succeed? */ -static atomic_long_t n_cbfloods; static struct list_head rcu_torture_removed; static int rcu_torture_writer_state; @@ -259,6 +250,8 @@ static atomic_t barrier_cbs_invoked; /* Barrier callbacks invoked. */ static wait_queue_head_t *barrier_cbs_wq; /* Coordinate barrier testing. */ static DECLARE_WAIT_QUEUE_HEAD(barrier_wq); +static bool rcu_fwd_cb_nodelay; /* Short rcu_torture_delay() delays. */ + /* * Allocate an element from the rcu_tortures pool. */ @@ -348,7 +341,8 @@ rcu_read_delay(struct torture_random_state *rrsp, struct rt_read_seg *rtrsp) * period, and we want a long delay occasionally to trigger * force_quiescent_state. */ - if (!(torture_random(rrsp) % (nrealreaders * 2000 * longdelay_ms))) { + if (!rcu_fwd_cb_nodelay && + !(torture_random(rrsp) % (nrealreaders * 2000 * longdelay_ms))) { started = cur_ops->get_gp_seq(); ts = rcu_trace_clock_local(); if (preempt_count() & (SOFTIRQ_MASK | HARDIRQ_MASK)) @@ -870,59 +864,6 @@ checkwait: stutter_wait("rcu_torture_boost"); return 0; } -static void rcu_torture_cbflood_cb(struct rcu_head *rhp) -{ -} - -/* - * RCU torture callback-flood kthread. Repeatedly induces bursts of calls - * to call_rcu() or analogous, increasing the probability of occurrence - * of callback-overflow corner cases. - */ -static int -rcu_torture_cbflood(void *arg) -{ - int err = 1; - int i; - int j; - struct rcu_head *rhp; - - if (cbflood_n_per_burst > 0 && - cbflood_inter_holdoff > 0 && - cbflood_intra_holdoff > 0 && - cur_ops->call && - cur_ops->cb_barrier) { - rhp = vmalloc(array3_size(cbflood_n_burst, - cbflood_n_per_burst, - sizeof(*rhp))); - err = !rhp; - } - if (err) { - VERBOSE_TOROUT_STRING("rcu_torture_cbflood disabled: Bad args or OOM"); - goto wait_for_stop; - } - VERBOSE_TOROUT_STRING("rcu_torture_cbflood task started"); - do { - schedule_timeout_interruptible(cbflood_inter_holdoff); - atomic_long_inc(&n_cbfloods); - WARN_ON(signal_pending(current)); - for (i = 0; i < cbflood_n_burst; i++) { - for (j = 0; j < cbflood_n_per_burst; j++) { - cur_ops->call(&rhp[i * cbflood_n_per_burst + j], - rcu_torture_cbflood_cb); - } - schedule_timeout_interruptible(cbflood_intra_holdoff); - WARN_ON(signal_pending(current)); - } - cur_ops->cb_barrier(); - stutter_wait("rcu_torture_cbflood"); - } while (!torture_must_stop()); - vfree(rhp); -wait_for_stop: - torture_kthread_stopping("rcu_torture_cbflood"); - return 0; -} - /* * RCU torture force-quiescent-state kthread. Repeatedly induces * bursts of calls to force_quiescent_state(), increasing the probability @@ -1457,11 +1398,10 @@ rcu_torture_stats_print(void) n_rcu_torture_boosts, atomic_long_read(&n_rcu_torture_timers)); torture_onoff_stats(); - pr_cont("barrier: %ld/%ld:%ld ", + pr_cont("barrier: %ld/%ld:%ld\n", n_barrier_successes, n_barrier_attempts, n_rcu_torture_barrier_error); - pr_cont("cbflood: %ld\n", atomic_long_read(&n_cbfloods)); pr_alert("%s%s ", torture_type, TORTURE_FLAG); if (atomic_read(&n_rcu_torture_mberror) != 0 || @@ -1674,8 +1614,90 @@ static void rcu_torture_fwd_prog_cb(struct rcu_head *rhp) cur_ops->call(&fcsp->rh, rcu_torture_fwd_prog_cb); } -/* Carry out grace-period forward-progress testing. */ -static int rcu_torture_fwd_prog(void *args) +/* State for continuous-flood RCU callbacks. */ +struct rcu_fwd_cb { + struct rcu_head rh; + struct rcu_fwd_cb *rfc_next; + int rfc_gps; +}; +static DEFINE_SPINLOCK(rcu_fwd_lock); +static struct rcu_fwd_cb *rcu_fwd_cb_head; +static struct rcu_fwd_cb **rcu_fwd_cb_tail = &rcu_fwd_cb_head; +static long n_launders_cb; +static unsigned long rcu_fwd_startat; +static bool rcu_fwd_emergency_stop; +#define MAX_FWD_CB_JIFFIES (8 * HZ) /* Maximum CB test duration. */ +#define MIN_FWD_CB_LAUNDERS 3 /* This many CB invocations to count. */ +#define MIN_FWD_CBS_LAUNDERED 100 /* Number of counted CBs. */ +#define FWD_CBS_HIST_DIV 10 /* Histogram buckets/second. */ +static long n_launders_hist[2 * MAX_FWD_CB_JIFFIES / (HZ / FWD_CBS_HIST_DIV)]; + +static void rcu_torture_fwd_cb_hist(void) +{ + int i; + int j; + + for (i = ARRAY_SIZE(n_launders_hist) - 1; i > 0; i--) + if (n_launders_hist[i] > 0) + break; + pr_alert("%s: Callback-invocation histogram (duration %lu jiffies):", + __func__, jiffies - rcu_fwd_startat); + for (j = 0; j <= i; j++) + pr_cont(" %ds/%d: %ld", + j + 1, FWD_CBS_HIST_DIV, n_launders_hist[j]); + pr_cont("\n"); +} + +/* Callback function for continuous-flood RCU callbacks. */ +static void rcu_torture_fwd_cb_cr(struct rcu_head *rhp) +{ + unsigned long flags; + int i; + struct rcu_fwd_cb *rfcp = container_of(rhp, struct rcu_fwd_cb, rh); + struct rcu_fwd_cb **rfcpp; + + rfcp->rfc_next = NULL; + rfcp->rfc_gps++; + spin_lock_irqsave(&rcu_fwd_lock, flags); + rfcpp = rcu_fwd_cb_tail; + rcu_fwd_cb_tail = &rfcp->rfc_next; + WRITE_ONCE(*rfcpp, rfcp); + WRITE_ONCE(n_launders_cb, n_launders_cb + 1); + i = ((jiffies - rcu_fwd_startat) / (HZ / FWD_CBS_HIST_DIV)); + if (i >= ARRAY_SIZE(n_launders_hist)) + i = ARRAY_SIZE(n_launders_hist) - 1; + n_launders_hist[i]++; + spin_unlock_irqrestore(&rcu_fwd_lock, flags); +} + +/* + * Free all callbacks on the rcu_fwd_cb_head list, either because the + * test is over or because we hit an OOM event. + */ +static unsigned long rcu_torture_fwd_prog_cbfree(void) +{ + unsigned long flags; + unsigned long freed = 0; + struct rcu_fwd_cb *rfcp; + + for (;;) { + spin_lock_irqsave(&rcu_fwd_lock, flags); + rfcp = rcu_fwd_cb_head; + if (!rfcp) + break; + rcu_fwd_cb_head = rfcp->rfc_next; + if (!rcu_fwd_cb_head) + rcu_fwd_cb_tail = &rcu_fwd_cb_head; + spin_unlock_irqrestore(&rcu_fwd_lock, flags); + kfree(rfcp); + freed++; + } + spin_unlock_irqrestore(&rcu_fwd_lock, flags); + return freed; +} + +/* Carry out need_resched()/cond_resched() forward-progress testing. */ +static void rcu_torture_fwd_prog_nr(int *tested, int *tested_tries) { unsigned long cver; unsigned long dur; @@ -1686,56 +1708,186 @@ static int rcu_torture_fwd_prog(void *args) int sd4; bool selfpropcb = false; unsigned long stopat; - int tested = 0; - int tested_tries = 0; static DEFINE_TORTURE_RANDOM(trs); - VERBOSE_TOROUT_STRING("rcu_torture_fwd_progress task started"); - if (!IS_ENABLED(CONFIG_SMP) || !IS_ENABLED(CONFIG_RCU_BOOST)) - set_user_nice(current, MAX_NICE); if (cur_ops->call && cur_ops->sync && cur_ops->cb_barrier) { init_rcu_head_on_stack(&fcs.rh); selfpropcb = true; } + + /* Tight loop containing cond_resched(). */ + if (selfpropcb) { + WRITE_ONCE(fcs.stop, 0); + cur_ops->call(&fcs.rh, rcu_torture_fwd_prog_cb); + } + cver = READ_ONCE(rcu_torture_current_version); + gps = cur_ops->get_gp_seq(); + sd = cur_ops->stall_dur() + 1; + sd4 = (sd + fwd_progress_div - 1) / fwd_progress_div; + dur = sd4 + torture_random(&trs) % (sd - sd4); + WRITE_ONCE(rcu_fwd_startat, jiffies); + stopat = rcu_fwd_startat + dur; + while (time_before(jiffies, stopat) && + !READ_ONCE(rcu_fwd_emergency_stop) && !torture_must_stop()) { + idx = cur_ops->readlock(); + udelay(10); + cur_ops->readunlock(idx); + if (!fwd_progress_need_resched || need_resched()) + cond_resched(); + } + (*tested_tries)++; + if (!time_before(jiffies, stopat) && + !READ_ONCE(rcu_fwd_emergency_stop) && !torture_must_stop()) { + (*tested)++; + cver = READ_ONCE(rcu_torture_current_version) - cver; + gps = rcutorture_seq_diff(cur_ops->get_gp_seq(), gps); + WARN_ON(!cver && gps < 2); + pr_alert("%s: Duration %ld cver %ld gps %ld\n", __func__, dur, cver, gps); + } + if (selfpropcb) { + WRITE_ONCE(fcs.stop, 1); + cur_ops->sync(); /* Wait for running CB to complete. */ + cur_ops->cb_barrier(); /* Wait for queued callbacks. */ + } + + if (selfpropcb) { + WARN_ON(READ_ONCE(fcs.stop) != 2); + destroy_rcu_head_on_stack(&fcs.rh); + } +} + +/* Carry out call_rcu() forward-progress testing. */ +static void rcu_torture_fwd_prog_cr(void) +{ + unsigned long cver; + unsigned long gps; + int i; + long n_launders; + long n_launders_cb_snap; + long n_launders_sa; + long n_max_cbs; + long n_max_gps; + struct rcu_fwd_cb *rfcp; + struct rcu_fwd_cb *rfcpn; + unsigned long stopat; + unsigned long stoppedat; + + if (READ_ONCE(rcu_fwd_emergency_stop)) + return; /* Get out of the way quickly, no GP wait! */ + + /* Loop continuously posting RCU callbacks. */ + WRITE_ONCE(rcu_fwd_cb_nodelay, true); + cur_ops->sync(); /* Later readers see above write. */ + WRITE_ONCE(rcu_fwd_startat, jiffies); + stopat = rcu_fwd_startat + MAX_FWD_CB_JIFFIES; + n_launders = 0; + n_launders_cb = 0; + n_launders_sa = 0; + n_max_cbs = 0; + n_max_gps = 0; + for (i = 0; i < ARRAY_SIZE(n_launders_hist); i++) + n_launders_hist[i] = 0; + cver = READ_ONCE(rcu_torture_current_version); + gps = cur_ops->get_gp_seq(); + while (time_before(jiffies, stopat) && + !READ_ONCE(rcu_fwd_emergency_stop) && !torture_must_stop()) { + rfcp = READ_ONCE(rcu_fwd_cb_head); + rfcpn = NULL; + if (rfcp) + rfcpn = READ_ONCE(rfcp->rfc_next); + if (rfcpn) { + if (rfcp->rfc_gps >= MIN_FWD_CB_LAUNDERS && + ++n_max_gps >= MIN_FWD_CBS_LAUNDERED) + break; + rcu_fwd_cb_head = rfcpn; + n_launders++; + n_launders_sa++; + } else { + rfcp = kmalloc(sizeof(*rfcp), GFP_KERNEL); + if (WARN_ON_ONCE(!rfcp)) { + schedule_timeout_interruptible(1); + continue; + } + n_max_cbs++; + n_launders_sa = 0; + rfcp->rfc_gps = 0; + } + cur_ops->call(&rfcp->rh, rcu_torture_fwd_cb_cr); + cond_resched(); + } + stoppedat = jiffies; + n_launders_cb_snap = READ_ONCE(n_launders_cb); + cver = READ_ONCE(rcu_torture_current_version) - cver; + gps = rcutorture_seq_diff(cur_ops->get_gp_seq(), gps); + cur_ops->cb_barrier(); /* Wait for callbacks to be invoked. */ + (void)rcu_torture_fwd_prog_cbfree(); + + WRITE_ONCE(rcu_fwd_cb_nodelay, false); + if (!torture_must_stop() && !READ_ONCE(rcu_fwd_emergency_stop)) { + WARN_ON(n_max_gps < MIN_FWD_CBS_LAUNDERED); + pr_alert("%s Duration %lu barrier: %lu pending %ld n_launders: %ld n_launders_sa: %ld n_max_gps: %ld n_max_cbs: %ld cver %ld gps %ld\n", + __func__, + stoppedat - rcu_fwd_startat, jiffies - stoppedat, + n_launders + n_max_cbs - n_launders_cb_snap, + n_launders, n_launders_sa, + n_max_gps, n_max_cbs, cver, gps); + rcu_torture_fwd_cb_hist(); + } +} + + +/* + * OOM notifier, but this only prints diagnostic information for the + * current forward-progress test. + */ +static int rcutorture_oom_notify(struct notifier_block *self, + unsigned long notused, void *nfreed) +{ + WARN(1, "%s invoked upon OOM during forward-progress testing.\n", + __func__); + rcu_torture_fwd_cb_hist(); + rcu_fwd_progress_check(1 + (jiffies - READ_ONCE(rcu_fwd_startat) / 2)); + WRITE_ONCE(rcu_fwd_emergency_stop, true); + smp_mb(); /* Emergency stop before free and wait to avoid hangs. */ + pr_info("%s: Freed %lu RCU callbacks.\n", + __func__, rcu_torture_fwd_prog_cbfree()); + rcu_barrier(); + pr_info("%s: Freed %lu RCU callbacks.\n", + __func__, rcu_torture_fwd_prog_cbfree()); + rcu_barrier(); + pr_info("%s: Freed %lu RCU callbacks.\n", + __func__, rcu_torture_fwd_prog_cbfree()); + smp_mb(); /* Frees before return to avoid redoing OOM. */ + (*(unsigned long *)nfreed)++; /* Forward progress CBs freed! */ + pr_info("%s returning after OOM processing.\n", __func__); + return NOTIFY_OK; +} + +static struct notifier_block rcutorture_oom_nb = { + .notifier_call = rcutorture_oom_notify +}; + +/* Carry out grace-period forward-progress testing. */ +static int rcu_torture_fwd_prog(void *args) +{ + int tested = 0; + int tested_tries = 0; + + VERBOSE_TOROUT_STRING("rcu_torture_fwd_progress task started"); + rcu_bind_current_to_nocb(); + if (!IS_ENABLED(CONFIG_SMP) || !IS_ENABLED(CONFIG_RCU_BOOST)) + set_user_nice(current, MAX_NICE); do { schedule_timeout_interruptible(fwd_progress_holdoff * HZ); - if (selfpropcb) { - WRITE_ONCE(fcs.stop, 0); - cur_ops->call(&fcs.rh, rcu_torture_fwd_prog_cb); - } - cver = READ_ONCE(rcu_torture_current_version); - gps = cur_ops->get_gp_seq(); - sd = cur_ops->stall_dur() + 1; - sd4 = (sd + fwd_progress_div - 1) / fwd_progress_div; - dur = sd4 + torture_random(&trs) % (sd - sd4); - stopat = jiffies + dur; - while (time_before(jiffies, stopat) && !torture_must_stop()) { - idx = cur_ops->readlock(); - udelay(10); - cur_ops->readunlock(idx); - if (!fwd_progress_need_resched || need_resched()) - cond_resched(); - } - tested_tries++; - if (!time_before(jiffies, stopat) && !torture_must_stop()) { - tested++; - cver = READ_ONCE(rcu_torture_current_version) - cver; - gps = rcutorture_seq_diff(cur_ops->get_gp_seq(), gps); - WARN_ON(!cver && gps < 2); - pr_alert("%s: Duration %ld cver %ld gps %ld\n", __func__, dur, cver, gps); - } - if (selfpropcb) { - WRITE_ONCE(fcs.stop, 1); - cur_ops->sync(); /* Wait for running CB to complete. */ - cur_ops->cb_barrier(); /* Wait for queued callbacks. */ - } + WRITE_ONCE(rcu_fwd_emergency_stop, false); + register_oom_notifier(&rcutorture_oom_nb); + rcu_torture_fwd_prog_nr(&tested, &tested_tries); + rcu_torture_fwd_prog_cr(); + unregister_oom_notifier(&rcutorture_oom_nb); + /* Avoid slow periods, better to test when busy. */ stutter_wait("rcu_torture_fwd_prog"); } while (!torture_must_stop()); - if (selfpropcb) { - WARN_ON(READ_ONCE(fcs.stop) != 2); - destroy_rcu_head_on_stack(&fcs.rh); - } /* Short runs might not contain a valid forward-progress attempt. */ WARN_ON(!tested && tested_tries >= 5); pr_alert("%s: tested %d tested_tries %d\n", __func__, tested, tested_tries); @@ -1748,7 +1900,8 @@ static int __init rcu_torture_fwd_prog_init(void) { if (!fwd_progress) return 0; /* Not requested, so don't do it. */ - if (!cur_ops->stall_dur || cur_ops->stall_dur() <= 0) { + if (!cur_ops->stall_dur || cur_ops->stall_dur() <= 0 || + cur_ops == &rcu_busted_ops) { VERBOSE_TOROUT_STRING("rcu_torture_fwd_prog_init: Disabled, unsupported by RCU flavor under test"); return 0; } @@ -1968,8 +2121,6 @@ rcu_torture_cleanup(void) cur_ops->name, gp_seq, flags); torture_stop_kthread(rcu_torture_stats, stats_task); torture_stop_kthread(rcu_torture_fqs, fqs_task); - for (i = 0; i < ncbflooders; i++) - torture_stop_kthread(rcu_torture_cbflood, cbflood_task[i]); if (rcu_torture_can_boost()) cpuhp_remove_state(rcutor_hp); @@ -2252,24 +2403,6 @@ rcu_torture_init(void) goto unwind; if (object_debug) rcu_test_debug_objects(); - if (cbflood_n_burst > 0) { - /* Create the cbflood threads */ - ncbflooders = (num_online_cpus() + 3) / 4; - cbflood_task = kcalloc(ncbflooders, sizeof(*cbflood_task), - GFP_KERNEL); - if (!cbflood_task) { - VERBOSE_TOROUT_ERRSTRING("out of memory"); - firsterr = -ENOMEM; - goto unwind; - } - for (i = 0; i < ncbflooders; i++) { - firsterr = torture_create_kthread(rcu_torture_cbflood, - NULL, - cbflood_task[i]); - if (firsterr) - goto unwind; - } - } torture_init_end(); return 0; diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c index b46e6683f8c9..32dfd6522548 100644 --- a/kernel/rcu/srcutiny.c +++ b/kernel/rcu/srcutiny.c @@ -37,30 +37,30 @@ int rcu_scheduler_active __read_mostly; static LIST_HEAD(srcu_boot_list); static bool srcu_init_done; -static int init_srcu_struct_fields(struct srcu_struct *sp) +static int init_srcu_struct_fields(struct srcu_struct *ssp) { - sp->srcu_lock_nesting[0] = 0; - sp->srcu_lock_nesting[1] = 0; - init_swait_queue_head(&sp->srcu_wq); - sp->srcu_cb_head = NULL; - sp->srcu_cb_tail = &sp->srcu_cb_head; - sp->srcu_gp_running = false; - sp->srcu_gp_waiting = false; - sp->srcu_idx = 0; - INIT_WORK(&sp->srcu_work, srcu_drive_gp); - INIT_LIST_HEAD(&sp->srcu_work.entry); + ssp->srcu_lock_nesting[0] = 0; + ssp->srcu_lock_nesting[1] = 0; + init_swait_queue_head(&ssp->srcu_wq); + ssp->srcu_cb_head = NULL; + ssp->srcu_cb_tail = &ssp->srcu_cb_head; + ssp->srcu_gp_running = false; + ssp->srcu_gp_waiting = false; + ssp->srcu_idx = 0; + INIT_WORK(&ssp->srcu_work, srcu_drive_gp); + INIT_LIST_HEAD(&ssp->srcu_work.entry); return 0; } #ifdef CONFIG_DEBUG_LOCK_ALLOC -int __init_srcu_struct(struct srcu_struct *sp, const char *name, +int __init_srcu_struct(struct srcu_struct *ssp, const char *name, struct lock_class_key *key) { /* Don't re-initialize a lock while it is held. */ - debug_check_no_locks_freed((void *)sp, sizeof(*sp)); - lockdep_init_map(&sp->dep_map, name, key, 0); - return init_srcu_struct_fields(sp); + debug_check_no_locks_freed((void *)ssp, sizeof(*ssp)); + lockdep_init_map(&ssp->dep_map, name, key, 0); + return init_srcu_struct_fields(ssp); } EXPORT_SYMBOL_GPL(__init_srcu_struct); @@ -68,15 +68,15 @@ EXPORT_SYMBOL_GPL(__init_srcu_struct); /* * init_srcu_struct - initialize a sleep-RCU structure - * @sp: structure to initialize. + * @ssp: structure to initialize. * * Must invoke this on a given srcu_struct before passing that srcu_struct * to any other function. Each srcu_struct represents a separate domain * of SRCU protection. */ -int init_srcu_struct(struct srcu_struct *sp) +int init_srcu_struct(struct srcu_struct *ssp) { - return init_srcu_struct_fields(sp); + return init_srcu_struct_fields(ssp); } EXPORT_SYMBOL_GPL(init_srcu_struct); @@ -84,22 +84,22 @@ EXPORT_SYMBOL_GPL(init_srcu_struct); /* * cleanup_srcu_struct - deconstruct a sleep-RCU structure - * @sp: structure to clean up. + * @ssp: structure to clean up. * * Must invoke this after you are finished using a given srcu_struct that * was initialized via init_srcu_struct(), else you leak memory. */ -void _cleanup_srcu_struct(struct srcu_struct *sp, bool quiesced) +void _cleanup_srcu_struct(struct srcu_struct *ssp, bool quiesced) { - WARN_ON(sp->srcu_lock_nesting[0] || sp->srcu_lock_nesting[1]); + WARN_ON(ssp->srcu_lock_nesting[0] || ssp->srcu_lock_nesting[1]); if (quiesced) - WARN_ON(work_pending(&sp->srcu_work)); + WARN_ON(work_pending(&ssp->srcu_work)); else - flush_work(&sp->srcu_work); - WARN_ON(sp->srcu_gp_running); - WARN_ON(sp->srcu_gp_waiting); - WARN_ON(sp->srcu_cb_head); - WARN_ON(&sp->srcu_cb_head != sp->srcu_cb_tail); + flush_work(&ssp->srcu_work); + WARN_ON(ssp->srcu_gp_running); + WARN_ON(ssp->srcu_gp_waiting); + WARN_ON(ssp->srcu_cb_head); + WARN_ON(&ssp->srcu_cb_head != ssp->srcu_cb_tail); } EXPORT_SYMBOL_GPL(_cleanup_srcu_struct); @@ -107,13 +107,13 @@ EXPORT_SYMBOL_GPL(_cleanup_srcu_struct); * Removes the count for the old reader from the appropriate element of * the srcu_struct. */ -void __srcu_read_unlock(struct srcu_struct *sp, int idx) +void __srcu_read_unlock(struct srcu_struct *ssp, int idx) { - int newval = sp->srcu_lock_nesting[idx] - 1; + int newval = ssp->srcu_lock_nesting[idx] - 1; - WRITE_ONCE(sp->srcu_lock_nesting[idx], newval); - if (!newval && READ_ONCE(sp->srcu_gp_waiting)) - swake_up_one(&sp->srcu_wq); + WRITE_ONCE(ssp->srcu_lock_nesting[idx], newval); + if (!newval && READ_ONCE(ssp->srcu_gp_waiting)) + swake_up_one(&ssp->srcu_wq); } EXPORT_SYMBOL_GPL(__srcu_read_unlock); @@ -127,24 +127,24 @@ void srcu_drive_gp(struct work_struct *wp) int idx; struct rcu_head *lh; struct rcu_head *rhp; - struct srcu_struct *sp; + struct srcu_struct *ssp; - sp = container_of(wp, struct srcu_struct, srcu_work); - if (sp->srcu_gp_running || !READ_ONCE(sp->srcu_cb_head)) + ssp = container_of(wp, struct srcu_struct, srcu_work); + if (ssp->srcu_gp_running || !READ_ONCE(ssp->srcu_cb_head)) return; /* Already running or nothing to do. */ /* Remove recently arrived callbacks and wait for readers. */ - WRITE_ONCE(sp->srcu_gp_running, true); + WRITE_ONCE(ssp->srcu_gp_running, true); local_irq_disable(); - lh = sp->srcu_cb_head; - sp->srcu_cb_head = NULL; - sp->srcu_cb_tail = &sp->srcu_cb_head; + lh = ssp->srcu_cb_head; + ssp->srcu_cb_head = NULL; + ssp->srcu_cb_tail = &ssp->srcu_cb_head; local_irq_enable(); - idx = sp->srcu_idx; - WRITE_ONCE(sp->srcu_idx, !sp->srcu_idx); - WRITE_ONCE(sp->srcu_gp_waiting, true); /* srcu_read_unlock() wakes! */ - swait_event_exclusive(sp->srcu_wq, !READ_ONCE(sp->srcu_lock_nesting[idx])); - WRITE_ONCE(sp->srcu_gp_waiting, false); /* srcu_read_unlock() cheap. */ + idx = ssp->srcu_idx; + WRITE_ONCE(ssp->srcu_idx, !ssp->srcu_idx); + WRITE_ONCE(ssp->srcu_gp_waiting, true); /* srcu_read_unlock() wakes! */ + swait_event_exclusive(ssp->srcu_wq, !READ_ONCE(ssp->srcu_lock_nesting[idx])); + WRITE_ONCE(ssp->srcu_gp_waiting, false); /* srcu_read_unlock() cheap. */ /* Invoke the callbacks we removed above. */ while (lh) { @@ -161,9 +161,9 @@ void srcu_drive_gp(struct work_struct *wp) * at interrupt level, but the ->srcu_gp_running checks will * straighten that out. */ - WRITE_ONCE(sp->srcu_gp_running, false); - if (READ_ONCE(sp->srcu_cb_head)) - schedule_work(&sp->srcu_work); + WRITE_ONCE(ssp->srcu_gp_running, false); + if (READ_ONCE(ssp->srcu_cb_head)) + schedule_work(&ssp->srcu_work); } EXPORT_SYMBOL_GPL(srcu_drive_gp); @@ -171,7 +171,7 @@ EXPORT_SYMBOL_GPL(srcu_drive_gp); * Enqueue an SRCU callback on the specified srcu_struct structure, * initiating grace-period processing if it is not already running. */ -void call_srcu(struct srcu_struct *sp, struct rcu_head *rhp, +void call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp, rcu_callback_t func) { unsigned long flags; @@ -179,14 +179,14 @@ void call_srcu(struct srcu_struct *sp, struct rcu_head *rhp, rhp->func = func; rhp->next = NULL; local_irq_save(flags); - *sp->srcu_cb_tail = rhp; - sp->srcu_cb_tail = &rhp->next; + *ssp->srcu_cb_tail = rhp; + ssp->srcu_cb_tail = &rhp->next; local_irq_restore(flags); - if (!READ_ONCE(sp->srcu_gp_running)) { + if (!READ_ONCE(ssp->srcu_gp_running)) { if (likely(srcu_init_done)) - schedule_work(&sp->srcu_work); - else if (list_empty(&sp->srcu_work.entry)) - list_add(&sp->srcu_work.entry, &srcu_boot_list); + schedule_work(&ssp->srcu_work); + else if (list_empty(&ssp->srcu_work.entry)) + list_add(&ssp->srcu_work.entry, &srcu_boot_list); } } EXPORT_SYMBOL_GPL(call_srcu); @@ -194,13 +194,13 @@ EXPORT_SYMBOL_GPL(call_srcu); /* * synchronize_srcu - wait for prior SRCU read-side critical-section completion */ -void synchronize_srcu(struct srcu_struct *sp) +void synchronize_srcu(struct srcu_struct *ssp) { struct rcu_synchronize rs; init_rcu_head_on_stack(&rs.head); init_completion(&rs.completion); - call_srcu(sp, &rs.head, wakeme_after_rcu); + call_srcu(ssp, &rs.head, wakeme_after_rcu); wait_for_completion(&rs.completion); destroy_rcu_head_on_stack(&rs.head); } @@ -219,13 +219,13 @@ void __init rcu_scheduler_starting(void) */ void __init srcu_init(void) { - struct srcu_struct *sp; + struct srcu_struct *ssp; srcu_init_done = true; while (!list_empty(&srcu_boot_list)) { - sp = list_first_entry(&srcu_boot_list, + ssp = list_first_entry(&srcu_boot_list, struct srcu_struct, srcu_work.entry); - list_del_init(&sp->srcu_work.entry); - schedule_work(&sp->srcu_work); + list_del_init(&ssp->srcu_work.entry); + schedule_work(&ssp->srcu_work); } } diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index a8846ed7f352..3600d88d8956 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -56,7 +56,7 @@ static LIST_HEAD(srcu_boot_list); static bool __read_mostly srcu_init_done; static void srcu_invoke_callbacks(struct work_struct *work); -static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay); +static void srcu_reschedule(struct srcu_struct *ssp, unsigned long delay); static void process_srcu(struct work_struct *work); /* Wrappers for lock acquisition and release, see raw_spin_lock_rcu_node(). */ @@ -92,7 +92,7 @@ do { \ * srcu_read_unlock() running against them. So if the is_static parameter * is set, don't initialize ->srcu_lock_count[] and ->srcu_unlock_count[]. */ -static void init_srcu_struct_nodes(struct srcu_struct *sp, bool is_static) +static void init_srcu_struct_nodes(struct srcu_struct *ssp, bool is_static) { int cpu; int i; @@ -103,13 +103,13 @@ static void init_srcu_struct_nodes(struct srcu_struct *sp, bool is_static) struct srcu_node *snp_first; /* Work out the overall tree geometry. */ - sp->level[0] = &sp->node[0]; + ssp->level[0] = &ssp->node[0]; for (i = 1; i < rcu_num_lvls; i++) - sp->level[i] = sp->level[i - 1] + num_rcu_lvl[i - 1]; + ssp->level[i] = ssp->level[i - 1] + num_rcu_lvl[i - 1]; rcu_init_levelspread(levelspread, num_rcu_lvl); /* Each pass through this loop initializes one srcu_node structure. */ - srcu_for_each_node_breadth_first(sp, snp) { + srcu_for_each_node_breadth_first(ssp, snp) { spin_lock_init(&ACCESS_PRIVATE(snp, lock)); WARN_ON_ONCE(ARRAY_SIZE(snp->srcu_have_cbs) != ARRAY_SIZE(snp->srcu_data_have_cbs)); @@ -120,17 +120,17 @@ static void init_srcu_struct_nodes(struct srcu_struct *sp, bool is_static) snp->srcu_gp_seq_needed_exp = 0; snp->grplo = -1; snp->grphi = -1; - if (snp == &sp->node[0]) { + if (snp == &ssp->node[0]) { /* Root node, special case. */ snp->srcu_parent = NULL; continue; } /* Non-root node. */ - if (snp == sp->level[level + 1]) + if (snp == ssp->level[level + 1]) level++; - snp->srcu_parent = sp->level[level - 1] + - (snp - sp->level[level]) / + snp->srcu_parent = ssp->level[level - 1] + + (snp - ssp->level[level]) / levelspread[level - 1]; } @@ -141,14 +141,14 @@ static void init_srcu_struct_nodes(struct srcu_struct *sp, bool is_static) WARN_ON_ONCE(ARRAY_SIZE(sdp->srcu_lock_count) != ARRAY_SIZE(sdp->srcu_unlock_count)); level = rcu_num_lvls - 1; - snp_first = sp->level[level]; + snp_first = ssp->level[level]; for_each_possible_cpu(cpu) { - sdp = per_cpu_ptr(sp->sda, cpu); + sdp = per_cpu_ptr(ssp->sda, cpu); spin_lock_init(&ACCESS_PRIVATE(sdp, lock)); rcu_segcblist_init(&sdp->srcu_cblist); sdp->srcu_cblist_invoking = false; - sdp->srcu_gp_seq_needed = sp->srcu_gp_seq; - sdp->srcu_gp_seq_needed_exp = sp->srcu_gp_seq; + sdp->srcu_gp_seq_needed = ssp->srcu_gp_seq; + sdp->srcu_gp_seq_needed_exp = ssp->srcu_gp_seq; sdp->mynode = &snp_first[cpu / levelspread[level]]; for (snp = sdp->mynode; snp != NULL; snp = snp->srcu_parent) { if (snp->grplo < 0) @@ -157,7 +157,7 @@ static void init_srcu_struct_nodes(struct srcu_struct *sp, bool is_static) } sdp->cpu = cpu; INIT_DELAYED_WORK(&sdp->work, srcu_invoke_callbacks); - sdp->sp = sp; + sdp->ssp = ssp; sdp->grpmask = 1 << (cpu - sdp->mynode->grplo); if (is_static) continue; @@ -176,35 +176,35 @@ static void init_srcu_struct_nodes(struct srcu_struct *sp, bool is_static) * parameter is passed through to init_srcu_struct_nodes(), and * also tells us that ->sda has already been wired up to srcu_data. */ -static int init_srcu_struct_fields(struct srcu_struct *sp, bool is_static) +static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static) { - mutex_init(&sp->srcu_cb_mutex); - mutex_init(&sp->srcu_gp_mutex); - sp->srcu_idx = 0; - sp->srcu_gp_seq = 0; - sp->srcu_barrier_seq = 0; - mutex_init(&sp->srcu_barrier_mutex); - atomic_set(&sp->srcu_barrier_cpu_cnt, 0); - INIT_DELAYED_WORK(&sp->work, process_srcu); + mutex_init(&ssp->srcu_cb_mutex); + mutex_init(&ssp->srcu_gp_mutex); + ssp->srcu_idx = 0; + ssp->srcu_gp_seq = 0; + ssp->srcu_barrier_seq = 0; + mutex_init(&ssp->srcu_barrier_mutex); + atomic_set(&ssp->srcu_barrier_cpu_cnt, 0); + INIT_DELAYED_WORK(&ssp->work, process_srcu); if (!is_static) - sp->sda = alloc_percpu(struct srcu_data); - init_srcu_struct_nodes(sp, is_static); - sp->srcu_gp_seq_needed_exp = 0; - sp->srcu_last_gp_end = ktime_get_mono_fast_ns(); - smp_store_release(&sp->srcu_gp_seq_needed, 0); /* Init done. */ - return sp->sda ? 0 : -ENOMEM; + ssp->sda = alloc_percpu(struct srcu_data); + init_srcu_struct_nodes(ssp, is_static); + ssp->srcu_gp_seq_needed_exp = 0; + ssp->srcu_last_gp_end = ktime_get_mono_fast_ns(); + smp_store_release(&ssp->srcu_gp_seq_needed, 0); /* Init done. */ + return ssp->sda ? 0 : -ENOMEM; } #ifdef CONFIG_DEBUG_LOCK_ALLOC -int __init_srcu_struct(struct srcu_struct *sp, const char *name, +int __init_srcu_struct(struct srcu_struct *ssp, const char *name, struct lock_class_key *key) { /* Don't re-initialize a lock while it is held. */ - debug_check_no_locks_freed((void *)sp, sizeof(*sp)); - lockdep_init_map(&sp->dep_map, name, key, 0); - spin_lock_init(&ACCESS_PRIVATE(sp, lock)); - return init_srcu_struct_fields(sp, false); + debug_check_no_locks_freed((void *)ssp, sizeof(*ssp)); + lockdep_init_map(&ssp->dep_map, name, key, 0); + spin_lock_init(&ACCESS_PRIVATE(ssp, lock)); + return init_srcu_struct_fields(ssp, false); } EXPORT_SYMBOL_GPL(__init_srcu_struct); @@ -212,16 +212,16 @@ EXPORT_SYMBOL_GPL(__init_srcu_struct); /** * init_srcu_struct - initialize a sleep-RCU structure - * @sp: structure to initialize. + * @ssp: structure to initialize. * * Must invoke this on a given srcu_struct before passing that srcu_struct * to any other function. Each srcu_struct represents a separate domain * of SRCU protection. */ -int init_srcu_struct(struct srcu_struct *sp) +int init_srcu_struct(struct srcu_struct *ssp) { - spin_lock_init(&ACCESS_PRIVATE(sp, lock)); - return init_srcu_struct_fields(sp, false); + spin_lock_init(&ACCESS_PRIVATE(ssp, lock)); + return init_srcu_struct_fields(ssp, false); } EXPORT_SYMBOL_GPL(init_srcu_struct); @@ -231,37 +231,37 @@ EXPORT_SYMBOL_GPL(init_srcu_struct); * First-use initialization of statically allocated srcu_struct * structure. Wiring up the combining tree is more than can be * done with compile-time initialization, so this check is added - * to each update-side SRCU primitive. Use sp->lock, which -is- + * to each update-side SRCU primitive. Use ssp->lock, which -is- * compile-time initialized, to resolve races involving multiple * CPUs trying to garner first-use privileges. */ -static void check_init_srcu_struct(struct srcu_struct *sp) +static void check_init_srcu_struct(struct srcu_struct *ssp) { unsigned long flags; /* The smp_load_acquire() pairs with the smp_store_release(). */ - if (!rcu_seq_state(smp_load_acquire(&sp->srcu_gp_seq_needed))) /*^^^*/ + if (!rcu_seq_state(smp_load_acquire(&ssp->srcu_gp_seq_needed))) /*^^^*/ return; /* Already initialized. */ - spin_lock_irqsave_rcu_node(sp, flags); - if (!rcu_seq_state(sp->srcu_gp_seq_needed)) { - spin_unlock_irqrestore_rcu_node(sp, flags); + spin_lock_irqsave_rcu_node(ssp, flags); + if (!rcu_seq_state(ssp->srcu_gp_seq_needed)) { + spin_unlock_irqrestore_rcu_node(ssp, flags); return; } - init_srcu_struct_fields(sp, true); - spin_unlock_irqrestore_rcu_node(sp, flags); + init_srcu_struct_fields(ssp, true); + spin_unlock_irqrestore_rcu_node(ssp, flags); } /* * Returns approximate total of the readers' ->srcu_lock_count[] values * for the rank of per-CPU counters specified by idx. */ -static unsigned long srcu_readers_lock_idx(struct srcu_struct *sp, int idx) +static unsigned long srcu_readers_lock_idx(struct srcu_struct *ssp, int idx) { int cpu; unsigned long sum = 0; for_each_possible_cpu(cpu) { - struct srcu_data *cpuc = per_cpu_ptr(sp->sda, cpu); + struct srcu_data *cpuc = per_cpu_ptr(ssp->sda, cpu); sum += READ_ONCE(cpuc->srcu_lock_count[idx]); } @@ -272,13 +272,13 @@ static unsigned long srcu_readers_lock_idx(struct srcu_struct *sp, int idx) * Returns approximate total of the readers' ->srcu_unlock_count[] values * for the rank of per-CPU counters specified by idx. */ -static unsigned long srcu_readers_unlock_idx(struct srcu_struct *sp, int idx) +static unsigned long srcu_readers_unlock_idx(struct srcu_struct *ssp, int idx) { int cpu; unsigned long sum = 0; for_each_possible_cpu(cpu) { - struct srcu_data *cpuc = per_cpu_ptr(sp->sda, cpu); + struct srcu_data *cpuc = per_cpu_ptr(ssp->sda, cpu); sum += READ_ONCE(cpuc->srcu_unlock_count[idx]); } @@ -289,11 +289,11 @@ static unsigned long srcu_readers_unlock_idx(struct srcu_struct *sp, int idx) * Return true if the number of pre-existing readers is determined to * be zero. */ -static bool srcu_readers_active_idx_check(struct srcu_struct *sp, int idx) +static bool srcu_readers_active_idx_check(struct srcu_struct *ssp, int idx) { unsigned long unlocks; - unlocks = srcu_readers_unlock_idx(sp, idx); + unlocks = srcu_readers_unlock_idx(ssp, idx); /* * Make sure that a lock is always counted if the corresponding @@ -329,25 +329,25 @@ static bool srcu_readers_active_idx_check(struct srcu_struct *sp, int idx) * of floor(ULONG_MAX/NR_CPUS/2), which should be sufficient, * especially on 64-bit systems. */ - return srcu_readers_lock_idx(sp, idx) == unlocks; + return srcu_readers_lock_idx(ssp, idx) == unlocks; } /** * srcu_readers_active - returns true if there are readers. and false * otherwise - * @sp: which srcu_struct to count active readers (holding srcu_read_lock). + * @ssp: which srcu_struct to count active readers (holding srcu_read_lock). * * Note that this is not an atomic primitive, and can therefore suffer * severe errors when invoked on an active srcu_struct. That said, it * can be useful as an error check at cleanup time. */ -static bool srcu_readers_active(struct srcu_struct *sp) +static bool srcu_readers_active(struct srcu_struct *ssp) { int cpu; unsigned long sum = 0; for_each_possible_cpu(cpu) { - struct srcu_data *cpuc = per_cpu_ptr(sp->sda, cpu); + struct srcu_data *cpuc = per_cpu_ptr(ssp->sda, cpu); sum += READ_ONCE(cpuc->srcu_lock_count[0]); sum += READ_ONCE(cpuc->srcu_lock_count[1]); @@ -363,44 +363,44 @@ static bool srcu_readers_active(struct srcu_struct *sp) * Return grace-period delay, zero if there are expedited grace * periods pending, SRCU_INTERVAL otherwise. */ -static unsigned long srcu_get_delay(struct srcu_struct *sp) +static unsigned long srcu_get_delay(struct srcu_struct *ssp) { - if (ULONG_CMP_LT(READ_ONCE(sp->srcu_gp_seq), - READ_ONCE(sp->srcu_gp_seq_needed_exp))) + if (ULONG_CMP_LT(READ_ONCE(ssp->srcu_gp_seq), + READ_ONCE(ssp->srcu_gp_seq_needed_exp))) return 0; return SRCU_INTERVAL; } /* Helper for cleanup_srcu_struct() and cleanup_srcu_struct_quiesced(). */ -void _cleanup_srcu_struct(struct srcu_struct *sp, bool quiesced) +void _cleanup_srcu_struct(struct srcu_struct *ssp, bool quiesced) { int cpu; - if (WARN_ON(!srcu_get_delay(sp))) + if (WARN_ON(!srcu_get_delay(ssp))) return; /* Just leak it! */ - if (WARN_ON(srcu_readers_active(sp))) + if (WARN_ON(srcu_readers_active(ssp))) return; /* Just leak it! */ if (quiesced) { - if (WARN_ON(delayed_work_pending(&sp->work))) + if (WARN_ON(delayed_work_pending(&ssp->work))) return; /* Just leak it! */ } else { - flush_delayed_work(&sp->work); + flush_delayed_work(&ssp->work); } for_each_possible_cpu(cpu) if (quiesced) { - if (WARN_ON(delayed_work_pending(&per_cpu_ptr(sp->sda, cpu)->work))) + if (WARN_ON(delayed_work_pending(&per_cpu_ptr(ssp->sda, cpu)->work))) return; /* Just leak it! */ } else { - flush_delayed_work(&per_cpu_ptr(sp->sda, cpu)->work); + flush_delayed_work(&per_cpu_ptr(ssp->sda, cpu)->work); } - if (WARN_ON(rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) != SRCU_STATE_IDLE) || - WARN_ON(srcu_readers_active(sp))) { + if (WARN_ON(rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)) != SRCU_STATE_IDLE) || + WARN_ON(srcu_readers_active(ssp))) { pr_info("%s: Active srcu_struct %p state: %d\n", - __func__, sp, rcu_seq_state(READ_ONCE(sp->srcu_gp_seq))); + __func__, ssp, rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq))); return; /* Caller forgot to stop doing call_srcu()? */ } - free_percpu(sp->sda); - sp->sda = NULL; + free_percpu(ssp->sda); + ssp->sda = NULL; } EXPORT_SYMBOL_GPL(_cleanup_srcu_struct); @@ -409,12 +409,12 @@ EXPORT_SYMBOL_GPL(_cleanup_srcu_struct); * srcu_struct. * Returns an index that must be passed to the matching srcu_read_unlock(). */ -int __srcu_read_lock(struct srcu_struct *sp) +int __srcu_read_lock(struct srcu_struct *ssp) { int idx; - idx = READ_ONCE(sp->srcu_idx) & 0x1; - this_cpu_inc(sp->sda->srcu_lock_count[idx]); + idx = READ_ONCE(ssp->srcu_idx) & 0x1; + this_cpu_inc(ssp->sda->srcu_lock_count[idx]); smp_mb(); /* B */ /* Avoid leaking the critical section. */ return idx; } @@ -425,10 +425,10 @@ EXPORT_SYMBOL_GPL(__srcu_read_lock); * element of the srcu_struct. Note that this may well be a different * CPU than that which was incremented by the corresponding srcu_read_lock(). */ -void __srcu_read_unlock(struct srcu_struct *sp, int idx) +void __srcu_read_unlock(struct srcu_struct *ssp, int idx) { smp_mb(); /* C */ /* Avoid leaking the critical section. */ - this_cpu_inc(sp->sda->srcu_unlock_count[idx]); + this_cpu_inc(ssp->sda->srcu_unlock_count[idx]); } EXPORT_SYMBOL_GPL(__srcu_read_unlock); @@ -444,20 +444,22 @@ EXPORT_SYMBOL_GPL(__srcu_read_unlock); /* * Start an SRCU grace period. */ -static void srcu_gp_start(struct srcu_struct *sp) +static void srcu_gp_start(struct srcu_struct *ssp) { - struct srcu_data *sdp = this_cpu_ptr(sp->sda); + struct srcu_data *sdp = this_cpu_ptr(ssp->sda); int state; - lockdep_assert_held(&ACCESS_PRIVATE(sp, lock)); - WARN_ON_ONCE(ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)); + lockdep_assert_held(&ACCESS_PRIVATE(ssp, lock)); + WARN_ON_ONCE(ULONG_CMP_GE(ssp->srcu_gp_seq, ssp->srcu_gp_seq_needed)); + spin_lock_rcu_node(sdp); /* Interrupts already disabled. */ rcu_segcblist_advance(&sdp->srcu_cblist, - rcu_seq_current(&sp->srcu_gp_seq)); + rcu_seq_current(&ssp->srcu_gp_seq)); (void)rcu_segcblist_accelerate(&sdp->srcu_cblist, - rcu_seq_snap(&sp->srcu_gp_seq)); + rcu_seq_snap(&ssp->srcu_gp_seq)); + spin_unlock_rcu_node(sdp); /* Interrupts remain disabled. */ smp_mb(); /* Order prior store to ->srcu_gp_seq_needed vs. GP start. */ - rcu_seq_start(&sp->srcu_gp_seq); - state = rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)); + rcu_seq_start(&ssp->srcu_gp_seq); + state = rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)); WARN_ON_ONCE(state != SRCU_STATE_SCAN1); } @@ -511,7 +513,7 @@ static void srcu_schedule_cbs_sdp(struct srcu_data *sdp, unsigned long delay) * just-completed grace period, the one corresponding to idx. If possible, * schedule this invocation on the corresponding CPUs. */ -static void srcu_schedule_cbs_snp(struct srcu_struct *sp, struct srcu_node *snp, +static void srcu_schedule_cbs_snp(struct srcu_struct *ssp, struct srcu_node *snp, unsigned long mask, unsigned long delay) { int cpu; @@ -519,7 +521,7 @@ static void srcu_schedule_cbs_snp(struct srcu_struct *sp, struct srcu_node *snp, for (cpu = snp->grplo; cpu <= snp->grphi; cpu++) { if (!(mask & (1 << (cpu - snp->grplo)))) continue; - srcu_schedule_cbs_sdp(per_cpu_ptr(sp->sda, cpu), delay); + srcu_schedule_cbs_sdp(per_cpu_ptr(ssp->sda, cpu), delay); } } @@ -532,7 +534,7 @@ static void srcu_schedule_cbs_snp(struct srcu_struct *sp, struct srcu_node *snp, * are initiating callback invocation. This allows the ->srcu_have_cbs[] * array to have a finite number of elements. */ -static void srcu_gp_end(struct srcu_struct *sp) +static void srcu_gp_end(struct srcu_struct *ssp) { unsigned long cbdelay; bool cbs; @@ -546,28 +548,28 @@ static void srcu_gp_end(struct srcu_struct *sp) struct srcu_node *snp; /* Prevent more than one additional grace period. */ - mutex_lock(&sp->srcu_cb_mutex); + mutex_lock(&ssp->srcu_cb_mutex); /* End the current grace period. */ - spin_lock_irq_rcu_node(sp); - idx = rcu_seq_state(sp->srcu_gp_seq); + spin_lock_irq_rcu_node(ssp); + idx = rcu_seq_state(ssp->srcu_gp_seq); WARN_ON_ONCE(idx != SRCU_STATE_SCAN2); - cbdelay = srcu_get_delay(sp); - sp->srcu_last_gp_end = ktime_get_mono_fast_ns(); - rcu_seq_end(&sp->srcu_gp_seq); - gpseq = rcu_seq_current(&sp->srcu_gp_seq); - if (ULONG_CMP_LT(sp->srcu_gp_seq_needed_exp, gpseq)) - sp->srcu_gp_seq_needed_exp = gpseq; - spin_unlock_irq_rcu_node(sp); - mutex_unlock(&sp->srcu_gp_mutex); + cbdelay = srcu_get_delay(ssp); + ssp->srcu_last_gp_end = ktime_get_mono_fast_ns(); + rcu_seq_end(&ssp->srcu_gp_seq); + gpseq = rcu_seq_current(&ssp->srcu_gp_seq); + if (ULONG_CMP_LT(ssp->srcu_gp_seq_needed_exp, gpseq)) + ssp->srcu_gp_seq_needed_exp = gpseq; + spin_unlock_irq_rcu_node(ssp); + mutex_unlock(&ssp->srcu_gp_mutex); /* A new grace period can start at this point. But only one. */ /* Initiate callback invocation as needed. */ idx = rcu_seq_ctr(gpseq) % ARRAY_SIZE(snp->srcu_have_cbs); - srcu_for_each_node_breadth_first(sp, snp) { + srcu_for_each_node_breadth_first(ssp, snp) { spin_lock_irq_rcu_node(snp); cbs = false; - last_lvl = snp >= sp->level[rcu_num_lvls - 1]; + last_lvl = snp >= ssp->level[rcu_num_lvls - 1]; if (last_lvl) cbs = snp->srcu_have_cbs[idx] == gpseq; snp->srcu_have_cbs[idx] = gpseq; @@ -578,12 +580,12 @@ static void srcu_gp_end(struct srcu_struct *sp) snp->srcu_data_have_cbs[idx] = 0; spin_unlock_irq_rcu_node(snp); if (cbs) - srcu_schedule_cbs_snp(sp, snp, mask, cbdelay); + srcu_schedule_cbs_snp(ssp, snp, mask, cbdelay); /* Occasionally prevent srcu_data counter wrap. */ if (!(gpseq & counter_wrap_check) && last_lvl) for (cpu = snp->grplo; cpu <= snp->grphi; cpu++) { - sdp = per_cpu_ptr(sp->sda, cpu); + sdp = per_cpu_ptr(ssp->sda, cpu); spin_lock_irqsave_rcu_node(sdp, flags); if (ULONG_CMP_GE(gpseq, sdp->srcu_gp_seq_needed + 100)) @@ -596,18 +598,18 @@ static void srcu_gp_end(struct srcu_struct *sp) } /* Callback initiation done, allow grace periods after next. */ - mutex_unlock(&sp->srcu_cb_mutex); + mutex_unlock(&ssp->srcu_cb_mutex); /* Start a new grace period if needed. */ - spin_lock_irq_rcu_node(sp); - gpseq = rcu_seq_current(&sp->srcu_gp_seq); + spin_lock_irq_rcu_node(ssp); + gpseq = rcu_seq_current(&ssp->srcu_gp_seq); if (!rcu_seq_state(gpseq) && - ULONG_CMP_LT(gpseq, sp->srcu_gp_seq_needed)) { - srcu_gp_start(sp); - spin_unlock_irq_rcu_node(sp); - srcu_reschedule(sp, 0); + ULONG_CMP_LT(gpseq, ssp->srcu_gp_seq_needed)) { + srcu_gp_start(ssp); + spin_unlock_irq_rcu_node(ssp); + srcu_reschedule(ssp, 0); } else { - spin_unlock_irq_rcu_node(sp); + spin_unlock_irq_rcu_node(ssp); } } @@ -618,13 +620,13 @@ static void srcu_gp_end(struct srcu_struct *sp) * but without expediting. To start a completely new grace period, * whether expedited or not, use srcu_funnel_gp_start() instead. */ -static void srcu_funnel_exp_start(struct srcu_struct *sp, struct srcu_node *snp, +static void srcu_funnel_exp_start(struct srcu_struct *ssp, struct srcu_node *snp, unsigned long s) { unsigned long flags; for (; snp != NULL; snp = snp->srcu_parent) { - if (rcu_seq_done(&sp->srcu_gp_seq, s) || + if (rcu_seq_done(&ssp->srcu_gp_seq, s) || ULONG_CMP_GE(READ_ONCE(snp->srcu_gp_seq_needed_exp), s)) return; spin_lock_irqsave_rcu_node(snp, flags); @@ -635,10 +637,10 @@ static void srcu_funnel_exp_start(struct srcu_struct *sp, struct srcu_node *snp, WRITE_ONCE(snp->srcu_gp_seq_needed_exp, s); spin_unlock_irqrestore_rcu_node(snp, flags); } - spin_lock_irqsave_rcu_node(sp, flags); - if (ULONG_CMP_LT(sp->srcu_gp_seq_needed_exp, s)) - sp->srcu_gp_seq_needed_exp = s; - spin_unlock_irqrestore_rcu_node(sp, flags); + spin_lock_irqsave_rcu_node(ssp, flags); + if (ULONG_CMP_LT(ssp->srcu_gp_seq_needed_exp, s)) + ssp->srcu_gp_seq_needed_exp = s; + spin_unlock_irqrestore_rcu_node(ssp, flags); } /* @@ -651,7 +653,7 @@ static void srcu_funnel_exp_start(struct srcu_struct *sp, struct srcu_node *snp, * Note that this function also does the work of srcu_funnel_exp_start(), * in some cases by directly invoking it. */ -static void srcu_funnel_gp_start(struct srcu_struct *sp, struct srcu_data *sdp, +static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp, unsigned long s, bool do_norm) { unsigned long flags; @@ -661,7 +663,7 @@ static void srcu_funnel_gp_start(struct srcu_struct *sp, struct srcu_data *sdp, /* Each pass through the loop does one level of the srcu_node tree. */ for (; snp != NULL; snp = snp->srcu_parent) { - if (rcu_seq_done(&sp->srcu_gp_seq, s) && snp != sdp->mynode) + if (rcu_seq_done(&ssp->srcu_gp_seq, s) && snp != sdp->mynode) return; /* GP already done and CBs recorded. */ spin_lock_irqsave_rcu_node(snp, flags); if (ULONG_CMP_GE(snp->srcu_have_cbs[idx], s)) { @@ -676,7 +678,7 @@ static void srcu_funnel_gp_start(struct srcu_struct *sp, struct srcu_data *sdp, return; } if (!do_norm) - srcu_funnel_exp_start(sp, snp, s); + srcu_funnel_exp_start(ssp, snp, s); return; } snp->srcu_have_cbs[idx] = s; @@ -688,29 +690,29 @@ static void srcu_funnel_gp_start(struct srcu_struct *sp, struct srcu_data *sdp, } /* Top of tree, must ensure the grace period will be started. */ - spin_lock_irqsave_rcu_node(sp, flags); - if (ULONG_CMP_LT(sp->srcu_gp_seq_needed, s)) { + spin_lock_irqsave_rcu_node(ssp, flags); + if (ULONG_CMP_LT(ssp->srcu_gp_seq_needed, s)) { /* * Record need for grace period s. Pair with load * acquire setting up for initialization. */ - smp_store_release(&sp->srcu_gp_seq_needed, s); /*^^^*/ + smp_store_release(&ssp->srcu_gp_seq_needed, s); /*^^^*/ } - if (!do_norm && ULONG_CMP_LT(sp->srcu_gp_seq_needed_exp, s)) - sp->srcu_gp_seq_needed_exp = s; + if (!do_norm && ULONG_CMP_LT(ssp->srcu_gp_seq_needed_exp, s)) + ssp->srcu_gp_seq_needed_exp = s; /* If grace period not already done and none in progress, start it. */ - if (!rcu_seq_done(&sp->srcu_gp_seq, s) && - rcu_seq_state(sp->srcu_gp_seq) == SRCU_STATE_IDLE) { - WARN_ON_ONCE(ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)); - srcu_gp_start(sp); + if (!rcu_seq_done(&ssp->srcu_gp_seq, s) && + rcu_seq_state(ssp->srcu_gp_seq) == SRCU_STATE_IDLE) { + WARN_ON_ONCE(ULONG_CMP_GE(ssp->srcu_gp_seq, ssp->srcu_gp_seq_needed)); + srcu_gp_start(ssp); if (likely(srcu_init_done)) - queue_delayed_work(rcu_gp_wq, &sp->work, - srcu_get_delay(sp)); - else if (list_empty(&sp->work.work.entry)) - list_add(&sp->work.work.entry, &srcu_boot_list); + queue_delayed_work(rcu_gp_wq, &ssp->work, + srcu_get_delay(ssp)); + else if (list_empty(&ssp->work.work.entry)) + list_add(&ssp->work.work.entry, &srcu_boot_list); } - spin_unlock_irqrestore_rcu_node(sp, flags); + spin_unlock_irqrestore_rcu_node(ssp, flags); } /* @@ -718,12 +720,12 @@ static void srcu_funnel_gp_start(struct srcu_struct *sp, struct srcu_data *sdp, * loop an additional time if there is an expedited grace period pending. * The caller must ensure that ->srcu_idx is not changed while checking. */ -static bool try_check_zero(struct srcu_struct *sp, int idx, int trycount) +static bool try_check_zero(struct srcu_struct *ssp, int idx, int trycount) { for (;;) { - if (srcu_readers_active_idx_check(sp, idx)) + if (srcu_readers_active_idx_check(ssp, idx)) return true; - if (--trycount + !srcu_get_delay(sp) <= 0) + if (--trycount + !srcu_get_delay(ssp) <= 0) return false; udelay(SRCU_RETRY_CHECK_DELAY); } @@ -734,7 +736,7 @@ static bool try_check_zero(struct srcu_struct *sp, int idx, int trycount) * use the other rank of the ->srcu_(un)lock_count[] arrays. This allows * us to wait for pre-existing readers in a starvation-free manner. */ -static void srcu_flip(struct srcu_struct *sp) +static void srcu_flip(struct srcu_struct *ssp) { /* * Ensure that if this updater saw a given reader's increment @@ -746,7 +748,7 @@ static void srcu_flip(struct srcu_struct *sp) */ smp_mb(); /* E */ /* Pairs with B and C. */ - WRITE_ONCE(sp->srcu_idx, sp->srcu_idx + 1); + WRITE_ONCE(ssp->srcu_idx, ssp->srcu_idx + 1); /* * Ensure that if the updater misses an __srcu_read_unlock() @@ -779,7 +781,7 @@ static void srcu_flip(struct srcu_struct *sp) * negligible when amoritized over that time period, and the extra latency * of a needlessly non-expedited grace period is similarly negligible. */ -static bool srcu_might_be_idle(struct srcu_struct *sp) +static bool srcu_might_be_idle(struct srcu_struct *ssp) { unsigned long curseq; unsigned long flags; @@ -788,7 +790,7 @@ static bool srcu_might_be_idle(struct srcu_struct *sp) /* If the local srcu_data structure has callbacks, not idle. */ local_irq_save(flags); - sdp = this_cpu_ptr(sp->sda); + sdp = this_cpu_ptr(ssp->sda); if (rcu_segcblist_pend_cbs(&sdp->srcu_cblist)) { local_irq_restore(flags); return false; /* Callbacks already present, so not idle. */ @@ -804,17 +806,17 @@ static bool srcu_might_be_idle(struct srcu_struct *sp) /* First, see if enough time has passed since the last GP. */ t = ktime_get_mono_fast_ns(); if (exp_holdoff == 0 || - time_in_range_open(t, sp->srcu_last_gp_end, - sp->srcu_last_gp_end + exp_holdoff)) + time_in_range_open(t, ssp->srcu_last_gp_end, + ssp->srcu_last_gp_end + exp_holdoff)) return false; /* Too soon after last GP. */ /* Next, check for probable idleness. */ - curseq = rcu_seq_current(&sp->srcu_gp_seq); + curseq = rcu_seq_current(&ssp->srcu_gp_seq); smp_mb(); /* Order ->srcu_gp_seq with ->srcu_gp_seq_needed. */ - if (ULONG_CMP_LT(curseq, READ_ONCE(sp->srcu_gp_seq_needed))) + if (ULONG_CMP_LT(curseq, READ_ONCE(ssp->srcu_gp_seq_needed))) return false; /* Grace period in progress, so not idle. */ smp_mb(); /* Order ->srcu_gp_seq with prior access. */ - if (curseq != rcu_seq_current(&sp->srcu_gp_seq)) + if (curseq != rcu_seq_current(&ssp->srcu_gp_seq)) return false; /* GP # changed, so not idle. */ return true; /* With reasonable probability, idle! */ } @@ -854,16 +856,17 @@ static void srcu_leak_callback(struct rcu_head *rhp) * srcu_read_lock(), and srcu_read_unlock() that are all passed the same * srcu_struct structure. */ -void __call_srcu(struct srcu_struct *sp, struct rcu_head *rhp, +void __call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp, rcu_callback_t func, bool do_norm) { unsigned long flags; + int idx; bool needexp = false; bool needgp = false; unsigned long s; struct srcu_data *sdp; - check_init_srcu_struct(sp); + check_init_srcu_struct(ssp); if (debug_rcu_head_queue(rhp)) { /* Probable double call_srcu(), so leak the callback. */ WRITE_ONCE(rhp->func, srcu_leak_callback); @@ -871,13 +874,14 @@ void __call_srcu(struct srcu_struct *sp, struct rcu_head *rhp, return; } rhp->func = func; + idx = srcu_read_lock(ssp); local_irq_save(flags); - sdp = this_cpu_ptr(sp->sda); + sdp = this_cpu_ptr(ssp->sda); spin_lock_rcu_node(sdp); rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp, false); rcu_segcblist_advance(&sdp->srcu_cblist, - rcu_seq_current(&sp->srcu_gp_seq)); - s = rcu_seq_snap(&sp->srcu_gp_seq); + rcu_seq_current(&ssp->srcu_gp_seq)); + s = rcu_seq_snap(&ssp->srcu_gp_seq); (void)rcu_segcblist_accelerate(&sdp->srcu_cblist, s); if (ULONG_CMP_LT(sdp->srcu_gp_seq_needed, s)) { sdp->srcu_gp_seq_needed = s; @@ -889,14 +893,15 @@ void __call_srcu(struct srcu_struct *sp, struct rcu_head *rhp, } spin_unlock_irqrestore_rcu_node(sdp, flags); if (needgp) - srcu_funnel_gp_start(sp, sdp, s, do_norm); + srcu_funnel_gp_start(ssp, sdp, s, do_norm); else if (needexp) - srcu_funnel_exp_start(sp, sdp->mynode, s); + srcu_funnel_exp_start(ssp, sdp->mynode, s); + srcu_read_unlock(ssp, idx); } /** * call_srcu() - Queue a callback for invocation after an SRCU grace period - * @sp: srcu_struct in queue the callback + * @ssp: srcu_struct in queue the callback * @rhp: structure to be used for queueing the SRCU callback. * @func: function to be invoked after the SRCU grace period * @@ -911,21 +916,21 @@ void __call_srcu(struct srcu_struct *sp, struct rcu_head *rhp, * The callback will be invoked from process context, but must nevertheless * be fast and must not block. */ -void call_srcu(struct srcu_struct *sp, struct rcu_head *rhp, +void call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp, rcu_callback_t func) { - __call_srcu(sp, rhp, func, true); + __call_srcu(ssp, rhp, func, true); } EXPORT_SYMBOL_GPL(call_srcu); /* * Helper function for synchronize_srcu() and synchronize_srcu_expedited(). */ -static void __synchronize_srcu(struct srcu_struct *sp, bool do_norm) +static void __synchronize_srcu(struct srcu_struct *ssp, bool do_norm) { struct rcu_synchronize rcu; - RCU_LOCKDEP_WARN(lock_is_held(&sp->dep_map) || + RCU_LOCKDEP_WARN(lock_is_held(&ssp->dep_map) || lock_is_held(&rcu_bh_lock_map) || lock_is_held(&rcu_lock_map) || lock_is_held(&rcu_sched_lock_map), @@ -934,10 +939,10 @@ static void __synchronize_srcu(struct srcu_struct *sp, bool do_norm) if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE) return; might_sleep(); - check_init_srcu_struct(sp); + check_init_srcu_struct(ssp); init_completion(&rcu.completion); init_rcu_head_on_stack(&rcu.head); - __call_srcu(sp, &rcu.head, wakeme_after_rcu, do_norm); + __call_srcu(ssp, &rcu.head, wakeme_after_rcu, do_norm); wait_for_completion(&rcu.completion); destroy_rcu_head_on_stack(&rcu.head); @@ -953,7 +958,7 @@ static void __synchronize_srcu(struct srcu_struct *sp, bool do_norm) /** * synchronize_srcu_expedited - Brute-force SRCU grace period - * @sp: srcu_struct with which to synchronize. + * @ssp: srcu_struct with which to synchronize. * * Wait for an SRCU grace period to elapse, but be more aggressive about * spinning rather than blocking when waiting. @@ -961,15 +966,15 @@ static void __synchronize_srcu(struct srcu_struct *sp, bool do_norm) * Note that synchronize_srcu_expedited() has the same deadlock and * memory-ordering properties as does synchronize_srcu(). */ -void synchronize_srcu_expedited(struct srcu_struct *sp) +void synchronize_srcu_expedited(struct srcu_struct *ssp) { - __synchronize_srcu(sp, rcu_gp_is_normal()); + __synchronize_srcu(ssp, rcu_gp_is_normal()); } EXPORT_SYMBOL_GPL(synchronize_srcu_expedited); /** * synchronize_srcu - wait for prior SRCU read-side critical-section completion - * @sp: srcu_struct with which to synchronize. + * @ssp: srcu_struct with which to synchronize. * * Wait for the count to drain to zero of both indexes. To avoid the * possible starvation of synchronize_srcu(), it waits for the count of @@ -1011,12 +1016,12 @@ EXPORT_SYMBOL_GPL(synchronize_srcu_expedited); * SRCU must also provide it. Note that detecting idleness is heuristic * and subject to both false positives and negatives. */ -void synchronize_srcu(struct srcu_struct *sp) +void synchronize_srcu(struct srcu_struct *ssp) { - if (srcu_might_be_idle(sp) || rcu_gp_is_expedited()) - synchronize_srcu_expedited(sp); + if (srcu_might_be_idle(ssp) || rcu_gp_is_expedited()) + synchronize_srcu_expedited(ssp); else - __synchronize_srcu(sp, true); + __synchronize_srcu(ssp, true); } EXPORT_SYMBOL_GPL(synchronize_srcu); @@ -1026,36 +1031,36 @@ EXPORT_SYMBOL_GPL(synchronize_srcu); static void srcu_barrier_cb(struct rcu_head *rhp) { struct srcu_data *sdp; - struct srcu_struct *sp; + struct srcu_struct *ssp; sdp = container_of(rhp, struct srcu_data, srcu_barrier_head); - sp = sdp->sp; - if (atomic_dec_and_test(&sp->srcu_barrier_cpu_cnt)) - complete(&sp->srcu_barrier_completion); + ssp = sdp->ssp; + if (atomic_dec_and_test(&ssp->srcu_barrier_cpu_cnt)) + complete(&ssp->srcu_barrier_completion); } /** * srcu_barrier - Wait until all in-flight call_srcu() callbacks complete. - * @sp: srcu_struct on which to wait for in-flight callbacks. + * @ssp: srcu_struct on which to wait for in-flight callbacks. */ -void srcu_barrier(struct srcu_struct *sp) +void srcu_barrier(struct srcu_struct *ssp) { int cpu; struct srcu_data *sdp; - unsigned long s = rcu_seq_snap(&sp->srcu_barrier_seq); + unsigned long s = rcu_seq_snap(&ssp->srcu_barrier_seq); - check_init_srcu_struct(sp); - mutex_lock(&sp->srcu_barrier_mutex); - if (rcu_seq_done(&sp->srcu_barrier_seq, s)) { + check_init_srcu_struct(ssp); + mutex_lock(&ssp->srcu_barrier_mutex); + if (rcu_seq_done(&ssp->srcu_barrier_seq, s)) { smp_mb(); /* Force ordering following return. */ - mutex_unlock(&sp->srcu_barrier_mutex); + mutex_unlock(&ssp->srcu_barrier_mutex); return; /* Someone else did our work for us. */ } - rcu_seq_start(&sp->srcu_barrier_seq); - init_completion(&sp->srcu_barrier_completion); + rcu_seq_start(&ssp->srcu_barrier_seq); + init_completion(&ssp->srcu_barrier_completion); /* Initial count prevents reaching zero until all CBs are posted. */ - atomic_set(&sp->srcu_barrier_cpu_cnt, 1); + atomic_set(&ssp->srcu_barrier_cpu_cnt, 1); /* * Each pass through this loop enqueues a callback, but only @@ -1066,39 +1071,39 @@ void srcu_barrier(struct srcu_struct *sp) * grace period as the last callback already in the queue. */ for_each_possible_cpu(cpu) { - sdp = per_cpu_ptr(sp->sda, cpu); + sdp = per_cpu_ptr(ssp->sda, cpu); spin_lock_irq_rcu_node(sdp); - atomic_inc(&sp->srcu_barrier_cpu_cnt); + atomic_inc(&ssp->srcu_barrier_cpu_cnt); sdp->srcu_barrier_head.func = srcu_barrier_cb; debug_rcu_head_queue(&sdp->srcu_barrier_head); if (!rcu_segcblist_entrain(&sdp->srcu_cblist, &sdp->srcu_barrier_head, 0)) { debug_rcu_head_unqueue(&sdp->srcu_barrier_head); - atomic_dec(&sp->srcu_barrier_cpu_cnt); + atomic_dec(&ssp->srcu_barrier_cpu_cnt); } spin_unlock_irq_rcu_node(sdp); } /* Remove the initial count, at which point reaching zero can happen. */ - if (atomic_dec_and_test(&sp->srcu_barrier_cpu_cnt)) - complete(&sp->srcu_barrier_completion); - wait_for_completion(&sp->srcu_barrier_completion); + if (atomic_dec_and_test(&ssp->srcu_barrier_cpu_cnt)) + complete(&ssp->srcu_barrier_completion); + wait_for_completion(&ssp->srcu_barrier_completion); - rcu_seq_end(&sp->srcu_barrier_seq); - mutex_unlock(&sp->srcu_barrier_mutex); + rcu_seq_end(&ssp->srcu_barrier_seq); + mutex_unlock(&ssp->srcu_barrier_mutex); } EXPORT_SYMBOL_GPL(srcu_barrier); /** * srcu_batches_completed - return batches completed. - * @sp: srcu_struct on which to report batch completion. + * @ssp: srcu_struct on which to report batch completion. * * Report the number of batches, correlated with, but not necessarily * precisely the same as, the number of grace periods that have elapsed. */ -unsigned long srcu_batches_completed(struct srcu_struct *sp) +unsigned long srcu_batches_completed(struct srcu_struct *ssp) { - return sp->srcu_idx; + return ssp->srcu_idx; } EXPORT_SYMBOL_GPL(srcu_batches_completed); @@ -1107,11 +1112,11 @@ EXPORT_SYMBOL_GPL(srcu_batches_completed); * to SRCU_STATE_SCAN2, and invoke srcu_gp_end() when scan has * completed in that state. */ -static void srcu_advance_state(struct srcu_struct *sp) +static void srcu_advance_state(struct srcu_struct *ssp) { int idx; - mutex_lock(&sp->srcu_gp_mutex); + mutex_lock(&ssp->srcu_gp_mutex); /* * Because readers might be delayed for an extended period after @@ -1123,47 +1128,47 @@ static void srcu_advance_state(struct srcu_struct *sp) * The load-acquire ensures that we see the accesses performed * by the prior grace period. */ - idx = rcu_seq_state(smp_load_acquire(&sp->srcu_gp_seq)); /* ^^^ */ + idx = rcu_seq_state(smp_load_acquire(&ssp->srcu_gp_seq)); /* ^^^ */ if (idx == SRCU_STATE_IDLE) { - spin_lock_irq_rcu_node(sp); - if (ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)) { - WARN_ON_ONCE(rcu_seq_state(sp->srcu_gp_seq)); - spin_unlock_irq_rcu_node(sp); - mutex_unlock(&sp->srcu_gp_mutex); + spin_lock_irq_rcu_node(ssp); + if (ULONG_CMP_GE(ssp->srcu_gp_seq, ssp->srcu_gp_seq_needed)) { + WARN_ON_ONCE(rcu_seq_state(ssp->srcu_gp_seq)); + spin_unlock_irq_rcu_node(ssp); + mutex_unlock(&ssp->srcu_gp_mutex); return; } - idx = rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)); + idx = rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)); if (idx == SRCU_STATE_IDLE) - srcu_gp_start(sp); - spin_unlock_irq_rcu_node(sp); + srcu_gp_start(ssp); + spin_unlock_irq_rcu_node(ssp); if (idx != SRCU_STATE_IDLE) { - mutex_unlock(&sp->srcu_gp_mutex); + mutex_unlock(&ssp->srcu_gp_mutex); return; /* Someone else started the grace period. */ } } - if (rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) == SRCU_STATE_SCAN1) { - idx = 1 ^ (sp->srcu_idx & 1); - if (!try_check_zero(sp, idx, 1)) { - mutex_unlock(&sp->srcu_gp_mutex); + if (rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)) == SRCU_STATE_SCAN1) { + idx = 1 ^ (ssp->srcu_idx & 1); + if (!try_check_zero(ssp, idx, 1)) { + mutex_unlock(&ssp->srcu_gp_mutex); return; /* readers present, retry later. */ } - srcu_flip(sp); - rcu_seq_set_state(&sp->srcu_gp_seq, SRCU_STATE_SCAN2); + srcu_flip(ssp); + rcu_seq_set_state(&ssp->srcu_gp_seq, SRCU_STATE_SCAN2); } - if (rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) == SRCU_STATE_SCAN2) { + if (rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)) == SRCU_STATE_SCAN2) { /* * SRCU read-side critical sections are normally short, * so check at least twice in quick succession after a flip. */ - idx = 1 ^ (sp->srcu_idx & 1); - if (!try_check_zero(sp, idx, 2)) { - mutex_unlock(&sp->srcu_gp_mutex); + idx = 1 ^ (ssp->srcu_idx & 1); + if (!try_check_zero(ssp, idx, 2)) { + mutex_unlock(&ssp->srcu_gp_mutex); return; /* readers present, retry later. */ } - srcu_gp_end(sp); /* Releases ->srcu_gp_mutex. */ + srcu_gp_end(ssp); /* Releases ->srcu_gp_mutex. */ } } @@ -1179,14 +1184,14 @@ static void srcu_invoke_callbacks(struct work_struct *work) struct rcu_cblist ready_cbs; struct rcu_head *rhp; struct srcu_data *sdp; - struct srcu_struct *sp; + struct srcu_struct *ssp; sdp = container_of(work, struct srcu_data, work.work); - sp = sdp->sp; + ssp = sdp->ssp; rcu_cblist_init(&ready_cbs); spin_lock_irq_rcu_node(sdp); rcu_segcblist_advance(&sdp->srcu_cblist, - rcu_seq_current(&sp->srcu_gp_seq)); + rcu_seq_current(&ssp->srcu_gp_seq)); if (sdp->srcu_cblist_invoking || !rcu_segcblist_ready_cbs(&sdp->srcu_cblist)) { spin_unlock_irq_rcu_node(sdp); @@ -1212,7 +1217,7 @@ static void srcu_invoke_callbacks(struct work_struct *work) spin_lock_irq_rcu_node(sdp); rcu_segcblist_insert_count(&sdp->srcu_cblist, &ready_cbs); (void)rcu_segcblist_accelerate(&sdp->srcu_cblist, - rcu_seq_snap(&sp->srcu_gp_seq)); + rcu_seq_snap(&ssp->srcu_gp_seq)); sdp->srcu_cblist_invoking = false; more = rcu_segcblist_ready_cbs(&sdp->srcu_cblist); spin_unlock_irq_rcu_node(sdp); @@ -1224,24 +1229,24 @@ static void srcu_invoke_callbacks(struct work_struct *work) * Finished one round of SRCU grace period. Start another if there are * more SRCU callbacks queued, otherwise put SRCU into not-running state. */ -static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay) +static void srcu_reschedule(struct srcu_struct *ssp, unsigned long delay) { bool pushgp = true; - spin_lock_irq_rcu_node(sp); - if (ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)) { - if (!WARN_ON_ONCE(rcu_seq_state(sp->srcu_gp_seq))) { + spin_lock_irq_rcu_node(ssp); + if (ULONG_CMP_GE(ssp->srcu_gp_seq, ssp->srcu_gp_seq_needed)) { + if (!WARN_ON_ONCE(rcu_seq_state(ssp->srcu_gp_seq))) { /* All requests fulfilled, time to go idle. */ pushgp = false; } - } else if (!rcu_seq_state(sp->srcu_gp_seq)) { + } else if (!rcu_seq_state(ssp->srcu_gp_seq)) { /* Outstanding request and no GP. Start one. */ - srcu_gp_start(sp); + srcu_gp_start(ssp); } - spin_unlock_irq_rcu_node(sp); + spin_unlock_irq_rcu_node(ssp); if (pushgp) - queue_delayed_work(rcu_gp_wq, &sp->work, delay); + queue_delayed_work(rcu_gp_wq, &ssp->work, delay); } /* @@ -1249,41 +1254,41 @@ static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay) */ static void process_srcu(struct work_struct *work) { - struct srcu_struct *sp; + struct srcu_struct *ssp; - sp = container_of(work, struct srcu_struct, work.work); + ssp = container_of(work, struct srcu_struct, work.work); - srcu_advance_state(sp); - srcu_reschedule(sp, srcu_get_delay(sp)); + srcu_advance_state(ssp); + srcu_reschedule(ssp, srcu_get_delay(ssp)); } void srcutorture_get_gp_data(enum rcutorture_type test_type, - struct srcu_struct *sp, int *flags, + struct srcu_struct *ssp, int *flags, unsigned long *gp_seq) { if (test_type != SRCU_FLAVOR) return; *flags = 0; - *gp_seq = rcu_seq_current(&sp->srcu_gp_seq); + *gp_seq = rcu_seq_current(&ssp->srcu_gp_seq); } EXPORT_SYMBOL_GPL(srcutorture_get_gp_data); -void srcu_torture_stats_print(struct srcu_struct *sp, char *tt, char *tf) +void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf) { int cpu; int idx; unsigned long s0 = 0, s1 = 0; - idx = sp->srcu_idx & 0x1; + idx = ssp->srcu_idx & 0x1; pr_alert("%s%s Tree SRCU g%ld per-CPU(idx=%d):", - tt, tf, rcu_seq_current(&sp->srcu_gp_seq), idx); + tt, tf, rcu_seq_current(&ssp->srcu_gp_seq), idx); for_each_possible_cpu(cpu) { unsigned long l0, l1; unsigned long u0, u1; long c0, c1; struct srcu_data *sdp; - sdp = per_cpu_ptr(sp->sda, cpu); + sdp = per_cpu_ptr(ssp->sda, cpu); u0 = sdp->srcu_unlock_count[!idx]; u1 = sdp->srcu_unlock_count[idx]; @@ -1318,14 +1323,14 @@ early_initcall(srcu_bootup_announce); void __init srcu_init(void) { - struct srcu_struct *sp; + struct srcu_struct *ssp; srcu_init_done = true; while (!list_empty(&srcu_boot_list)) { - sp = list_first_entry(&srcu_boot_list, struct srcu_struct, + ssp = list_first_entry(&srcu_boot_list, struct srcu_struct, work.work.entry); - check_init_srcu_struct(sp); - list_del_init(&sp->work.work.entry); - queue_work(rcu_gp_wq, &sp->work.work); + check_init_srcu_struct(ssp); + list_del_init(&ssp->work.work.entry); + queue_work(rcu_gp_wq, &ssp->work.work); } } diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c index 3f943efcf61c..be10036fa621 100644 --- a/kernel/rcu/sync.c +++ b/kernel/rcu/sync.c @@ -44,15 +44,15 @@ static const struct { __INIT_HELD(rcu_read_lock_held) }, [RCU_SCHED_SYNC] = { - .sync = synchronize_sched, - .call = call_rcu_sched, - .wait = rcu_barrier_sched, + .sync = synchronize_rcu, + .call = call_rcu, + .wait = rcu_barrier, __INIT_HELD(rcu_read_lock_sched_held) }, [RCU_BH_SYNC] = { - .sync = synchronize_rcu_bh, - .call = call_rcu_bh, - .wait = rcu_barrier_bh, + .sync = synchronize_rcu, + .call = call_rcu, + .wait = rcu_barrier, __INIT_HELD(rcu_read_lock_bh_held) }, }; @@ -125,8 +125,7 @@ void rcu_sync_enter(struct rcu_sync *rsp) rsp->gp_state = GP_PENDING; spin_unlock_irq(&rsp->rss_lock); - BUG_ON(need_wait && need_sync); - + WARN_ON_ONCE(need_wait && need_sync); if (need_sync) { gp_ops[rsp->gp_type].sync(); rsp->gp_state = GP_PASSED; @@ -139,7 +138,7 @@ void rcu_sync_enter(struct rcu_sync *rsp) * Nobody has yet been allowed the 'fast' path and thus we can * avoid doing any sync(). The callback will get 'dropped'. */ - BUG_ON(rsp->gp_state != GP_PASSED); + WARN_ON_ONCE(rsp->gp_state != GP_PASSED); } } @@ -166,8 +165,8 @@ static void rcu_sync_func(struct rcu_head *rhp) struct rcu_sync *rsp = container_of(rhp, struct rcu_sync, cb_head); unsigned long flags; - BUG_ON(rsp->gp_state != GP_PASSED); - BUG_ON(rsp->cb_state == CB_IDLE); + WARN_ON_ONCE(rsp->gp_state != GP_PASSED); + WARN_ON_ONCE(rsp->cb_state == CB_IDLE); spin_lock_irqsave(&rsp->rss_lock, flags); if (rsp->gp_count) { @@ -225,7 +224,7 @@ void rcu_sync_dtor(struct rcu_sync *rsp) { int cb_state; - BUG_ON(rsp->gp_count); + WARN_ON_ONCE(rsp->gp_count); spin_lock_irq(&rsp->rss_lock); if (rsp->cb_state == CB_REPLAY) @@ -235,6 +234,6 @@ void rcu_sync_dtor(struct rcu_sync *rsp) if (cb_state != CB_IDLE) { gp_ops[rsp->gp_type].wait(); - BUG_ON(rsp->cb_state != CB_IDLE); + WARN_ON_ONCE(rsp->cb_state != CB_IDLE); } } diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 121f833acd04..9180158756d2 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -207,6 +207,19 @@ static int rcu_gp_in_progress(void) return rcu_seq_state(rcu_seq_current(&rcu_state.gp_seq)); } +/* + * Return the number of callbacks queued on the specified CPU. + * Handles both the nocbs and normal cases. + */ +static long rcu_get_n_cbs_cpu(int cpu) +{ + struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); + + if (rcu_segcblist_is_enabled(&rdp->cblist)) /* Online normal CPU? */ + return rcu_segcblist_n_cbs(&rdp->cblist); + return rcu_get_n_cbs_nocb_cpu(rdp); /* Works for offline, too. */ +} + void rcu_softirq_qs(void) { rcu_qs(); @@ -500,16 +513,29 @@ void rcu_force_quiescent_state(void) EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); /* + * Convert a ->gp_state value to a character string. + */ +static const char *gp_state_getname(short gs) +{ + if (gs < 0 || gs >= ARRAY_SIZE(gp_state_names)) + return "???"; + return gp_state_names[gs]; +} + +/* * Show the state of the grace-period kthreads. */ void show_rcu_gp_kthreads(void) { int cpu; + unsigned long j; struct rcu_data *rdp; struct rcu_node *rnp; - pr_info("%s: wait state: %d ->state: %#lx\n", rcu_state.name, - rcu_state.gp_state, rcu_state.gp_kthread->state); + j = jiffies - READ_ONCE(rcu_state.gp_activity); + pr_info("%s: wait state: %s(%d) ->state: %#lx delta ->gp_activity %ld\n", + rcu_state.name, gp_state_getname(rcu_state.gp_state), + rcu_state.gp_state, rcu_state.gp_kthread->state, j); rcu_for_each_node_breadth_first(rnp) { if (ULONG_CMP_GE(rcu_state.gp_seq, rnp->gp_seq_needed)) continue; @@ -891,12 +917,12 @@ void rcu_irq_enter_irqson(void) } /** - * rcu_is_watching - see if RCU thinks that the current CPU is idle + * rcu_is_watching - see if RCU thinks that the current CPU is not idle * * Return true if RCU is watching the running CPU, which means that this * CPU can safely enter RCU read-side critical sections. In other words, - * if the current CPU is in its idle loop and is neither in an interrupt - * or NMI handler, return true. + * if the current CPU is not in its idle loop or is in an interrupt or + * NMI handler, return true. */ bool notrace rcu_is_watching(void) { @@ -1143,16 +1169,6 @@ static void record_gp_stall_check_time(void) } /* - * Convert a ->gp_state value to a character string. - */ -static const char *gp_state_getname(short gs) -{ - if (gs < 0 || gs >= ARRAY_SIZE(gp_state_names)) - return "???"; - return gp_state_names[gs]; -} - -/* * Complain about starvation of grace-period kthread. */ static void rcu_check_gp_kthread_starvation(void) @@ -1262,8 +1278,7 @@ static void print_other_cpu_stall(unsigned long gp_seq) print_cpu_stall_info_end(); for_each_possible_cpu(cpu) - totqlen += rcu_segcblist_n_cbs(&per_cpu_ptr(&rcu_data, - cpu)->cblist); + totqlen += rcu_get_n_cbs_cpu(cpu); pr_cont("(detected by %d, t=%ld jiffies, g=%ld, q=%lu)\n", smp_processor_id(), (long)(jiffies - rcu_state.gp_start), (long)rcu_seq_current(&rcu_state.gp_seq), totqlen); @@ -1323,8 +1338,7 @@ static void print_cpu_stall(void) raw_spin_unlock_irqrestore_rcu_node(rdp->mynode, flags); print_cpu_stall_info_end(); for_each_possible_cpu(cpu) - totqlen += rcu_segcblist_n_cbs(&per_cpu_ptr(&rcu_data, - cpu)->cblist); + totqlen += rcu_get_n_cbs_cpu(cpu); pr_cont(" (t=%lu jiffies g=%ld q=%lu)\n", jiffies - rcu_state.gp_start, (long)rcu_seq_current(&rcu_state.gp_seq), totqlen); @@ -1986,7 +2000,8 @@ static void rcu_gp_cleanup(void) WRITE_ONCE(rcu_state.gp_activity, jiffies); raw_spin_lock_irq_rcu_node(rnp); - gp_duration = jiffies - rcu_state.gp_start; + rcu_state.gp_end = jiffies; + gp_duration = rcu_state.gp_end - rcu_state.gp_start; if (gp_duration > rcu_state.gp_max) rcu_state.gp_max = gp_duration; @@ -2032,9 +2047,9 @@ static void rcu_gp_cleanup(void) rnp = rcu_get_root(); raw_spin_lock_irq_rcu_node(rnp); /* GP before ->gp_seq update. */ - /* Declare grace period done. */ - rcu_seq_end(&rcu_state.gp_seq); + /* Declare grace period done, trace first to use old GP number. */ trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("end")); + rcu_seq_end(&rcu_state.gp_seq); rcu_state.gp_state = RCU_GP_IDLE; /* Check for GP requests since above loop. */ rdp = this_cpu_ptr(&rcu_data); @@ -2600,10 +2615,10 @@ static void force_quiescent_state(void) * This function checks for grace-period requests that fail to motivate * RCU to come out of its idle mode. */ -static void -rcu_check_gp_start_stall(struct rcu_node *rnp, struct rcu_data *rdp) +void +rcu_check_gp_start_stall(struct rcu_node *rnp, struct rcu_data *rdp, + const unsigned long gpssdelay) { - const unsigned long gpssdelay = rcu_jiffies_till_stall_check() * HZ; unsigned long flags; unsigned long j; struct rcu_node *rnp_root = rcu_get_root(); @@ -2655,6 +2670,48 @@ rcu_check_gp_start_stall(struct rcu_node *rnp, struct rcu_data *rdp) } /* + * Do a forward-progress check for rcutorture. This is normally invoked + * due to an OOM event. The argument "j" gives the time period during + * which rcutorture would like progress to have been made. + */ +void rcu_fwd_progress_check(unsigned long j) +{ + unsigned long cbs; + int cpu; + unsigned long max_cbs = 0; + int max_cpu = -1; + struct rcu_data *rdp; + + if (rcu_gp_in_progress()) { + pr_info("%s: GP age %lu jiffies\n", + __func__, jiffies - rcu_state.gp_start); + show_rcu_gp_kthreads(); + } else { + pr_info("%s: Last GP end %lu jiffies ago\n", + __func__, jiffies - rcu_state.gp_end); + preempt_disable(); + rdp = this_cpu_ptr(&rcu_data); + rcu_check_gp_start_stall(rdp->mynode, rdp, j); + preempt_enable(); + } + for_each_possible_cpu(cpu) { + cbs = rcu_get_n_cbs_cpu(cpu); + if (!cbs) + continue; + if (max_cpu < 0) + pr_info("%s: callbacks", __func__); + pr_cont(" %d: %lu", cpu, cbs); + if (cbs <= max_cbs) + continue; + max_cbs = cbs; + max_cpu = cpu; + } + if (max_cpu >= 0) + pr_cont("\n"); +} +EXPORT_SYMBOL_GPL(rcu_fwd_progress_check); + +/* * This does the RCU core processing work for the specified rcu_data * structures. This may be called only from the CPU to whom the rdp * belongs. @@ -2690,7 +2747,7 @@ static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused local_irq_restore(flags); } - rcu_check_gp_start_stall(rnp, rdp); + rcu_check_gp_start_stall(rnp, rdp, rcu_jiffies_till_stall_check()); /* If there are callbacks ready, invoke them. */ if (rcu_segcblist_ready_cbs(&rdp->cblist)) @@ -2826,7 +2883,7 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func, int cpu, bool lazy) * Very early boot, before rcu_init(). Initialize if needed * and then drop through to queue the callback. */ - BUG_ON(cpu != -1); + WARN_ON_ONCE(cpu != -1); WARN_ON_ONCE(!rcu_is_watching()); if (rcu_segcblist_empty(&rdp->cblist)) rcu_segcblist_init(&rdp->cblist); @@ -3485,7 +3542,8 @@ static int __init rcu_spawn_gp_kthread(void) rcu_scheduler_fully_active = 1; t = kthread_create(rcu_gp_kthread, NULL, "%s", rcu_state.name); - BUG_ON(IS_ERR(t)); + if (WARN_ONCE(IS_ERR(t), "%s: Could not start grace-period kthread, OOM is now expected behavior\n", __func__)) + return 0; rnp = rcu_get_root(); raw_spin_lock_irqsave_rcu_node(rnp, flags); rcu_state.gp_kthread = t; diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 703e19ff532d..d90b02b53c0e 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -57,7 +57,7 @@ struct rcu_node { /* some rcu_state fields as well as */ /* following. */ unsigned long gp_seq; /* Track rsp->rcu_gp_seq. */ - unsigned long gp_seq_needed; /* Track rsp->rcu_gp_seq_needed. */ + unsigned long gp_seq_needed; /* Track furthest future GP request. */ unsigned long completedqs; /* All QSes done for this node. */ unsigned long qsmask; /* CPUs or groups that need to switch in */ /* order for current grace period to proceed.*/ @@ -163,7 +163,7 @@ union rcu_noqs { struct rcu_data { /* 1) quiescent-state and grace-period handling : */ unsigned long gp_seq; /* Track rsp->rcu_gp_seq counter. */ - unsigned long gp_seq_needed; /* Track rsp->rcu_gp_seq_needed ctr. */ + unsigned long gp_seq_needed; /* Track furthest future GP request. */ union rcu_noqs cpu_no_qs; /* No QSes yet for this CPU. */ bool core_needs_qs; /* Core waits for quiesc state. */ bool beenonline; /* CPU online at least once. */ @@ -328,6 +328,8 @@ struct rcu_state { /* force_quiescent_state(). */ unsigned long gp_start; /* Time at which GP started, */ /* but in jiffies. */ + unsigned long gp_end; /* Time last GP ended, again */ + /* in jiffies. */ unsigned long gp_activity; /* Time of last GP kthread */ /* activity in jiffies. */ unsigned long gp_req_activity; /* Time of last GP request */ @@ -398,17 +400,6 @@ static const char *tp_rcu_varname __used __tracepoint_string = rcu_name; #define RCU_NAME rcu_name #endif /* #else #ifdef CONFIG_TRACING */ -/* - * RCU implementation internal declarations: - */ -extern struct rcu_state rcu_sched_state; - -extern struct rcu_state rcu_bh_state; - -#ifdef CONFIG_PREEMPT_RCU -extern struct rcu_state rcu_preempt_state; -#endif /* #ifdef CONFIG_PREEMPT_RCU */ - int rcu_dynticks_snap(struct rcu_data *rdp); #ifdef CONFIG_RCU_BOOST @@ -466,6 +457,7 @@ static void __init rcu_spawn_nocb_kthreads(void); static void __init rcu_organize_nocb_kthreads(void); #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ static bool init_nocb_callback_list(struct rcu_data *rdp); +static unsigned long rcu_get_n_cbs_nocb_cpu(struct rcu_data *rdp); static void rcu_bind_gp_kthread(void); static bool rcu_nohz_full_cpu(void); static void rcu_dynticks_task_enter(void); diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 8d18c1014e2b..928fe5893a57 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -450,10 +450,12 @@ static void sync_rcu_exp_select_cpus(smp_call_func_t func) } INIT_WORK(&rnp->rew.rew_work, sync_rcu_exp_select_node_cpus); preempt_disable(); - cpu = cpumask_next(rnp->grplo - 1, cpu_online_mask); + cpu = find_next_bit(&rnp->ffmask, BITS_PER_LONG, -1); /* If all offline, queue the work on an unbound CPU. */ - if (unlikely(cpu > rnp->grphi)) + if (unlikely(cpu > rnp->grphi - rnp->grplo)) cpu = WORK_CPU_UNBOUND; + else + cpu += rnp->grplo; queue_work_on(cpu, rcu_par_gp_wq, &rnp->rew.rew_work); preempt_enable(); rnp->exp_need_flush = true; @@ -690,8 +692,10 @@ static void sync_rcu_exp_handler(void *unused) */ if (t->rcu_read_lock_nesting > 0) { raw_spin_lock_irqsave_rcu_node(rnp, flags); - if (rnp->expmask & rdp->grpmask) + if (rnp->expmask & rdp->grpmask) { rdp->deferred_qs = true; + WRITE_ONCE(t->rcu_read_unlock_special.b.exp_hint, true); + } raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 05915e536336..1b3dd2fc0cd6 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -397,6 +397,11 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp) return rnp->gp_tasks != NULL; } +/* Bias and limit values for ->rcu_read_lock_nesting. */ +#define RCU_NEST_BIAS INT_MAX +#define RCU_NEST_NMAX (-INT_MAX / 2) +#define RCU_NEST_PMAX (INT_MAX / 2) + /* * Preemptible RCU implementation for rcu_read_lock(). * Just increment ->rcu_read_lock_nesting, shared state will be updated @@ -405,6 +410,8 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp) void __rcu_read_lock(void) { current->rcu_read_lock_nesting++; + if (IS_ENABLED(CONFIG_PROVE_LOCKING)) + WARN_ON_ONCE(current->rcu_read_lock_nesting > RCU_NEST_PMAX); barrier(); /* critical section after entry code. */ } EXPORT_SYMBOL_GPL(__rcu_read_lock); @@ -424,20 +431,18 @@ void __rcu_read_unlock(void) --t->rcu_read_lock_nesting; } else { barrier(); /* critical section before exit code. */ - t->rcu_read_lock_nesting = INT_MIN; + t->rcu_read_lock_nesting = -RCU_NEST_BIAS; barrier(); /* assign before ->rcu_read_unlock_special load */ if (unlikely(READ_ONCE(t->rcu_read_unlock_special.s))) rcu_read_unlock_special(t); barrier(); /* ->rcu_read_unlock_special load before assign */ t->rcu_read_lock_nesting = 0; } -#ifdef CONFIG_PROVE_LOCKING - { - int rrln = READ_ONCE(t->rcu_read_lock_nesting); + if (IS_ENABLED(CONFIG_PROVE_LOCKING)) { + int rrln = t->rcu_read_lock_nesting; - WARN_ON_ONCE(rrln < 0 && rrln > INT_MIN / 2); + WARN_ON_ONCE(rrln < 0 && rrln > RCU_NEST_NMAX); } -#endif /* #ifdef CONFIG_PROVE_LOCKING */ } EXPORT_SYMBOL_GPL(__rcu_read_unlock); @@ -597,7 +602,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) */ static bool rcu_preempt_need_deferred_qs(struct task_struct *t) { - return (this_cpu_ptr(&rcu_data)->deferred_qs || + return (__this_cpu_read(rcu_data.deferred_qs) || READ_ONCE(t->rcu_read_unlock_special.s)) && t->rcu_read_lock_nesting <= 0; } @@ -617,11 +622,11 @@ static void rcu_preempt_deferred_qs(struct task_struct *t) if (!rcu_preempt_need_deferred_qs(t)) return; if (couldrecurse) - t->rcu_read_lock_nesting -= INT_MIN; + t->rcu_read_lock_nesting -= RCU_NEST_BIAS; local_irq_save(flags); rcu_preempt_deferred_qs_irqrestore(t, flags); if (couldrecurse) - t->rcu_read_lock_nesting += INT_MIN; + t->rcu_read_lock_nesting += RCU_NEST_BIAS; } /* @@ -642,13 +647,21 @@ static void rcu_read_unlock_special(struct task_struct *t) local_irq_save(flags); irqs_were_disabled = irqs_disabled_flags(flags); - if ((preempt_bh_were_disabled || irqs_were_disabled) && - t->rcu_read_unlock_special.b.blocked) { + if (preempt_bh_were_disabled || irqs_were_disabled) { + WRITE_ONCE(t->rcu_read_unlock_special.b.exp_hint, false); /* Need to defer quiescent state until everything is enabled. */ - raise_softirq_irqoff(RCU_SOFTIRQ); + if (irqs_were_disabled) { + /* Enabling irqs does not reschedule, so... */ + raise_softirq_irqoff(RCU_SOFTIRQ); + } else { + /* Enabling BH or preempt does reschedule, so... */ + set_tsk_need_resched(current); + set_preempt_need_resched(); + } local_irq_restore(flags); return; } + WRITE_ONCE(t->rcu_read_unlock_special.b.exp_hint, false); rcu_preempt_deferred_qs_irqrestore(t, flags); } @@ -1464,7 +1477,8 @@ static void __init rcu_spawn_boost_kthreads(void) for_each_possible_cpu(cpu) per_cpu(rcu_cpu_has_work, cpu) = 0; - BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec)); + if (WARN_ONCE(smpboot_register_percpu_thread(&rcu_cpu_thread_spec), "%s: Could not start rcub kthread, OOM is now expected behavior\n", __func__)) + return; rcu_for_each_leaf_node(rnp) (void)rcu_spawn_one_boost_kthread(rnp); } @@ -1997,7 +2011,7 @@ static bool rcu_nocb_cpu_needs_barrier(int cpu) * (if a callback is in fact needed). This is associated with an * atomic_inc() in the caller. */ - ret = atomic_long_read(&rdp->nocb_q_count); + ret = rcu_get_n_cbs_nocb_cpu(rdp); #ifdef CONFIG_PROVE_RCU rhp = READ_ONCE(rdp->nocb_head); @@ -2052,7 +2066,7 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, TPS("WakeNotPoll")); return; } - len = atomic_long_read(&rdp->nocb_q_count); + len = rcu_get_n_cbs_nocb_cpu(rdp); if (old_rhpp == &rdp->nocb_head) { if (!irqs_disabled_flags(flags)) { /* ... if queue was empty ... */ @@ -2101,11 +2115,11 @@ static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, trace_rcu_kfree_callback(rcu_state.name, rhp, (unsigned long)rhp->func, -atomic_long_read(&rdp->nocb_q_count_lazy), - -atomic_long_read(&rdp->nocb_q_count)); + -rcu_get_n_cbs_nocb_cpu(rdp)); else trace_rcu_callback(rcu_state.name, rhp, -atomic_long_read(&rdp->nocb_q_count_lazy), - -atomic_long_read(&rdp->nocb_q_count)); + -rcu_get_n_cbs_nocb_cpu(rdp)); /* * If called from an extended quiescent state with interrupts @@ -2322,13 +2336,14 @@ static int rcu_nocb_kthread(void *arg) tail = rdp->nocb_follower_tail; rdp->nocb_follower_tail = &rdp->nocb_follower_head; raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); - BUG_ON(!list); + if (WARN_ON_ONCE(!list)) + continue; trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WokeNonEmpty")); /* Each pass through the following loop invokes a callback. */ trace_rcu_batch_start(rcu_state.name, atomic_long_read(&rdp->nocb_q_count_lazy), - atomic_long_read(&rdp->nocb_q_count), -1); + rcu_get_n_cbs_nocb_cpu(rdp), -1); c = cl = 0; while (list) { next = list->next; @@ -2495,7 +2510,8 @@ static void rcu_spawn_one_nocb_kthread(int cpu) /* Spawn the kthread for this CPU. */ t = kthread_run(rcu_nocb_kthread, rdp_spawn, "rcuo%c/%d", rcu_state.abbr, cpu); - BUG_ON(IS_ERR(t)); + if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo kthread, OOM is now expected behavior\n", __func__)) + return; WRITE_ONCE(rdp_spawn->nocb_kthread, t); } @@ -2587,6 +2603,26 @@ static bool init_nocb_callback_list(struct rcu_data *rdp) return true; } +/* + * Bind the current task to the offloaded CPUs. If there are no offloaded + * CPUs, leave the task unbound. Splat if the bind attempt fails. + */ +void rcu_bind_current_to_nocb(void) +{ + if (cpumask_available(rcu_nocb_mask) && cpumask_weight(rcu_nocb_mask)) + WARN_ON(sched_setaffinity(current->pid, rcu_nocb_mask)); +} +EXPORT_SYMBOL_GPL(rcu_bind_current_to_nocb); + +/* + * Return the number of RCU callbacks still queued from the specified + * CPU, which must be a nocbs CPU. + */ +static unsigned long rcu_get_n_cbs_nocb_cpu(struct rcu_data *rdp) +{ + return atomic_long_read(&rdp->nocb_q_count); +} + #else /* #ifdef CONFIG_RCU_NOCB_CPU */ static bool rcu_nocb_cpu_needs_barrier(int cpu) @@ -2647,6 +2683,11 @@ static bool init_nocb_callback_list(struct rcu_data *rdp) return false; } +static unsigned long rcu_get_n_cbs_nocb_cpu(struct rcu_data *rdp) +{ + return 0; +} + #endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */ /* diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index f203b94f6b5b..1971869c4072 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -335,8 +335,7 @@ void __wait_rcu_gp(bool checktiny, int n, call_rcu_func_t *crcu_array, /* Initialize and register callbacks for each crcu_array element. */ for (i = 0; i < n; i++) { if (checktiny && - (crcu_array[i] == call_rcu || - crcu_array[i] == call_rcu_bh)) { + (crcu_array[i] == call_rcu)) { might_sleep(); continue; } @@ -352,8 +351,7 @@ void __wait_rcu_gp(bool checktiny, int n, call_rcu_func_t *crcu_array, /* Wait for all callbacks to be invoked. */ for (i = 0; i < n; i++) { if (checktiny && - (crcu_array[i] == call_rcu || - crcu_array[i] == call_rcu_bh)) + (crcu_array[i] == call_rcu)) continue; for (j = 0; j < i; j++) if (crcu_array[j] == crcu_array[i]) @@ -822,7 +820,8 @@ static int __init rcu_spawn_tasks_kthread(void) struct task_struct *t; t = kthread_run(rcu_tasks_kthread, NULL, "rcu_tasks_kthread"); - BUG_ON(IS_ERR(t)); + if (WARN_ONCE(IS_ERR(t), "%s: Could not start Tasks-RCU grace-period kthread, OOM is now expected behavior\n", __func__)) + return 0; smp_mb(); /* Ensure others see full kthread. */ WRITE_ONCE(rcu_tasks_kthread_ptr, t); return 0; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 6fedf3a98581..f66920173370 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -697,7 +697,7 @@ static void set_load_weight(struct task_struct *p, bool update_load) /* * SCHED_IDLE tasks get minimal weight: */ - if (idle_policy(p->policy)) { + if (task_has_idle_policy(p)) { load->weight = scale_load(WEIGHT_IDLEPRIO); load->inv_weight = WMULT_IDLEPRIO; p->se.runnable_weight = load->weight; @@ -2857,7 +2857,7 @@ unsigned long nr_running(void) * preemption, thus the result might have a time-of-check-to-time-of-use * race. The caller is responsible to use it correctly, for example: * - * - from a non-preemptable section (of course) + * - from a non-preemptible section (of course) * * - from a thread that is bound to a single CPU * @@ -4191,7 +4191,7 @@ recheck: * Treat SCHED_IDLE as nice 20. Only allow a switch to * SCHED_NORMAL if the RLIMIT_NICE would normally permit it. */ - if (idle_policy(p->policy) && !idle_policy(policy)) { + if (task_has_idle_policy(p) && !idle_policy(policy)) { if (!can_nice(p, task_nice(p))) return -EPERM; } @@ -5783,7 +5783,7 @@ int sched_cpu_deactivate(unsigned int cpu) * * Do sync before park smpboot threads to take care the rcu boost case. */ - synchronize_rcu_mult(call_rcu, call_rcu_sched); + synchronize_rcu(); #ifdef CONFIG_SCHED_SMT /* diff --git a/kernel/sched/cpufreq.c b/kernel/sched/cpufreq.c index 5e54cbcae673..22bd8980f32f 100644 --- a/kernel/sched/cpufreq.c +++ b/kernel/sched/cpufreq.c @@ -1,12 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Scheduler code and data structures related to cpufreq. * * Copyright (C) 2016, Intel Corporation * Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #include "sched.h" diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 3fffad3bc8a8..033ec7c45f13 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -1,18 +1,16 @@ +// SPDX-License-Identifier: GPL-2.0 /* * CPUFreq governor based on scheduler-provided CPU utilization data. * * Copyright (C) 2016, Intel Corporation * Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include "sched.h" +#include <linux/sched/cpufreq.h> #include <trace/events/power.h> struct sugov_tunables { @@ -167,7 +165,7 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy, unsigned int freq = arch_scale_freq_invariant() ? policy->cpuinfo.max_freq : policy->cur; - freq = (freq + (freq >> 2)) * util / max; + freq = map_util_freq(util, freq, max); if (freq == sg_policy->cached_raw_freq && !sg_policy->need_freq_update) return sg_policy->next_freq; @@ -197,15 +195,13 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy, * based on the task model parameters and gives the minimal utilization * required to meet deadlines. */ -static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu) +unsigned long schedutil_freq_util(int cpu, unsigned long util_cfs, + unsigned long max, enum schedutil_type type) { - struct rq *rq = cpu_rq(sg_cpu->cpu); - unsigned long util, irq, max; - - sg_cpu->max = max = arch_scale_cpu_capacity(NULL, sg_cpu->cpu); - sg_cpu->bw_dl = cpu_bw_dl(rq); + unsigned long dl_util, util, irq; + struct rq *rq = cpu_rq(cpu); - if (rt_rq_is_runnable(&rq->rt)) + if (type == FREQUENCY_UTIL && rt_rq_is_runnable(&rq->rt)) return max; /* @@ -223,22 +219,31 @@ static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu) * utilization (PELT windows are synchronized) we can directly add them * to obtain the CPU's actual utilization. */ - util = cpu_util_cfs(rq); + util = util_cfs; util += cpu_util_rt(rq); + dl_util = cpu_util_dl(rq); + /* - * We do not make cpu_util_dl() a permanent part of this sum because we - * want to use cpu_bw_dl() later on, but we need to check if the - * CFS+RT+DL sum is saturated (ie. no idle time) such that we select - * f_max when there is no idle time. + * For frequency selection we do not make cpu_util_dl() a permanent part + * of this sum because we want to use cpu_bw_dl() later on, but we need + * to check if the CFS+RT+DL sum is saturated (ie. no idle time) such + * that we select f_max when there is no idle time. * * NOTE: numerical errors or stop class might cause us to not quite hit * saturation when we should -- something for later. */ - if ((util + cpu_util_dl(rq)) >= max) + if (util + dl_util >= max) return max; /* + * OTOH, for energy computation we need the estimated running time, so + * include util_dl and ignore dl_bw. + */ + if (type == ENERGY_UTIL) + util += dl_util; + + /* * There is still idle time; further improve the number by using the * irq metric. Because IRQ/steal time is hidden from the task clock we * need to scale the task numbers: @@ -260,7 +265,22 @@ static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu) * bw_dl as requested freq. However, cpufreq is not yet ready for such * an interface. So, we only do the latter for now. */ - return min(max, util + sg_cpu->bw_dl); + if (type == FREQUENCY_UTIL) + util += cpu_bw_dl(rq); + + return min(max, util); +} + +static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu) +{ + struct rq *rq = cpu_rq(sg_cpu->cpu); + unsigned long util = cpu_util_cfs(rq); + unsigned long max = arch_scale_cpu_capacity(NULL, sg_cpu->cpu); + + sg_cpu->max = max; + sg_cpu->bw_dl = cpu_bw_dl(rq); + + return schedutil_freq_util(sg_cpu->cpu, util, max, FREQUENCY_UTIL); } /** @@ -601,7 +621,7 @@ static struct kobj_type sugov_tunables_ktype = { /********************** cpufreq governor interface *********************/ -static struct cpufreq_governor schedutil_gov; +struct cpufreq_governor schedutil_gov; static struct sugov_policy *sugov_policy_alloc(struct cpufreq_policy *policy) { @@ -860,7 +880,7 @@ static void sugov_limits(struct cpufreq_policy *policy) sg_policy->need_freq_update = true; } -static struct cpufreq_governor schedutil_gov = { +struct cpufreq_governor schedutil_gov = { .name = "schedutil", .owner = THIS_MODULE, .dynamic_switching = true, @@ -883,3 +903,36 @@ static int __init sugov_register(void) return cpufreq_register_governor(&schedutil_gov); } fs_initcall(sugov_register); + +#ifdef CONFIG_ENERGY_MODEL +extern bool sched_energy_update; +extern struct mutex sched_energy_mutex; + +static void rebuild_sd_workfn(struct work_struct *work) +{ + mutex_lock(&sched_energy_mutex); + sched_energy_update = true; + rebuild_sched_domains(); + sched_energy_update = false; + mutex_unlock(&sched_energy_mutex); +} +static DECLARE_WORK(rebuild_sd_work, rebuild_sd_workfn); + +/* + * EAS shouldn't be attempted without sugov, so rebuild the sched_domains + * on governor changes to make sure the scheduler knows about it. + */ +void sched_cpufreq_governor_change(struct cpufreq_policy *policy, + struct cpufreq_governor *old_gov) +{ + if (old_gov == &schedutil_gov || policy->governor == &schedutil_gov) { + /* + * When called from the cpufreq_register_driver() path, the + * cpu_hotplug_lock is already held, so use a work item to + * avoid nested locking in rebuild_sched_domains(). + */ + schedule_work(&rebuild_sd_work); + } + +} +#endif diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 0796f938c4f0..ba4a143bdcf3 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -525,7 +525,7 @@ void account_idle_ticks(unsigned long ticks) /* * Perform (stime * rtime) / total, but avoid multiplication overflow by - * loosing precision when the numbers are big. + * losing precision when the numbers are big. */ static u64 scale_stime(u64 stime, u64 rtime, u64 total) { diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 91e4202b0634..fb8b7b5d745d 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -727,7 +727,7 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se, * refill the runtime and set the deadline a period in the future, * because keeping the current (absolute) deadline of the task would * result in breaking guarantees promised to other tasks (refer to - * Documentation/scheduler/sched-deadline.txt for more informations). + * Documentation/scheduler/sched-deadline.txt for more information). * * This function returns true if: * @@ -1695,6 +1695,14 @@ static void start_hrtick_dl(struct rq *rq, struct task_struct *p) } #endif +static inline void set_next_task(struct rq *rq, struct task_struct *p) +{ + p->se.exec_start = rq_clock_task(rq); + + /* You can't push away the running task */ + dequeue_pushable_dl_task(rq, p); +} + static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq, struct dl_rq *dl_rq) { @@ -1750,10 +1758,8 @@ pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) BUG_ON(!dl_se); p = dl_task_of(dl_se); - p->se.exec_start = rq_clock_task(rq); - /* Running task will never be pushed. */ - dequeue_pushable_dl_task(rq, p); + set_next_task(rq, p); if (hrtick_enabled(rq)) start_hrtick_dl(rq, p); @@ -1808,12 +1814,7 @@ static void task_fork_dl(struct task_struct *p) static void set_curr_task_dl(struct rq *rq) { - struct task_struct *p = rq->curr; - - p->se.exec_start = rq_clock_task(rq); - - /* You can't push away the running task */ - dequeue_pushable_dl_task(rq, p); + set_next_task(rq, rq->curr); } #ifdef CONFIG_SMP @@ -2041,10 +2042,8 @@ static int push_dl_task(struct rq *rq) return 0; retry: - if (unlikely(next_task == rq->curr)) { - WARN_ON(1); + if (WARN_ON(next_task == rq->curr)) return 0; - } /* * If next_task preempts rq->curr, and rq->curr diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 6383aa6a60ca..02bd5f969b21 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -974,7 +974,7 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, #endif P(policy); P(prio); - if (p->policy == SCHED_DEADLINE) { + if (task_has_dl_policy(p)) { P(dl.runtime); P(dl.deadline); } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ac855b2f4774..d1907506318a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -38,7 +38,7 @@ * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds) */ unsigned int sysctl_sched_latency = 6000000ULL; -unsigned int normalized_sysctl_sched_latency = 6000000ULL; +static unsigned int normalized_sysctl_sched_latency = 6000000ULL; /* * The initial- and re-scaling of tunables is configurable @@ -58,8 +58,8 @@ enum sched_tunable_scaling sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_L * * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds) */ -unsigned int sysctl_sched_min_granularity = 750000ULL; -unsigned int normalized_sysctl_sched_min_granularity = 750000ULL; +unsigned int sysctl_sched_min_granularity = 750000ULL; +static unsigned int normalized_sysctl_sched_min_granularity = 750000ULL; /* * This value is kept at sysctl_sched_latency/sysctl_sched_min_granularity @@ -81,8 +81,8 @@ unsigned int sysctl_sched_child_runs_first __read_mostly; * * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds) */ -unsigned int sysctl_sched_wakeup_granularity = 1000000UL; -unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL; +unsigned int sysctl_sched_wakeup_granularity = 1000000UL; +static unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL; const_debug unsigned int sysctl_sched_migration_cost = 500000UL; @@ -94,6 +94,14 @@ int __weak arch_asym_cpu_priority(int cpu) { return -cpu; } + +/* + * The margin used when comparing utilization with CPU capacity: + * util * margin < capacity * 1024 + * + * (default: ~20%) + */ +static unsigned int capacity_margin = 1280; #endif #ifdef CONFIG_CFS_BANDWIDTH @@ -110,14 +118,6 @@ int __weak arch_asym_cpu_priority(int cpu) unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; #endif -/* - * The margin used when comparing utilization with CPU capacity: - * util * margin < capacity * 1024 - * - * (default: ~20%) - */ -unsigned int capacity_margin = 1280; - static inline void update_load_add(struct load_weight *lw, unsigned long inc) { lw->weight += inc; @@ -703,9 +703,9 @@ void init_entity_runnable_average(struct sched_entity *se) memset(sa, 0, sizeof(*sa)); /* - * Tasks are intialized with full load to be seen as heavy tasks until + * Tasks are initialized with full load to be seen as heavy tasks until * they get a chance to stabilize to their real load level. - * Group entities are intialized with zero load to reflect the fact that + * Group entities are initialized with zero load to reflect the fact that * nothing has been attached to the task group yet. */ if (entity_is_task(se)) @@ -2734,6 +2734,17 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) WRITE_ONCE(*ptr, res); \ } while (0) +/* + * Remove and clamp on negative, from a local variable. + * + * A variant of sub_positive(), which does not use explicit load-store + * and is thus optimized for local variable updates. + */ +#define lsub_positive(_ptr, _val) do { \ + typeof(_ptr) ptr = (_ptr); \ + *ptr -= min_t(typeof(*ptr), *ptr, _val); \ +} while (0) + #ifdef CONFIG_SMP static inline void enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) @@ -3604,7 +3615,7 @@ static inline unsigned long _task_util_est(struct task_struct *p) { struct util_est ue = READ_ONCE(p->se.avg.util_est); - return max(ue.ewma, ue.enqueued); + return (max(ue.ewma, ue.enqueued) | UTIL_AVG_UNCHANGED); } static inline unsigned long task_util_est(struct task_struct *p) @@ -3622,7 +3633,7 @@ static inline void util_est_enqueue(struct cfs_rq *cfs_rq, /* Update root cfs_rq's estimated utilization */ enqueued = cfs_rq->avg.util_est.enqueued; - enqueued += (_task_util_est(p) | UTIL_AVG_UNCHANGED); + enqueued += _task_util_est(p); WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued); } @@ -3650,8 +3661,7 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep) /* Update root cfs_rq's estimated utilization */ ue.enqueued = cfs_rq->avg.util_est.enqueued; - ue.enqueued -= min_t(unsigned int, ue.enqueued, - (_task_util_est(p) | UTIL_AVG_UNCHANGED)); + ue.enqueued -= min_t(unsigned int, ue.enqueued, _task_util_est(p)); WRITE_ONCE(cfs_rq->avg.util_est.enqueued, ue.enqueued); /* @@ -3966,8 +3976,8 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) /* * When dequeuing a sched_entity, we must: * - Update loads to have both entity and cfs_rq synced with now. - * - Substract its load from the cfs_rq->runnable_avg. - * - Substract its previous weight from cfs_rq->load.weight. + * - Subtract its load from the cfs_rq->runnable_avg. + * - Subtract its previous weight from cfs_rq->load.weight. * - For group entity, update its weight to reflect the new share * of its group cfs_rq. */ @@ -4640,7 +4650,7 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun) cfs_b->distribute_running = 0; throttled = !list_empty(&cfs_b->throttled_cfs_rq); - cfs_b->runtime -= min(runtime, cfs_b->runtime); + lsub_positive(&cfs_b->runtime, runtime); } /* @@ -4774,7 +4784,7 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) raw_spin_lock(&cfs_b->lock); if (expires == cfs_b->runtime_expires) - cfs_b->runtime -= min(runtime, cfs_b->runtime); + lsub_positive(&cfs_b->runtime, runtime); cfs_b->distribute_running = 0; raw_spin_unlock(&cfs_b->lock); } @@ -5072,6 +5082,24 @@ static inline void hrtick_update(struct rq *rq) } #endif +#ifdef CONFIG_SMP +static inline unsigned long cpu_util(int cpu); +static unsigned long capacity_of(int cpu); + +static inline bool cpu_overutilized(int cpu) +{ + return (capacity_of(cpu) * 1024) < (cpu_util(cpu) * capacity_margin); +} + +static inline void update_overutilized_status(struct rq *rq) +{ + if (!READ_ONCE(rq->rd->overutilized) && cpu_overutilized(rq->cpu)) + WRITE_ONCE(rq->rd->overutilized, SG_OVERUTILIZED); +} +#else +static inline void update_overutilized_status(struct rq *rq) { } +#endif + /* * The enqueue_task method is called before nr_running is * increased. Here we update the fair scheduling stats and @@ -5129,8 +5157,26 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) update_cfs_group(se); } - if (!se) + if (!se) { add_nr_running(rq, 1); + /* + * Since new tasks are assigned an initial util_avg equal to + * half of the spare capacity of their CPU, tiny tasks have the + * ability to cross the overutilized threshold, which will + * result in the load balancer ruining all the task placement + * done by EAS. As a way to mitigate that effect, do not account + * for the first enqueue operation of new tasks during the + * overutilized flag detection. + * + * A better way of solving this problem would be to wait for + * the PELT signals of tasks to converge before taking them + * into account, but that is not straightforward to implement, + * and the following generally works well enough in practice. + */ + if (flags & ENQUEUE_WAKEUP) + update_overutilized_status(rq); + + } hrtick_update(rq); } @@ -6241,7 +6287,7 @@ static unsigned long cpu_util_without(int cpu, struct task_struct *p) util = READ_ONCE(cfs_rq->avg.util_avg); /* Discount task's util from CPU's util */ - util -= min_t(unsigned int, util, task_util(p)); + lsub_positive(&util, task_util(p)); /* * Covered cases: @@ -6290,10 +6336,9 @@ static unsigned long cpu_util_without(int cpu, struct task_struct *p) * properly fix the execl regression and it helps in further * reducing the chances for the above race. */ - if (unlikely(task_on_rq_queued(p) || current == p)) { - estimated -= min_t(unsigned int, estimated, - (_task_util_est(p) | UTIL_AVG_UNCHANGED)); - } + if (unlikely(task_on_rq_queued(p) || current == p)) + lsub_positive(&estimated, _task_util_est(p)); + util = max(util, estimated); } @@ -6333,6 +6378,213 @@ static int wake_cap(struct task_struct *p, int cpu, int prev_cpu) } /* + * Predicts what cpu_util(@cpu) would return if @p was migrated (and enqueued) + * to @dst_cpu. + */ +static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu) +{ + struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs; + unsigned long util_est, util = READ_ONCE(cfs_rq->avg.util_avg); + + /* + * If @p migrates from @cpu to another, remove its contribution. Or, + * if @p migrates from another CPU to @cpu, add its contribution. In + * the other cases, @cpu is not impacted by the migration, so the + * util_avg should already be correct. + */ + if (task_cpu(p) == cpu && dst_cpu != cpu) + sub_positive(&util, task_util(p)); + else if (task_cpu(p) != cpu && dst_cpu == cpu) + util += task_util(p); + + if (sched_feat(UTIL_EST)) { + util_est = READ_ONCE(cfs_rq->avg.util_est.enqueued); + + /* + * During wake-up, the task isn't enqueued yet and doesn't + * appear in the cfs_rq->avg.util_est.enqueued of any rq, + * so just add it (if needed) to "simulate" what will be + * cpu_util() after the task has been enqueued. + */ + if (dst_cpu == cpu) + util_est += _task_util_est(p); + + util = max(util, util_est); + } + + return min(util, capacity_orig_of(cpu)); +} + +/* + * compute_energy(): Estimates the energy that would be consumed if @p was + * migrated to @dst_cpu. compute_energy() predicts what will be the utilization + * landscape of the * CPUs after the task migration, and uses the Energy Model + * to compute what would be the energy if we decided to actually migrate that + * task. + */ +static long +compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd) +{ + long util, max_util, sum_util, energy = 0; + int cpu; + + for (; pd; pd = pd->next) { + max_util = sum_util = 0; + /* + * The capacity state of CPUs of the current rd can be driven by + * CPUs of another rd if they belong to the same performance + * domain. So, account for the utilization of these CPUs too + * by masking pd with cpu_online_mask instead of the rd span. + * + * If an entire performance domain is outside of the current rd, + * it will not appear in its pd list and will not be accounted + * by compute_energy(). + */ + for_each_cpu_and(cpu, perf_domain_span(pd), cpu_online_mask) { + util = cpu_util_next(cpu, p, dst_cpu); + util = schedutil_energy_util(cpu, util); + max_util = max(util, max_util); + sum_util += util; + } + + energy += em_pd_energy(pd->em_pd, max_util, sum_util); + } + + return energy; +} + +/* + * find_energy_efficient_cpu(): Find most energy-efficient target CPU for the + * waking task. find_energy_efficient_cpu() looks for the CPU with maximum + * spare capacity in each performance domain and uses it as a potential + * candidate to execute the task. Then, it uses the Energy Model to figure + * out which of the CPU candidates is the most energy-efficient. + * + * The rationale for this heuristic is as follows. In a performance domain, + * all the most energy efficient CPU candidates (according to the Energy + * Model) are those for which we'll request a low frequency. When there are + * several CPUs for which the frequency request will be the same, we don't + * have enough data to break the tie between them, because the Energy Model + * only includes active power costs. With this model, if we assume that + * frequency requests follow utilization (e.g. using schedutil), the CPU with + * the maximum spare capacity in a performance domain is guaranteed to be among + * the best candidates of the performance domain. + * + * In practice, it could be preferable from an energy standpoint to pack + * small tasks on a CPU in order to let other CPUs go in deeper idle states, + * but that could also hurt our chances to go cluster idle, and we have no + * ways to tell with the current Energy Model if this is actually a good + * idea or not. So, find_energy_efficient_cpu() basically favors + * cluster-packing, and spreading inside a cluster. That should at least be + * a good thing for latency, and this is consistent with the idea that most + * of the energy savings of EAS come from the asymmetry of the system, and + * not so much from breaking the tie between identical CPUs. That's also the + * reason why EAS is enabled in the topology code only for systems where + * SD_ASYM_CPUCAPACITY is set. + * + * NOTE: Forkees are not accepted in the energy-aware wake-up path because + * they don't have any useful utilization data yet and it's not possible to + * forecast their impact on energy consumption. Consequently, they will be + * placed by find_idlest_cpu() on the least loaded CPU, which might turn out + * to be energy-inefficient in some use-cases. The alternative would be to + * bias new tasks towards specific types of CPUs first, or to try to infer + * their util_avg from the parent task, but those heuristics could hurt + * other use-cases too. So, until someone finds a better way to solve this, + * let's keep things simple by re-using the existing slow path. + */ + +static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) +{ + unsigned long prev_energy = ULONG_MAX, best_energy = ULONG_MAX; + struct root_domain *rd = cpu_rq(smp_processor_id())->rd; + int cpu, best_energy_cpu = prev_cpu; + struct perf_domain *head, *pd; + unsigned long cpu_cap, util; + struct sched_domain *sd; + + rcu_read_lock(); + pd = rcu_dereference(rd->pd); + if (!pd || READ_ONCE(rd->overutilized)) + goto fail; + head = pd; + + /* + * Energy-aware wake-up happens on the lowest sched_domain starting + * from sd_asym_cpucapacity spanning over this_cpu and prev_cpu. + */ + sd = rcu_dereference(*this_cpu_ptr(&sd_asym_cpucapacity)); + while (sd && !cpumask_test_cpu(prev_cpu, sched_domain_span(sd))) + sd = sd->parent; + if (!sd) + goto fail; + + sync_entity_load_avg(&p->se); + if (!task_util_est(p)) + goto unlock; + + for (; pd; pd = pd->next) { + unsigned long cur_energy, spare_cap, max_spare_cap = 0; + int max_spare_cap_cpu = -1; + + for_each_cpu_and(cpu, perf_domain_span(pd), sched_domain_span(sd)) { + if (!cpumask_test_cpu(cpu, &p->cpus_allowed)) + continue; + + /* Skip CPUs that will be overutilized. */ + util = cpu_util_next(cpu, p, cpu); + cpu_cap = capacity_of(cpu); + if (cpu_cap * 1024 < util * capacity_margin) + continue; + + /* Always use prev_cpu as a candidate. */ + if (cpu == prev_cpu) { + prev_energy = compute_energy(p, prev_cpu, head); + best_energy = min(best_energy, prev_energy); + continue; + } + + /* + * Find the CPU with the maximum spare capacity in + * the performance domain + */ + spare_cap = cpu_cap - util; + if (spare_cap > max_spare_cap) { + max_spare_cap = spare_cap; + max_spare_cap_cpu = cpu; + } + } + + /* Evaluate the energy impact of using this CPU. */ + if (max_spare_cap_cpu >= 0) { + cur_energy = compute_energy(p, max_spare_cap_cpu, head); + if (cur_energy < best_energy) { + best_energy = cur_energy; + best_energy_cpu = max_spare_cap_cpu; + } + } + } +unlock: + rcu_read_unlock(); + + /* + * Pick the best CPU if prev_cpu cannot be used, or if it saves at + * least 6% of the energy used by prev_cpu. + */ + if (prev_energy == ULONG_MAX) + return best_energy_cpu; + + if ((prev_energy - best_energy) > (prev_energy >> 4)) + return best_energy_cpu; + + return prev_cpu; + +fail: + rcu_read_unlock(); + + return -1; +} + +/* * select_task_rq_fair: Select target runqueue for the waking task in domains * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE, * SD_BALANCE_FORK, or SD_BALANCE_EXEC. @@ -6355,8 +6607,16 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f if (sd_flag & SD_BALANCE_WAKE) { record_wakee(p); - want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu) - && cpumask_test_cpu(cpu, &p->cpus_allowed); + + if (static_branch_unlikely(&sched_energy_present)) { + new_cpu = find_energy_efficient_cpu(p, prev_cpu); + if (new_cpu >= 0) + return new_cpu; + new_cpu = prev_cpu; + } + + want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu) && + cpumask_test_cpu(cpu, &p->cpus_allowed); } rcu_read_lock(); @@ -6520,7 +6780,7 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se) static void set_last_buddy(struct sched_entity *se) { - if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE)) + if (entity_is_task(se) && unlikely(task_has_idle_policy(task_of(se)))) return; for_each_sched_entity(se) { @@ -6532,7 +6792,7 @@ static void set_last_buddy(struct sched_entity *se) static void set_next_buddy(struct sched_entity *se) { - if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE)) + if (entity_is_task(se) && unlikely(task_has_idle_policy(task_of(se)))) return; for_each_sched_entity(se) { @@ -6590,8 +6850,8 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ return; /* Idle tasks are by definition preempted by non-idle tasks. */ - if (unlikely(curr->policy == SCHED_IDLE) && - likely(p->policy != SCHED_IDLE)) + if (unlikely(task_has_idle_policy(curr)) && + likely(!task_has_idle_policy(p))) goto preempt; /* @@ -7012,7 +7272,7 @@ static int task_hot(struct task_struct *p, struct lb_env *env) if (p->sched_class != &fair_sched_class) return 0; - if (unlikely(p->policy == SCHED_IDLE)) + if (unlikely(task_has_idle_policy(p))) return 0; /* @@ -7896,16 +8156,16 @@ static bool update_nohz_stats(struct rq *rq, bool force) * update_sg_lb_stats - Update sched_group's statistics for load balancing. * @env: The load balancing environment. * @group: sched_group whose statistics are to be updated. - * @load_idx: Load index of sched_domain of this_cpu for load calc. - * @local_group: Does group contain this_cpu. * @sgs: variable to hold the statistics for this group. - * @overload: Indicate pullable load (e.g. >1 runnable task). + * @sg_status: Holds flag indicating the status of the sched_group */ static inline void update_sg_lb_stats(struct lb_env *env, - struct sched_group *group, int load_idx, - int local_group, struct sg_lb_stats *sgs, - bool *overload) + struct sched_group *group, + struct sg_lb_stats *sgs, + int *sg_status) { + int local_group = cpumask_test_cpu(env->dst_cpu, sched_group_span(group)); + int load_idx = get_sd_load_idx(env->sd, env->idle); unsigned long load; int i, nr_running; @@ -7929,7 +8189,10 @@ static inline void update_sg_lb_stats(struct lb_env *env, nr_running = rq->nr_running; if (nr_running > 1) - *overload = true; + *sg_status |= SG_OVERLOAD; + + if (cpu_overutilized(i)) + *sg_status |= SG_OVERUTILIZED; #ifdef CONFIG_NUMA_BALANCING sgs->nr_numa_running += rq->nr_numa_running; @@ -7945,7 +8208,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, if (env->sd->flags & SD_ASYM_CPUCAPACITY && sgs->group_misfit_task_load < rq->misfit_task_load) { sgs->group_misfit_task_load = rq->misfit_task_load; - *overload = 1; + *sg_status |= SG_OVERLOAD; } } @@ -8090,17 +8353,14 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd struct sched_group *sg = env->sd->groups; struct sg_lb_stats *local = &sds->local_stat; struct sg_lb_stats tmp_sgs; - int load_idx; - bool overload = false; bool prefer_sibling = child && child->flags & SD_PREFER_SIBLING; + int sg_status = 0; #ifdef CONFIG_NO_HZ_COMMON if (env->idle == CPU_NEWLY_IDLE && READ_ONCE(nohz.has_blocked)) env->flags |= LBF_NOHZ_STATS; #endif - load_idx = get_sd_load_idx(env->sd, env->idle); - do { struct sg_lb_stats *sgs = &tmp_sgs; int local_group; @@ -8115,8 +8375,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd update_group_capacity(env->sd, env->dst_cpu); } - update_sg_lb_stats(env, sg, load_idx, local_group, sgs, - &overload); + update_sg_lb_stats(env, sg, sgs, &sg_status); if (local_group) goto next_group; @@ -8165,9 +8424,15 @@ next_group: env->fbq_type = fbq_classify_group(&sds->busiest_stat); if (!env->sd->parent) { + struct root_domain *rd = env->dst_rq->rd; + /* update overload indicator if we are at root domain */ - if (READ_ONCE(env->dst_rq->rd->overload) != overload) - WRITE_ONCE(env->dst_rq->rd->overload, overload); + WRITE_ONCE(rd->overload, sg_status & SG_OVERLOAD); + + /* Update over-utilization (tipping point, U >= 0) indicator */ + WRITE_ONCE(rd->overutilized, sg_status & SG_OVERUTILIZED); + } else if (sg_status & SG_OVERUTILIZED) { + WRITE_ONCE(env->dst_rq->rd->overutilized, SG_OVERUTILIZED); } } @@ -8394,6 +8659,14 @@ static struct sched_group *find_busiest_group(struct lb_env *env) * this level. */ update_sd_lb_stats(env, &sds); + + if (static_branch_unlikely(&sched_energy_present)) { + struct root_domain *rd = env->dst_rq->rd; + + if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized)) + goto out_balanced; + } + local = &sds.local_stat; busiest = &sds.busiest_stat; @@ -8910,13 +9183,22 @@ out_all_pinned: sd->nr_balance_failed = 0; out_one_pinned: + ld_moved = 0; + + /* + * idle_balance() disregards balance intervals, so we could repeatedly + * reach this code, which would lead to balance_interval skyrocketting + * in a short amount of time. Skip the balance_interval increase logic + * to avoid that. + */ + if (env.idle == CPU_NEWLY_IDLE) + goto out; + /* tune up the balancing interval */ - if (((env.flags & LBF_ALL_PINNED) && - sd->balance_interval < MAX_PINNED_INTERVAL) || - (sd->balance_interval < sd->max_interval)) + if ((env.flags & LBF_ALL_PINNED && + sd->balance_interval < MAX_PINNED_INTERVAL) || + sd->balance_interval < sd->max_interval) sd->balance_interval *= 2; - - ld_moved = 0; out: return ld_moved; } @@ -9281,7 +9563,7 @@ static void nohz_balancer_kick(struct rq *rq) } } - sd = rcu_dereference(per_cpu(sd_asym, cpu)); + sd = rcu_dereference(per_cpu(sd_asym_packing, cpu)); if (sd) { for_each_cpu(i, sched_domain_span(sd)) { if (i == cpu || @@ -9533,9 +9815,7 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) return false; } - /* - * barrier, pairs with nohz_balance_enter_idle(), ensures ... - */ + /* could be _relaxed() */ flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu)); if (!(flags & NOHZ_KICK_MASK)) return false; @@ -9785,6 +10065,7 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) task_tick_numa(rq, curr); update_misfit_status(curr, rq); + update_overutilized_status(task_rq(curr)); } /* diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c index e6802181900f..81faddba9e20 100644 --- a/kernel/sched/isolation.c +++ b/kernel/sched/isolation.c @@ -8,14 +8,14 @@ */ #include "sched.h" -DEFINE_STATIC_KEY_FALSE(housekeeping_overriden); -EXPORT_SYMBOL_GPL(housekeeping_overriden); +DEFINE_STATIC_KEY_FALSE(housekeeping_overridden); +EXPORT_SYMBOL_GPL(housekeeping_overridden); static cpumask_var_t housekeeping_mask; static unsigned int housekeeping_flags; int housekeeping_any_cpu(enum hk_flags flags) { - if (static_branch_unlikely(&housekeeping_overriden)) + if (static_branch_unlikely(&housekeeping_overridden)) if (housekeeping_flags & flags) return cpumask_any_and(housekeeping_mask, cpu_online_mask); return smp_processor_id(); @@ -24,7 +24,7 @@ EXPORT_SYMBOL_GPL(housekeeping_any_cpu); const struct cpumask *housekeeping_cpumask(enum hk_flags flags) { - if (static_branch_unlikely(&housekeeping_overriden)) + if (static_branch_unlikely(&housekeeping_overridden)) if (housekeeping_flags & flags) return housekeeping_mask; return cpu_possible_mask; @@ -33,7 +33,7 @@ EXPORT_SYMBOL_GPL(housekeeping_cpumask); void housekeeping_affine(struct task_struct *t, enum hk_flags flags) { - if (static_branch_unlikely(&housekeeping_overriden)) + if (static_branch_unlikely(&housekeeping_overridden)) if (housekeeping_flags & flags) set_cpus_allowed_ptr(t, housekeeping_mask); } @@ -41,7 +41,7 @@ EXPORT_SYMBOL_GPL(housekeeping_affine); bool housekeeping_test_cpu(int cpu, enum hk_flags flags) { - if (static_branch_unlikely(&housekeeping_overriden)) + if (static_branch_unlikely(&housekeeping_overridden)) if (housekeeping_flags & flags) return cpumask_test_cpu(cpu, housekeeping_mask); return true; @@ -53,7 +53,7 @@ void __init housekeeping_init(void) if (!housekeeping_flags) return; - static_branch_enable(&housekeeping_overriden); + static_branch_enable(&housekeeping_overridden); if (housekeeping_flags & HK_FLAG_TICK) sched_tick_offload_init(); diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index 76e0eaf4654e..3cd8a3a795d2 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c @@ -210,7 +210,7 @@ static int membarrier_register_global_expedited(void) * future scheduler executions will observe the new * thread flag state for this mm. */ - synchronize_sched(); + synchronize_rcu(); } atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY, &mm->membarrier_state); @@ -246,7 +246,7 @@ static int membarrier_register_private_expedited(int flags) * Ensure all future scheduler executions will observe the * new thread flag state for this process. */ - synchronize_sched(); + synchronize_rcu(); } atomic_or(state, &mm->membarrier_state); @@ -298,7 +298,7 @@ SYSCALL_DEFINE2(membarrier, int, cmd, int, flags) if (tick_nohz_full_enabled()) return -EINVAL; if (num_online_cpus() > 1) - synchronize_sched(); + synchronize_rcu(); return 0; case MEMBARRIER_CMD_GLOBAL_EXPEDITED: return membarrier_global_expedited(); diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index a21ea6021929..e4f398ad9e73 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1498,6 +1498,14 @@ static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flag #endif } +static inline void set_next_task(struct rq *rq, struct task_struct *p) +{ + p->se.exec_start = rq_clock_task(rq); + + /* The running task is never eligible for pushing */ + dequeue_pushable_task(rq, p); +} + static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq, struct rt_rq *rt_rq) { @@ -1518,7 +1526,6 @@ static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq, static struct task_struct *_pick_next_task_rt(struct rq *rq) { struct sched_rt_entity *rt_se; - struct task_struct *p; struct rt_rq *rt_rq = &rq->rt; do { @@ -1527,10 +1534,7 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq) rt_rq = group_rt_rq(rt_se); } while (rt_rq); - p = rt_task_of(rt_se); - p->se.exec_start = rq_clock_task(rq); - - return p; + return rt_task_of(rt_se); } static struct task_struct * @@ -1573,8 +1577,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) p = _pick_next_task_rt(rq); - /* The running task is never eligible for pushing */ - dequeue_pushable_task(rq, p); + set_next_task(rq, p); rt_queue_push_tasks(rq); @@ -1810,10 +1813,8 @@ static int push_rt_task(struct rq *rq) return 0; retry: - if (unlikely(next_task == rq->curr)) { - WARN_ON(1); + if (WARN_ON(next_task == rq->curr)) return 0; - } /* * It's possible that the next_task slipped in of @@ -2355,12 +2356,7 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) static void set_curr_task_rt(struct rq *rq) { - struct task_struct *p = rq->curr; - - p->se.exec_start = rq_clock_task(rq); - - /* The running task is never eligible for pushing */ - dequeue_pushable_task(rq, p); + set_next_task(rq, rq->curr); } static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 4e524ab589c9..0ba08924e017 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -45,6 +45,7 @@ #include <linux/ctype.h> #include <linux/debugfs.h> #include <linux/delayacct.h> +#include <linux/energy_model.h> #include <linux/init_task.h> #include <linux/kprobes.h> #include <linux/kthread.h> @@ -177,6 +178,11 @@ static inline bool valid_policy(int policy) rt_policy(policy) || dl_policy(policy); } +static inline int task_has_idle_policy(struct task_struct *p) +{ + return idle_policy(p->policy); +} + static inline int task_has_rt_policy(struct task_struct *p) { return rt_policy(p->policy); @@ -632,7 +638,7 @@ struct dl_rq { /* * Deadline values of the currently executing and the * earliest ready task on this rq. Caching these facilitates - * the decision wether or not a ready but not running task + * the decision whether or not a ready but not running task * should migrate somewhere else. */ struct { @@ -704,6 +710,16 @@ static inline bool sched_asym_prefer(int a, int b) return arch_asym_cpu_priority(a) > arch_asym_cpu_priority(b); } +struct perf_domain { + struct em_perf_domain *em_pd; + struct perf_domain *next; + struct rcu_head rcu; +}; + +/* Scheduling group status flags */ +#define SG_OVERLOAD 0x1 /* More than one runnable task on a CPU. */ +#define SG_OVERUTILIZED 0x2 /* One or more CPUs are over-utilized. */ + /* * We add the notion of a root-domain which will be used to define per-domain * variables. Each exclusive cpuset essentially defines an island domain by @@ -726,6 +742,9 @@ struct root_domain { */ int overload; + /* Indicate one or more cpus over-utilized (tipping point) */ + int overutilized; + /* * The bit corresponding to a CPU gets set here if such CPU has more * than one runnable -deadline task (as it is below for RT tasks). @@ -756,6 +775,12 @@ struct root_domain { struct cpupri cpupri; unsigned long max_cpu_capacity; + + /* + * NULL-terminated list of performance domains intersecting with the + * CPUs of the rd. Protected by RCU. + */ + struct perf_domain *pd; }; extern struct root_domain def_root_domain; @@ -1285,7 +1310,8 @@ DECLARE_PER_CPU(int, sd_llc_size); DECLARE_PER_CPU(int, sd_llc_id); DECLARE_PER_CPU(struct sched_domain_shared *, sd_llc_shared); DECLARE_PER_CPU(struct sched_domain *, sd_numa); -DECLARE_PER_CPU(struct sched_domain *, sd_asym); +DECLARE_PER_CPU(struct sched_domain *, sd_asym_packing); +DECLARE_PER_CPU(struct sched_domain *, sd_asym_cpucapacity); extern struct static_key_false sched_asym_cpucapacity; struct sched_group_capacity { @@ -1429,7 +1455,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) #ifdef CONFIG_SMP /* * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be - * successfuly executed on another CPU. We must ensure that updates of + * successfully executed on another CPU. We must ensure that updates of * per-task data have been completed by this moment. */ smp_wmb(); @@ -1794,12 +1820,12 @@ static inline void add_nr_running(struct rq *rq, unsigned count) rq->nr_running = prev_nr + count; - if (prev_nr < 2 && rq->nr_running >= 2) { #ifdef CONFIG_SMP + if (prev_nr < 2 && rq->nr_running >= 2) { if (!READ_ONCE(rq->rd->overload)) WRITE_ONCE(rq->rd->overload, 1); -#endif } +#endif sched_update_tick_dependency(rq); } @@ -1855,27 +1881,6 @@ unsigned long arch_scale_freq_capacity(int cpu) #endif #ifdef CONFIG_SMP -#ifndef arch_scale_cpu_capacity -static __always_inline -unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu) -{ - if (sd && (sd->flags & SD_SHARE_CPUCAPACITY) && (sd->span_weight > 1)) - return sd->smt_gain / sd->span_weight; - - return SCHED_CAPACITY_SCALE; -} -#endif -#else -#ifndef arch_scale_cpu_capacity -static __always_inline -unsigned long arch_scale_cpu_capacity(void __always_unused *sd, int cpu) -{ - return SCHED_CAPACITY_SCALE; -} -#endif -#endif - -#ifdef CONFIG_SMP #ifdef CONFIG_PREEMPT static inline void double_rq_lock(struct rq *rq1, struct rq *rq2); @@ -2207,6 +2212,31 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {} #endif #ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL +/** + * enum schedutil_type - CPU utilization type + * @FREQUENCY_UTIL: Utilization used to select frequency + * @ENERGY_UTIL: Utilization used during energy calculation + * + * The utilization signals of all scheduling classes (CFS/RT/DL) and IRQ time + * need to be aggregated differently depending on the usage made of them. This + * enum is used within schedutil_freq_util() to differentiate the types of + * utilization expected by the callers, and adjust the aggregation accordingly. + */ +enum schedutil_type { + FREQUENCY_UTIL, + ENERGY_UTIL, +}; + +unsigned long schedutil_freq_util(int cpu, unsigned long util_cfs, + unsigned long max, enum schedutil_type type); + +static inline unsigned long schedutil_energy_util(int cpu, unsigned long cfs) +{ + unsigned long max = arch_scale_cpu_capacity(NULL, cpu); + + return schedutil_freq_util(cpu, cfs, max, ENERGY_UTIL); +} + static inline unsigned long cpu_bw_dl(struct rq *rq) { return (rq->dl.running_bw * SCHED_CAPACITY_SCALE) >> BW_SHIFT; @@ -2233,6 +2263,11 @@ static inline unsigned long cpu_util_rt(struct rq *rq) { return READ_ONCE(rq->avg_rt.util_avg); } +#else /* CONFIG_CPU_FREQ_GOV_SCHEDUTIL */ +static inline unsigned long schedutil_energy_util(int cpu, unsigned long cfs) +{ + return cfs; +} #endif #ifdef CONFIG_HAVE_SCHED_AVG_IRQ @@ -2262,3 +2297,13 @@ unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned return util; } #endif + +#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) +#define perf_domain_span(pd) (to_cpumask(((pd)->em_pd->cpus))) +#else +#define perf_domain_span(pd) NULL +#endif + +#ifdef CONFIG_SMP +extern struct static_key_false sched_energy_present; +#endif diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 8d7f15ba5916..3f35ba1d8fde 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -201,6 +201,199 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) return 1; } +DEFINE_STATIC_KEY_FALSE(sched_energy_present); +#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) +DEFINE_MUTEX(sched_energy_mutex); +bool sched_energy_update; + +static void free_pd(struct perf_domain *pd) +{ + struct perf_domain *tmp; + + while (pd) { + tmp = pd->next; + kfree(pd); + pd = tmp; + } +} + +static struct perf_domain *find_pd(struct perf_domain *pd, int cpu) +{ + while (pd) { + if (cpumask_test_cpu(cpu, perf_domain_span(pd))) + return pd; + pd = pd->next; + } + + return NULL; +} + +static struct perf_domain *pd_init(int cpu) +{ + struct em_perf_domain *obj = em_cpu_get(cpu); + struct perf_domain *pd; + + if (!obj) { + if (sched_debug()) + pr_info("%s: no EM found for CPU%d\n", __func__, cpu); + return NULL; + } + + pd = kzalloc(sizeof(*pd), GFP_KERNEL); + if (!pd) + return NULL; + pd->em_pd = obj; + + return pd; +} + +static void perf_domain_debug(const struct cpumask *cpu_map, + struct perf_domain *pd) +{ + if (!sched_debug() || !pd) + return; + + printk(KERN_DEBUG "root_domain %*pbl:", cpumask_pr_args(cpu_map)); + + while (pd) { + printk(KERN_CONT " pd%d:{ cpus=%*pbl nr_cstate=%d }", + cpumask_first(perf_domain_span(pd)), + cpumask_pr_args(perf_domain_span(pd)), + em_pd_nr_cap_states(pd->em_pd)); + pd = pd->next; + } + + printk(KERN_CONT "\n"); +} + +static void destroy_perf_domain_rcu(struct rcu_head *rp) +{ + struct perf_domain *pd; + + pd = container_of(rp, struct perf_domain, rcu); + free_pd(pd); +} + +static void sched_energy_set(bool has_eas) +{ + if (!has_eas && static_branch_unlikely(&sched_energy_present)) { + if (sched_debug()) + pr_info("%s: stopping EAS\n", __func__); + static_branch_disable_cpuslocked(&sched_energy_present); + } else if (has_eas && !static_branch_unlikely(&sched_energy_present)) { + if (sched_debug()) + pr_info("%s: starting EAS\n", __func__); + static_branch_enable_cpuslocked(&sched_energy_present); + } +} + +/* + * EAS can be used on a root domain if it meets all the following conditions: + * 1. an Energy Model (EM) is available; + * 2. the SD_ASYM_CPUCAPACITY flag is set in the sched_domain hierarchy. + * 3. the EM complexity is low enough to keep scheduling overheads low; + * 4. schedutil is driving the frequency of all CPUs of the rd; + * + * The complexity of the Energy Model is defined as: + * + * C = nr_pd * (nr_cpus + nr_cs) + * + * with parameters defined as: + * - nr_pd: the number of performance domains + * - nr_cpus: the number of CPUs + * - nr_cs: the sum of the number of capacity states of all performance + * domains (for example, on a system with 2 performance domains, + * with 10 capacity states each, nr_cs = 2 * 10 = 20). + * + * It is generally not a good idea to use such a model in the wake-up path on + * very complex platforms because of the associated scheduling overheads. The + * arbitrary constraint below prevents that. It makes EAS usable up to 16 CPUs + * with per-CPU DVFS and less than 8 capacity states each, for example. + */ +#define EM_MAX_COMPLEXITY 2048 + +extern struct cpufreq_governor schedutil_gov; +static bool build_perf_domains(const struct cpumask *cpu_map) +{ + int i, nr_pd = 0, nr_cs = 0, nr_cpus = cpumask_weight(cpu_map); + struct perf_domain *pd = NULL, *tmp; + int cpu = cpumask_first(cpu_map); + struct root_domain *rd = cpu_rq(cpu)->rd; + struct cpufreq_policy *policy; + struct cpufreq_governor *gov; + + /* EAS is enabled for asymmetric CPU capacity topologies. */ + if (!per_cpu(sd_asym_cpucapacity, cpu)) { + if (sched_debug()) { + pr_info("rd %*pbl: CPUs do not have asymmetric capacities\n", + cpumask_pr_args(cpu_map)); + } + goto free; + } + + for_each_cpu(i, cpu_map) { + /* Skip already covered CPUs. */ + if (find_pd(pd, i)) + continue; + + /* Do not attempt EAS if schedutil is not being used. */ + policy = cpufreq_cpu_get(i); + if (!policy) + goto free; + gov = policy->governor; + cpufreq_cpu_put(policy); + if (gov != &schedutil_gov) { + if (rd->pd) + pr_warn("rd %*pbl: Disabling EAS, schedutil is mandatory\n", + cpumask_pr_args(cpu_map)); + goto free; + } + + /* Create the new pd and add it to the local list. */ + tmp = pd_init(i); + if (!tmp) + goto free; + tmp->next = pd; + pd = tmp; + + /* + * Count performance domains and capacity states for the + * complexity check. + */ + nr_pd++; + nr_cs += em_pd_nr_cap_states(pd->em_pd); + } + + /* Bail out if the Energy Model complexity is too high. */ + if (nr_pd * (nr_cs + nr_cpus) > EM_MAX_COMPLEXITY) { + WARN(1, "rd %*pbl: Failed to start EAS, EM complexity is too high\n", + cpumask_pr_args(cpu_map)); + goto free; + } + + perf_domain_debug(cpu_map, pd); + + /* Attach the new list of performance domains to the root domain. */ + tmp = rd->pd; + rcu_assign_pointer(rd->pd, pd); + if (tmp) + call_rcu(&tmp->rcu, destroy_perf_domain_rcu); + + return !!pd; + +free: + free_pd(pd); + tmp = rd->pd; + rcu_assign_pointer(rd->pd, NULL); + if (tmp) + call_rcu(&tmp->rcu, destroy_perf_domain_rcu); + + return false; +} +#else +static void free_pd(struct perf_domain *pd) { } +#endif /* CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL*/ + static void free_rootdomain(struct rcu_head *rcu) { struct root_domain *rd = container_of(rcu, struct root_domain, rcu); @@ -211,6 +404,7 @@ static void free_rootdomain(struct rcu_head *rcu) free_cpumask_var(rd->rto_mask); free_cpumask_var(rd->online); free_cpumask_var(rd->span); + free_pd(rd->pd); kfree(rd); } @@ -397,7 +591,8 @@ DEFINE_PER_CPU(int, sd_llc_size); DEFINE_PER_CPU(int, sd_llc_id); DEFINE_PER_CPU(struct sched_domain_shared *, sd_llc_shared); DEFINE_PER_CPU(struct sched_domain *, sd_numa); -DEFINE_PER_CPU(struct sched_domain *, sd_asym); +DEFINE_PER_CPU(struct sched_domain *, sd_asym_packing); +DEFINE_PER_CPU(struct sched_domain *, sd_asym_cpucapacity); DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity); static void update_top_cache_domain(int cpu) @@ -423,7 +618,10 @@ static void update_top_cache_domain(int cpu) rcu_assign_pointer(per_cpu(sd_numa, cpu), sd); sd = highest_flag_domain(cpu, SD_ASYM_PACKING); - rcu_assign_pointer(per_cpu(sd_asym, cpu), sd); + rcu_assign_pointer(per_cpu(sd_asym_packing, cpu), sd); + + sd = lowest_flag_domain(cpu, SD_ASYM_CPUCAPACITY); + rcu_assign_pointer(per_cpu(sd_asym_cpucapacity, cpu), sd); } /* @@ -1133,7 +1331,6 @@ sd_init(struct sched_domain_topology_level *tl, .last_balance = jiffies, .balance_interval = sd_weight, - .smt_gain = 0, .max_newidle_lb_cost = 0, .next_decay_max_lb_cost = jiffies, .child = child, @@ -1164,7 +1361,6 @@ sd_init(struct sched_domain_topology_level *tl, if (sd->flags & SD_SHARE_CPUCAPACITY) { sd->imbalance_pct = 110; - sd->smt_gain = 1178; /* ~15% */ } else if (sd->flags & SD_SHARE_PKG_RESOURCES) { sd->imbalance_pct = 117; @@ -1934,6 +2130,7 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur, void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], struct sched_domain_attr *dattr_new) { + bool __maybe_unused has_eas = false; int i, j, n; int new_topology; @@ -1961,8 +2158,8 @@ void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], /* Destroy deleted domains: */ for (i = 0; i < ndoms_cur; i++) { for (j = 0; j < n && !new_topology; j++) { - if (cpumask_equal(doms_cur[i], doms_new[j]) - && dattrs_equal(dattr_cur, i, dattr_new, j)) + if (cpumask_equal(doms_cur[i], doms_new[j]) && + dattrs_equal(dattr_cur, i, dattr_new, j)) goto match1; } /* No match - a current sched domain not in new doms_new[] */ @@ -1982,8 +2179,8 @@ match1: /* Build new domains: */ for (i = 0; i < ndoms_new; i++) { for (j = 0; j < n && !new_topology; j++) { - if (cpumask_equal(doms_new[i], doms_cur[j]) - && dattrs_equal(dattr_new, i, dattr_cur, j)) + if (cpumask_equal(doms_new[i], doms_cur[j]) && + dattrs_equal(dattr_new, i, dattr_cur, j)) goto match2; } /* No match - add a new doms_new */ @@ -1992,6 +2189,24 @@ match2: ; } +#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) + /* Build perf. domains: */ + for (i = 0; i < ndoms_new; i++) { + for (j = 0; j < n && !sched_energy_update; j++) { + if (cpumask_equal(doms_new[i], doms_cur[j]) && + cpu_rq(cpumask_first(doms_cur[j]))->rd->pd) { + has_eas = true; + goto match3; + } + } + /* No match - add perf. domains for a new rd */ + has_eas |= build_perf_domains(doms_new[i]); +match3: + ; + } + sched_energy_set(has_eas); +#endif + /* Remember the new sched domains: */ if (doms_cur != &fallback_doms) free_sched_domains(doms_cur, ndoms_cur); diff --git a/kernel/sys.c b/kernel/sys.c index 123bd73046ec..64b5a230f38d 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -121,6 +121,9 @@ #ifndef SVE_GET_VL # define SVE_GET_VL() (-EINVAL) #endif +#ifndef PAC_RESET_KEYS +# define PAC_RESET_KEYS(a, b) (-EINVAL) +#endif /* * this is where the system-wide overflow UID and GID are defined, for @@ -2476,6 +2479,11 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, return -EINVAL; error = arch_prctl_spec_ctrl_set(me, arg2, arg3); break; + case PR_PAC_RESET_KEYS: + if (arg3 || arg4 || arg5) + return -EINVAL; + error = PAC_RESET_KEYS(me, arg2); + break; default: error = -EINVAL; break; diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index fa5de5e8de61..2c97e8c2d29f 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Alarmtimer interface * @@ -10,10 +11,6 @@ * Copyright (C) 2010 IBM Corperation * * Author: John Stultz <john.stultz@linaro.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #include <linux/time.h> #include <linux/hrtimer.h> diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 8c0e4092f661..5e77662dd2d9 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -1,14 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0 /* - * linux/kernel/time/clockevents.c - * * This file contains functions which manage clock event devices. * * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de> * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner - * - * This code is licenced under the GPL version 2. For details see - * kernel-base/COPYING. */ #include <linux/clockchips.h> @@ -39,10 +35,8 @@ static u64 cev_delta2ns(unsigned long latch, struct clock_event_device *evt, u64 clc = (u64) latch << evt->shift; u64 rnd; - if (unlikely(!evt->mult)) { + if (WARN_ON(!evt->mult)) evt->mult = 1; - WARN_ON(1); - } rnd = (u64) evt->mult - 1; /* @@ -164,10 +158,8 @@ void clockevents_switch_state(struct clock_event_device *dev, * on it, so fix it up and emit a warning: */ if (clockevent_state_oneshot(dev)) { - if (unlikely(!dev->mult)) { + if (WARN_ON(!dev->mult)) dev->mult = 1; - WARN_ON(1); - } } } } @@ -315,10 +307,8 @@ int clockevents_program_event(struct clock_event_device *dev, ktime_t expires, int64_t delta; int rc; - if (unlikely(expires < 0)) { - WARN_ON_ONCE(1); + if (WARN_ON_ONCE(expires < 0)) return -ETIME; - } dev->next_event = expires; diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index ffe081623aec..3bcc19ceb073 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -1,26 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0+ /* - * linux/kernel/time/clocksource.c - * * This file contains the functions which manage clocksource drivers. * * Copyright (C) 2004, 2005 IBM, John Stultz (johnstul@us.ibm.com) - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - * - * TODO WishList: - * o Allow clocksource drivers to be unregistered */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 9cdd74bd2d27..f5cfa1b73d6f 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1,34 +1,25 @@ +// SPDX-License-Identifier: GPL-2.0 /* - * linux/kernel/hrtimer.c - * * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de> * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar * Copyright(C) 2006-2007 Timesys Corp., Thomas Gleixner * * High-resolution kernel timers * - * In contrast to the low-resolution timeout API implemented in - * kernel/timer.c, hrtimers provide finer resolution and accuracy - * depending on system configuration and capabilities. - * - * These timers are currently used for: - * - itimers - * - POSIX timers - * - nanosleep - * - precise in-kernel timing + * In contrast to the low-resolution timeout API, aka timer wheel, + * hrtimers provide finer resolution and accuracy depending on system + * configuration and capabilities. * * Started by: Thomas Gleixner and Ingo Molnar * * Credits: - * based on kernel/timer.c + * Based on the original timer wheel code * * Help, testing, suggestions, bugfixes, improvements were * provided by: * * George Anzinger, Andrew Morton, Steven Rostedt, Roman Zippel * et. al. - * - * For licencing details see kernel-base/COPYING */ #include <linux/cpu.h> diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c index 9a65713c8309..02068b2d5862 100644 --- a/kernel/time/itimer.c +++ b/kernel/time/itimer.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 /* - * linux/kernel/itimer.c - * * Copyright (C) 1992 Darren Senn */ diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index 497719127bf9..dc1b6f1929f9 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c @@ -1,25 +1,9 @@ -/*********************************************************************** -* linux/kernel/time/jiffies.c -* -* This file contains the jiffies based clocksource. -* -* Copyright (C) 2004, 2005 IBM, John Stultz (johnstul@us.ibm.com) -* -* This program is free software; you can redistribute it and/or modify -* it under the terms of the GNU General Public License as published by -* the Free Software Foundation; either version 2 of the License, or -* (at your option) any later version. -* -* This program is distributed in the hope that it will be useful, -* but WITHOUT ANY WARRANTY; without even the implied warranty of -* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -* GNU General Public License for more details. -* -* You should have received a copy of the GNU General Public License -* along with this program; if not, write to the Free Software -* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. -* -************************************************************************/ +// SPDX-License-Identifier: GPL-2.0+ +/* + * This file contains the jiffies based clocksource. + * + * Copyright (C) 2004, 2005 IBM, John Stultz (johnstul@us.ibm.com) + */ #include <linux/clocksource.h> #include <linux/jiffies.h> #include <linux/module.h> diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index c5e0cba3b39c..bc3a3c37ec9c 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -17,7 +17,6 @@ #include <linux/mm.h> #include <linux/module.h> #include <linux/rtc.h> -#include <linux/math64.h> #include "ntp_internal.h" #include "timekeeping_internal.h" diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c index fe56c4e06c51..425bbfce6819 100644 --- a/kernel/time/posix-clock.c +++ b/kernel/time/posix-clock.c @@ -1,21 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0+ /* - * posix-clock.c - support for dynamic clock devices + * Support for dynamic clock devices * * Copyright (C) 2010 OMICRON electronics GmbH - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #include <linux/device.h> #include <linux/export.h> diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c index 989ccf028bde..a51895486e5e 100644 --- a/kernel/time/posix-stubs.c +++ b/kernel/time/posix-stubs.c @@ -1,12 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Dummy stubs used when CONFIG_POSIX_TIMERS=n * * Created by: Nicolas Pitre, July 2016 * Copyright: (C) 2016 Linaro Limited - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #include <linux/linkage.h> diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 31f49ae80f43..0e84bb72a3da 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -1,34 +1,13 @@ +// SPDX-License-Identifier: GPL-2.0+ /* - * linux/kernel/posix-timers.c - * - * * 2002-10-15 Posix Clocks & timers * by George Anzinger george@mvista.com - * * Copyright (C) 2002 2003 by MontaVista Software. * * 2004-06-01 Fix CLOCK_REALTIME clock/timer TIMER_ABSTIME bug. * Copyright (C) 2004 Boris Hu * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or (at - * your option) any later version. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - * - * MontaVista Software | 1237 East Arques Avenue | Sunnyvale | CA 94085 | USA - */ - -/* These are all the functions necessary to implement - * POSIX clocks & timers + * These are all the functions necessary to implement POSIX clocks & timers */ #include <linux/mm.h> #include <linux/interrupt.h> diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index cbc72c2c1fca..094b82ca95e5 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -1,10 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* - * sched_clock.c: Generic sched_clock() support, to extend low level - * hardware time counters to full 64-bit ns values. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. + * Generic sched_clock() support, to extend low level hardware time + * counters to full 64-bit ns values. */ #include <linux/clocksource.h> #include <linux/init.h> diff --git a/kernel/time/test_udelay.c b/kernel/time/test_udelay.c index b0928ab3270f..77c63005dc4e 100644 --- a/kernel/time/test_udelay.c +++ b/kernel/time/test_udelay.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * udelay() test kernel module * @@ -7,15 +8,6 @@ * Specifying usecs of 0 or negative values will run multiples tests. * * Copyright (C) 2014 Google, Inc. - * - * This software is licensed under the terms of the GNU General Public - * License version 2, as published by the Free Software Foundation, and - * may be copied, distributed, and modified under those terms. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. */ #include <linux/debugfs.h> diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c index a59641fb88b6..5be6154e2fd2 100644 --- a/kernel/time/tick-broadcast-hrtimer.c +++ b/kernel/time/tick-broadcast-hrtimer.c @@ -1,8 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* - * linux/kernel/time/tick-broadcast-hrtimer.c - * This file emulates a local clock event device - * via a pseudo clock device. + * Emulate a local clock event device via a pseudo clock device. */ #include <linux/cpu.h> #include <linux/err.h> diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index be0aac2b4300..803fa67aace9 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -1,15 +1,11 @@ +// SPDX-License-Identifier: GPL-2.0 /* - * linux/kernel/time/tick-broadcast.c - * * This file contains functions which emulate a local clock-event * device via a broadcast event source. * * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de> * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner - * - * This code is licenced under the GPL version 2. For details see - * kernel-base/COPYING. */ #include <linux/cpu.h> #include <linux/err.h> diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 14de3727b18e..529143b4c8d2 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -1,15 +1,11 @@ +// SPDX-License-Identifier: GPL-2.0 /* - * linux/kernel/time/tick-common.c - * * This file contains the base functions to manage periodic tick * related events. * * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de> * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner - * - * This code is licenced under the GPL version 2. For details see - * kernel-base/COPYING. */ #include <linux/cpu.h> #include <linux/err.h> diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c index 6fe615d57ebb..f9745d47425a 100644 --- a/kernel/time/tick-oneshot.c +++ b/kernel/time/tick-oneshot.c @@ -1,15 +1,11 @@ +// SPDX-License-Identifier: GPL-2.0 /* - * linux/kernel/time/tick-oneshot.c - * * This file contains functions which manage high resolution tick * related events. * * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de> * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner - * - * This code is licenced under the GPL version 2. For details see - * kernel-base/COPYING. */ #include <linux/cpu.h> #include <linux/err.h> diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 69e673b88474..6fa52cd6df0b 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -1,6 +1,5 @@ +// SPDX-License-Identifier: GPL-2.0 /* - * linux/kernel/time/tick-sched.c - * * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de> * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar * Copyright(C) 2006-2007 Timesys Corp., Thomas Gleixner @@ -8,8 +7,6 @@ * No idle tick implementation for low and high resolution timers * * Started by: Thomas Gleixner and Ingo Molnar - * - * Distribute under GPLv2. */ #include <linux/cpu.h> #include <linux/err.h> diff --git a/kernel/time/time.c b/kernel/time/time.c index ad204cf6d001..5aa0a156e331 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -1,14 +1,11 @@ +// SPDX-License-Identifier: GPL-2.0 /* - * linux/kernel/time.c - * * Copyright (C) 1991, 1992 Linus Torvalds * - * This file contains the interface functions for the various - * time related system calls: time, stime, gettimeofday, settimeofday, - * adjtime - */ -/* - * Modification history kernel/time.c + * This file contains the interface functions for the various time related + * system calls: time, stime, gettimeofday, settimeofday, adjtime + * + * Modification history: * * 1993-09-02 Philip Gladstone * Created file with time related functions from sched/core.c and adjtimex() diff --git a/kernel/time/timeconst.bc b/kernel/time/timeconst.bc index f83bbb81600b..7ed0e0fb5831 100644 --- a/kernel/time/timeconst.bc +++ b/kernel/time/timeconst.bc @@ -1,3 +1,5 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + scale=0 define gcd(a,b) { diff --git a/kernel/time/timeconv.c b/kernel/time/timeconv.c index 7142580ad94f..589e0a552129 100644 --- a/kernel/time/timeconv.c +++ b/kernel/time/timeconv.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: LGPL-2.0+ /* * Copyright (C) 1993, 1994, 1995, 1996, 1997 Free Software Foundation, Inc. * This file is part of the GNU C Library. diff --git a/kernel/time/timecounter.c b/kernel/time/timecounter.c index 8afd78932bdf..85b98e727306 100644 --- a/kernel/time/timecounter.c +++ b/kernel/time/timecounter.c @@ -1,20 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0+ /* - * linux/kernel/time/timecounter.c - * - * based on code that migrated away from - * linux/kernel/time/clocksource.c - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. + * Based on clocksource code. See commit 74d23cc704d1 */ - #include <linux/export.h> #include <linux/timecounter.h> diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 2d110c948805..c801e25875a3 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1,13 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0 /* - * linux/kernel/time/timekeeping.c - * - * Kernel timekeeping code and accessor functions - * - * This code was moved from linux/kernel/timer.c. - * Please see that file for copyright and history logs. - * + * Kernel timekeeping code and accessor functions. Based on code from + * timer.c, moved in commit 8524070b7982. */ - #include <linux/timekeeper_internal.h> #include <linux/module.h> #include <linux/interrupt.h> @@ -50,7 +45,9 @@ enum timekeeping_adv_mode { static struct { seqcount_t seq; struct timekeeper timekeeper; -} tk_core ____cacheline_aligned; +} tk_core ____cacheline_aligned = { + .seq = SEQCNT_ZERO(tk_core.seq), +}; static DEFINE_RAW_SPINLOCK(timekeeper_lock); static struct timekeeper shadow_timekeeper; diff --git a/kernel/time/timekeeping_debug.c b/kernel/time/timekeeping_debug.c index 238e4be60229..86489950d690 100644 --- a/kernel/time/timekeeping_debug.c +++ b/kernel/time/timekeeping_debug.c @@ -1,17 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * debugfs file to track time spent in suspend * * Copyright (c) 2011, Google, Inc. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. */ #include <linux/debugfs.h> @@ -28,7 +19,7 @@ static unsigned int sleep_time_bin[NUM_BINS] = {0}; -static int tk_debug_show_sleep_time(struct seq_file *s, void *data) +static int tk_debug_sleep_time_show(struct seq_file *s, void *data) { unsigned int bin; seq_puts(s, " time (secs) count\n"); @@ -42,18 +33,7 @@ static int tk_debug_show_sleep_time(struct seq_file *s, void *data) } return 0; } - -static int tk_debug_sleep_time_open(struct inode *inode, struct file *file) -{ - return single_open(file, tk_debug_show_sleep_time, NULL); -} - -static const struct file_operations tk_debug_sleep_time_fops = { - .open = tk_debug_sleep_time_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; +DEFINE_SHOW_ATTRIBUTE(tk_debug_sleep_time); static int __init tk_debug_sleep_time_init(void) { diff --git a/kernel/time/timer.c b/kernel/time/timer.c index fa49cd753dea..444156debfa0 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1,6 +1,5 @@ +// SPDX-License-Identifier: GPL-2.0 /* - * linux/kernel/timer.c - * * Kernel internal timers * * Copyright (C) 1991, 1992 Linus Torvalds diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index d647dabdac97..98ba50dcb1b2 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -1,13 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0 /* - * kernel/time/timer_list.c - * * List pending timers * * Copyright(C) 2006, Red Hat, Inc., Ingo Molnar - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #include <linux/proc_fs.h> diff --git a/kernel/torture.c b/kernel/torture.c index 17d91f5fba2a..bbf6d473e50c 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -194,11 +194,23 @@ torture_onoff(void *arg) int cpu; int maxcpu = -1; DEFINE_TORTURE_RANDOM(rand); + int ret; VERBOSE_TOROUT_STRING("torture_onoff task started"); for_each_online_cpu(cpu) maxcpu = cpu; WARN_ON(maxcpu < 0); + if (!IS_MODULE(CONFIG_TORTURE_TEST)) + for_each_possible_cpu(cpu) { + if (cpu_online(cpu)) + continue; + ret = cpu_up(cpu); + if (ret && verbose) { + pr_alert("%s" TORTURE_FLAG + "%s: Initial online %d: errno %d\n", + __func__, torture_type, cpu, ret); + } + } if (maxcpu == 0) { VERBOSE_TOROUT_STRING("Only one CPU, so CPU-hotplug testing is disabled"); @@ -233,16 +245,15 @@ stop: */ int torture_onoff_init(long ooholdoff, long oointerval) { - int ret = 0; - #ifdef CONFIG_HOTPLUG_CPU onoff_holdoff = ooholdoff; onoff_interval = oointerval; if (onoff_interval <= 0) return 0; - ret = torture_create_kthread(torture_onoff, NULL, onoff_task); -#endif /* #ifdef CONFIG_HOTPLUG_CPU */ - return ret; + return torture_create_kthread(torture_onoff, NULL, onoff_task); +#else /* #ifdef CONFIG_HOTPLUG_CPU */ + return 0; +#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */ } EXPORT_SYMBOL_GPL(torture_onoff_init); @@ -513,15 +524,13 @@ static int torture_shutdown(void *arg) */ int torture_shutdown_init(int ssecs, void (*cleanup)(void)) { - int ret = 0; - torture_shutdown_hook = cleanup; if (ssecs > 0) { shutdown_time = ktime_add(ktime_get(), ktime_set(ssecs, 0)); - ret = torture_create_kthread(torture_shutdown, NULL, + return torture_create_kthread(torture_shutdown, NULL, shutdown_task); } - return ret; + return 0; } EXPORT_SYMBOL_GPL(torture_shutdown_init); @@ -620,13 +629,10 @@ static int torture_stutter(void *arg) /* * Initialize and kick off the torture_stutter kthread. */ -int torture_stutter_init(int s) +int torture_stutter_init(const int s) { - int ret; - stutter = s; - ret = torture_create_kthread(torture_stutter, NULL, stutter_task); - return ret; + return torture_create_kthread(torture_stutter, NULL, stutter_task); } EXPORT_SYMBOL_GPL(torture_stutter_init); diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index e23eb9fc77aa..f0ff24173a0b 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -173,7 +173,7 @@ static void ftrace_sync(struct work_struct *work) { /* * This function is just a stub to implement a hard force - * of synchronize_sched(). This requires synchronizing + * of synchronize_rcu(). This requires synchronizing * tasks even in userspace and idle. * * Yes, function tracing is rude. @@ -934,7 +934,7 @@ ftrace_profile_write(struct file *filp, const char __user *ubuf, ftrace_profile_enabled = 0; /* * unregister_ftrace_profiler calls stop_machine - * so this acts like an synchronize_sched. + * so this acts like an synchronize_rcu. */ unregister_ftrace_profiler(); } @@ -1086,7 +1086,7 @@ struct ftrace_ops *ftrace_ops_trampoline(unsigned long addr) /* * Some of the ops may be dynamically allocated, - * they are freed after a synchronize_sched(). + * they are freed after a synchronize_rcu(). */ preempt_disable_notrace(); @@ -1286,7 +1286,7 @@ static void free_ftrace_hash_rcu(struct ftrace_hash *hash) { if (!hash || hash == EMPTY_HASH) return; - call_rcu_sched(&hash->rcu, __free_ftrace_hash_rcu); + call_rcu(&hash->rcu, __free_ftrace_hash_rcu); } void ftrace_free_filter(struct ftrace_ops *ops) @@ -1501,7 +1501,7 @@ static bool hash_contains_ip(unsigned long ip, * the ip is not in the ops->notrace_hash. * * This needs to be called with preemption disabled as - * the hashes are freed with call_rcu_sched(). + * the hashes are freed with call_rcu(). */ static int ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs) @@ -4496,7 +4496,7 @@ unregister_ftrace_function_probe_func(char *glob, struct trace_array *tr, if (ftrace_enabled && !ftrace_hash_empty(hash)) ftrace_run_modify_code(&probe->ops, FTRACE_UPDATE_CALLS, &old_hash_ops); - synchronize_sched(); + synchronize_rcu(); hlist_for_each_entry_safe(entry, tmp, &hhd, hlist) { hlist_del(&entry->hlist); @@ -5314,7 +5314,7 @@ ftrace_graph_release(struct inode *inode, struct file *file) mutex_unlock(&graph_lock); /* Wait till all users are no longer using the old hash */ - synchronize_sched(); + synchronize_rcu(); free_ftrace_hash(old_hash); } @@ -5708,7 +5708,7 @@ void ftrace_release_mod(struct module *mod) list_for_each_entry_safe(mod_map, n, &ftrace_mod_maps, list) { if (mod_map->mod == mod) { list_del_rcu(&mod_map->list); - call_rcu_sched(&mod_map->rcu, ftrace_free_mod_map); + call_rcu(&mod_map->rcu, ftrace_free_mod_map); break; } } @@ -5928,7 +5928,7 @@ ftrace_mod_address_lookup(unsigned long addr, unsigned long *size, struct ftrace_mod_map *mod_map; const char *ret = NULL; - /* mod_map is freed via call_rcu_sched() */ + /* mod_map is freed via call_rcu() */ preempt_disable(); list_for_each_entry_rcu(mod_map, &ftrace_mod_maps, list) { ret = ftrace_func_address_lookup(mod_map, addr, size, off, sym); @@ -6263,7 +6263,7 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, /* * Some of the ops may be dynamically allocated, - * they must be freed after a synchronize_sched(). + * they must be freed after a synchronize_rcu(). */ preempt_disable_notrace(); @@ -6434,7 +6434,7 @@ static void clear_ftrace_pids(struct trace_array *tr) rcu_assign_pointer(tr->function_pids, NULL); /* Wait till all users are no longer using pid filtering */ - synchronize_sched(); + synchronize_rcu(); trace_free_pid_list(pid_list); } @@ -6581,7 +6581,7 @@ ftrace_pid_write(struct file *filp, const char __user *ubuf, rcu_assign_pointer(tr->function_pids, pid_list); if (filtered_pids) { - synchronize_sched(); + synchronize_rcu(); trace_free_pid_list(filtered_pids); } else if (pid_list) { /* Register a probe to set whether to ignore the tracing of a task */ diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 65bd4616220d..4f3247a53259 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1834,7 +1834,7 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size, * There could have been a race between checking * record_disable and incrementing it. */ - synchronize_sched(); + synchronize_rcu(); for_each_buffer_cpu(buffer, cpu) { cpu_buffer = buffer->buffers[cpu]; rb_check_pages(cpu_buffer); @@ -3151,7 +3151,7 @@ static bool rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer) * This prevents all writes to the buffer. Any attempt to write * to the buffer after this will fail and return NULL. * - * The caller should call synchronize_sched() after this. + * The caller should call synchronize_rcu() after this. */ void ring_buffer_record_disable(struct ring_buffer *buffer) { @@ -3253,7 +3253,7 @@ bool ring_buffer_record_is_set_on(struct ring_buffer *buffer) * This prevents all writes to the buffer. Any attempt to write * to the buffer after this will fail and return NULL. * - * The caller should call synchronize_sched() after this. + * The caller should call synchronize_rcu() after this. */ void ring_buffer_record_disable_cpu(struct ring_buffer *buffer, int cpu) { @@ -4191,7 +4191,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_read_prepare); void ring_buffer_read_prepare_sync(void) { - synchronize_sched(); + synchronize_rcu(); } EXPORT_SYMBOL_GPL(ring_buffer_read_prepare_sync); @@ -4363,7 +4363,7 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu) atomic_inc(&cpu_buffer->record_disabled); /* Make sure all commits have finished */ - synchronize_sched(); + synchronize_rcu(); raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); @@ -4496,7 +4496,7 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a, goto out; /* - * We can't do a synchronize_sched here because this + * We can't do a synchronize_rcu here because this * function can be called in atomic context. * Normally this will be called from the same CPU as cpu. * If not it's up to the caller to protect this. diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index ff1c4b20cd0a..51612b4a603f 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1681,7 +1681,7 @@ void tracing_reset(struct trace_buffer *buf, int cpu) ring_buffer_record_disable(buffer); /* Make sure all commits have finished */ - synchronize_sched(); + synchronize_rcu(); ring_buffer_reset_cpu(buffer, cpu); ring_buffer_record_enable(buffer); @@ -1698,7 +1698,7 @@ void tracing_reset_online_cpus(struct trace_buffer *buf) ring_buffer_record_disable(buffer); /* Make sure all commits have finished */ - synchronize_sched(); + synchronize_rcu(); buf->time_start = buffer_ftrace_now(buf, buf->cpu); @@ -2250,7 +2250,7 @@ void trace_buffered_event_disable(void) preempt_enable(); /* Wait for all current users to finish */ - synchronize_sched(); + synchronize_rcu(); for_each_tracing_cpu(cpu) { free_page((unsigned long)per_cpu(trace_buffered_event, cpu)); @@ -5398,7 +5398,7 @@ static int tracing_set_tracer(struct trace_array *tr, const char *buf) if (tr->current_trace->reset) tr->current_trace->reset(tr); - /* Current trace needs to be nop_trace before synchronize_sched */ + /* Current trace needs to be nop_trace before synchronize_rcu */ tr->current_trace = &nop_trace; #ifdef CONFIG_TRACER_MAX_TRACE @@ -5412,7 +5412,7 @@ static int tracing_set_tracer(struct trace_array *tr, const char *buf) * The update_max_tr is called from interrupts disabled * so a synchronized_sched() is sufficient. */ - synchronize_sched(); + synchronize_rcu(); free_snapshot(tr); } #endif diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 5574e862de8d..27821480105e 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -1616,7 +1616,7 @@ static int process_system_preds(struct trace_subsystem_dir *dir, /* * The calls can still be using the old filters. - * Do a synchronize_sched() and to ensure all calls are + * Do a synchronize_rcu() and to ensure all calls are * done with them before we free them. */ tracepoint_synchronize_unregister(); @@ -1848,7 +1848,7 @@ int apply_subsystem_event_filter(struct trace_subsystem_dir *dir, if (filter) { /* * No event actually uses the system filter - * we can free it without synchronize_sched(). + * we can free it without synchronize_rcu(). */ __free_filter(system->filter); system->filter = filter; diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index fec67188c4d2..adc153ab51c0 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -333,7 +333,7 @@ disable_trace_kprobe(struct trace_kprobe *tk, struct trace_event_file *file) * event_call related objects, which will be accessed in * the kprobe_trace_func/kretprobe_trace_func. */ - synchronize_sched(); + synchronize_rcu(); kfree(link); /* Ignored if link == NULL */ } diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index a3be42304485..46f2ab1e08a9 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -92,7 +92,7 @@ static __init int release_early_probes(void) while (early_probes) { tmp = early_probes; early_probes = tmp->next; - call_rcu_sched(tmp, rcu_free_old_probes); + call_rcu(tmp, rcu_free_old_probes); } return 0; @@ -123,7 +123,7 @@ static inline void release_probes(struct tracepoint_func *old) * cover both cases. So let us chain the SRCU and sched RCU * callbacks to wait for both grace periods. */ - call_rcu_sched(&tp_probes->rcu, rcu_free_old_probes); + call_rcu(&tp_probes->rcu, rcu_free_old_probes); } } diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 0280deac392e..392be4b252f6 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3396,7 +3396,7 @@ static void put_unbound_pool(struct worker_pool *pool) del_timer_sync(&pool->mayday_timer); /* sched-RCU protected to allow dereferences from get_work_pool() */ - call_rcu_sched(&pool->rcu, rcu_free_pool); + call_rcu(&pool->rcu, rcu_free_pool); } /** @@ -3503,14 +3503,14 @@ static void pwq_unbound_release_workfn(struct work_struct *work) put_unbound_pool(pool); mutex_unlock(&wq_pool_mutex); - call_rcu_sched(&pwq->rcu, rcu_free_pwq); + call_rcu(&pwq->rcu, rcu_free_pwq); /* * If we're the last pwq going away, @wq is already dead and no one * is gonna access it anymore. Schedule RCU free. */ if (is_last) - call_rcu_sched(&wq->rcu, rcu_free_wq); + call_rcu(&wq->rcu, rcu_free_wq); } /** @@ -4195,7 +4195,7 @@ void destroy_workqueue(struct workqueue_struct *wq) * The base ref is never dropped on per-cpu pwqs. Directly * schedule RCU free. */ - call_rcu_sched(&wq->rcu, rcu_free_wq); + call_rcu(&wq->rcu, rcu_free_wq); } else { /* * We're the sole accessor of @wq at this point. Directly |