diff options
Diffstat (limited to 'kernel')
90 files changed, 2735 insertions, 1504 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 4cb8e8b23c6e..9c323a6daa46 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -108,7 +108,6 @@ obj-$(CONFIG_CRASH_DUMP) += crash_dump.o obj-$(CONFIG_JUMP_LABEL) += jump_label.o obj-$(CONFIG_CONTEXT_TRACKING) += context_tracking.o obj-$(CONFIG_TORTURE_TEST) += torture.o -obj-$(CONFIG_MEMBARRIER) += membarrier.o obj-$(CONFIG_HAS_IOMEM) += memremap.o diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 62d686d96581..9eb8b3511636 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -66,7 +66,7 @@ static struct fsnotify_group *audit_watch_group; /* fsnotify events we care about. */ #define AUDIT_FS_WATCH (FS_MOVE | FS_CREATE | FS_DELETE | FS_DELETE_SELF |\ - FS_MOVE_SELF | FS_EVENT_ON_CHILD) + FS_MOVE_SELF | FS_EVENT_ON_CHILD | FS_UNMOUNT) static void audit_free_parent(struct audit_parent *parent) { @@ -457,13 +457,15 @@ void audit_remove_watch_rule(struct audit_krule *krule) list_del(&krule->rlist); if (list_empty(&watch->rules)) { + /* + * audit_remove_watch() drops our reference to 'parent' which + * can get freed. Grab our own reference to be safe. + */ + audit_get_parent(parent); audit_remove_watch(watch); - - if (list_empty(&parent->watches)) { - audit_get_parent(parent); + if (list_empty(&parent->watches)) fsnotify_destroy_mark(&parent->mark, audit_watch_group); - audit_put_parent(parent); - } + audit_put_parent(parent); } } diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 4fb463172aa8..d11c8181f4c5 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -652,12 +652,27 @@ static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr, } } +static bool fd_htab_map_needs_adjust(const struct bpf_htab *htab) +{ + return htab->map.map_type == BPF_MAP_TYPE_HASH_OF_MAPS && + BITS_PER_LONG == 64; +} + +static u32 htab_size_value(const struct bpf_htab *htab, bool percpu) +{ + u32 size = htab->map.value_size; + + if (percpu || fd_htab_map_needs_adjust(htab)) + size = round_up(size, 8); + return size; +} + static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, void *value, u32 key_size, u32 hash, bool percpu, bool onallcpus, struct htab_elem *old_elem) { - u32 size = htab->map.value_size; + u32 size = htab_size_value(htab, percpu); bool prealloc = htab_is_prealloc(htab); struct htab_elem *l_new, **pl_new; void __percpu *pptr; @@ -696,9 +711,6 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, memcpy(l_new->key, key, key_size); if (percpu) { - /* round up value_size to 8 bytes */ - size = round_up(size, 8); - if (prealloc) { pptr = htab_elem_get_ptr(l_new, key_size); } else { @@ -1209,17 +1221,9 @@ const struct bpf_map_ops htab_lru_percpu_map_ops = { static struct bpf_map *fd_htab_map_alloc(union bpf_attr *attr) { - struct bpf_map *map; - if (attr->value_size != sizeof(u32)) return ERR_PTR(-EINVAL); - - /* pointer is stored internally */ - attr->value_size = sizeof(void *); - map = htab_map_alloc(attr); - attr->value_size = sizeof(u32); - - return map; + return htab_map_alloc(attr); } static void fd_htab_map_free(struct bpf_map *map) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 8d5151688504..2f4039bafebb 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -577,6 +577,13 @@ static void update_domain_attr_tree(struct sched_domain_attr *dattr, rcu_read_unlock(); } +/* Must be called with cpuset_mutex held. */ +static inline int nr_cpusets(void) +{ + /* jump label reference count + the top-level cpuset */ + return static_key_count(&cpusets_enabled_key.key) + 1; +} + /* * generate_sched_domains() * @@ -1892,6 +1899,7 @@ static struct cftype files[] = { { .name = "memory_pressure", .read_u64 = cpuset_read_u64, + .private = FILE_MEMORY_PRESSURE, }, { @@ -2343,13 +2351,7 @@ void cpuset_update_active_cpus(void) * We're inside cpu hotplug critical region which usually nests * inside cgroup synchronization. Bounce actual hotplug processing * to a work item to avoid reverse locking order. - * - * We still need to do partition_sched_domains() synchronously; - * otherwise, the scheduler will get confused and put tasks to the - * dead CPU. Fall back to the default single domain. - * cpuset_hotplug_workfn() will rebuild it as necessary. */ - partition_sched_domains(1, NULL, NULL); schedule_work(&cpuset_hotplug_work); } diff --git a/kernel/configs/android-base.config b/kernel/configs/android-base.config index d70829033bb7..d3fd428f4b92 100644 --- a/kernel/configs/android-base.config +++ b/kernel/configs/android-base.config @@ -10,6 +10,7 @@ # CONFIG_USELIB is not set CONFIG_ANDROID=y CONFIG_ANDROID_BINDER_IPC=y +CONFIG_ANDROID_BINDER_DEVICES=binder,hwbinder,vndbinder CONFIG_ANDROID_LOW_MEMORY_KILLER=y CONFIG_ARMV8_DEPRECATED=y CONFIG_ASHMEM=y diff --git a/kernel/cpu.c b/kernel/cpu.c index eee033134262..acf5308fad51 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -650,6 +650,7 @@ static int takedown_cpu(unsigned int cpu) __cpu_die(cpu); tick_cleanup_dead_cpu(cpu); + rcutree_migrate_callbacks(cpu); return 0; } @@ -1252,7 +1253,17 @@ static int cpuhp_store_callbacks(enum cpuhp_state state, const char *name, struct cpuhp_step *sp; int ret = 0; - if (state == CPUHP_AP_ONLINE_DYN || state == CPUHP_BP_PREPARE_DYN) { + /* + * If name is NULL, then the state gets removed. + * + * CPUHP_AP_ONLINE_DYN and CPUHP_BP_PREPARE_DYN are handed out on + * the first allocation from these dynamic ranges, so the removal + * would trigger a new allocation and clear the wrong (already + * empty) state, leaving the callbacks of the to be cleared state + * dangling, which causes wreckage on the next hotplug operation. + */ + if (name && (state == CPUHP_AP_ONLINE_DYN || + state == CPUHP_BP_PREPARE_DYN)) { ret = cpuhp_reserve_state(state); if (ret < 0) return ret; diff --git a/kernel/events/core.c b/kernel/events/core.c index 426c2ffba16d..294f1927f944 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1249,26 +1249,31 @@ unclone_ctx(struct perf_event_context *ctx) return parent_ctx; } -static u32 perf_event_pid(struct perf_event *event, struct task_struct *p) +static u32 perf_event_pid_type(struct perf_event *event, struct task_struct *p, + enum pid_type type) { + u32 nr; /* * only top level events have the pid namespace they were created in */ if (event->parent) event = event->parent; - return task_tgid_nr_ns(p, event->ns); + nr = __task_pid_nr_ns(p, type, event->ns); + /* avoid -1 if it is idle thread or runs in another ns */ + if (!nr && !pid_alive(p)) + nr = -1; + return nr; } -static u32 perf_event_tid(struct perf_event *event, struct task_struct *p) +static u32 perf_event_pid(struct perf_event *event, struct task_struct *p) { - /* - * only top level events have the pid namespace they were created in - */ - if (event->parent) - event = event->parent; + return perf_event_pid_type(event, p, __PIDTYPE_TGID); +} - return task_pid_nr_ns(p, event->ns); +static u32 perf_event_tid(struct perf_event *event, struct task_struct *p) +{ + return perf_event_pid_type(event, p, PIDTYPE_PID); } /* @@ -1570,6 +1575,9 @@ static void __perf_event_header_size(struct perf_event *event, u64 sample_type) if (sample_type & PERF_SAMPLE_TRANSACTION) size += sizeof(data->txn); + if (sample_type & PERF_SAMPLE_PHYS_ADDR) + size += sizeof(data->phys_addr); + event->header_size = size; } @@ -2217,6 +2225,33 @@ static int group_can_go_on(struct perf_event *event, return can_add_hw; } +/* + * Complement to update_event_times(). This computes the tstamp_* values to + * continue 'enabled' state from @now, and effectively discards the time + * between the prior tstamp_stopped and now (as we were in the OFF state, or + * just switched (context) time base). + * + * This further assumes '@event->state == INACTIVE' (we just came from OFF) and + * cannot have been scheduled in yet. And going into INACTIVE state means + * '@event->tstamp_stopped = @now'. + * + * Thus given the rules of update_event_times(): + * + * total_time_enabled = tstamp_stopped - tstamp_enabled + * total_time_running = tstamp_stopped - tstamp_running + * + * We can insert 'tstamp_stopped == now' and reverse them to compute new + * tstamp_* values. + */ +static void __perf_event_enable_time(struct perf_event *event, u64 now) +{ + WARN_ON_ONCE(event->state != PERF_EVENT_STATE_INACTIVE); + + event->tstamp_stopped = now; + event->tstamp_enabled = now - event->total_time_enabled; + event->tstamp_running = now - event->total_time_running; +} + static void add_event_to_ctx(struct perf_event *event, struct perf_event_context *ctx) { @@ -2224,9 +2259,12 @@ static void add_event_to_ctx(struct perf_event *event, list_add_event(event, ctx); perf_group_attach(event); - event->tstamp_enabled = tstamp; - event->tstamp_running = tstamp; - event->tstamp_stopped = tstamp; + /* + * We can be called with event->state == STATE_OFF when we create with + * .disabled = 1. In that case the IOC_ENABLE will call this function. + */ + if (event->state == PERF_EVENT_STATE_INACTIVE) + __perf_event_enable_time(event, tstamp); } static void ctx_sched_out(struct perf_event_context *ctx, @@ -2471,10 +2509,11 @@ static void __perf_event_mark_enabled(struct perf_event *event) u64 tstamp = perf_event_time(event); event->state = PERF_EVENT_STATE_INACTIVE; - event->tstamp_enabled = tstamp - event->total_time_enabled; + __perf_event_enable_time(event, tstamp); list_for_each_entry(sub, &event->sibling_list, group_entry) { + /* XXX should not be > INACTIVE if event isn't */ if (sub->state >= PERF_EVENT_STATE_INACTIVE) - sub->tstamp_enabled = tstamp - sub->total_time_enabled; + __perf_event_enable_time(sub, tstamp); } } @@ -3180,6 +3219,13 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx, return; perf_ctx_lock(cpuctx, ctx); + /* + * We must check ctx->nr_events while holding ctx->lock, such + * that we serialize against perf_install_in_context(). + */ + if (!ctx->nr_events) + goto unlock; + perf_pmu_disable(ctx->pmu); /* * We want to keep the following priority order: @@ -3193,6 +3239,8 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx, cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); perf_event_sched_in(cpuctx, ctx, task); perf_pmu_enable(ctx->pmu); + +unlock: perf_ctx_unlock(cpuctx, ctx); } @@ -3625,10 +3673,7 @@ unlock: static inline u64 perf_event_count(struct perf_event *event) { - if (event->pmu->count) - return event->pmu->count(event); - - return __perf_event_count(event); + return local64_read(&event->count) + atomic64_read(&event->child_count); } /* @@ -3659,15 +3704,6 @@ int perf_event_read_local(struct perf_event *event, u64 *value) goto out; } - /* - * It must not have a pmu::count method, those are not - * NMI safe. - */ - if (event->pmu->count) { - ret = -EOPNOTSUPP; - goto out; - } - /* If this is a per-task event, it must be for current */ if ((event->attach_state & PERF_ATTACH_TASK) && event->hw.target != current) { @@ -5090,7 +5126,7 @@ static void perf_mmap_open(struct vm_area_struct *vma) atomic_inc(&event->rb->aux_mmap_count); if (event->pmu->event_mapped) - event->pmu->event_mapped(event); + event->pmu->event_mapped(event, vma->vm_mm); } static void perf_pmu_output_stop(struct perf_event *event); @@ -5113,7 +5149,7 @@ static void perf_mmap_close(struct vm_area_struct *vma) unsigned long size = perf_data_size(rb); if (event->pmu->event_unmapped) - event->pmu->event_unmapped(event); + event->pmu->event_unmapped(event, vma->vm_mm); /* * rb->aux_mmap_count will always drop before rb->mmap_count and @@ -5411,7 +5447,7 @@ aux_unlock: vma->vm_ops = &perf_mmap_vmops; if (event->pmu->event_mapped) - event->pmu->event_mapped(event); + event->pmu->event_mapped(event, vma->vm_mm); return ret; } @@ -5972,6 +6008,9 @@ void perf_output_sample(struct perf_output_handle *handle, } } + if (sample_type & PERF_SAMPLE_PHYS_ADDR) + perf_output_put(handle, data->phys_addr); + if (!event->attr.watermark) { int wakeup_events = event->attr.wakeup_events; @@ -5987,6 +6026,38 @@ void perf_output_sample(struct perf_output_handle *handle, } } +static u64 perf_virt_to_phys(u64 virt) +{ + u64 phys_addr = 0; + struct page *p = NULL; + + if (!virt) + return 0; + + if (virt >= TASK_SIZE) { + /* If it's vmalloc()d memory, leave phys_addr as 0 */ + if (virt_addr_valid((void *)(uintptr_t)virt) && + !(virt >= VMALLOC_START && virt < VMALLOC_END)) + phys_addr = (u64)virt_to_phys((void *)(uintptr_t)virt); + } else { + /* + * Walking the pages tables for user address. + * Interrupts are disabled, so it prevents any tear down + * of the page tables. + * Try IRQ-safe __get_user_pages_fast first. + * If failed, leave phys_addr as 0. + */ + if ((current->mm != NULL) && + (__get_user_pages_fast(virt, 1, 0, &p) == 1)) + phys_addr = page_to_phys(p) + virt % PAGE_SIZE; + + if (p) + put_page(p); + } + + return phys_addr; +} + void perf_prepare_sample(struct perf_event_header *header, struct perf_sample_data *data, struct perf_event *event, @@ -6105,6 +6176,9 @@ void perf_prepare_sample(struct perf_event_header *header, header->size += size; } + + if (sample_type & PERF_SAMPLE_PHYS_ADDR) + data->phys_addr = perf_virt_to_phys(data->addr); } static void __always_inline @@ -7256,6 +7330,11 @@ static void perf_log_throttle(struct perf_event *event, int enable) perf_output_end(&handle); } +void perf_event_itrace_started(struct perf_event *event) +{ + event->attach_state |= PERF_ATTACH_ITRACE; +} + static void perf_log_itrace_start(struct perf_event *event) { struct perf_output_handle handle; @@ -7271,7 +7350,7 @@ static void perf_log_itrace_start(struct perf_event *event) event = event->parent; if (!(event->pmu->capabilities & PERF_PMU_CAP_ITRACE) || - event->hw.itrace_started) + event->attach_state & PERF_ATTACH_ITRACE) return; rec.header.type = PERF_RECORD_ITRACE_START; @@ -7875,16 +7954,15 @@ void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx, } } perf_tp_event(call->event.type, count, raw_data, size, regs, head, - rctx, task); + rctx, task, NULL); } EXPORT_SYMBOL_GPL(perf_trace_run_bpf_submit); void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size, struct pt_regs *regs, struct hlist_head *head, int rctx, - struct task_struct *task) + struct task_struct *task, struct perf_event *event) { struct perf_sample_data data; - struct perf_event *event; struct perf_raw_record raw = { .frag = { @@ -7898,9 +7976,15 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size, perf_trace_buf_update(record, event_type); - hlist_for_each_entry_rcu(event, head, hlist_entry) { + /* Use the given event instead of the hlist */ + if (event) { if (perf_tp_event_match(event, &data, regs)) perf_swevent_event(event, count, &data, regs); + } else { + hlist_for_each_entry_rcu(event, head, hlist_entry) { + if (perf_tp_event_match(event, &data, regs)) + perf_swevent_event(event, count, &data, regs); + } } /* @@ -9580,6 +9664,8 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr, if (ret) return -EFAULT; + attr->size = size; + if (attr->__reserved_1) return -EINVAL; @@ -9852,6 +9938,11 @@ SYSCALL_DEFINE5(perf_event_open, return -EINVAL; } + /* Only privileged users can get physical addresses */ + if ((attr.sample_type & PERF_SAMPLE_PHYS_ADDR) && + perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN)) + return -EACCES; + if (!attr.sample_max_stack) attr.sample_max_stack = sysctl_perf_event_max_stack; @@ -10001,28 +10092,27 @@ SYSCALL_DEFINE5(perf_event_open, goto err_context; /* - * Do not allow to attach to a group in a different - * task or CPU context: + * Make sure we're both events for the same CPU; + * grouping events for different CPUs is broken; since + * you can never concurrently schedule them anyhow. */ - if (move_group) { - /* - * Make sure we're both on the same task, or both - * per-cpu events. - */ - if (group_leader->ctx->task != ctx->task) - goto err_context; + if (group_leader->cpu != event->cpu) + goto err_context; - /* - * Make sure we're both events for the same CPU; - * grouping events for different CPUs is broken; since - * you can never concurrently schedule them anyhow. - */ - if (group_leader->cpu != event->cpu) - goto err_context; - } else { - if (group_leader->ctx != ctx) - goto err_context; - } + /* + * Make sure we're both on the same task, or both + * per-CPU events. + */ + if (group_leader->ctx->task != ctx->task) + goto err_context; + + /* + * Do not allow to attach to a group in a different task + * or CPU context. If we're moving SW events, we'll fix + * this up later, so allow that. + */ + if (!move_group && group_leader->ctx != ctx) + goto err_context; /* * Only a group leader can be exclusive or pinned diff --git a/kernel/events/internal.h b/kernel/events/internal.h index 486fd78eb8d5..843e97047335 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -38,9 +38,9 @@ struct ring_buffer { struct user_struct *mmap_user; /* AUX area */ - local_t aux_head; + long aux_head; local_t aux_nest; - local_t aux_wakeup; + long aux_wakeup; /* last aux_watermark boundary crossed by aux_head */ unsigned long aux_pgoff; int aux_nr_pages; int aux_overwrite; @@ -208,7 +208,7 @@ static inline int get_recursion_context(int *recursion) { int rctx; - if (in_nmi()) + if (unlikely(in_nmi())) rctx = 3; else if (in_irq()) rctx = 2; diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index ee97196bb151..af71a84e12ee 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -367,7 +367,7 @@ void *perf_aux_output_begin(struct perf_output_handle *handle, if (WARN_ON_ONCE(local_xchg(&rb->aux_nest, 1))) goto err_put; - aux_head = local_read(&rb->aux_head); + aux_head = rb->aux_head; handle->rb = rb; handle->event = event; @@ -382,7 +382,7 @@ void *perf_aux_output_begin(struct perf_output_handle *handle, */ if (!rb->aux_overwrite) { aux_tail = ACCESS_ONCE(rb->user_page->aux_tail); - handle->wakeup = local_read(&rb->aux_wakeup) + rb->aux_watermark; + handle->wakeup = rb->aux_wakeup + rb->aux_watermark; if (aux_head - aux_tail < perf_aux_size(rb)) handle->size = CIRC_SPACE(aux_head, aux_tail, perf_aux_size(rb)); @@ -433,12 +433,12 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size) handle->aux_flags |= PERF_AUX_FLAG_OVERWRITE; aux_head = handle->head; - local_set(&rb->aux_head, aux_head); + rb->aux_head = aux_head; } else { handle->aux_flags &= ~PERF_AUX_FLAG_OVERWRITE; - aux_head = local_read(&rb->aux_head); - local_add(size, &rb->aux_head); + aux_head = rb->aux_head; + rb->aux_head += size; } if (size || handle->aux_flags) { @@ -450,11 +450,10 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size) handle->aux_flags); } - aux_head = rb->user_page->aux_head = local_read(&rb->aux_head); - - if (aux_head - local_read(&rb->aux_wakeup) >= rb->aux_watermark) { + rb->user_page->aux_head = rb->aux_head; + if (rb->aux_head - rb->aux_wakeup >= rb->aux_watermark) { wakeup = true; - local_add(rb->aux_watermark, &rb->aux_wakeup); + rb->aux_wakeup = rounddown(rb->aux_head, rb->aux_watermark); } if (wakeup) { @@ -478,22 +477,20 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size) int perf_aux_output_skip(struct perf_output_handle *handle, unsigned long size) { struct ring_buffer *rb = handle->rb; - unsigned long aux_head; if (size > handle->size) return -ENOSPC; - local_add(size, &rb->aux_head); + rb->aux_head += size; - aux_head = rb->user_page->aux_head = local_read(&rb->aux_head); - if (aux_head - local_read(&rb->aux_wakeup) >= rb->aux_watermark) { + rb->user_page->aux_head = rb->aux_head; + if (rb->aux_head - rb->aux_wakeup >= rb->aux_watermark) { perf_output_wakeup(handle); - local_add(rb->aux_watermark, &rb->aux_wakeup); - handle->wakeup = local_read(&rb->aux_wakeup) + - rb->aux_watermark; + rb->aux_wakeup = rounddown(rb->aux_head, rb->aux_watermark); + handle->wakeup = rb->aux_wakeup + rb->aux_watermark; } - handle->head = aux_head; + handle->head = rb->aux_head; handle->size -= size; return 0; diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 0e137f98a50c..267f6ef91d97 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1262,8 +1262,6 @@ void uprobe_end_dup_mmap(void) void uprobe_dup_mmap(struct mm_struct *oldmm, struct mm_struct *newmm) { - newmm->uprobes_state.xol_area = NULL; - if (test_bit(MMF_HAS_UPROBES, &oldmm->flags)) { set_bit(MMF_HAS_UPROBES, &newmm->flags); /* unconditionally, dup_mmap() skips VM_DONTCOPY vmas */ diff --git a/kernel/exit.c b/kernel/exit.c index c5548faa9f37..a35d8a17e01f 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -764,7 +764,6 @@ void __noreturn do_exit(long code) { struct task_struct *tsk = current; int group_dead; - TASKS_RCU(int tasks_rcu_i); profile_task_exit(tsk); kcov_task_exit(tsk); @@ -819,7 +818,8 @@ void __noreturn do_exit(long code) * Ensure that we must observe the pi_state in exit_mm() -> * mm_release() -> exit_pi_state_list(). */ - raw_spin_unlock_wait(&tsk->pi_lock); + raw_spin_lock_irq(&tsk->pi_lock); + raw_spin_unlock_irq(&tsk->pi_lock); if (unlikely(in_atomic())) { pr_info("note: %s[%d] exited with preempt_count %d\n", @@ -881,9 +881,7 @@ void __noreturn do_exit(long code) */ flush_ptrace_hw_breakpoint(tsk); - TASKS_RCU(preempt_disable()); - TASKS_RCU(tasks_rcu_i = __srcu_read_lock(&tasks_rcu_exit_srcu)); - TASKS_RCU(preempt_enable()); + exit_tasks_rcu_start(); exit_notify(tsk, group_dead); proc_exit_connector(tsk); mpol_put_task_policy(tsk); @@ -918,8 +916,9 @@ void __noreturn do_exit(long code) if (tsk->nr_dirtied) __this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied); exit_rcu(); - TASKS_RCU(__srcu_read_unlock(&tasks_rcu_exit_srcu, tasks_rcu_i)); + exit_tasks_rcu_finish(); + lockdep_free_task(tsk); do_task_dead(); } EXPORT_SYMBOL_GPL(do_exit); diff --git a/kernel/fork.c b/kernel/fork.c index e075b7780421..4e5345c07344 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -88,6 +88,7 @@ #include <linux/sysctl.h> #include <linux/kcov.h> #include <linux/livepatch.h> +#include <linux/thread_info.h> #include <asm/pgtable.h> #include <asm/pgalloc.h> @@ -217,7 +218,7 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) return s->addr; } - stack = __vmalloc_node_range(THREAD_SIZE, THREAD_SIZE, + stack = __vmalloc_node_range(THREAD_SIZE, THREAD_ALIGN, VMALLOC_START, VMALLOC_END, THREADINFO_GFP, PAGE_KERNEL, @@ -484,6 +485,8 @@ void __init fork_init(void) cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "fork:vm_stack_cache", NULL, free_vm_stack_cache); #endif + + lockdep_init_task(&init_task); } int __weak arch_dup_task_struct(struct task_struct *dst, @@ -785,6 +788,13 @@ static void mm_init_owner(struct mm_struct *mm, struct task_struct *p) #endif } +static void mm_init_uprobes_state(struct mm_struct *mm) +{ +#ifdef CONFIG_UPROBES + mm->uprobes_state.xol_area = NULL; +#endif +} + static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, struct user_namespace *user_ns) { @@ -806,11 +816,13 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, mm_init_cpumask(mm); mm_init_aio(mm); mm_init_owner(mm, p); + RCU_INIT_POINTER(mm->exe_file, NULL); mmu_notifier_mm_init(mm); init_tlb_flush_pending(mm); #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS mm->pmd_huge_pte = NULL; #endif + mm_init_uprobes_state(mm); if (current->mm) { mm->flags = current->mm->flags & MMF_INIT_MASK; @@ -1691,6 +1703,7 @@ static __latent_entropy struct task_struct *copy_process( p->lockdep_depth = 0; /* no locks held yet */ p->curr_chain_key = 0; p->lockdep_recursion = 0; + lockdep_init_task(p); #endif #ifdef CONFIG_DEBUG_MUTEXES @@ -1949,6 +1962,7 @@ bad_fork_cleanup_audit: bad_fork_cleanup_perf: perf_event_free_task(p); bad_fork_cleanup_policy: + lockdep_free_task(p); #ifdef CONFIG_NUMA mpol_put(p->mempolicy); bad_fork_cleanup_threadgroup_lock: diff --git a/kernel/futex.c b/kernel/futex.c index f50b434756c1..3d38eaf05492 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -876,6 +876,8 @@ static struct task_struct *futex_find_get_task(pid_t pid) return p; } +#ifdef CONFIG_FUTEX_PI + /* * This task is holding PI mutexes at exit time => bad. * Kernel cleans up PI-state, but userspace is likely hosed. @@ -933,6 +935,8 @@ void exit_pi_state_list(struct task_struct *curr) raw_spin_unlock_irq(&curr->pi_lock); } +#endif + /* * We need to check the following states: * @@ -1547,6 +1551,45 @@ out: return ret; } +static int futex_atomic_op_inuser(unsigned int encoded_op, u32 __user *uaddr) +{ + unsigned int op = (encoded_op & 0x70000000) >> 28; + unsigned int cmp = (encoded_op & 0x0f000000) >> 24; + int oparg = sign_extend32((encoded_op & 0x00fff000) >> 12, 12); + int cmparg = sign_extend32(encoded_op & 0x00000fff, 12); + int oldval, ret; + + if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) { + if (oparg < 0 || oparg > 31) + return -EINVAL; + oparg = 1 << oparg; + } + + if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) + return -EFAULT; + + ret = arch_futex_atomic_op_inuser(op, oparg, &oldval, uaddr); + if (ret) + return ret; + + switch (cmp) { + case FUTEX_OP_CMP_EQ: + return oldval == cmparg; + case FUTEX_OP_CMP_NE: + return oldval != cmparg; + case FUTEX_OP_CMP_LT: + return oldval < cmparg; + case FUTEX_OP_CMP_GE: + return oldval >= cmparg; + case FUTEX_OP_CMP_LE: + return oldval <= cmparg; + case FUTEX_OP_CMP_GT: + return oldval > cmparg; + default: + return -ENOSYS; + } +} + /* * Wake up all waiters hashed on the physical page that is mapped * to this virtual address: @@ -1800,6 +1843,15 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, struct futex_q *this, *next; DEFINE_WAKE_Q(wake_q); + /* + * When PI not supported: return -ENOSYS if requeue_pi is true, + * consequently the compiler knows requeue_pi is always false past + * this point which will optimize away all the conditional code + * further down. + */ + if (!IS_ENABLED(CONFIG_FUTEX_PI) && requeue_pi) + return -ENOSYS; + if (requeue_pi) { /* * Requeue PI only works on two distinct uaddrs. This @@ -2595,6 +2647,9 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, struct futex_q q = futex_q_init; int res, ret; + if (!IS_ENABLED(CONFIG_FUTEX_PI)) + return -ENOSYS; + if (refill_pi_state_cache()) return -ENOMEM; @@ -2774,6 +2829,9 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags) struct futex_q *top_waiter; int ret; + if (!IS_ENABLED(CONFIG_FUTEX_PI)) + return -ENOSYS; + retry: if (get_user(uval, uaddr)) return -EFAULT; @@ -2984,6 +3042,9 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, struct futex_q q = futex_q_init; int res, ret; + if (!IS_ENABLED(CONFIG_FUTEX_PI)) + return -ENOSYS; + if (uaddr == uaddr2) return -EINVAL; diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 23958980189d..f51b7b6d2451 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -1000,7 +1000,7 @@ EXPORT_SYMBOL_GPL(irq_set_chip_and_handler_name); void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set) { - unsigned long flags; + unsigned long flags, trigger, tmp; struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0); if (!desc) @@ -1014,6 +1014,8 @@ void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set) irq_settings_clr_and_set(desc, clr, set); + trigger = irqd_get_trigger_type(&desc->irq_data); + irqd_clear(&desc->irq_data, IRQD_NO_BALANCING | IRQD_PER_CPU | IRQD_TRIGGER_MASK | IRQD_LEVEL | IRQD_MOVE_PCNTXT); if (irq_settings_has_no_balance_set(desc)) @@ -1025,7 +1027,11 @@ void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set) if (irq_settings_is_level(desc)) irqd_set(&desc->irq_data, IRQD_LEVEL); - irqd_set(&desc->irq_data, irq_settings_get_trigger_mask(desc)); + tmp = irq_settings_get_trigger_mask(desc); + if (tmp != IRQ_TYPE_NONE) + trigger = tmp; + + irqd_set(&desc->irq_data, trigger); irq_put_desc_unlock(desc, flags); } diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c index 4d384edc0c64..c3fdb36dec30 100644 --- a/kernel/irq/debugfs.c +++ b/kernel/irq/debugfs.c @@ -5,6 +5,7 @@ */ #include <linux/irqdomain.h> #include <linux/irq.h> +#include <linux/uaccess.h> #include "internals.h" @@ -171,8 +172,55 @@ static int irq_debug_open(struct inode *inode, struct file *file) return single_open(file, irq_debug_show, inode->i_private); } +static ssize_t irq_debug_write(struct file *file, const char __user *user_buf, + size_t count, loff_t *ppos) +{ + struct irq_desc *desc = file_inode(file)->i_private; + char buf[8] = { 0, }; + size_t size; + + size = min(sizeof(buf) - 1, count); + if (copy_from_user(buf, user_buf, size)) + return -EFAULT; + + if (!strncmp(buf, "trigger", size)) { + unsigned long flags; + int err; + + /* Try the HW interface first */ + err = irq_set_irqchip_state(irq_desc_get_irq(desc), + IRQCHIP_STATE_PENDING, true); + if (!err) + return count; + + /* + * Otherwise, try to inject via the resend interface, + * which may or may not succeed. + */ + chip_bus_lock(desc); + raw_spin_lock_irqsave(&desc->lock, flags); + + if (irq_settings_is_level(desc)) { + /* Can't do level, sorry */ + err = -EINVAL; + } else { + desc->istate |= IRQS_PENDING; + check_irq_resend(desc); + err = 0; + } + + raw_spin_unlock_irqrestore(&desc->lock, flags); + chip_bus_sync_unlock(desc); + + return err ? err : count; + } + + return count; +} + static const struct file_operations dfs_irq_ops = { .open = irq_debug_open, + .write = irq_debug_write, .read = seq_read, .llseek = seq_lseek, .release = single_release, @@ -186,7 +234,7 @@ void irq_add_debugfs_entry(unsigned int irq, struct irq_desc *desc) return; sprintf(name, "%d", irq); - desc->debugfs_file = debugfs_create_file(name, 0444, irq_dir, desc, + desc->debugfs_file = debugfs_create_file(name, 0644, irq_dir, desc, &dfs_irq_ops); } diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index a2c48058354c..a4aa39009f0d 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -151,7 +151,7 @@ static inline void chip_bus_sync_unlock(struct irq_desc *desc) #define IRQ_GET_DESC_CHECK_PERCPU (_IRQ_DESC_CHECK | _IRQ_DESC_PERCPU) #define for_each_action_of_desc(desc, act) \ - for (act = desc->act; act; act = act->next) + for (act = desc->action; act; act = act->next) struct irq_desc * __irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus, diff --git a/kernel/irq/ipi.c b/kernel/irq/ipi.c index 1a9abc1c8ea0..259a22aa9934 100644 --- a/kernel/irq/ipi.c +++ b/kernel/irq/ipi.c @@ -165,7 +165,7 @@ irq_hw_number_t ipi_get_hwirq(unsigned int irq, unsigned int cpu) struct irq_data *data = irq_get_irq_data(irq); struct cpumask *ipimask = data ? irq_data_get_affinity_mask(data) : NULL; - if (!data || !ipimask || cpu > nr_cpu_ids) + if (!data || !ipimask || cpu >= nr_cpu_ids) return INVALID_HWIRQ; if (!cpumask_test_cpu(cpu, ipimask)) @@ -195,7 +195,7 @@ static int ipi_send_verify(struct irq_chip *chip, struct irq_data *data, if (!chip->ipi_send_single && !chip->ipi_send_mask) return -EINVAL; - if (cpu > nr_cpu_ids) + if (cpu >= nr_cpu_ids) return -EINVAL; if (dest) { diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 1ff9912211e9..d62351714f3e 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -1504,10 +1504,10 @@ int irq_domain_push_irq(struct irq_domain *domain, int virq, void *arg) if (WARN_ON(!irq_domain_is_hierarchy(domain))) return -EINVAL; - if (domain->parent != root_irq_data->domain) + if (!root_irq_data) return -EINVAL; - if (!root_irq_data) + if (domain->parent != root_irq_data->domain) return -EINVAL; child_irq_data = kzalloc_node(sizeof(*child_irq_data), GFP_KERNEL, diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 1d1a5b945ab4..573dc52b0806 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -400,8 +400,18 @@ int irq_set_vcpu_affinity(unsigned int irq, void *vcpu_info) return -EINVAL; data = irq_desc_get_irq_data(desc); - chip = irq_data_get_irq_chip(data); - if (chip && chip->irq_set_vcpu_affinity) + do { + chip = irq_data_get_irq_chip(data); + if (chip && chip->irq_set_vcpu_affinity) + break; +#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY + data = data->parent_data; +#else + data = NULL; +#endif + } while (data); + + if (data) ret = chip->irq_set_vcpu_affinity(data, vcpu_info); irq_put_desc_unlock(desc, flags); diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 7f9642a1e267..6376b4a598d3 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -61,12 +61,12 @@ static int show_irq_affinity(int type, struct seq_file *m) case EFFECTIVE: case EFFECTIVE_LIST: #ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK - mask = desc->irq_common_data.effective_affinity; + mask = irq_data_get_effective_affinity_mask(&desc->irq_data); break; -#else - return -EINVAL; #endif - }; + default: + return -EINVAL; + } switch (type) { case AFFINITY_LIST: diff --git a/kernel/jump_label.c b/kernel/jump_label.c index d11c506a6ac3..0bf2e8f5244a 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -79,29 +79,7 @@ int static_key_count(struct static_key *key) } EXPORT_SYMBOL_GPL(static_key_count); -void static_key_enable(struct static_key *key) -{ - int count = static_key_count(key); - - WARN_ON_ONCE(count < 0 || count > 1); - - if (!count) - static_key_slow_inc(key); -} -EXPORT_SYMBOL_GPL(static_key_enable); - -void static_key_disable(struct static_key *key) -{ - int count = static_key_count(key); - - WARN_ON_ONCE(count < 0 || count > 1); - - if (count) - static_key_slow_dec(key); -} -EXPORT_SYMBOL_GPL(static_key_disable); - -void static_key_slow_inc(struct static_key *key) +static void static_key_slow_inc_cpuslocked(struct static_key *key) { int v, v1; @@ -125,24 +103,87 @@ void static_key_slow_inc(struct static_key *key) return; } - cpus_read_lock(); jump_label_lock(); if (atomic_read(&key->enabled) == 0) { atomic_set(&key->enabled, -1); jump_label_update(key); - atomic_set(&key->enabled, 1); + /* + * Ensure that if the above cmpxchg loop observes our positive + * value, it must also observe all the text changes. + */ + atomic_set_release(&key->enabled, 1); } else { atomic_inc(&key->enabled); } jump_label_unlock(); +} + +void static_key_slow_inc(struct static_key *key) +{ + cpus_read_lock(); + static_key_slow_inc_cpuslocked(key); cpus_read_unlock(); } EXPORT_SYMBOL_GPL(static_key_slow_inc); -static void __static_key_slow_dec(struct static_key *key, - unsigned long rate_limit, struct delayed_work *work) +void static_key_enable_cpuslocked(struct static_key *key) +{ + STATIC_KEY_CHECK_USE(); + + if (atomic_read(&key->enabled) > 0) { + WARN_ON_ONCE(atomic_read(&key->enabled) != 1); + return; + } + + jump_label_lock(); + if (atomic_read(&key->enabled) == 0) { + atomic_set(&key->enabled, -1); + jump_label_update(key); + /* + * See static_key_slow_inc(). + */ + atomic_set_release(&key->enabled, 1); + } + jump_label_unlock(); +} +EXPORT_SYMBOL_GPL(static_key_enable_cpuslocked); + +void static_key_enable(struct static_key *key) +{ + cpus_read_lock(); + static_key_enable_cpuslocked(key); + cpus_read_unlock(); +} +EXPORT_SYMBOL_GPL(static_key_enable); + +void static_key_disable_cpuslocked(struct static_key *key) +{ + STATIC_KEY_CHECK_USE(); + + if (atomic_read(&key->enabled) != 1) { + WARN_ON_ONCE(atomic_read(&key->enabled) != 0); + return; + } + + jump_label_lock(); + if (atomic_cmpxchg(&key->enabled, 1, 0)) + jump_label_update(key); + jump_label_unlock(); +} +EXPORT_SYMBOL_GPL(static_key_disable_cpuslocked); + +void static_key_disable(struct static_key *key) { cpus_read_lock(); + static_key_disable_cpuslocked(key); + cpus_read_unlock(); +} +EXPORT_SYMBOL_GPL(static_key_disable); + +static void static_key_slow_dec_cpuslocked(struct static_key *key, + unsigned long rate_limit, + struct delayed_work *work) +{ /* * The negative count check is valid even when a negative * key->enabled is in use by static_key_slow_inc(); a @@ -153,7 +194,6 @@ static void __static_key_slow_dec(struct static_key *key, if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex)) { WARN(atomic_read(&key->enabled) < 0, "jump label: negative count!\n"); - cpus_read_unlock(); return; } @@ -164,6 +204,14 @@ static void __static_key_slow_dec(struct static_key *key, jump_label_update(key); } jump_label_unlock(); +} + +static void __static_key_slow_dec(struct static_key *key, + unsigned long rate_limit, + struct delayed_work *work) +{ + cpus_read_lock(); + static_key_slow_dec_cpuslocked(key, rate_limit, work); cpus_read_unlock(); } diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 1ae7c41c33c1..20fef1a38602 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -301,7 +301,7 @@ static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order) { struct page *pages; - pages = alloc_pages(gfp_mask, order); + pages = alloc_pages(gfp_mask & ~__GFP_ZERO, order); if (pages) { unsigned int count, i; @@ -310,6 +310,13 @@ static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order) count = 1 << order; for (i = 0; i < count; i++) SetPageReserved(pages + i); + + arch_kexec_post_alloc_pages(page_address(pages), count, + gfp_mask); + + if (gfp_mask & __GFP_ZERO) + for (i = 0; i < count; i++) + clear_highpage(pages + i); } return pages; @@ -321,6 +328,9 @@ static void kimage_free_pages(struct page *page) order = page_private(page); count = 1 << order; + + arch_kexec_pre_free_pages(page_address(page), count); + for (i = 0; i < count; i++) ClearPageReserved(page + i); __free_pages(page, order); diff --git a/kernel/kmod.c b/kernel/kmod.c index 6d016c5d97c8..2f37acde640b 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -71,6 +71,18 @@ static atomic_t kmod_concurrent_max = ATOMIC_INIT(MAX_KMOD_CONCURRENT); static DECLARE_WAIT_QUEUE_HEAD(kmod_wq); /* + * This is a restriction on having *all* MAX_KMOD_CONCURRENT threads + * running at the same time without returning. When this happens we + * believe you've somehow ended up with a recursive module dependency + * creating a loop. + * + * We have no option but to fail. + * + * Userspace should proactively try to detect and prevent these. + */ +#define MAX_KMOD_ALL_BUSY_TIMEOUT 5 + +/* modprobe_path is set via /proc/sys. */ char modprobe_path[KMOD_PATH_LEN] = "/sbin/modprobe"; @@ -167,8 +179,17 @@ int __request_module(bool wait, const char *fmt, ...) pr_warn_ratelimited("request_module: kmod_concurrent_max (%u) close to 0 (max_modprobes: %u), for module %s, throttling...", atomic_read(&kmod_concurrent_max), MAX_KMOD_CONCURRENT, module_name); - wait_event_interruptible(kmod_wq, - atomic_dec_if_positive(&kmod_concurrent_max) >= 0); + ret = wait_event_killable_timeout(kmod_wq, + atomic_dec_if_positive(&kmod_concurrent_max) >= 0, + MAX_KMOD_ALL_BUSY_TIMEOUT * HZ); + if (!ret) { + pr_warn_ratelimited("request_module: modprobe %s cannot be processed, kmod busy with %d threads for more than %d seconds now", + module_name, MAX_KMOD_CONCURRENT, MAX_KMOD_ALL_BUSY_TIMEOUT); + return -ETIME; + } else if (ret == -ERESTARTSYS) { + pr_warn_ratelimited("request_module: sigkill sent for modprobe %s, giving up", module_name); + return ret; + } } trace_module_request(module_name, wait, _RET_IP_); diff --git a/kernel/kthread.c b/kernel/kthread.c index 26db528c1d88..1c19edf82427 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -637,6 +637,7 @@ repeat: schedule(); try_to_freeze(); + cond_resched(); goto repeat; } EXPORT_SYMBOL_GPL(kthread_worker_fn); diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 7d2499bec5fe..44c8d0d17170 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -58,6 +58,10 @@ #define CREATE_TRACE_POINTS #include <trace/events/lock.h> +#ifdef CONFIG_LOCKDEP_CROSSRELEASE +#include <linux/slab.h> +#endif + #ifdef CONFIG_PROVE_LOCKING int prove_locking = 1; module_param(prove_locking, int, 0644); @@ -344,14 +348,12 @@ EXPORT_SYMBOL(lockdep_on); #if VERBOSE # define HARDIRQ_VERBOSE 1 # define SOFTIRQ_VERBOSE 1 -# define RECLAIM_VERBOSE 1 #else # define HARDIRQ_VERBOSE 0 # define SOFTIRQ_VERBOSE 0 -# define RECLAIM_VERBOSE 0 #endif -#if VERBOSE || HARDIRQ_VERBOSE || SOFTIRQ_VERBOSE || RECLAIM_VERBOSE +#if VERBOSE || HARDIRQ_VERBOSE || SOFTIRQ_VERBOSE /* * Quick filtering for interesting events: */ @@ -726,6 +728,18 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass) return is_static || static_obj(lock->key) ? NULL : ERR_PTR(-EINVAL); } +#ifdef CONFIG_LOCKDEP_CROSSRELEASE +static void cross_init(struct lockdep_map *lock, int cross); +static int cross_lock(struct lockdep_map *lock); +static int lock_acquire_crosslock(struct held_lock *hlock); +static int lock_release_crosslock(struct lockdep_map *lock); +#else +static inline void cross_init(struct lockdep_map *lock, int cross) {} +static inline int cross_lock(struct lockdep_map *lock) { return 0; } +static inline int lock_acquire_crosslock(struct held_lock *hlock) { return 2; } +static inline int lock_release_crosslock(struct lockdep_map *lock) { return 2; } +#endif + /* * Register a lock's class in the hash-table, if the class is not present * yet. Otherwise we look it up. We cache the result in the lock object @@ -1125,22 +1139,41 @@ print_circular_lock_scenario(struct held_lock *src, printk(KERN_CONT "\n\n"); } - printk(" Possible unsafe locking scenario:\n\n"); - printk(" CPU0 CPU1\n"); - printk(" ---- ----\n"); - printk(" lock("); - __print_lock_name(target); - printk(KERN_CONT ");\n"); - printk(" lock("); - __print_lock_name(parent); - printk(KERN_CONT ");\n"); - printk(" lock("); - __print_lock_name(target); - printk(KERN_CONT ");\n"); - printk(" lock("); - __print_lock_name(source); - printk(KERN_CONT ");\n"); - printk("\n *** DEADLOCK ***\n\n"); + if (cross_lock(tgt->instance)) { + printk(" Possible unsafe locking scenario by crosslock:\n\n"); + printk(" CPU0 CPU1\n"); + printk(" ---- ----\n"); + printk(" lock("); + __print_lock_name(parent); + printk(KERN_CONT ");\n"); + printk(" lock("); + __print_lock_name(target); + printk(KERN_CONT ");\n"); + printk(" lock("); + __print_lock_name(source); + printk(KERN_CONT ");\n"); + printk(" unlock("); + __print_lock_name(target); + printk(KERN_CONT ");\n"); + printk("\n *** DEADLOCK ***\n\n"); + } else { + printk(" Possible unsafe locking scenario:\n\n"); + printk(" CPU0 CPU1\n"); + printk(" ---- ----\n"); + printk(" lock("); + __print_lock_name(target); + printk(KERN_CONT ");\n"); + printk(" lock("); + __print_lock_name(parent); + printk(KERN_CONT ");\n"); + printk(" lock("); + __print_lock_name(target); + printk(KERN_CONT ");\n"); + printk(" lock("); + __print_lock_name(source); + printk(KERN_CONT ");\n"); + printk("\n *** DEADLOCK ***\n\n"); + } } /* @@ -1165,7 +1198,12 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth, pr_warn("%s/%d is trying to acquire lock:\n", curr->comm, task_pid_nr(curr)); print_lock(check_src); - pr_warn("\nbut task is already holding lock:\n"); + + if (cross_lock(check_tgt->instance)) + pr_warn("\nbut now in release context of a crosslock acquired at the following:\n"); + else + pr_warn("\nbut task is already holding lock:\n"); + print_lock(check_tgt); pr_warn("\nwhich lock already depends on the new lock.\n\n"); pr_warn("\nthe existing dependency chain (in reverse order) is:\n"); @@ -1183,7 +1221,8 @@ static inline int class_equal(struct lock_list *entry, void *data) static noinline int print_circular_bug(struct lock_list *this, struct lock_list *target, struct held_lock *check_src, - struct held_lock *check_tgt) + struct held_lock *check_tgt, + struct stack_trace *trace) { struct task_struct *curr = current; struct lock_list *parent; @@ -1193,7 +1232,9 @@ static noinline int print_circular_bug(struct lock_list *this, if (!debug_locks_off_graph_unlock() || debug_locks_silent) return 0; - if (!save_trace(&this->trace)) + if (cross_lock(check_tgt->instance)) + this->trace = *trace; + else if (!save_trace(&this->trace)) return 0; depth = get_lock_depth(target); @@ -1309,6 +1350,19 @@ check_noncircular(struct lock_list *root, struct lock_class *target, return result; } +static noinline int +check_redundant(struct lock_list *root, struct lock_class *target, + struct lock_list **target_entry) +{ + int result; + + debug_atomic_inc(nr_redundant_checks); + + result = __bfs_forwards(root, target, class_equal, target_entry); + + return result; +} + #if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) /* * Forwards and backwards subgraph searching, for the purposes of @@ -1784,6 +1838,9 @@ check_deadlock(struct task_struct *curr, struct held_lock *next, if (nest) return 2; + if (cross_lock(prev->instance)) + continue; + return print_deadlock_bug(curr, prev, next); } return 1; @@ -1813,20 +1870,13 @@ check_deadlock(struct task_struct *curr, struct held_lock *next, */ static int check_prev_add(struct task_struct *curr, struct held_lock *prev, - struct held_lock *next, int distance, int *stack_saved) + struct held_lock *next, int distance, struct stack_trace *trace, + int (*save)(struct stack_trace *trace)) { struct lock_list *entry; int ret; struct lock_list this; struct lock_list *uninitialized_var(target_entry); - /* - * Static variable, serialized by the graph_lock(). - * - * We use this static variable to save the stack trace in case - * we call into this function multiple times due to encountering - * trylocks in the held lock stack. - */ - static struct stack_trace trace; /* * Prove that the new <prev> -> <next> dependency would not @@ -1841,7 +1891,7 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, this.parent = NULL; ret = check_noncircular(&this, hlock_class(prev), &target_entry); if (unlikely(!ret)) - return print_circular_bug(&this, target_entry, next, prev); + return print_circular_bug(&this, target_entry, next, prev, trace); else if (unlikely(ret < 0)) return print_bfs_bug(ret); @@ -1870,15 +1920,26 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, if (entry->class == hlock_class(next)) { if (distance == 1) entry->distance = 1; - return 2; + return 1; } } - if (!*stack_saved) { - if (!save_trace(&trace)) - return 0; - *stack_saved = 1; + /* + * Is the <prev> -> <next> link redundant? + */ + this.class = hlock_class(prev); + this.parent = NULL; + ret = check_redundant(&this, hlock_class(next), &target_entry); + if (!ret) { + debug_atomic_inc(nr_redundant); + return 2; } + if (ret < 0) + return print_bfs_bug(ret); + + + if (save && !save(trace)) + return 0; /* * Ok, all validations passed, add the new lock @@ -1886,14 +1947,14 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, */ ret = add_lock_to_list(hlock_class(next), &hlock_class(prev)->locks_after, - next->acquire_ip, distance, &trace); + next->acquire_ip, distance, trace); if (!ret) return 0; ret = add_lock_to_list(hlock_class(prev), &hlock_class(next)->locks_before, - next->acquire_ip, distance, &trace); + next->acquire_ip, distance, trace); if (!ret) return 0; @@ -1901,8 +1962,6 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, * Debugging printouts: */ if (verbose(hlock_class(prev)) || verbose(hlock_class(next))) { - /* We drop graph lock, so another thread can overwrite trace. */ - *stack_saved = 0; graph_unlock(); printk("\n new dependency: "); print_lock_name(hlock_class(prev)); @@ -1910,9 +1969,10 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, print_lock_name(hlock_class(next)); printk(KERN_CONT "\n"); dump_stack(); - return graph_lock(); + if (!graph_lock()) + return 0; } - return 1; + return 2; } /* @@ -1925,8 +1985,9 @@ static int check_prevs_add(struct task_struct *curr, struct held_lock *next) { int depth = curr->lockdep_depth; - int stack_saved = 0; struct held_lock *hlock; + struct stack_trace trace; + int (*save)(struct stack_trace *trace) = save_trace; /* * Debugging checks. @@ -1947,21 +2008,36 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next) int distance = curr->lockdep_depth - depth + 1; hlock = curr->held_locks + depth - 1; /* - * Only non-recursive-read entries get new dependencies - * added: + * Only non-crosslock entries get new dependencies added. + * Crosslock entries will be added by commit later: */ - if (hlock->read != 2 && hlock->check) { - if (!check_prev_add(curr, hlock, next, - distance, &stack_saved)) - return 0; + if (!cross_lock(hlock->instance)) { /* - * Stop after the first non-trylock entry, - * as non-trylock entries have added their - * own direct dependencies already, so this - * lock is connected to them indirectly: + * Only non-recursive-read entries get new dependencies + * added: */ - if (!hlock->trylock) - break; + if (hlock->read != 2 && hlock->check) { + int ret = check_prev_add(curr, hlock, next, + distance, &trace, save); + if (!ret) + return 0; + + /* + * Stop saving stack_trace if save_trace() was + * called at least once: + */ + if (save && ret == 2) + save = NULL; + + /* + * Stop after the first non-trylock entry, + * as non-trylock entries have added their + * own direct dependencies already, so this + * lock is connected to them indirectly: + */ + if (!hlock->trylock) + break; + } } depth--; /* @@ -2126,19 +2202,26 @@ static int check_no_collision(struct task_struct *curr, } /* - * Look up a dependency chain. If the key is not present yet then - * add it and return 1 - in this case the new dependency chain is - * validated. If the key is already hashed, return 0. - * (On return with 1 graph_lock is held.) + * This is for building a chain between just two different classes, + * instead of adding a new hlock upon current, which is done by + * add_chain_cache(). + * + * This can be called in any context with two classes, while + * add_chain_cache() must be done within the lock owener's context + * since it uses hlock which might be racy in another context. */ -static inline int lookup_chain_cache(struct task_struct *curr, - struct held_lock *hlock, - u64 chain_key) +static inline int add_chain_cache_classes(unsigned int prev, + unsigned int next, + unsigned int irq_context, + u64 chain_key) { - struct lock_class *class = hlock_class(hlock); struct hlist_head *hash_head = chainhashentry(chain_key); struct lock_chain *chain; - int i, j; + + /* + * Allocate a new chain entry from the static array, and add + * it to the hash: + */ /* * We might need to take the graph lock, ensure we've got IRQs @@ -2147,43 +2230,76 @@ static inline int lookup_chain_cache(struct task_struct *curr, */ if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) return 0; + + if (unlikely(nr_lock_chains >= MAX_LOCKDEP_CHAINS)) { + if (!debug_locks_off_graph_unlock()) + return 0; + + print_lockdep_off("BUG: MAX_LOCKDEP_CHAINS too low!"); + dump_stack(); + return 0; + } + + chain = lock_chains + nr_lock_chains++; + chain->chain_key = chain_key; + chain->irq_context = irq_context; + chain->depth = 2; + if (likely(nr_chain_hlocks + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS)) { + chain->base = nr_chain_hlocks; + nr_chain_hlocks += chain->depth; + chain_hlocks[chain->base] = prev - 1; + chain_hlocks[chain->base + 1] = next -1; + } +#ifdef CONFIG_DEBUG_LOCKDEP /* - * We can walk it lock-free, because entries only get added - * to the hash: + * Important for check_no_collision(). */ - hlist_for_each_entry_rcu(chain, hash_head, entry) { - if (chain->chain_key == chain_key) { -cache_hit: - debug_atomic_inc(chain_lookup_hits); - if (!check_no_collision(curr, hlock, chain)) - return 0; - - if (very_verbose(class)) - printk("\nhash chain already cached, key: " - "%016Lx tail class: [%p] %s\n", - (unsigned long long)chain_key, - class->key, class->name); + else { + if (!debug_locks_off_graph_unlock()) return 0; - } + + print_lockdep_off("BUG: MAX_LOCKDEP_CHAIN_HLOCKS too low!"); + dump_stack(); + return 0; } - if (very_verbose(class)) - printk("\nnew hash chain, key: %016Lx tail class: [%p] %s\n", - (unsigned long long)chain_key, class->key, class->name); +#endif + + hlist_add_head_rcu(&chain->entry, hash_head); + debug_atomic_inc(chain_lookup_misses); + inc_chains(); + + return 1; +} + +/* + * Adds a dependency chain into chain hashtable. And must be called with + * graph_lock held. + * + * Return 0 if fail, and graph_lock is released. + * Return 1 if succeed, with graph_lock held. + */ +static inline int add_chain_cache(struct task_struct *curr, + struct held_lock *hlock, + u64 chain_key) +{ + struct lock_class *class = hlock_class(hlock); + struct hlist_head *hash_head = chainhashentry(chain_key); + struct lock_chain *chain; + int i, j; + /* * Allocate a new chain entry from the static array, and add * it to the hash: */ - if (!graph_lock()) - return 0; + /* - * We have to walk the chain again locked - to avoid duplicates: + * We might need to take the graph lock, ensure we've got IRQs + * disabled to make this an IRQ-safe lock.. for recursion reasons + * lockdep won't complain about its own locking errors. */ - hlist_for_each_entry(chain, hash_head, entry) { - if (chain->chain_key == chain_key) { - graph_unlock(); - goto cache_hit; - } - } + if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) + return 0; + if (unlikely(nr_lock_chains >= MAX_LOCKDEP_CHAINS)) { if (!debug_locks_off_graph_unlock()) return 0; @@ -2235,6 +2351,78 @@ cache_hit: return 1; } +/* + * Look up a dependency chain. + */ +static inline struct lock_chain *lookup_chain_cache(u64 chain_key) +{ + struct hlist_head *hash_head = chainhashentry(chain_key); + struct lock_chain *chain; + + /* + * We can walk it lock-free, because entries only get added + * to the hash: + */ + hlist_for_each_entry_rcu(chain, hash_head, entry) { + if (chain->chain_key == chain_key) { + debug_atomic_inc(chain_lookup_hits); + return chain; + } + } + return NULL; +} + +/* + * If the key is not present yet in dependency chain cache then + * add it and return 1 - in this case the new dependency chain is + * validated. If the key is already hashed, return 0. + * (On return with 1 graph_lock is held.) + */ +static inline int lookup_chain_cache_add(struct task_struct *curr, + struct held_lock *hlock, + u64 chain_key) +{ + struct lock_class *class = hlock_class(hlock); + struct lock_chain *chain = lookup_chain_cache(chain_key); + + if (chain) { +cache_hit: + if (!check_no_collision(curr, hlock, chain)) + return 0; + + if (very_verbose(class)) { + printk("\nhash chain already cached, key: " + "%016Lx tail class: [%p] %s\n", + (unsigned long long)chain_key, + class->key, class->name); + } + + return 0; + } + + if (very_verbose(class)) { + printk("\nnew hash chain, key: %016Lx tail class: [%p] %s\n", + (unsigned long long)chain_key, class->key, class->name); + } + + if (!graph_lock()) + return 0; + + /* + * We have to walk the chain again locked - to avoid duplicates: + */ + chain = lookup_chain_cache(chain_key); + if (chain) { + graph_unlock(); + goto cache_hit; + } + + if (!add_chain_cache(curr, hlock, chain_key)) + return 0; + + return 1; +} + static int validate_chain(struct task_struct *curr, struct lockdep_map *lock, struct held_lock *hlock, int chain_head, u64 chain_key) { @@ -2245,11 +2433,11 @@ static int validate_chain(struct task_struct *curr, struct lockdep_map *lock, * * We look up the chain_key and do the O(N^2) check and update of * the dependencies only if this is a new dependency chain. - * (If lookup_chain_cache() returns with 1 it acquires + * (If lookup_chain_cache_add() return with 1 it acquires * graph_lock for us) */ if (!hlock->trylock && hlock->check && - lookup_chain_cache(curr, hlock, chain_key)) { + lookup_chain_cache_add(curr, hlock, chain_key)) { /* * Check whether last held lock: * @@ -2277,14 +2465,17 @@ static int validate_chain(struct task_struct *curr, struct lockdep_map *lock, * Add dependency only if this lock is not the head * of the chain, and if it's not a secondary read-lock: */ - if (!chain_head && ret != 2) + if (!chain_head && ret != 2) { if (!check_prevs_add(curr, hlock)) return 0; + } + graph_unlock(); - } else - /* after lookup_chain_cache(): */ + } else { + /* after lookup_chain_cache_add(): */ if (unlikely(!debug_locks)) return 0; + } return 1; } @@ -2567,14 +2758,6 @@ static int SOFTIRQ_verbose(struct lock_class *class) return 0; } -static int RECLAIM_FS_verbose(struct lock_class *class) -{ -#if RECLAIM_VERBOSE - return class_filter(class); -#endif - return 0; -} - #define STRICT_READ_CHECKS 1 static int (*state_verbose_f[])(struct lock_class *class) = { @@ -2870,57 +3053,6 @@ void trace_softirqs_off(unsigned long ip) debug_atomic_inc(redundant_softirqs_off); } -static void __lockdep_trace_alloc(gfp_t gfp_mask, unsigned long flags) -{ - struct task_struct *curr = current; - - if (unlikely(!debug_locks)) - return; - - gfp_mask = current_gfp_context(gfp_mask); - - /* no reclaim without waiting on it */ - if (!(gfp_mask & __GFP_DIRECT_RECLAIM)) - return; - - /* this guy won't enter reclaim */ - if ((curr->flags & PF_MEMALLOC) && !(gfp_mask & __GFP_NOMEMALLOC)) - return; - - /* We're only interested __GFP_FS allocations for now */ - if (!(gfp_mask & __GFP_FS) || (curr->flags & PF_MEMALLOC_NOFS)) - return; - - /* - * Oi! Can't be having __GFP_FS allocations with IRQs disabled. - */ - if (DEBUG_LOCKS_WARN_ON(irqs_disabled_flags(flags))) - return; - - /* Disable lockdep if explicitly requested */ - if (gfp_mask & __GFP_NOLOCKDEP) - return; - - mark_held_locks(curr, RECLAIM_FS); -} - -static void check_flags(unsigned long flags); - -void lockdep_trace_alloc(gfp_t gfp_mask) -{ - unsigned long flags; - - if (unlikely(current->lockdep_recursion)) - return; - - raw_local_irq_save(flags); - check_flags(flags); - current->lockdep_recursion = 1; - __lockdep_trace_alloc(gfp_mask, flags); - current->lockdep_recursion = 0; - raw_local_irq_restore(flags); -} - static int mark_irqflags(struct task_struct *curr, struct held_lock *hlock) { /* @@ -2966,22 +3098,6 @@ static int mark_irqflags(struct task_struct *curr, struct held_lock *hlock) } } - /* - * We reuse the irq context infrastructure more broadly as a general - * context checking code. This tests GFP_FS recursion (a lock taken - * during reclaim for a GFP_FS allocation is held over a GFP_FS - * allocation). - */ - if (!hlock->trylock && (curr->lockdep_reclaim_gfp & __GFP_FS)) { - if (hlock->read) { - if (!mark_lock(curr, hlock, LOCK_USED_IN_RECLAIM_FS_READ)) - return 0; - } else { - if (!mark_lock(curr, hlock, LOCK_USED_IN_RECLAIM_FS)) - return 0; - } - } - return 1; } @@ -3040,10 +3156,6 @@ static inline int separate_irq_context(struct task_struct *curr, return 0; } -void lockdep_trace_alloc(gfp_t gfp_mask) -{ -} - #endif /* defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) */ /* @@ -3116,7 +3228,7 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this, /* * Initialize a lock instance's lock-class mapping info: */ -void lockdep_init_map(struct lockdep_map *lock, const char *name, +static void __lockdep_init_map(struct lockdep_map *lock, const char *name, struct lock_class_key *key, int subclass) { int i; @@ -3174,8 +3286,25 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name, raw_local_irq_restore(flags); } } + +void lockdep_init_map(struct lockdep_map *lock, const char *name, + struct lock_class_key *key, int subclass) +{ + cross_init(lock, 0); + __lockdep_init_map(lock, name, key, subclass); +} EXPORT_SYMBOL_GPL(lockdep_init_map); +#ifdef CONFIG_LOCKDEP_CROSSRELEASE +void lockdep_init_map_crosslock(struct lockdep_map *lock, const char *name, + struct lock_class_key *key, int subclass) +{ + cross_init(lock, 1); + __lockdep_init_map(lock, name, key, subclass); +} +EXPORT_SYMBOL_GPL(lockdep_init_map_crosslock); +#endif + struct lock_class_key __lockdep_no_validate__; EXPORT_SYMBOL_GPL(__lockdep_no_validate__); @@ -3231,6 +3360,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, int chain_head = 0; int class_idx; u64 chain_key; + int ret; if (unlikely(!debug_locks)) return 0; @@ -3279,7 +3409,8 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, class_idx = class - lock_classes + 1; - if (depth) { + /* TODO: nest_lock is not implemented for crosslock yet. */ + if (depth && !cross_lock(lock)) { hlock = curr->held_locks + depth - 1; if (hlock->class_idx == class_idx && nest_lock) { if (hlock->references) { @@ -3367,6 +3498,14 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, if (!validate_chain(curr, lock, hlock, chain_head, chain_key)) return 0; + ret = lock_acquire_crosslock(hlock); + /* + * 2 means normal acquire operations are needed. Otherwise, it's + * ok just to return with '0:fail, 1:success'. + */ + if (ret != 2) + return ret; + curr->curr_chain_key = chain_key; curr->lockdep_depth++; check_chain_key(curr); @@ -3604,11 +3743,19 @@ __lock_release(struct lockdep_map *lock, int nested, unsigned long ip) struct task_struct *curr = current; struct held_lock *hlock; unsigned int depth; - int i; + int ret, i; if (unlikely(!debug_locks)) return 0; + ret = lock_release_crosslock(lock); + /* + * 2 means normal release operations are needed. Otherwise, it's + * ok just to return with '0:fail, 1:success'. + */ + if (ret != 2) + return ret; + depth = curr->lockdep_depth; /* * So we're all set to release this lock.. wait what lock? We don't @@ -3952,18 +4099,6 @@ void lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie cookie) } EXPORT_SYMBOL_GPL(lock_unpin_lock); -void lockdep_set_current_reclaim_state(gfp_t gfp_mask) -{ - current->lockdep_reclaim_gfp = current_gfp_context(gfp_mask); -} -EXPORT_SYMBOL_GPL(lockdep_set_current_reclaim_state); - -void lockdep_clear_current_reclaim_state(void) -{ - current->lockdep_reclaim_gfp = 0; -} -EXPORT_SYMBOL_GPL(lockdep_clear_current_reclaim_state); - #ifdef CONFIG_LOCK_STAT static int print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock, @@ -4484,6 +4619,12 @@ asmlinkage __visible void lockdep_sys_exit(void) curr->comm, curr->pid); lockdep_print_held_locks(curr); } + + /* + * The lock history for each syscall should be independent. So wipe the + * slate clean on return to userspace. + */ + lockdep_invariant_state(false); } void lockdep_rcu_suspicious(const char *file, const int line, const char *s) @@ -4532,3 +4673,488 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s) dump_stack(); } EXPORT_SYMBOL_GPL(lockdep_rcu_suspicious); + +#ifdef CONFIG_LOCKDEP_CROSSRELEASE + +/* + * Crossrelease works by recording a lock history for each thread and + * connecting those historic locks that were taken after the + * wait_for_completion() in the complete() context. + * + * Task-A Task-B + * + * mutex_lock(&A); + * mutex_unlock(&A); + * + * wait_for_completion(&C); + * lock_acquire_crosslock(); + * atomic_inc_return(&cross_gen_id); + * | + * | mutex_lock(&B); + * | mutex_unlock(&B); + * | + * | complete(&C); + * `-- lock_commit_crosslock(); + * + * Which will then add a dependency between B and C. + */ + +#define xhlock(i) (current->xhlocks[(i) % MAX_XHLOCKS_NR]) + +/* + * Whenever a crosslock is held, cross_gen_id will be increased. + */ +static atomic_t cross_gen_id; /* Can be wrapped */ + +/* + * Make an entry of the ring buffer invalid. + */ +static inline void invalidate_xhlock(struct hist_lock *xhlock) +{ + /* + * Normally, xhlock->hlock.instance must be !NULL. + */ + xhlock->hlock.instance = NULL; +} + +/* + * Lock history stacks; we have 2 nested lock history stacks: + * + * HARD(IRQ) + * SOFT(IRQ) + * + * The thing is that once we complete a HARD/SOFT IRQ the future task locks + * should not depend on any of the locks observed while running the IRQ. So + * what we do is rewind the history buffer and erase all our knowledge of that + * temporal event. + */ + +void crossrelease_hist_start(enum xhlock_context_t c) +{ + struct task_struct *cur = current; + + if (!cur->xhlocks) + return; + + cur->xhlock_idx_hist[c] = cur->xhlock_idx; + cur->hist_id_save[c] = cur->hist_id; +} + +void crossrelease_hist_end(enum xhlock_context_t c) +{ + struct task_struct *cur = current; + + if (cur->xhlocks) { + unsigned int idx = cur->xhlock_idx_hist[c]; + struct hist_lock *h = &xhlock(idx); + + cur->xhlock_idx = idx; + + /* Check if the ring was overwritten. */ + if (h->hist_id != cur->hist_id_save[c]) + invalidate_xhlock(h); + } +} + +/* + * lockdep_invariant_state() is used to annotate independence inside a task, to + * make one task look like multiple independent 'tasks'. + * + * Take for instance workqueues; each work is independent of the last. The + * completion of a future work does not depend on the completion of a past work + * (in general). Therefore we must not carry that (lock) dependency across + * works. + * + * This is true for many things; pretty much all kthreads fall into this + * pattern, where they have an invariant state and future completions do not + * depend on past completions. Its just that since they all have the 'same' + * form -- the kthread does the same over and over -- it doesn't typically + * matter. + * + * The same is true for system-calls, once a system call is completed (we've + * returned to userspace) the next system call does not depend on the lock + * history of the previous system call. + * + * They key property for independence, this invariant state, is that it must be + * a point where we hold no locks and have no history. Because if we were to + * hold locks, the restore at _end() would not necessarily recover it's history + * entry. Similarly, independence per-definition means it does not depend on + * prior state. + */ +void lockdep_invariant_state(bool force) +{ + /* + * We call this at an invariant point, no current state, no history. + * Verify the former, enforce the latter. + */ + WARN_ON_ONCE(!force && current->lockdep_depth); + invalidate_xhlock(&xhlock(current->xhlock_idx)); +} + +static int cross_lock(struct lockdep_map *lock) +{ + return lock ? lock->cross : 0; +} + +/* + * This is needed to decide the relationship between wrapable variables. + */ +static inline int before(unsigned int a, unsigned int b) +{ + return (int)(a - b) < 0; +} + +static inline struct lock_class *xhlock_class(struct hist_lock *xhlock) +{ + return hlock_class(&xhlock->hlock); +} + +static inline struct lock_class *xlock_class(struct cross_lock *xlock) +{ + return hlock_class(&xlock->hlock); +} + +/* + * Should we check a dependency with previous one? + */ +static inline int depend_before(struct held_lock *hlock) +{ + return hlock->read != 2 && hlock->check && !hlock->trylock; +} + +/* + * Should we check a dependency with next one? + */ +static inline int depend_after(struct held_lock *hlock) +{ + return hlock->read != 2 && hlock->check; +} + +/* + * Check if the xhlock is valid, which would be false if, + * + * 1. Has not used after initializaion yet. + * 2. Got invalidated. + * + * Remind hist_lock is implemented as a ring buffer. + */ +static inline int xhlock_valid(struct hist_lock *xhlock) +{ + /* + * xhlock->hlock.instance must be !NULL. + */ + return !!xhlock->hlock.instance; +} + +/* + * Record a hist_lock entry. + * + * Irq disable is only required. + */ +static void add_xhlock(struct held_lock *hlock) +{ + unsigned int idx = ++current->xhlock_idx; + struct hist_lock *xhlock = &xhlock(idx); + +#ifdef CONFIG_DEBUG_LOCKDEP + /* + * This can be done locklessly because they are all task-local + * state, we must however ensure IRQs are disabled. + */ + WARN_ON_ONCE(!irqs_disabled()); +#endif + + /* Initialize hist_lock's members */ + xhlock->hlock = *hlock; + xhlock->hist_id = ++current->hist_id; + + xhlock->trace.nr_entries = 0; + xhlock->trace.max_entries = MAX_XHLOCK_TRACE_ENTRIES; + xhlock->trace.entries = xhlock->trace_entries; + xhlock->trace.skip = 3; + save_stack_trace(&xhlock->trace); +} + +static inline int same_context_xhlock(struct hist_lock *xhlock) +{ + return xhlock->hlock.irq_context == task_irq_context(current); +} + +/* + * This should be lockless as far as possible because this would be + * called very frequently. + */ +static void check_add_xhlock(struct held_lock *hlock) +{ + /* + * Record a hist_lock, only in case that acquisitions ahead + * could depend on the held_lock. For example, if the held_lock + * is trylock then acquisitions ahead never depends on that. + * In that case, we don't need to record it. Just return. + */ + if (!current->xhlocks || !depend_before(hlock)) + return; + + add_xhlock(hlock); +} + +/* + * For crosslock. + */ +static int add_xlock(struct held_lock *hlock) +{ + struct cross_lock *xlock; + unsigned int gen_id; + + if (!graph_lock()) + return 0; + + xlock = &((struct lockdep_map_cross *)hlock->instance)->xlock; + + /* + * When acquisitions for a crosslock are overlapped, we use + * nr_acquire to perform commit for them, based on cross_gen_id + * of the first acquisition, which allows to add additional + * dependencies. + * + * Moreover, when no acquisition of a crosslock is in progress, + * we should not perform commit because the lock might not exist + * any more, which might cause incorrect memory access. So we + * have to track the number of acquisitions of a crosslock. + * + * depend_after() is necessary to initialize only the first + * valid xlock so that the xlock can be used on its commit. + */ + if (xlock->nr_acquire++ && depend_after(&xlock->hlock)) + goto unlock; + + gen_id = (unsigned int)atomic_inc_return(&cross_gen_id); + xlock->hlock = *hlock; + xlock->hlock.gen_id = gen_id; +unlock: + graph_unlock(); + return 1; +} + +/* + * Called for both normal and crosslock acquires. Normal locks will be + * pushed on the hist_lock queue. Cross locks will record state and + * stop regular lock_acquire() to avoid being placed on the held_lock + * stack. + * + * Return: 0 - failure; + * 1 - crosslock, done; + * 2 - normal lock, continue to held_lock[] ops. + */ +static int lock_acquire_crosslock(struct held_lock *hlock) +{ + /* + * CONTEXT 1 CONTEXT 2 + * --------- --------- + * lock A (cross) + * X = atomic_inc_return(&cross_gen_id) + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * Y = atomic_read_acquire(&cross_gen_id) + * lock B + * + * atomic_read_acquire() is for ordering between A and B, + * IOW, A happens before B, when CONTEXT 2 see Y >= X. + * + * Pairs with atomic_inc_return() in add_xlock(). + */ + hlock->gen_id = (unsigned int)atomic_read_acquire(&cross_gen_id); + + if (cross_lock(hlock->instance)) + return add_xlock(hlock); + + check_add_xhlock(hlock); + return 2; +} + +static int copy_trace(struct stack_trace *trace) +{ + unsigned long *buf = stack_trace + nr_stack_trace_entries; + unsigned int max_nr = MAX_STACK_TRACE_ENTRIES - nr_stack_trace_entries; + unsigned int nr = min(max_nr, trace->nr_entries); + + trace->nr_entries = nr; + memcpy(buf, trace->entries, nr * sizeof(trace->entries[0])); + trace->entries = buf; + nr_stack_trace_entries += nr; + + if (nr_stack_trace_entries >= MAX_STACK_TRACE_ENTRIES-1) { + if (!debug_locks_off_graph_unlock()) + return 0; + + print_lockdep_off("BUG: MAX_STACK_TRACE_ENTRIES too low!"); + dump_stack(); + + return 0; + } + + return 1; +} + +static int commit_xhlock(struct cross_lock *xlock, struct hist_lock *xhlock) +{ + unsigned int xid, pid; + u64 chain_key; + + xid = xlock_class(xlock) - lock_classes; + chain_key = iterate_chain_key((u64)0, xid); + pid = xhlock_class(xhlock) - lock_classes; + chain_key = iterate_chain_key(chain_key, pid); + + if (lookup_chain_cache(chain_key)) + return 1; + + if (!add_chain_cache_classes(xid, pid, xhlock->hlock.irq_context, + chain_key)) + return 0; + + if (!check_prev_add(current, &xlock->hlock, &xhlock->hlock, 1, + &xhlock->trace, copy_trace)) + return 0; + + return 1; +} + +static void commit_xhlocks(struct cross_lock *xlock) +{ + unsigned int cur = current->xhlock_idx; + unsigned int prev_hist_id = xhlock(cur).hist_id; + unsigned int i; + + if (!graph_lock()) + return; + + if (xlock->nr_acquire) { + for (i = 0; i < MAX_XHLOCKS_NR; i++) { + struct hist_lock *xhlock = &xhlock(cur - i); + + if (!xhlock_valid(xhlock)) + break; + + if (before(xhlock->hlock.gen_id, xlock->hlock.gen_id)) + break; + + if (!same_context_xhlock(xhlock)) + break; + + /* + * Filter out the cases where the ring buffer was + * overwritten and the current entry has a bigger + * hist_id than the previous one, which is impossible + * otherwise: + */ + if (unlikely(before(prev_hist_id, xhlock->hist_id))) + break; + + prev_hist_id = xhlock->hist_id; + + /* + * commit_xhlock() returns 0 with graph_lock already + * released if fail. + */ + if (!commit_xhlock(xlock, xhlock)) + return; + } + } + + graph_unlock(); +} + +void lock_commit_crosslock(struct lockdep_map *lock) +{ + struct cross_lock *xlock; + unsigned long flags; + + if (unlikely(!debug_locks || current->lockdep_recursion)) + return; + + if (!current->xhlocks) + return; + + /* + * Do commit hist_locks with the cross_lock, only in case that + * the cross_lock could depend on acquisitions after that. + * + * For example, if the cross_lock does not have the 'check' flag + * then we don't need to check dependencies and commit for that. + * Just skip it. In that case, of course, the cross_lock does + * not depend on acquisitions ahead, either. + * + * WARNING: Don't do that in add_xlock() in advance. When an + * acquisition context is different from the commit context, + * invalid(skipped) cross_lock might be accessed. + */ + if (!depend_after(&((struct lockdep_map_cross *)lock)->xlock.hlock)) + return; + + raw_local_irq_save(flags); + check_flags(flags); + current->lockdep_recursion = 1; + xlock = &((struct lockdep_map_cross *)lock)->xlock; + commit_xhlocks(xlock); + current->lockdep_recursion = 0; + raw_local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(lock_commit_crosslock); + +/* + * Return: 0 - failure; + * 1 - crosslock, done; + * 2 - normal lock, continue to held_lock[] ops. + */ +static int lock_release_crosslock(struct lockdep_map *lock) +{ + if (cross_lock(lock)) { + if (!graph_lock()) + return 0; + ((struct lockdep_map_cross *)lock)->xlock.nr_acquire--; + graph_unlock(); + return 1; + } + return 2; +} + +static void cross_init(struct lockdep_map *lock, int cross) +{ + if (cross) + ((struct lockdep_map_cross *)lock)->xlock.nr_acquire = 0; + + lock->cross = cross; + + /* + * Crossrelease assumes that the ring buffer size of xhlocks + * is aligned with power of 2. So force it on build. + */ + BUILD_BUG_ON(MAX_XHLOCKS_NR & (MAX_XHLOCKS_NR - 1)); +} + +void lockdep_init_task(struct task_struct *task) +{ + int i; + + task->xhlock_idx = UINT_MAX; + task->hist_id = 0; + + for (i = 0; i < XHLOCK_CTX_NR; i++) { + task->xhlock_idx_hist[i] = UINT_MAX; + task->hist_id_save[i] = 0; + } + + task->xhlocks = kzalloc(sizeof(struct hist_lock) * MAX_XHLOCKS_NR, + GFP_KERNEL); +} + +void lockdep_free_task(struct task_struct *task) +{ + if (task->xhlocks) { + void *tmp = task->xhlocks; + /* Diable crossrelease for current */ + task->xhlocks = NULL; + kfree(tmp); + } +} +#endif diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h index c08fbd2f5ba9..1da4669d57a7 100644 --- a/kernel/locking/lockdep_internals.h +++ b/kernel/locking/lockdep_internals.h @@ -143,6 +143,8 @@ struct lockdep_stats { int redundant_softirqs_on; int redundant_softirqs_off; int nr_unused_locks; + int nr_redundant_checks; + int nr_redundant; int nr_cyclic_checks; int nr_cyclic_check_recursions; int nr_find_usage_forwards_checks; diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c index 6d1fcc786081..68d9e267ccd4 100644 --- a/kernel/locking/lockdep_proc.c +++ b/kernel/locking/lockdep_proc.c @@ -201,6 +201,10 @@ static void lockdep_stats_debug_show(struct seq_file *m) debug_atomic_read(chain_lookup_hits)); seq_printf(m, " cyclic checks: %11llu\n", debug_atomic_read(nr_cyclic_checks)); + seq_printf(m, " redundant checks: %11llu\n", + debug_atomic_read(nr_redundant_checks)); + seq_printf(m, " redundant links: %11llu\n", + debug_atomic_read(nr_redundant)); seq_printf(m, " find-mask forwards checks: %11llu\n", debug_atomic_read(nr_find_usage_forwards_checks)); seq_printf(m, " find-mask backwards checks: %11llu\n", diff --git a/kernel/locking/lockdep_states.h b/kernel/locking/lockdep_states.h index 995b0cc2b84c..35ca09f2ed0b 100644 --- a/kernel/locking/lockdep_states.h +++ b/kernel/locking/lockdep_states.h @@ -6,4 +6,3 @@ */ LOCKDEP_STATE(HARDIRQ) LOCKDEP_STATE(SOFTIRQ) -LOCKDEP_STATE(RECLAIM_FS) diff --git a/kernel/locking/osq_lock.c b/kernel/locking/osq_lock.c index a3167941093b..a74ee6abd039 100644 --- a/kernel/locking/osq_lock.c +++ b/kernel/locking/osq_lock.c @@ -109,6 +109,19 @@ bool osq_lock(struct optimistic_spin_queue *lock) prev = decode_cpu(old); node->prev = prev; + + /* + * osq_lock() unqueue + * + * node->prev = prev osq_wait_next() + * WMB MB + * prev->next = node next->prev = prev // unqueue-C + * + * Here 'node->prev' and 'next->prev' are the same variable and we need + * to ensure these stores happen in-order to avoid corrupting the list. + */ + smp_wmb(); + WRITE_ONCE(prev->next, node); /* diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index fd24153e8a48..294294c71ba4 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c @@ -268,123 +268,6 @@ static __always_inline u32 __pv_wait_head_or_lock(struct qspinlock *lock, #define queued_spin_lock_slowpath native_queued_spin_lock_slowpath #endif -/* - * Various notes on spin_is_locked() and spin_unlock_wait(), which are - * 'interesting' functions: - * - * PROBLEM: some architectures have an interesting issue with atomic ACQUIRE - * operations in that the ACQUIRE applies to the LOAD _not_ the STORE (ARM64, - * PPC). Also qspinlock has a similar issue per construction, the setting of - * the locked byte can be unordered acquiring the lock proper. - * - * This gets to be 'interesting' in the following cases, where the /should/s - * end up false because of this issue. - * - * - * CASE 1: - * - * So the spin_is_locked() correctness issue comes from something like: - * - * CPU0 CPU1 - * - * global_lock(); local_lock(i) - * spin_lock(&G) spin_lock(&L[i]) - * for (i) if (!spin_is_locked(&G)) { - * spin_unlock_wait(&L[i]); smp_acquire__after_ctrl_dep(); - * return; - * } - * // deal with fail - * - * Where it is important CPU1 sees G locked or CPU0 sees L[i] locked such - * that there is exclusion between the two critical sections. - * - * The load from spin_is_locked(&G) /should/ be constrained by the ACQUIRE from - * spin_lock(&L[i]), and similarly the load(s) from spin_unlock_wait(&L[i]) - * /should/ be constrained by the ACQUIRE from spin_lock(&G). - * - * Similarly, later stuff is constrained by the ACQUIRE from CTRL+RMB. - * - * - * CASE 2: - * - * For spin_unlock_wait() there is a second correctness issue, namely: - * - * CPU0 CPU1 - * - * flag = set; - * smp_mb(); spin_lock(&l) - * spin_unlock_wait(&l); if (!flag) - * // add to lockless list - * spin_unlock(&l); - * // iterate lockless list - * - * Which wants to ensure that CPU1 will stop adding bits to the list and CPU0 - * will observe the last entry on the list (if spin_unlock_wait() had ACQUIRE - * semantics etc..) - * - * Where flag /should/ be ordered against the locked store of l. - */ - -/* - * queued_spin_lock_slowpath() can (load-)ACQUIRE the lock before - * issuing an _unordered_ store to set _Q_LOCKED_VAL. - * - * This means that the store can be delayed, but no later than the - * store-release from the unlock. This means that simply observing - * _Q_LOCKED_VAL is not sufficient to determine if the lock is acquired. - * - * There are two paths that can issue the unordered store: - * - * (1) clear_pending_set_locked(): *,1,0 -> *,0,1 - * - * (2) set_locked(): t,0,0 -> t,0,1 ; t != 0 - * atomic_cmpxchg_relaxed(): t,0,0 -> 0,0,1 - * - * However, in both cases we have other !0 state we've set before to queue - * ourseves: - * - * For (1) we have the atomic_cmpxchg_acquire() that set _Q_PENDING_VAL, our - * load is constrained by that ACQUIRE to not pass before that, and thus must - * observe the store. - * - * For (2) we have a more intersting scenario. We enqueue ourselves using - * xchg_tail(), which ends up being a RELEASE. This in itself is not - * sufficient, however that is followed by an smp_cond_acquire() on the same - * word, giving a RELEASE->ACQUIRE ordering. This again constrains our load and - * guarantees we must observe that store. - * - * Therefore both cases have other !0 state that is observable before the - * unordered locked byte store comes through. This means we can use that to - * wait for the lock store, and then wait for an unlock. - */ -#ifndef queued_spin_unlock_wait -void queued_spin_unlock_wait(struct qspinlock *lock) -{ - u32 val; - - for (;;) { - val = atomic_read(&lock->val); - - if (!val) /* not locked, we're done */ - goto done; - - if (val & _Q_LOCKED_MASK) /* locked, go wait for unlock */ - break; - - /* not locked, but pending, wait until we observe the lock */ - cpu_relax(); - } - - /* any unlock is good */ - while (atomic_read(&lock->val) & _Q_LOCKED_MASK) - cpu_relax(); - -done: - smp_acquire__after_ctrl_dep(); -} -EXPORT_SYMBOL(queued_spin_unlock_wait); -#endif - #endif /* _GEN_PV_LOCK_SLOWPATH */ /** diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h index 4ccfcaae5b89..43555681c40b 100644 --- a/kernel/locking/qspinlock_paravirt.h +++ b/kernel/locking/qspinlock_paravirt.h @@ -72,7 +72,7 @@ static inline bool pv_queued_spin_steal_lock(struct qspinlock *lock) struct __qspinlock *l = (void *)lock; if (!(atomic_read(&lock->val) & _Q_LOCKED_PENDING_MASK) && - (cmpxchg(&l->locked, 0, _Q_LOCKED_VAL) == 0)) { + (cmpxchg_acquire(&l->locked, 0, _Q_LOCKED_VAL) == 0)) { qstat_inc(qstat_pv_lock_stealing, true); return true; } @@ -101,16 +101,16 @@ static __always_inline void clear_pending(struct qspinlock *lock) /* * The pending bit check in pv_queued_spin_steal_lock() isn't a memory - * barrier. Therefore, an atomic cmpxchg() is used to acquire the lock - * just to be sure that it will get it. + * barrier. Therefore, an atomic cmpxchg_acquire() is used to acquire the + * lock just to be sure that it will get it. */ static __always_inline int trylock_clear_pending(struct qspinlock *lock) { struct __qspinlock *l = (void *)lock; return !READ_ONCE(l->locked) && - (cmpxchg(&l->locked_pending, _Q_PENDING_VAL, _Q_LOCKED_VAL) - == _Q_PENDING_VAL); + (cmpxchg_acquire(&l->locked_pending, _Q_PENDING_VAL, + _Q_LOCKED_VAL) == _Q_PENDING_VAL); } #else /* _Q_PENDING_BITS == 8 */ static __always_inline void set_pending(struct qspinlock *lock) @@ -138,7 +138,7 @@ static __always_inline int trylock_clear_pending(struct qspinlock *lock) */ old = val; new = (val & ~_Q_PENDING_MASK) | _Q_LOCKED_VAL; - val = atomic_cmpxchg(&lock->val, old, new); + val = atomic_cmpxchg_acquire(&lock->val, old, new); if (val == old) return 1; @@ -362,8 +362,18 @@ static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node) * observe its next->locked value and advance itself. * * Matches with smp_store_mb() and cmpxchg() in pv_wait_node() + * + * The write to next->locked in arch_mcs_spin_unlock_contended() + * must be ordered before the read of pn->state in the cmpxchg() + * below for the code to work correctly. To guarantee full ordering + * irrespective of the success or failure of the cmpxchg(), + * a relaxed version with explicit barrier is used. The control + * dependency will order the reading of pn->state before any + * subsequent writes. */ - if (cmpxchg(&pn->state, vcpu_halted, vcpu_hashed) != vcpu_halted) + smp_mb__before_atomic(); + if (cmpxchg_relaxed(&pn->state, vcpu_halted, vcpu_hashed) + != vcpu_halted) return; /* diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h index 72ad45a9a794..8d039b928d61 100644 --- a/kernel/locking/rtmutex_common.h +++ b/kernel/locking/rtmutex_common.h @@ -40,6 +40,9 @@ struct rt_mutex_waiter { /* * Various helpers to access the waiters-tree: */ + +#ifdef CONFIG_RT_MUTEXES + static inline int rt_mutex_has_waiters(struct rt_mutex *lock) { return !RB_EMPTY_ROOT(&lock->waiters); @@ -69,6 +72,32 @@ task_top_pi_waiter(struct task_struct *p) pi_tree_entry); } +#else + +static inline int rt_mutex_has_waiters(struct rt_mutex *lock) +{ + return false; +} + +static inline struct rt_mutex_waiter * +rt_mutex_top_waiter(struct rt_mutex *lock) +{ + return NULL; +} + +static inline int task_has_pi_waiters(struct task_struct *p) +{ + return false; +} + +static inline struct rt_mutex_waiter * +task_top_pi_waiter(struct task_struct *p) +{ + return NULL; +} + +#endif + /* * lock->owner state tracking: */ diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c index 20819df98125..0848634c5512 100644 --- a/kernel/locking/rwsem-spinlock.c +++ b/kernel/locking/rwsem-spinlock.c @@ -126,7 +126,7 @@ __rwsem_wake_one_writer(struct rw_semaphore *sem) /* * get a read lock on the semaphore */ -void __sched __down_read(struct rw_semaphore *sem) +int __sched __down_read_common(struct rw_semaphore *sem, int state) { struct rwsem_waiter waiter; unsigned long flags; @@ -140,8 +140,6 @@ void __sched __down_read(struct rw_semaphore *sem) goto out; } - set_current_state(TASK_UNINTERRUPTIBLE); - /* set up my own style of waitqueue */ waiter.task = current; waiter.type = RWSEM_WAITING_FOR_READ; @@ -149,20 +147,41 @@ void __sched __down_read(struct rw_semaphore *sem) list_add_tail(&waiter.list, &sem->wait_list); - /* we don't need to touch the semaphore struct anymore */ - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); - /* wait to be given the lock */ for (;;) { if (!waiter.task) break; + if (signal_pending_state(state, current)) + goto out_nolock; + set_current_state(state); + raw_spin_unlock_irqrestore(&sem->wait_lock, flags); schedule(); - set_current_state(TASK_UNINTERRUPTIBLE); + raw_spin_lock_irqsave(&sem->wait_lock, flags); } - __set_current_state(TASK_RUNNING); + raw_spin_unlock_irqrestore(&sem->wait_lock, flags); out: - ; + return 0; + +out_nolock: + /* + * We didn't take the lock, so that there is a writer, which + * is owner or the first waiter of the sem. If it's a waiter, + * it will be woken by current owner. Not need to wake anybody. + */ + list_del(&waiter.list); + raw_spin_unlock_irqrestore(&sem->wait_lock, flags); + return -EINTR; +} + +void __sched __down_read(struct rw_semaphore *sem) +{ + __down_read_common(sem, TASK_UNINTERRUPTIBLE); +} + +int __sched __down_read_killable(struct rw_semaphore *sem) +{ + return __down_read_common(sem, TASK_KILLABLE); } /* diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index 34e727f18e49..02f660666ab8 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -221,8 +221,8 @@ static void __rwsem_mark_wake(struct rw_semaphore *sem, /* * Wait for the read lock to be granted */ -__visible -struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) +static inline struct rw_semaphore __sched * +__rwsem_down_read_failed_common(struct rw_semaphore *sem, int state) { long count, adjustment = -RWSEM_ACTIVE_READ_BIAS; struct rwsem_waiter waiter; @@ -255,17 +255,44 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) /* wait to be given the lock */ while (true) { - set_current_state(TASK_UNINTERRUPTIBLE); + set_current_state(state); if (!waiter.task) break; + if (signal_pending_state(state, current)) { + raw_spin_lock_irq(&sem->wait_lock); + if (waiter.task) + goto out_nolock; + raw_spin_unlock_irq(&sem->wait_lock); + break; + } schedule(); } __set_current_state(TASK_RUNNING); return sem; +out_nolock: + list_del(&waiter.list); + if (list_empty(&sem->wait_list)) + atomic_long_add(-RWSEM_WAITING_BIAS, &sem->count); + raw_spin_unlock_irq(&sem->wait_lock); + __set_current_state(TASK_RUNNING); + return ERR_PTR(-EINTR); +} + +__visible struct rw_semaphore * __sched +rwsem_down_read_failed(struct rw_semaphore *sem) +{ + return __rwsem_down_read_failed_common(sem, TASK_UNINTERRUPTIBLE); } EXPORT_SYMBOL(rwsem_down_read_failed); +__visible struct rw_semaphore * __sched +rwsem_down_read_failed_killable(struct rw_semaphore *sem) +{ + return __rwsem_down_read_failed_common(sem, TASK_KILLABLE); +} +EXPORT_SYMBOL(rwsem_down_read_failed_killable); + /* * This function must be called with the sem->wait_lock held to prevent * race conditions between checking the rwsem wait list and setting the diff --git a/kernel/membarrier.c b/kernel/membarrier.c deleted file mode 100644 index 9f9284f37f8d..000000000000 --- a/kernel/membarrier.c +++ /dev/null @@ -1,70 +0,0 @@ -/* - * Copyright (C) 2010, 2015 Mathieu Desnoyers <mathieu.desnoyers@efficios.com> - * - * membarrier system call - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - */ - -#include <linux/syscalls.h> -#include <linux/membarrier.h> -#include <linux/tick.h> - -/* - * Bitmask made from a "or" of all commands within enum membarrier_cmd, - * except MEMBARRIER_CMD_QUERY. - */ -#define MEMBARRIER_CMD_BITMASK (MEMBARRIER_CMD_SHARED) - -/** - * sys_membarrier - issue memory barriers on a set of threads - * @cmd: Takes command values defined in enum membarrier_cmd. - * @flags: Currently needs to be 0. For future extensions. - * - * If this system call is not implemented, -ENOSYS is returned. If the - * command specified does not exist, or if the command argument is invalid, - * this system call returns -EINVAL. For a given command, with flags argument - * set to 0, this system call is guaranteed to always return the same value - * until reboot. - * - * All memory accesses performed in program order from each targeted thread - * is guaranteed to be ordered with respect to sys_membarrier(). If we use - * the semantic "barrier()" to represent a compiler barrier forcing memory - * accesses to be performed in program order across the barrier, and - * smp_mb() to represent explicit memory barriers forcing full memory - * ordering across the barrier, we have the following ordering table for - * each pair of barrier(), sys_membarrier() and smp_mb(): - * - * The pair ordering is detailed as (O: ordered, X: not ordered): - * - * barrier() smp_mb() sys_membarrier() - * barrier() X X O - * smp_mb() X O O - * sys_membarrier() O O O - */ -SYSCALL_DEFINE2(membarrier, int, cmd, int, flags) -{ - /* MEMBARRIER_CMD_SHARED is not compatible with nohz_full. */ - if (tick_nohz_full_enabled()) - return -ENOSYS; - if (unlikely(flags)) - return -EINVAL; - switch (cmd) { - case MEMBARRIER_CMD_QUERY: - return MEMBARRIER_CMD_BITMASK; - case MEMBARRIER_CMD_SHARED: - if (num_online_cpus() > 1) - synchronize_sched(); - return 0; - default: - return -EINVAL; - } -} diff --git a/kernel/memremap.c b/kernel/memremap.c index 124bed776532..9afdc434fb49 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -34,13 +34,24 @@ static void *arch_memremap_wb(resource_size_t offset, unsigned long size) } #endif -static void *try_ram_remap(resource_size_t offset, size_t size) +#ifndef arch_memremap_can_ram_remap +static bool arch_memremap_can_ram_remap(resource_size_t offset, size_t size, + unsigned long flags) +{ + return true; +} +#endif + +static void *try_ram_remap(resource_size_t offset, size_t size, + unsigned long flags) { unsigned long pfn = PHYS_PFN(offset); /* In the simple case just return the existing linear address */ - if (pfn_valid(pfn) && !PageHighMem(pfn_to_page(pfn))) + if (pfn_valid(pfn) && !PageHighMem(pfn_to_page(pfn)) && + arch_memremap_can_ram_remap(offset, size, flags)) return __va(offset); + return NULL; /* fallback to arch_memremap_wb */ } @@ -48,7 +59,8 @@ static void *try_ram_remap(resource_size_t offset, size_t size) * memremap() - remap an iomem_resource as cacheable memory * @offset: iomem resource start address * @size: size of remap - * @flags: any of MEMREMAP_WB, MEMREMAP_WT and MEMREMAP_WC + * @flags: any of MEMREMAP_WB, MEMREMAP_WT, MEMREMAP_WC, + * MEMREMAP_ENC, MEMREMAP_DEC * * memremap() is "ioremap" for cases where it is known that the resource * being mapped does not have i/o side effects and the __iomem @@ -95,7 +107,7 @@ void *memremap(resource_size_t offset, size_t size, unsigned long flags) * the requested range is potentially in System RAM. */ if (is_ram == REGION_INTERSECTS) - addr = try_ram_remap(offset, size); + addr = try_ram_remap(offset, size, flags); if (!addr) addr = arch_memremap_wb(offset, size); } diff --git a/kernel/panic.c b/kernel/panic.c index a58932b41700..bdd18afa19a4 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -26,6 +26,7 @@ #include <linux/nmi.h> #include <linux/console.h> #include <linux/bug.h> +#include <linux/ratelimit.h> #define PANIC_TIMER_STEP 100 #define PANIC_BLINK_SPD 18 @@ -601,6 +602,17 @@ EXPORT_SYMBOL(__stack_chk_fail); #endif +#ifdef CONFIG_ARCH_HAS_REFCOUNT +void refcount_error_report(struct pt_regs *regs, const char *err) +{ + WARN_RATELIMIT(1, "refcount_t %s at %pB in %s[%d], uid/euid: %u/%u\n", + err, (void *)instruction_pointer(regs), + current->comm, task_pid_nr(current), + from_kuid_munged(&init_user_ns, current_uid()), + from_kuid_munged(&init_user_ns, current_euid())); +} +#endif + core_param(panic, panic_timeout, int, 0644); core_param(pause_on_oops, pause_on_oops, int, 0644); core_param(panic_on_warn, panic_on_warn, int, 0644); diff --git a/kernel/pid.c b/kernel/pid.c index c69c30d827e5..020dedbdf066 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -527,8 +527,11 @@ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type, if (!ns) ns = task_active_pid_ns(current); if (likely(pid_alive(task))) { - if (type != PIDTYPE_PID) + if (type != PIDTYPE_PID) { + if (type == __PIDTYPE_TGID) + type = PIDTYPE_PID; task = task->group_leader; + } nr = pid_nr_ns(rcu_dereference(task->pids[type].pid), ns); } rcu_read_unlock(); @@ -537,12 +540,6 @@ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type, } EXPORT_SYMBOL(__task_pid_nr_ns); -pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) -{ - return pid_nr_ns(task_tgid(tsk), ns); -} -EXPORT_SYMBOL(task_tgid_nr_ns); - struct pid_namespace *task_active_pid_ns(struct task_struct *tsk) { return ns_of_pid(task_pid(tsk)); diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index be90c945063f..9210379c0353 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig @@ -69,8 +69,7 @@ config TREE_SRCU This option selects the full-fledged version of SRCU. config TASKS_RCU - bool - default n + def_bool PREEMPT select SRCU help This option enables a task-based RCU implementation that uses diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 808b8c85f626..e4b43fef89f5 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -356,22 +356,10 @@ do { \ #ifdef CONFIG_TINY_RCU /* Tiny RCU doesn't expedite, as its purpose in life is instead to be tiny. */ -static inline bool rcu_gp_is_normal(void) /* Internal RCU use. */ -{ - return true; -} -static inline bool rcu_gp_is_expedited(void) /* Internal RCU use. */ -{ - return false; -} - -static inline void rcu_expedite_gp(void) -{ -} - -static inline void rcu_unexpedite_gp(void) -{ -} +static inline bool rcu_gp_is_normal(void) { return true; } +static inline bool rcu_gp_is_expedited(void) { return false; } +static inline void rcu_expedite_gp(void) { } +static inline void rcu_unexpedite_gp(void) { } #else /* #ifdef CONFIG_TINY_RCU */ bool rcu_gp_is_normal(void); /* Internal RCU use. */ bool rcu_gp_is_expedited(void); /* Internal RCU use. */ @@ -419,12 +407,8 @@ static inline void rcutorture_get_gp_data(enum rcutorture_type test_type, *gpnum = 0; *completed = 0; } -static inline void rcutorture_record_test_transition(void) -{ -} -static inline void rcutorture_record_progress(unsigned long vernum) -{ -} +static inline void rcutorture_record_test_transition(void) { } +static inline void rcutorture_record_progress(unsigned long vernum) { } #ifdef CONFIG_RCU_TRACE void do_trace_rcu_torture_read(const char *rcutorturename, struct rcu_head *rhp, @@ -460,92 +444,20 @@ void srcutorture_get_gp_data(enum rcutorture_type test_type, #endif #ifdef CONFIG_TINY_RCU - -/* - * Return the number of grace periods started. - */ -static inline unsigned long rcu_batches_started(void) -{ - return 0; -} - -/* - * Return the number of bottom-half grace periods started. - */ -static inline unsigned long rcu_batches_started_bh(void) -{ - return 0; -} - -/* - * Return the number of sched grace periods started. - */ -static inline unsigned long rcu_batches_started_sched(void) -{ - return 0; -} - -/* - * Return the number of grace periods completed. - */ -static inline unsigned long rcu_batches_completed(void) -{ - return 0; -} - -/* - * Return the number of bottom-half grace periods completed. - */ -static inline unsigned long rcu_batches_completed_bh(void) -{ - return 0; -} - -/* - * Return the number of sched grace periods completed. - */ -static inline unsigned long rcu_batches_completed_sched(void) -{ - return 0; -} - -/* - * Return the number of expedited grace periods completed. - */ -static inline unsigned long rcu_exp_batches_completed(void) -{ - return 0; -} - -/* - * Return the number of expedited sched grace periods completed. - */ -static inline unsigned long rcu_exp_batches_completed_sched(void) -{ - return 0; -} - -static inline unsigned long srcu_batches_completed(struct srcu_struct *sp) -{ - return 0; -} - -static inline void rcu_force_quiescent_state(void) -{ -} - -static inline void rcu_bh_force_quiescent_state(void) -{ -} - -static inline void rcu_sched_force_quiescent_state(void) -{ -} - -static inline void show_rcu_gp_kthreads(void) -{ -} - +static inline unsigned long rcu_batches_started(void) { return 0; } +static inline unsigned long rcu_batches_started_bh(void) { return 0; } +static inline unsigned long rcu_batches_started_sched(void) { return 0; } +static inline unsigned long rcu_batches_completed(void) { return 0; } +static inline unsigned long rcu_batches_completed_bh(void) { return 0; } +static inline unsigned long rcu_batches_completed_sched(void) { return 0; } +static inline unsigned long rcu_exp_batches_completed(void) { return 0; } +static inline unsigned long rcu_exp_batches_completed_sched(void) { return 0; } +static inline unsigned long +srcu_batches_completed(struct srcu_struct *sp) { return 0; } +static inline void rcu_force_quiescent_state(void) { } +static inline void rcu_bh_force_quiescent_state(void) { } +static inline void rcu_sched_force_quiescent_state(void) { } +static inline void show_rcu_gp_kthreads(void) { } #else /* #ifdef CONFIG_TINY_RCU */ extern unsigned long rcutorture_testseq; extern unsigned long rcutorture_vernum; diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c index 2b62a38b080f..7649fcd2c4c7 100644 --- a/kernel/rcu/rcu_segcblist.c +++ b/kernel/rcu/rcu_segcblist.c @@ -36,24 +36,6 @@ void rcu_cblist_init(struct rcu_cblist *rclp) } /* - * Debug function to actually count the number of callbacks. - * If the number exceeds the limit specified, return -1. - */ -long rcu_cblist_count_cbs(struct rcu_cblist *rclp, long lim) -{ - int cnt = 0; - struct rcu_head **rhpp = &rclp->head; - - for (;;) { - if (!*rhpp) - return cnt; - if (++cnt > lim) - return -1; - rhpp = &(*rhpp)->next; - } -} - -/* * Dequeue the oldest rcu_head structure from the specified callback * list. This function assumes that the callback is non-lazy, but * the caller can later invoke rcu_cblist_dequeued_lazy() if it @@ -103,17 +85,6 @@ void rcu_segcblist_disable(struct rcu_segcblist *rsclp) } /* - * Is the specified segment of the specified rcu_segcblist structure - * empty of callbacks? - */ -bool rcu_segcblist_segempty(struct rcu_segcblist *rsclp, int seg) -{ - if (seg == RCU_DONE_TAIL) - return &rsclp->head == rsclp->tails[RCU_DONE_TAIL]; - return rsclp->tails[seg - 1] == rsclp->tails[seg]; -} - -/* * Does the specified rcu_segcblist structure contain callbacks that * are ready to be invoked? */ @@ -134,50 +105,6 @@ bool rcu_segcblist_pend_cbs(struct rcu_segcblist *rsclp) } /* - * Dequeue and return the first ready-to-invoke callback. If there - * are no ready-to-invoke callbacks, return NULL. Disables interrupts - * to avoid interference. Does not protect from interference from other - * CPUs or tasks. - */ -struct rcu_head *rcu_segcblist_dequeue(struct rcu_segcblist *rsclp) -{ - unsigned long flags; - int i; - struct rcu_head *rhp; - - local_irq_save(flags); - if (!rcu_segcblist_ready_cbs(rsclp)) { - local_irq_restore(flags); - return NULL; - } - rhp = rsclp->head; - BUG_ON(!rhp); - rsclp->head = rhp->next; - for (i = RCU_DONE_TAIL; i < RCU_CBLIST_NSEGS; i++) { - if (rsclp->tails[i] != &rhp->next) - break; - rsclp->tails[i] = &rsclp->head; - } - smp_mb(); /* Dequeue before decrement for rcu_barrier(). */ - WRITE_ONCE(rsclp->len, rsclp->len - 1); - local_irq_restore(flags); - return rhp; -} - -/* - * Account for the fact that a previously dequeued callback turned out - * to be marked as lazy. - */ -void rcu_segcblist_dequeued_lazy(struct rcu_segcblist *rsclp) -{ - unsigned long flags; - - local_irq_save(flags); - rsclp->len_lazy--; - local_irq_restore(flags); -} - -/* * Return a pointer to the first callback in the specified rcu_segcblist * structure. This is useful for diagnostics. */ @@ -203,17 +130,6 @@ struct rcu_head *rcu_segcblist_first_pend_cb(struct rcu_segcblist *rsclp) } /* - * Does the specified rcu_segcblist structure contain callbacks that - * have not yet been processed beyond having been posted, that is, - * does it contain callbacks in its last segment? - */ -bool rcu_segcblist_new_cbs(struct rcu_segcblist *rsclp) -{ - return rcu_segcblist_is_enabled(rsclp) && - !rcu_segcblist_restempty(rsclp, RCU_NEXT_READY_TAIL); -} - -/* * Enqueue the specified callback onto the specified rcu_segcblist * structure, updating accounting as needed. Note that the ->len * field may be accessed locklessly, hence the WRITE_ONCE(). @@ -503,3 +419,27 @@ bool rcu_segcblist_future_gp_needed(struct rcu_segcblist *rsclp, return true; return false; } + +/* + * Merge the source rcu_segcblist structure into the destination + * rcu_segcblist structure, then initialize the source. Any pending + * callbacks from the source get to start over. It is best to + * advance and accelerate both the destination and the source + * before merging. + */ +void rcu_segcblist_merge(struct rcu_segcblist *dst_rsclp, + struct rcu_segcblist *src_rsclp) +{ + struct rcu_cblist donecbs; + struct rcu_cblist pendcbs; + + rcu_cblist_init(&donecbs); + rcu_cblist_init(&pendcbs); + rcu_segcblist_extract_count(src_rsclp, &donecbs); + rcu_segcblist_extract_done_cbs(src_rsclp, &donecbs); + rcu_segcblist_extract_pend_cbs(src_rsclp, &pendcbs); + rcu_segcblist_insert_count(dst_rsclp, &donecbs); + rcu_segcblist_insert_done_cbs(dst_rsclp, &donecbs); + rcu_segcblist_insert_pend_cbs(dst_rsclp, &pendcbs); + rcu_segcblist_init(src_rsclp); +} diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h index 6e36e36478cd..581c12b63544 100644 --- a/kernel/rcu/rcu_segcblist.h +++ b/kernel/rcu/rcu_segcblist.h @@ -31,29 +31,7 @@ static inline void rcu_cblist_dequeued_lazy(struct rcu_cblist *rclp) rclp->len_lazy--; } -/* - * Interim function to return rcu_cblist head pointer. Longer term, the - * rcu_cblist will be used more pervasively, removing the need for this - * function. - */ -static inline struct rcu_head *rcu_cblist_head(struct rcu_cblist *rclp) -{ - return rclp->head; -} - -/* - * Interim function to return rcu_cblist head pointer. Longer term, the - * rcu_cblist will be used more pervasively, removing the need for this - * function. - */ -static inline struct rcu_head **rcu_cblist_tail(struct rcu_cblist *rclp) -{ - WARN_ON_ONCE(!rclp->head); - return rclp->tail; -} - void rcu_cblist_init(struct rcu_cblist *rclp); -long rcu_cblist_count_cbs(struct rcu_cblist *rclp, long lim); struct rcu_head *rcu_cblist_dequeue(struct rcu_cblist *rclp); /* @@ -134,14 +112,10 @@ static inline struct rcu_head **rcu_segcblist_tail(struct rcu_segcblist *rsclp) void rcu_segcblist_init(struct rcu_segcblist *rsclp); void rcu_segcblist_disable(struct rcu_segcblist *rsclp); -bool rcu_segcblist_segempty(struct rcu_segcblist *rsclp, int seg); bool rcu_segcblist_ready_cbs(struct rcu_segcblist *rsclp); bool rcu_segcblist_pend_cbs(struct rcu_segcblist *rsclp); -struct rcu_head *rcu_segcblist_dequeue(struct rcu_segcblist *rsclp); -void rcu_segcblist_dequeued_lazy(struct rcu_segcblist *rsclp); struct rcu_head *rcu_segcblist_first_cb(struct rcu_segcblist *rsclp); struct rcu_head *rcu_segcblist_first_pend_cb(struct rcu_segcblist *rsclp); -bool rcu_segcblist_new_cbs(struct rcu_segcblist *rsclp); void rcu_segcblist_enqueue(struct rcu_segcblist *rsclp, struct rcu_head *rhp, bool lazy); bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp, @@ -162,3 +136,5 @@ void rcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq); bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq); bool rcu_segcblist_future_gp_needed(struct rcu_segcblist *rsclp, unsigned long seq); +void rcu_segcblist_merge(struct rcu_segcblist *dst_rsclp, + struct rcu_segcblist *src_rsclp); diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index 3cc18110b612..1f87a02c3399 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -317,8 +317,6 @@ static struct rcu_perf_ops sched_ops = { .name = "sched" }; -#ifdef CONFIG_TASKS_RCU - /* * Definitions for RCU-tasks perf testing. */ @@ -346,24 +344,11 @@ static struct rcu_perf_ops tasks_ops = { .name = "tasks" }; -#define RCUPERF_TASKS_OPS &tasks_ops, - static bool __maybe_unused torturing_tasks(void) { return cur_ops == &tasks_ops; } -#else /* #ifdef CONFIG_TASKS_RCU */ - -#define RCUPERF_TASKS_OPS - -static bool __maybe_unused torturing_tasks(void) -{ - return false; -} - -#endif /* #else #ifdef CONFIG_TASKS_RCU */ - /* * If performance tests complete, wait for shutdown to commence. */ @@ -658,7 +643,7 @@ rcu_perf_init(void) int firsterr = 0; static struct rcu_perf_ops *perf_ops[] = { &rcu_ops, &rcu_bh_ops, &srcu_ops, &srcud_ops, &sched_ops, - RCUPERF_TASKS_OPS + &tasks_ops, }; if (!torture_init_begin(perf_type, verbose, &perf_runnable)) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index b8f7f8ce8575..45f2ffbc1e78 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -199,7 +199,8 @@ MODULE_PARM_DESC(torture_runnable, "Start rcutorture at boot"); static u64 notrace rcu_trace_clock_local(void) { u64 ts = trace_clock_local(); - unsigned long __maybe_unused ts_rem = do_div(ts, NSEC_PER_USEC); + + (void)do_div(ts, NSEC_PER_USEC); return ts; } #else /* #ifdef CONFIG_RCU_TRACE */ @@ -496,7 +497,7 @@ static struct rcu_torture_ops rcu_busted_ops = { .fqs = NULL, .stats = NULL, .irq_capable = 1, - .name = "rcu_busted" + .name = "busted" }; /* @@ -522,7 +523,7 @@ static void srcu_read_delay(struct torture_random_state *rrsp) delay = torture_random(rrsp) % (nrealreaders * 2 * longdelay * uspertick); - if (!delay) + if (!delay && in_task()) schedule_timeout_interruptible(longdelay); else rcu_read_delay(rrsp); @@ -561,44 +562,7 @@ static void srcu_torture_barrier(void) static void srcu_torture_stats(void) { - int __maybe_unused cpu; - int idx; - -#ifdef CONFIG_TREE_SRCU - idx = srcu_ctlp->srcu_idx & 0x1; - pr_alert("%s%s Tree SRCU per-CPU(idx=%d):", - torture_type, TORTURE_FLAG, idx); - for_each_possible_cpu(cpu) { - unsigned long l0, l1; - unsigned long u0, u1; - long c0, c1; - struct srcu_data *counts; - - counts = per_cpu_ptr(srcu_ctlp->sda, cpu); - u0 = counts->srcu_unlock_count[!idx]; - u1 = counts->srcu_unlock_count[idx]; - - /* - * Make sure that a lock is always counted if the corresponding - * unlock is counted. - */ - smp_rmb(); - - l0 = counts->srcu_lock_count[!idx]; - l1 = counts->srcu_lock_count[idx]; - - c0 = l0 - u0; - c1 = l1 - u1; - pr_cont(" %d(%ld,%ld)", cpu, c0, c1); - } - pr_cont("\n"); -#elif defined(CONFIG_TINY_SRCU) - idx = READ_ONCE(srcu_ctlp->srcu_idx) & 0x1; - pr_alert("%s%s Tiny SRCU per-CPU(idx=%d): (%hd,%hd)\n", - torture_type, TORTURE_FLAG, idx, - READ_ONCE(srcu_ctlp->srcu_lock_nesting[!idx]), - READ_ONCE(srcu_ctlp->srcu_lock_nesting[idx])); -#endif + srcu_torture_stats_print(srcu_ctlp, torture_type, TORTURE_FLAG); } static void srcu_torture_synchronize_expedited(void) @@ -620,6 +584,7 @@ static struct rcu_torture_ops srcu_ops = { .call = srcu_torture_call, .cb_barrier = srcu_torture_barrier, .stats = srcu_torture_stats, + .irq_capable = 1, .name = "srcu" }; @@ -652,6 +617,7 @@ static struct rcu_torture_ops srcud_ops = { .call = srcu_torture_call, .cb_barrier = srcu_torture_barrier, .stats = srcu_torture_stats, + .irq_capable = 1, .name = "srcud" }; @@ -696,8 +662,6 @@ static struct rcu_torture_ops sched_ops = { .name = "sched" }; -#ifdef CONFIG_TASKS_RCU - /* * Definitions for RCU-tasks torture testing. */ @@ -735,24 +699,11 @@ static struct rcu_torture_ops tasks_ops = { .name = "tasks" }; -#define RCUTORTURE_TASKS_OPS &tasks_ops, - static bool __maybe_unused torturing_tasks(void) { return cur_ops == &tasks_ops; } -#else /* #ifdef CONFIG_TASKS_RCU */ - -#define RCUTORTURE_TASKS_OPS - -static bool __maybe_unused torturing_tasks(void) -{ - return false; -} - -#endif /* #else #ifdef CONFIG_TASKS_RCU */ - /* * RCU torture priority-boost testing. Runs one real-time thread per * CPU for moderate bursts, repeatedly registering RCU callbacks and @@ -1114,6 +1065,11 @@ rcu_torture_fakewriter(void *arg) return 0; } +static void rcu_torture_timer_cb(struct rcu_head *rhp) +{ + kfree(rhp); +} + /* * RCU torture reader from timer handler. Dereferences rcu_torture_current, * incrementing the corresponding element of the pipeline array. The @@ -1176,6 +1132,14 @@ static void rcu_torture_timer(unsigned long unused) __this_cpu_inc(rcu_torture_batch[completed]); preempt_enable(); cur_ops->readunlock(idx); + + /* Test call_rcu() invocation from interrupt handler. */ + if (cur_ops->call) { + struct rcu_head *rhp = kmalloc(sizeof(*rhp), GFP_NOWAIT); + + if (rhp) + cur_ops->call(rhp, rcu_torture_timer_cb); + } } /* @@ -1354,11 +1318,12 @@ rcu_torture_stats_print(void) srcutorture_get_gp_data(cur_ops->ttype, srcu_ctlp, &flags, &gpnum, &completed); wtp = READ_ONCE(writer_task); - pr_alert("??? Writer stall state %s(%d) g%lu c%lu f%#x ->state %#lx\n", + pr_alert("??? Writer stall state %s(%d) g%lu c%lu f%#x ->state %#lx cpu %d\n", rcu_torture_writer_state_getname(), rcu_torture_writer_state, gpnum, completed, flags, - wtp == NULL ? ~0UL : wtp->state); + wtp == NULL ? ~0UL : wtp->state, + wtp == NULL ? -1 : (int)task_cpu(wtp)); show_rcu_gp_kthreads(); rcu_ftrace_dump(DUMP_ALL); } @@ -1749,7 +1714,7 @@ rcu_torture_init(void) int firsterr = 0; static struct rcu_torture_ops *torture_ops[] = { &rcu_ops, &rcu_bh_ops, &rcu_busted_ops, &srcu_ops, &srcud_ops, - &sched_ops, RCUTORTURE_TASKS_OPS + &sched_ops, &tasks_ops, }; if (!torture_init_begin(torture_type, verbose, &torture_runnable)) diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c index 1a1c1047d2ed..76ac5f50b2c7 100644 --- a/kernel/rcu/srcutiny.c +++ b/kernel/rcu/srcutiny.c @@ -33,6 +33,8 @@ #include "rcu_segcblist.h" #include "rcu.h" +int rcu_scheduler_active __read_mostly; + static int init_srcu_struct_fields(struct srcu_struct *sp) { sp->srcu_lock_nesting[0] = 0; @@ -193,3 +195,9 @@ void synchronize_srcu(struct srcu_struct *sp) destroy_rcu_head_on_stack(&rs.head); } EXPORT_SYMBOL_GPL(synchronize_srcu); + +/* Lockdep diagnostics. */ +void __init rcu_scheduler_starting(void) +{ + rcu_scheduler_active = RCU_SCHEDULER_RUNNING; +} diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index d0ca524bf042..729a8706751d 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -51,6 +51,7 @@ module_param(counter_wrap_check, ulong, 0444); static void srcu_invoke_callbacks(struct work_struct *work); static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay); +static void process_srcu(struct work_struct *work); /* * Initialize SRCU combining tree. Note that statically allocated @@ -896,6 +897,15 @@ static void __synchronize_srcu(struct srcu_struct *sp, bool do_norm) __call_srcu(sp, &rcu.head, wakeme_after_rcu, do_norm); wait_for_completion(&rcu.completion); destroy_rcu_head_on_stack(&rcu.head); + + /* + * Make sure that later code is ordered after the SRCU grace + * period. This pairs with the raw_spin_lock_irq_rcu_node() + * in srcu_invoke_callbacks(). Unlike Tree RCU, this is needed + * because the current CPU might have been totally uninvolved with + * (and thus unordered against) that grace period. + */ + smp_mb(); } /** @@ -1194,7 +1204,7 @@ static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay) /* * This is the work-queue function that handles SRCU grace periods. */ -void process_srcu(struct work_struct *work) +static void process_srcu(struct work_struct *work) { struct srcu_struct *sp; @@ -1203,7 +1213,6 @@ void process_srcu(struct work_struct *work) srcu_advance_state(sp); srcu_reschedule(sp, srcu_get_delay(sp)); } -EXPORT_SYMBOL_GPL(process_srcu); void srcutorture_get_gp_data(enum rcutorture_type test_type, struct srcu_struct *sp, int *flags, @@ -1217,6 +1226,43 @@ void srcutorture_get_gp_data(enum rcutorture_type test_type, } EXPORT_SYMBOL_GPL(srcutorture_get_gp_data); +void srcu_torture_stats_print(struct srcu_struct *sp, char *tt, char *tf) +{ + int cpu; + int idx; + unsigned long s0 = 0, s1 = 0; + + idx = sp->srcu_idx & 0x1; + pr_alert("%s%s Tree SRCU per-CPU(idx=%d):", tt, tf, idx); + for_each_possible_cpu(cpu) { + unsigned long l0, l1; + unsigned long u0, u1; + long c0, c1; + struct srcu_data *counts; + + counts = per_cpu_ptr(sp->sda, cpu); + u0 = counts->srcu_unlock_count[!idx]; + u1 = counts->srcu_unlock_count[idx]; + + /* + * Make sure that a lock is always counted if the corresponding + * unlock is counted. + */ + smp_rmb(); + + l0 = counts->srcu_lock_count[!idx]; + l1 = counts->srcu_lock_count[idx]; + + c0 = l0 - u0; + c1 = l1 - u1; + pr_cont(" %d(%ld,%ld)", cpu, c0, c1); + s0 += c0; + s1 += c1; + } + pr_cont(" T(%ld,%ld)\n", s0, s1); +} +EXPORT_SYMBOL_GPL(srcu_torture_stats_print); + static int __init srcu_bootup_announce(void) { pr_info("Hierarchical SRCU implementation.\n"); diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index f8488965250f..a64eee0db39e 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -56,8 +56,6 @@ static struct rcu_ctrlblk rcu_bh_ctrlblk = { .curtail = &rcu_bh_ctrlblk.rcucblist, }; -#include "tiny_plugin.h" - void rcu_barrier_bh(void) { wait_rcu_gp(call_rcu_bh); diff --git a/kernel/rcu/tiny_plugin.h b/kernel/rcu/tiny_plugin.h deleted file mode 100644 index f0a01b2a3062..000000000000 --- a/kernel/rcu/tiny_plugin.h +++ /dev/null @@ -1,47 +0,0 @@ -/* - * Read-Copy Update mechanism for mutual exclusion, the Bloatwatch edition - * Internal non-public definitions that provide either classic - * or preemptible semantics. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, you can access it online at - * http://www.gnu.org/licenses/gpl-2.0.html. - * - * Copyright (c) 2010 Linaro - * - * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com> - */ - -#if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU) -#include <linux/kernel_stat.h> - -int rcu_scheduler_active __read_mostly; -EXPORT_SYMBOL_GPL(rcu_scheduler_active); - -/* - * During boot, we forgive RCU lockdep issues. After this function is - * invoked, we start taking RCU lockdep issues seriously. Note that unlike - * Tree RCU, Tiny RCU transitions directly from RCU_SCHEDULER_INACTIVE - * to RCU_SCHEDULER_RUNNING, skipping the RCU_SCHEDULER_INIT stage. - * The reason for this is that Tiny RCU does not need kthreads, so does - * not have to care about the fact that the scheduler is half-initialized - * at a certain phase of the boot process. Unless SRCU is in the mix. - */ -void __init rcu_scheduler_starting(void) -{ - WARN_ON(nr_context_switches() > 0); - rcu_scheduler_active = IS_ENABLED(CONFIG_SRCU) - ? RCU_SCHEDULER_INIT : RCU_SCHEDULER_RUNNING; -} - -#endif /* #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU) */ diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 51d4c3acf32d..84fe96641b2e 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -97,9 +97,6 @@ struct rcu_state sname##_state = { \ .gp_state = RCU_GP_IDLE, \ .gpnum = 0UL - 300UL, \ .completed = 0UL - 300UL, \ - .orphan_lock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.orphan_lock), \ - .orphan_pend = RCU_CBLIST_INITIALIZER(sname##_state.orphan_pend), \ - .orphan_done = RCU_CBLIST_INITIALIZER(sname##_state.orphan_done), \ .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ .name = RCU_STATE_NAME(sname), \ .abbr = sabbr, \ @@ -843,13 +840,9 @@ static void rcu_eqs_enter(bool user) */ void rcu_idle_enter(void) { - unsigned long flags; - - local_irq_save(flags); + RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_idle_enter() invoked with irqs enabled!!!"); rcu_eqs_enter(false); - local_irq_restore(flags); } -EXPORT_SYMBOL_GPL(rcu_idle_enter); #ifdef CONFIG_NO_HZ_FULL /** @@ -862,7 +855,8 @@ EXPORT_SYMBOL_GPL(rcu_idle_enter); */ void rcu_user_enter(void) { - rcu_eqs_enter(1); + RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_user_enter() invoked with irqs enabled!!!"); + rcu_eqs_enter(true); } #endif /* CONFIG_NO_HZ_FULL */ @@ -955,8 +949,10 @@ static void rcu_eqs_exit(bool user) if (oldval & DYNTICK_TASK_NEST_MASK) { rdtp->dynticks_nesting += DYNTICK_TASK_NEST_VALUE; } else { + __this_cpu_inc(disable_rcu_irq_enter); rdtp->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; rcu_eqs_exit_common(oldval, user); + __this_cpu_dec(disable_rcu_irq_enter); } } @@ -979,7 +975,6 @@ void rcu_idle_exit(void) rcu_eqs_exit(false); local_irq_restore(flags); } -EXPORT_SYMBOL_GPL(rcu_idle_exit); #ifdef CONFIG_NO_HZ_FULL /** @@ -1358,12 +1353,13 @@ static void rcu_check_gp_kthread_starvation(struct rcu_state *rsp) j = jiffies; gpa = READ_ONCE(rsp->gp_activity); if (j - gpa > 2 * HZ) { - pr_err("%s kthread starved for %ld jiffies! g%lu c%lu f%#x %s(%d) ->state=%#lx\n", + pr_err("%s kthread starved for %ld jiffies! g%lu c%lu f%#x %s(%d) ->state=%#lx ->cpu=%d\n", rsp->name, j - gpa, rsp->gpnum, rsp->completed, rsp->gp_flags, gp_state_getname(rsp->gp_state), rsp->gp_state, - rsp->gp_kthread ? rsp->gp_kthread->state : ~0); + rsp->gp_kthread ? rsp->gp_kthread->state : ~0, + rsp->gp_kthread ? task_cpu(rsp->gp_kthread) : -1); if (rsp->gp_kthread) { sched_show_task(rsp->gp_kthread); wake_up_process(rsp->gp_kthread); @@ -2067,8 +2063,8 @@ static bool rcu_gp_init(struct rcu_state *rsp) } /* - * Helper function for wait_event_interruptible_timeout() wakeup - * at force-quiescent-state time. + * Helper function for swait_event_idle() wakeup at force-quiescent-state + * time. */ static bool rcu_gp_fqs_check_wake(struct rcu_state *rsp, int *gfp) { @@ -2206,9 +2202,8 @@ static int __noreturn rcu_gp_kthread(void *arg) READ_ONCE(rsp->gpnum), TPS("reqwait")); rsp->gp_state = RCU_GP_WAIT_GPS; - swait_event_interruptible(rsp->gp_wq, - READ_ONCE(rsp->gp_flags) & - RCU_GP_FLAG_INIT); + swait_event_idle(rsp->gp_wq, READ_ONCE(rsp->gp_flags) & + RCU_GP_FLAG_INIT); rsp->gp_state = RCU_GP_DONE_GPS; /* Locking provides needed memory barrier. */ if (rcu_gp_init(rsp)) @@ -2239,7 +2234,7 @@ static int __noreturn rcu_gp_kthread(void *arg) READ_ONCE(rsp->gpnum), TPS("fqswait")); rsp->gp_state = RCU_GP_WAIT_FQS; - ret = swait_event_interruptible_timeout(rsp->gp_wq, + ret = swait_event_idle_timeout(rsp->gp_wq, rcu_gp_fqs_check_wake(rsp, &gf), j); rsp->gp_state = RCU_GP_DOING_FQS; /* Locking provides needed memory barriers. */ @@ -2409,6 +2404,8 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, return; } WARN_ON_ONCE(oldmask); /* Any child must be all zeroed! */ + WARN_ON_ONCE(rnp->level != rcu_num_lvls - 1 && + rcu_preempt_blocked_readers_cgp(rnp)); rnp->qsmask &= ~mask; trace_rcu_quiescent_state_report(rsp->name, rnp->gpnum, mask, rnp->qsmask, rnp->level, @@ -2563,85 +2560,6 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp) } /* - * Send the specified CPU's RCU callbacks to the orphanage. The - * specified CPU must be offline, and the caller must hold the - * ->orphan_lock. - */ -static void -rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, - struct rcu_node *rnp, struct rcu_data *rdp) -{ - lockdep_assert_held(&rsp->orphan_lock); - - /* No-CBs CPUs do not have orphanable callbacks. */ - if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) || rcu_is_nocb_cpu(rdp->cpu)) - return; - - /* - * Orphan the callbacks. First adjust the counts. This is safe - * because _rcu_barrier() excludes CPU-hotplug operations, so it - * cannot be running now. Thus no memory barrier is required. - */ - rdp->n_cbs_orphaned += rcu_segcblist_n_cbs(&rdp->cblist); - rcu_segcblist_extract_count(&rdp->cblist, &rsp->orphan_done); - - /* - * Next, move those callbacks still needing a grace period to - * the orphanage, where some other CPU will pick them up. - * Some of the callbacks might have gone partway through a grace - * period, but that is too bad. They get to start over because we - * cannot assume that grace periods are synchronized across CPUs. - */ - rcu_segcblist_extract_pend_cbs(&rdp->cblist, &rsp->orphan_pend); - - /* - * Then move the ready-to-invoke callbacks to the orphanage, - * where some other CPU will pick them up. These will not be - * required to pass though another grace period: They are done. - */ - rcu_segcblist_extract_done_cbs(&rdp->cblist, &rsp->orphan_done); - - /* Finally, disallow further callbacks on this CPU. */ - rcu_segcblist_disable(&rdp->cblist); -} - -/* - * Adopt the RCU callbacks from the specified rcu_state structure's - * orphanage. The caller must hold the ->orphan_lock. - */ -static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags) -{ - struct rcu_data *rdp = raw_cpu_ptr(rsp->rda); - - lockdep_assert_held(&rsp->orphan_lock); - - /* No-CBs CPUs are handled specially. */ - if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) || - rcu_nocb_adopt_orphan_cbs(rsp, rdp, flags)) - return; - - /* Do the accounting first. */ - rdp->n_cbs_adopted += rsp->orphan_done.len; - if (rsp->orphan_done.len_lazy != rsp->orphan_done.len) - rcu_idle_count_callbacks_posted(); - rcu_segcblist_insert_count(&rdp->cblist, &rsp->orphan_done); - - /* - * We do not need a memory barrier here because the only way we - * can get here if there is an rcu_barrier() in flight is if - * we are the task doing the rcu_barrier(). - */ - - /* First adopt the ready-to-invoke callbacks, then the done ones. */ - rcu_segcblist_insert_done_cbs(&rdp->cblist, &rsp->orphan_done); - WARN_ON_ONCE(rsp->orphan_done.head); - rcu_segcblist_insert_pend_cbs(&rdp->cblist, &rsp->orphan_pend); - WARN_ON_ONCE(rsp->orphan_pend.head); - WARN_ON_ONCE(rcu_segcblist_empty(&rdp->cblist) != - !rcu_segcblist_n_cbs(&rdp->cblist)); -} - -/* * Trace the fact that this CPU is going offline. */ static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) @@ -2704,14 +2622,12 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf) /* * The CPU has been completely removed, and some other CPU is reporting - * this fact from process context. Do the remainder of the cleanup, - * including orphaning the outgoing CPU's RCU callbacks, and also - * adopting them. There can only be one CPU hotplug operation at a time, - * so no other CPU can be attempting to update rcu_cpu_kthread_task. + * this fact from process context. Do the remainder of the cleanup. + * There can only be one CPU hotplug operation at a time, so no need for + * explicit locking. */ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) { - unsigned long flags; struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */ @@ -2720,18 +2636,6 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) /* Adjust any no-longer-needed kthreads. */ rcu_boost_kthread_setaffinity(rnp, -1); - - /* Orphan the dead CPU's callbacks, and adopt them if appropriate. */ - raw_spin_lock_irqsave(&rsp->orphan_lock, flags); - rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp); - rcu_adopt_orphan_cbs(rsp, flags); - raw_spin_unlock_irqrestore(&rsp->orphan_lock, flags); - - WARN_ONCE(rcu_segcblist_n_cbs(&rdp->cblist) != 0 || - !rcu_segcblist_empty(&rdp->cblist), - "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, 1stCB=%p\n", - cpu, rcu_segcblist_n_cbs(&rdp->cblist), - rcu_segcblist_first_cb(&rdp->cblist)); } /* @@ -3569,10 +3473,11 @@ static void rcu_barrier_callback(struct rcu_head *rhp) struct rcu_state *rsp = rdp->rsp; if (atomic_dec_and_test(&rsp->barrier_cpu_count)) { - _rcu_barrier_trace(rsp, "LastCB", -1, rsp->barrier_sequence); + _rcu_barrier_trace(rsp, TPS("LastCB"), -1, + rsp->barrier_sequence); complete(&rsp->barrier_completion); } else { - _rcu_barrier_trace(rsp, "CB", -1, rsp->barrier_sequence); + _rcu_barrier_trace(rsp, TPS("CB"), -1, rsp->barrier_sequence); } } @@ -3584,14 +3489,15 @@ static void rcu_barrier_func(void *type) struct rcu_state *rsp = type; struct rcu_data *rdp = raw_cpu_ptr(rsp->rda); - _rcu_barrier_trace(rsp, "IRQ", -1, rsp->barrier_sequence); + _rcu_barrier_trace(rsp, TPS("IRQ"), -1, rsp->barrier_sequence); rdp->barrier_head.func = rcu_barrier_callback; debug_rcu_head_queue(&rdp->barrier_head); if (rcu_segcblist_entrain(&rdp->cblist, &rdp->barrier_head, 0)) { atomic_inc(&rsp->barrier_cpu_count); } else { debug_rcu_head_unqueue(&rdp->barrier_head); - _rcu_barrier_trace(rsp, "IRQNQ", -1, rsp->barrier_sequence); + _rcu_barrier_trace(rsp, TPS("IRQNQ"), -1, + rsp->barrier_sequence); } } @@ -3605,14 +3511,15 @@ static void _rcu_barrier(struct rcu_state *rsp) struct rcu_data *rdp; unsigned long s = rcu_seq_snap(&rsp->barrier_sequence); - _rcu_barrier_trace(rsp, "Begin", -1, s); + _rcu_barrier_trace(rsp, TPS("Begin"), -1, s); /* Take mutex to serialize concurrent rcu_barrier() requests. */ mutex_lock(&rsp->barrier_mutex); /* Did someone else do our work for us? */ if (rcu_seq_done(&rsp->barrier_sequence, s)) { - _rcu_barrier_trace(rsp, "EarlyExit", -1, rsp->barrier_sequence); + _rcu_barrier_trace(rsp, TPS("EarlyExit"), -1, + rsp->barrier_sequence); smp_mb(); /* caller's subsequent code after above check. */ mutex_unlock(&rsp->barrier_mutex); return; @@ -3620,7 +3527,7 @@ static void _rcu_barrier(struct rcu_state *rsp) /* Mark the start of the barrier operation. */ rcu_seq_start(&rsp->barrier_sequence); - _rcu_barrier_trace(rsp, "Inc1", -1, rsp->barrier_sequence); + _rcu_barrier_trace(rsp, TPS("Inc1"), -1, rsp->barrier_sequence); /* * Initialize the count to one rather than to zero in order to @@ -3643,10 +3550,10 @@ static void _rcu_barrier(struct rcu_state *rsp) rdp = per_cpu_ptr(rsp->rda, cpu); if (rcu_is_nocb_cpu(cpu)) { if (!rcu_nocb_cpu_needs_barrier(rsp, cpu)) { - _rcu_barrier_trace(rsp, "OfflineNoCB", cpu, + _rcu_barrier_trace(rsp, TPS("OfflineNoCB"), cpu, rsp->barrier_sequence); } else { - _rcu_barrier_trace(rsp, "OnlineNoCB", cpu, + _rcu_barrier_trace(rsp, TPS("OnlineNoCB"), cpu, rsp->barrier_sequence); smp_mb__before_atomic(); atomic_inc(&rsp->barrier_cpu_count); @@ -3654,11 +3561,11 @@ static void _rcu_barrier(struct rcu_state *rsp) rcu_barrier_callback, rsp, cpu, 0); } } else if (rcu_segcblist_n_cbs(&rdp->cblist)) { - _rcu_barrier_trace(rsp, "OnlineQ", cpu, + _rcu_barrier_trace(rsp, TPS("OnlineQ"), cpu, rsp->barrier_sequence); smp_call_function_single(cpu, rcu_barrier_func, rsp, 1); } else { - _rcu_barrier_trace(rsp, "OnlineNQ", cpu, + _rcu_barrier_trace(rsp, TPS("OnlineNQ"), cpu, rsp->barrier_sequence); } } @@ -3675,7 +3582,7 @@ static void _rcu_barrier(struct rcu_state *rsp) wait_for_completion(&rsp->barrier_completion); /* Mark the end of the barrier operation. */ - _rcu_barrier_trace(rsp, "Inc2", -1, rsp->barrier_sequence); + _rcu_barrier_trace(rsp, TPS("Inc2"), -1, rsp->barrier_sequence); rcu_seq_end(&rsp->barrier_sequence); /* Other rcu_barrier() invocations can now safely proceed. */ @@ -3777,8 +3684,6 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) */ rnp = rdp->mynode; raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ - if (!rdp->beenonline) - WRITE_ONCE(rsp->ncpus, READ_ONCE(rsp->ncpus) + 1); rdp->beenonline = true; /* We have now been online. */ rdp->gpnum = rnp->completed; /* Make CPU later note any new GP. */ rdp->completed = rnp->completed; @@ -3882,6 +3787,8 @@ void rcu_cpu_starting(unsigned int cpu) { unsigned long flags; unsigned long mask; + int nbits; + unsigned long oldmask; struct rcu_data *rdp; struct rcu_node *rnp; struct rcu_state *rsp; @@ -3892,9 +3799,15 @@ void rcu_cpu_starting(unsigned int cpu) mask = rdp->grpmask; raw_spin_lock_irqsave_rcu_node(rnp, flags); rnp->qsmaskinitnext |= mask; + oldmask = rnp->expmaskinitnext; rnp->expmaskinitnext |= mask; + oldmask ^= rnp->expmaskinitnext; + nbits = bitmap_weight(&oldmask, BITS_PER_LONG); + /* Allow lockless access for expedited grace periods. */ + smp_store_release(&rsp->ncpus, rsp->ncpus + nbits); /* ^^^ */ raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } + smp_mb(); /* Ensure RCU read-side usage follows above initialization. */ } #ifdef CONFIG_HOTPLUG_CPU @@ -3937,6 +3850,50 @@ void rcu_report_dead(unsigned int cpu) for_each_rcu_flavor(rsp) rcu_cleanup_dying_idle_cpu(cpu, rsp); } + +/* Migrate the dead CPU's callbacks to the current CPU. */ +static void rcu_migrate_callbacks(int cpu, struct rcu_state *rsp) +{ + unsigned long flags; + struct rcu_data *my_rdp; + struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); + struct rcu_node *rnp_root = rcu_get_root(rdp->rsp); + + if (rcu_is_nocb_cpu(cpu) || rcu_segcblist_empty(&rdp->cblist)) + return; /* No callbacks to migrate. */ + + local_irq_save(flags); + my_rdp = this_cpu_ptr(rsp->rda); + if (rcu_nocb_adopt_orphan_cbs(my_rdp, rdp, flags)) { + local_irq_restore(flags); + return; + } + raw_spin_lock_rcu_node(rnp_root); /* irqs already disabled. */ + rcu_advance_cbs(rsp, rnp_root, rdp); /* Leverage recent GPs. */ + rcu_advance_cbs(rsp, rnp_root, my_rdp); /* Assign GP to pending CBs. */ + rcu_segcblist_merge(&my_rdp->cblist, &rdp->cblist); + WARN_ON_ONCE(rcu_segcblist_empty(&my_rdp->cblist) != + !rcu_segcblist_n_cbs(&my_rdp->cblist)); + raw_spin_unlock_irqrestore_rcu_node(rnp_root, flags); + WARN_ONCE(rcu_segcblist_n_cbs(&rdp->cblist) != 0 || + !rcu_segcblist_empty(&rdp->cblist), + "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, 1stCB=%p\n", + cpu, rcu_segcblist_n_cbs(&rdp->cblist), + rcu_segcblist_first_cb(&rdp->cblist)); +} + +/* + * The outgoing CPU has just passed through the dying-idle state, + * and we are being invoked from the CPU that was IPIed to continue the + * offline operation. We need to migrate the outgoing CPU's callbacks. + */ +void rcutree_migrate_callbacks(int cpu) +{ + struct rcu_state *rsp; + + for_each_rcu_flavor(rsp) + rcu_migrate_callbacks(cpu, rsp); +} #endif /* diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 9af0f31d6847..8e1f285f0a70 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -219,8 +219,6 @@ struct rcu_data { /* qlen at last check for QS forcing */ unsigned long n_cbs_invoked; /* count of RCU cbs invoked. */ unsigned long n_nocbs_invoked; /* count of no-CBs RCU cbs invoked. */ - unsigned long n_cbs_orphaned; /* RCU cbs orphaned by dying CPU */ - unsigned long n_cbs_adopted; /* RCU cbs adopted from dying CPU */ unsigned long n_force_qs_snap; /* did other CPU force QS recently? */ long blimit; /* Upper limit on a processed batch */ @@ -268,7 +266,9 @@ struct rcu_data { struct rcu_head **nocb_follower_tail; struct swait_queue_head nocb_wq; /* For nocb kthreads to sleep on. */ struct task_struct *nocb_kthread; + raw_spinlock_t nocb_lock; /* Guard following pair of fields. */ int nocb_defer_wakeup; /* Defer wakeup of nocb_kthread. */ + struct timer_list nocb_timer; /* Enforce finite deferral. */ /* The following fields are used by the leader, hence own cacheline. */ struct rcu_head *nocb_gp_head ____cacheline_internodealigned_in_smp; @@ -350,15 +350,6 @@ struct rcu_state { /* End of fields guarded by root rcu_node's lock. */ - raw_spinlock_t orphan_lock ____cacheline_internodealigned_in_smp; - /* Protect following fields. */ - struct rcu_cblist orphan_pend; /* Orphaned callbacks that */ - /* need a grace period. */ - struct rcu_cblist orphan_done; /* Orphaned callbacks that */ - /* are ready to invoke. */ - /* (Contains counts.) */ - /* End of fields guarded by orphan_lock. */ - struct mutex barrier_mutex; /* Guards barrier fields. */ atomic_t barrier_cpu_count; /* # CPUs waiting on. */ struct completion barrier_completion; /* Wake at barrier end. */ @@ -495,7 +486,7 @@ static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq); static void rcu_init_one_nocb(struct rcu_node *rnp); static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, bool lazy, unsigned long flags); -static bool rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, +static bool rcu_nocb_adopt_orphan_cbs(struct rcu_data *my_rdp, struct rcu_data *rdp, unsigned long flags); static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp); diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index dd21ca47e4b4..46d61b597731 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -73,7 +73,7 @@ static void sync_exp_reset_tree_hotplug(struct rcu_state *rsp) unsigned long flags; unsigned long mask; unsigned long oldmask; - int ncpus = READ_ONCE(rsp->ncpus); + int ncpus = smp_load_acquire(&rsp->ncpus); /* Order against locking. */ struct rcu_node *rnp; struct rcu_node *rnp_up; diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 908b309d60d7..55bde94b9572 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -180,6 +180,8 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp) struct task_struct *t = current; lockdep_assert_held(&rnp->lock); + WARN_ON_ONCE(rdp->mynode != rnp); + WARN_ON_ONCE(rnp->level != rcu_num_lvls - 1); /* * Decide where to queue the newly blocked task. In theory, @@ -261,6 +263,10 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp) rnp->gp_tasks = &t->rcu_node_entry; if (!rnp->exp_tasks && (blkd_state & RCU_EXP_BLKD)) rnp->exp_tasks = &t->rcu_node_entry; + WARN_ON_ONCE(!(blkd_state & RCU_GP_BLKD) != + !(rnp->qsmask & rdp->grpmask)); + WARN_ON_ONCE(!(blkd_state & RCU_EXP_BLKD) != + !(rnp->expmask & rdp->grpmask)); raw_spin_unlock_rcu_node(rnp); /* interrupts remain disabled. */ /* @@ -482,6 +488,7 @@ void rcu_read_unlock_special(struct task_struct *t) rnp = t->rcu_blocked_node; raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ WARN_ON_ONCE(rnp != t->rcu_blocked_node); + WARN_ON_ONCE(rnp->level != rcu_num_lvls - 1); empty_norm = !rcu_preempt_blocked_readers_cgp(rnp); empty_exp = sync_rcu_preempt_exp_done(rnp); smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */ @@ -495,10 +502,10 @@ void rcu_read_unlock_special(struct task_struct *t) if (&t->rcu_node_entry == rnp->exp_tasks) rnp->exp_tasks = np; if (IS_ENABLED(CONFIG_RCU_BOOST)) { - if (&t->rcu_node_entry == rnp->boost_tasks) - rnp->boost_tasks = np; /* Snapshot ->boost_mtx ownership w/rnp->lock held. */ drop_boost_mutex = rt_mutex_owner(&rnp->boost_mtx) == t; + if (&t->rcu_node_entry == rnp->boost_tasks) + rnp->boost_tasks = np; } /* @@ -636,10 +643,17 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp) */ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) { + struct task_struct *t; + RCU_LOCKDEP_WARN(preemptible(), "rcu_preempt_check_blocked_tasks() invoked with preemption enabled!!!\n"); WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)); - if (rcu_preempt_has_tasks(rnp)) + if (rcu_preempt_has_tasks(rnp)) { rnp->gp_tasks = rnp->blkd_tasks.next; + t = container_of(rnp->gp_tasks, struct task_struct, + rcu_node_entry); + trace_rcu_unlock_preempted_task(TPS("rcu_preempt-GPS"), + rnp->gpnum, t->pid); + } WARN_ON_ONCE(rnp->qsmask); } @@ -1788,23 +1802,62 @@ bool rcu_is_nocb_cpu(int cpu) } /* - * Kick the leader kthread for this NOCB group. + * Kick the leader kthread for this NOCB group. Caller holds ->nocb_lock + * and this function releases it. */ -static void wake_nocb_leader(struct rcu_data *rdp, bool force) +static void __wake_nocb_leader(struct rcu_data *rdp, bool force, + unsigned long flags) + __releases(rdp->nocb_lock) { struct rcu_data *rdp_leader = rdp->nocb_leader; - if (!READ_ONCE(rdp_leader->nocb_kthread)) + lockdep_assert_held(&rdp->nocb_lock); + if (!READ_ONCE(rdp_leader->nocb_kthread)) { + raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); return; - if (READ_ONCE(rdp_leader->nocb_leader_sleep) || force) { + } + if (rdp_leader->nocb_leader_sleep || force) { /* Prior smp_mb__after_atomic() orders against prior enqueue. */ WRITE_ONCE(rdp_leader->nocb_leader_sleep, false); + del_timer(&rdp->nocb_timer); + raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); smp_mb(); /* ->nocb_leader_sleep before swake_up(). */ swake_up(&rdp_leader->nocb_wq); + } else { + raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); } } /* + * Kick the leader kthread for this NOCB group, but caller has not + * acquired locks. + */ +static void wake_nocb_leader(struct rcu_data *rdp, bool force) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&rdp->nocb_lock, flags); + __wake_nocb_leader(rdp, force, flags); +} + +/* + * Arrange to wake the leader kthread for this NOCB group at some + * future time when it is safe to do so. + */ +static void wake_nocb_leader_defer(struct rcu_data *rdp, int waketype, + const char *reason) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&rdp->nocb_lock, flags); + if (rdp->nocb_defer_wakeup == RCU_NOCB_WAKE_NOT) + mod_timer(&rdp->nocb_timer, jiffies + 1); + WRITE_ONCE(rdp->nocb_defer_wakeup, waketype); + trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, reason); + raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); +} + +/* * Does the specified CPU need an RCU callback for the specified flavor * of rcu_barrier()? */ @@ -1891,11 +1944,8 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeEmpty")); } else { - WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE); - /* Store ->nocb_defer_wakeup before ->rcu_urgent_qs. */ - smp_store_release(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs), true); - trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, - TPS("WakeEmptyIsDeferred")); + wake_nocb_leader_defer(rdp, RCU_NOCB_WAKE, + TPS("WakeEmptyIsDeferred")); } rdp->qlen_last_fqs_check = 0; } else if (len > rdp->qlen_last_fqs_check + qhimark) { @@ -1905,11 +1955,8 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeOvf")); } else { - WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_FORCE); - /* Store ->nocb_defer_wakeup before ->rcu_urgent_qs. */ - smp_store_release(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs), true); - trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, - TPS("WakeOvfIsDeferred")); + wake_nocb_leader_defer(rdp, RCU_NOCB_WAKE, + TPS("WakeOvfIsDeferred")); } rdp->qlen_last_fqs_check = LONG_MAX / 2; } else { @@ -1961,30 +2008,19 @@ static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, * Adopt orphaned callbacks on a no-CBs CPU, or return 0 if this is * not a no-CBs CPU. */ -static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, +static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_data *my_rdp, struct rcu_data *rdp, unsigned long flags) { - long ql = rsp->orphan_done.len; - long qll = rsp->orphan_done.len_lazy; - - /* If this is not a no-CBs CPU, tell the caller to do it the old way. */ + RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_nocb_adopt_orphan_cbs() invoked with irqs enabled!!!"); if (!rcu_is_nocb_cpu(smp_processor_id())) - return false; - - /* First, enqueue the donelist, if any. This preserves CB ordering. */ - if (rsp->orphan_done.head) { - __call_rcu_nocb_enqueue(rdp, rcu_cblist_head(&rsp->orphan_done), - rcu_cblist_tail(&rsp->orphan_done), - ql, qll, flags); - } - if (rsp->orphan_pend.head) { - __call_rcu_nocb_enqueue(rdp, rcu_cblist_head(&rsp->orphan_pend), - rcu_cblist_tail(&rsp->orphan_pend), - ql, qll, flags); - } - rcu_cblist_init(&rsp->orphan_done); - rcu_cblist_init(&rsp->orphan_pend); + return false; /* Not NOCBs CPU, caller must migrate CBs. */ + __call_rcu_nocb_enqueue(my_rdp, rcu_segcblist_head(&rdp->cblist), + rcu_segcblist_tail(&rdp->cblist), + rcu_segcblist_n_cbs(&rdp->cblist), + rcu_segcblist_n_lazy_cbs(&rdp->cblist), flags); + rcu_segcblist_init(&rdp->cblist); + rcu_segcblist_disable(&rdp->cblist); return true; } @@ -2031,6 +2067,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp) static void nocb_leader_wait(struct rcu_data *my_rdp) { bool firsttime = true; + unsigned long flags; bool gotcbs; struct rcu_data *rdp; struct rcu_head **tail; @@ -2039,13 +2076,17 @@ wait_again: /* Wait for callbacks to appear. */ if (!rcu_nocb_poll) { - trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "Sleep"); + trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, TPS("Sleep")); swait_event_interruptible(my_rdp->nocb_wq, !READ_ONCE(my_rdp->nocb_leader_sleep)); - /* Memory barrier handled by smp_mb() calls below and repoll. */ + raw_spin_lock_irqsave(&my_rdp->nocb_lock, flags); + my_rdp->nocb_leader_sleep = true; + WRITE_ONCE(my_rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT); + del_timer(&my_rdp->nocb_timer); + raw_spin_unlock_irqrestore(&my_rdp->nocb_lock, flags); } else if (firsttime) { firsttime = false; /* Don't drown trace log with "Poll"! */ - trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "Poll"); + trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, TPS("Poll")); } /* @@ -2054,7 +2095,7 @@ wait_again: * nocb_gp_head, where they await a grace period. */ gotcbs = false; - smp_mb(); /* wakeup before ->nocb_head reads. */ + smp_mb(); /* wakeup and _sleep before ->nocb_head reads. */ for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) { rdp->nocb_gp_head = READ_ONCE(rdp->nocb_head); if (!rdp->nocb_gp_head) @@ -2066,56 +2107,41 @@ wait_again: gotcbs = true; } - /* - * If there were no callbacks, sleep a bit, rescan after a - * memory barrier, and go retry. - */ + /* No callbacks? Sleep a bit if polling, and go retry. */ if (unlikely(!gotcbs)) { - if (!rcu_nocb_poll) - trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, - "WokeEmpty"); WARN_ON(signal_pending(current)); - schedule_timeout_interruptible(1); - - /* Rescan in case we were a victim of memory ordering. */ - my_rdp->nocb_leader_sleep = true; - smp_mb(); /* Ensure _sleep true before scan. */ - for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) - if (READ_ONCE(rdp->nocb_head)) { - /* Found CB, so short-circuit next wait. */ - my_rdp->nocb_leader_sleep = false; - break; - } + if (rcu_nocb_poll) { + schedule_timeout_interruptible(1); + } else { + trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, + TPS("WokeEmpty")); + } goto wait_again; } /* Wait for one grace period. */ rcu_nocb_wait_gp(my_rdp); - /* - * We left ->nocb_leader_sleep unset to reduce cache thrashing. - * We set it now, but recheck for new callbacks while - * traversing our follower list. - */ - my_rdp->nocb_leader_sleep = true; - smp_mb(); /* Ensure _sleep true before scan of ->nocb_head. */ - /* Each pass through the following loop wakes a follower, if needed. */ for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) { - if (READ_ONCE(rdp->nocb_head)) + if (!rcu_nocb_poll && + READ_ONCE(rdp->nocb_head) && + READ_ONCE(my_rdp->nocb_leader_sleep)) { + raw_spin_lock_irqsave(&my_rdp->nocb_lock, flags); my_rdp->nocb_leader_sleep = false;/* No need to sleep.*/ + raw_spin_unlock_irqrestore(&my_rdp->nocb_lock, flags); + } if (!rdp->nocb_gp_head) continue; /* No CBs, so no need to wake follower. */ /* Append callbacks to follower's "done" list. */ - tail = xchg(&rdp->nocb_follower_tail, rdp->nocb_gp_tail); + raw_spin_lock_irqsave(&rdp->nocb_lock, flags); + tail = rdp->nocb_follower_tail; + rdp->nocb_follower_tail = rdp->nocb_gp_tail; *tail = rdp->nocb_gp_head; - smp_mb__after_atomic(); /* Store *tail before wakeup. */ + raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); if (rdp != my_rdp && tail == &rdp->nocb_follower_head) { - /* - * List was empty, wake up the follower. - * Memory barriers supplied by atomic_long_add(). - */ + /* List was empty, so wake up the follower. */ swake_up(&rdp->nocb_wq); } } @@ -2131,28 +2157,16 @@ wait_again: */ static void nocb_follower_wait(struct rcu_data *rdp) { - bool firsttime = true; - for (;;) { - if (!rcu_nocb_poll) { - trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, - "FollowerSleep"); - swait_event_interruptible(rdp->nocb_wq, - READ_ONCE(rdp->nocb_follower_head)); - } else if (firsttime) { - /* Don't drown trace log with "Poll"! */ - firsttime = false; - trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, "Poll"); - } + trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("FollowerSleep")); + swait_event_interruptible(rdp->nocb_wq, + READ_ONCE(rdp->nocb_follower_head)); if (smp_load_acquire(&rdp->nocb_follower_head)) { /* ^^^ Ensure CB invocation follows _head test. */ return; } - if (!rcu_nocb_poll) - trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, - "WokeEmpty"); WARN_ON(signal_pending(current)); - schedule_timeout_interruptible(1); + trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WokeEmpty")); } } @@ -2165,6 +2179,7 @@ static void nocb_follower_wait(struct rcu_data *rdp) static int rcu_nocb_kthread(void *arg) { int c, cl; + unsigned long flags; struct rcu_head *list; struct rcu_head *next; struct rcu_head **tail; @@ -2179,11 +2194,14 @@ static int rcu_nocb_kthread(void *arg) nocb_follower_wait(rdp); /* Pull the ready-to-invoke callbacks onto local list. */ - list = READ_ONCE(rdp->nocb_follower_head); + raw_spin_lock_irqsave(&rdp->nocb_lock, flags); + list = rdp->nocb_follower_head; + rdp->nocb_follower_head = NULL; + tail = rdp->nocb_follower_tail; + rdp->nocb_follower_tail = &rdp->nocb_follower_head; + raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); BUG_ON(!list); - trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, "WokeNonEmpty"); - WRITE_ONCE(rdp->nocb_follower_head, NULL); - tail = xchg(&rdp->nocb_follower_tail, &rdp->nocb_follower_head); + trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WokeNonEmpty")); /* Each pass through the following loop invokes a callback. */ trace_rcu_batch_start(rdp->rsp->name, @@ -2226,18 +2244,39 @@ static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp) } /* Do a deferred wakeup of rcu_nocb_kthread(). */ -static void do_nocb_deferred_wakeup(struct rcu_data *rdp) +static void do_nocb_deferred_wakeup_common(struct rcu_data *rdp) { + unsigned long flags; int ndw; - if (!rcu_nocb_need_deferred_wakeup(rdp)) + raw_spin_lock_irqsave(&rdp->nocb_lock, flags); + if (!rcu_nocb_need_deferred_wakeup(rdp)) { + raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); return; + } ndw = READ_ONCE(rdp->nocb_defer_wakeup); WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT); - wake_nocb_leader(rdp, ndw == RCU_NOCB_WAKE_FORCE); + __wake_nocb_leader(rdp, ndw == RCU_NOCB_WAKE_FORCE, flags); trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("DeferredWake")); } +/* Do a deferred wakeup of rcu_nocb_kthread() from a timer handler. */ +static void do_nocb_deferred_wakeup_timer(unsigned long x) +{ + do_nocb_deferred_wakeup_common((struct rcu_data *)x); +} + +/* + * Do a deferred wakeup of rcu_nocb_kthread() from fastpath. + * This means we do an inexact common-case check. Note that if + * we miss, ->nocb_timer will eventually clean things up. + */ +static void do_nocb_deferred_wakeup(struct rcu_data *rdp) +{ + if (rcu_nocb_need_deferred_wakeup(rdp)) + do_nocb_deferred_wakeup_common(rdp); +} + void __init rcu_init_nohz(void) { int cpu; @@ -2287,6 +2326,9 @@ static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) rdp->nocb_tail = &rdp->nocb_head; init_swait_queue_head(&rdp->nocb_wq); rdp->nocb_follower_tail = &rdp->nocb_follower_head; + raw_spin_lock_init(&rdp->nocb_lock); + setup_timer(&rdp->nocb_timer, do_nocb_deferred_wakeup_timer, + (unsigned long)rdp); } /* @@ -2459,7 +2501,7 @@ static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, return false; } -static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, +static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_data *my_rdp, struct rcu_data *rdp, unsigned long flags) { diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 00e77c470017..5033b66d2753 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -568,7 +568,7 @@ static DECLARE_WAIT_QUEUE_HEAD(rcu_tasks_cbs_wq); static DEFINE_RAW_SPINLOCK(rcu_tasks_cbs_lock); /* Track exiting tasks in order to allow them to be waited for. */ -DEFINE_SRCU(tasks_rcu_exit_srcu); +DEFINE_STATIC_SRCU(tasks_rcu_exit_srcu); /* Control stall timeouts. Disable with <= 0, otherwise jiffies till stall. */ #define RCU_TASK_STALL_TIMEOUT (HZ * 60 * 10) @@ -875,6 +875,22 @@ static void rcu_spawn_tasks_kthread(void) mutex_unlock(&rcu_tasks_kthread_mutex); } +/* Do the srcu_read_lock() for the above synchronize_srcu(). */ +void exit_tasks_rcu_start(void) +{ + preempt_disable(); + current->rcu_tasks_idx = __srcu_read_lock(&tasks_rcu_exit_srcu); + preempt_enable(); +} + +/* Do the srcu_read_unlock() for the above synchronize_srcu(). */ +void exit_tasks_rcu_finish(void) +{ + preempt_disable(); + __srcu_read_unlock(&tasks_rcu_exit_srcu, current->rcu_tasks_idx); + preempt_enable(); +} + #endif /* #ifdef CONFIG_TASKS_RCU */ #ifndef CONFIG_TINY_RCU diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 53f0164ed362..78f54932ea1d 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -25,3 +25,4 @@ obj-$(CONFIG_SCHED_DEBUG) += debug.o obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o obj-$(CONFIG_CPU_FREQ) += cpufreq.o obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o +obj-$(CONFIG_MEMBARRIER) += membarrier.o diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c index da39489d2d80..de6d7f4dfcb5 100644 --- a/kernel/sched/autogroup.c +++ b/kernel/sched/autogroup.c @@ -71,7 +71,6 @@ static inline struct autogroup *autogroup_create(void) goto out_fail; tg = sched_create_group(&root_task_group); - if (IS_ERR(tg)) goto out_free; @@ -101,7 +100,7 @@ out_free: out_fail: if (printk_ratelimit()) { printk(KERN_WARNING "autogroup_create: %s failure.\n", - ag ? "sched_create_group()" : "kmalloc()"); + ag ? "sched_create_group()" : "kzalloc()"); } return autogroup_kref_get(&autogroup_default); diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c index 13fc5ae9bf2f..cc873075c3bd 100644 --- a/kernel/sched/completion.c +++ b/kernel/sched/completion.c @@ -32,6 +32,12 @@ void complete(struct completion *x) unsigned long flags; spin_lock_irqsave(&x->wait.lock, flags); + + /* + * Perform commit of crossrelease here. + */ + complete_release_commit(x); + if (x->done != UINT_MAX) x->done++; __wake_up_locked(&x->wait, TASK_NORMAL, 1); @@ -47,6 +53,13 @@ EXPORT_SYMBOL(complete); * * It may be assumed that this function implies a write memory barrier before * changing the task state if and only if any tasks are woken up. + * + * Since complete_all() sets the completion of @x permanently to done + * to allow multiple waiters to finish, a call to reinit_completion() + * must be used on @x if @x is to be used again. The code must make + * sure that all waiters have woken and finished before reinitializing + * @x. Also note that the function completion_done() can not be used + * to know if there are still waiters after complete_all() has been called. */ void complete_all(struct completion *x) { @@ -92,9 +105,14 @@ __wait_for_common(struct completion *x, { might_sleep(); + complete_acquire(x); + spin_lock_irq(&x->wait.lock); timeout = do_wait_for_common(x, action, timeout, state); spin_unlock_irq(&x->wait.lock); + + complete_release(x); + return timeout; } @@ -297,9 +315,12 @@ EXPORT_SYMBOL(try_wait_for_completion); * Return: 0 if there are waiters (wait_for_completion() in progress) * 1 if there are no waiters. * + * Note, this will always return true if complete_all() was called on @X. */ bool completion_done(struct completion *x) { + unsigned long flags; + if (!READ_ONCE(x->done)) return false; @@ -307,14 +328,9 @@ bool completion_done(struct completion *x) * If ->done, we need to wait for complete() to release ->wait.lock * otherwise we can end up freeing the completion before complete() * is done referencing it. - * - * The RMB pairs with complete()'s RELEASE of ->wait.lock and orders - * the loads of ->done and ->wait.lock such that we cannot observe - * the lock before complete() acquires it while observing the ->done - * after it's acquired the lock. */ - smp_rmb(); - spin_unlock_wait(&x->wait.lock); + spin_lock_irqsave(&x->wait.lock, flags); + spin_unlock_irqrestore(&x->wait.lock, flags); return true; } EXPORT_SYMBOL(completion_done); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 0869b20fba81..6d2c7ff9ba98 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -951,8 +951,13 @@ struct migration_arg { static struct rq *__migrate_task(struct rq *rq, struct rq_flags *rf, struct task_struct *p, int dest_cpu) { - if (unlikely(!cpu_active(dest_cpu))) - return rq; + if (p->flags & PF_KTHREAD) { + if (unlikely(!cpu_online(dest_cpu))) + return rq; + } else { + if (unlikely(!cpu_active(dest_cpu))) + return rq; + } /* Affinity changed (again). */ if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) @@ -1967,8 +1972,8 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) * reordered with p->state check below. This pairs with mb() in * set_current_state() the waiting thread does. */ - smp_mb__before_spinlock(); raw_spin_lock_irqsave(&p->pi_lock, flags); + smp_mb__after_spinlock(); if (!(p->state & state)) goto out; @@ -2635,6 +2640,16 @@ static struct rq *finish_task_switch(struct task_struct *prev) prev_state = prev->state; vtime_task_switch(prev); perf_event_task_sched_in(prev, current); + /* + * The membarrier system call requires a full memory barrier + * after storing to rq->curr, before going back to user-space. + * + * TODO: This smp_mb__after_unlock_lock can go away if PPC end + * up adding a full barrier to switch_mm(), or we should figure + * out if a smp_mb__after_unlock_lock is really the proper API + * to use. + */ + smp_mb__after_unlock_lock(); finish_lock_switch(rq, prev); finish_arch_post_lock_switch(); @@ -3281,8 +3296,8 @@ static void __sched notrace __schedule(bool preempt) * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE) * done by the caller to avoid the race with signal_wake_up(). */ - smp_mb__before_spinlock(); rq_lock(rq, &rf); + smp_mb__after_spinlock(); /* Promote REQ to ACT */ rq->clock_update_flags <<= 1; @@ -3324,6 +3339,21 @@ static void __sched notrace __schedule(bool preempt) if (likely(prev != next)) { rq->nr_switches++; rq->curr = next; + /* + * The membarrier system call requires each architecture + * to have a full memory barrier after updating + * rq->curr, before returning to user-space. For TSO + * (e.g. x86), the architecture must provide its own + * barrier in switch_mm(). For weakly ordered machines + * for which spin_unlock() acts as a full memory + * barrier, finish_lock_switch() in common code takes + * care of this barrier. For weakly ordered machines for + * which spin_unlock() acts as a RELEASE barrier (only + * arm64 and PowerPC), arm64 has a full barrier in + * switch_to(), and PowerPC has + * smp_mb__after_unlock_lock() before + * finish_lock_switch(). + */ ++*switch_count; trace_sched_switch(preempt, prev, next); @@ -3352,8 +3382,8 @@ void __noreturn do_task_dead(void) * To avoid it, we have to wait for releasing tsk->pi_lock which * is held by try_to_wake_up() */ - smp_mb(); - raw_spin_unlock_wait(¤t->pi_lock); + raw_spin_lock_irq(¤t->pi_lock); + raw_spin_unlock_irq(¤t->pi_lock); /* Causes final put_task_struct in finish_task_switch(): */ __set_current_state(TASK_DEAD); @@ -5103,24 +5133,17 @@ out_unlock: return retval; } -static const char stat_nam[] = TASK_STATE_TO_CHAR_STR; - void sched_show_task(struct task_struct *p) { unsigned long free = 0; int ppid; - unsigned long state = p->state; - - /* Make sure the string lines up properly with the number of task states: */ - BUILD_BUG_ON(sizeof(TASK_STATE_TO_CHAR_STR)-1 != ilog2(TASK_STATE_MAX)+1); if (!try_get_task_stack(p)) return; - if (state) - state = __ffs(state) + 1; - printk(KERN_INFO "%-15.15s %c", p->comm, - state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); - if (state == TASK_RUNNING) + + printk(KERN_INFO "%-15.15s %c", p->comm, task_state_to_char(p)); + + if (p->state == TASK_RUNNING) printk(KERN_CONT " running task "); #ifdef CONFIG_DEBUG_STACK_USAGE free = stack_not_used(p); @@ -5177,11 +5200,6 @@ void show_state_filter(unsigned long state_filter) debug_show_all_locks(); } -void init_idle_bootup_task(struct task_struct *idle) -{ - idle->sched_class = &idle_sched_class; -} - /** * init_idle - set up an idle thread for a given CPU * @idle: task in question @@ -5438,7 +5456,7 @@ static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf) */ next = pick_next_task(rq, &fake_task, rf); BUG_ON(!next); - next->sched_class->put_prev_task(rq, next); + put_prev_task(rq, next); /* * Rules for changing task_struct::cpus_allowed are holding diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c index fba235c7d026..8d9562d890d3 100644 --- a/kernel/sched/cpudeadline.c +++ b/kernel/sched/cpudeadline.c @@ -119,29 +119,29 @@ static inline int cpudl_maximum(struct cpudl *cp) * @p: the task * @later_mask: a mask to fill in with the selected CPUs (or NULL) * - * Returns: int - best CPU (heap maximum if suitable) + * Returns: int - CPUs were found */ int cpudl_find(struct cpudl *cp, struct task_struct *p, struct cpumask *later_mask) { - int best_cpu = -1; const struct sched_dl_entity *dl_se = &p->dl; if (later_mask && cpumask_and(later_mask, cp->free_cpus, &p->cpus_allowed)) { - best_cpu = cpumask_any(later_mask); - goto out; - } else if (cpumask_test_cpu(cpudl_maximum(cp), &p->cpus_allowed) && - dl_time_before(dl_se->deadline, cp->elements[0].dl)) { - best_cpu = cpudl_maximum(cp); - if (later_mask) - cpumask_set_cpu(best_cpu, later_mask); - } + return 1; + } else { + int best_cpu = cpudl_maximum(cp); + WARN_ON(best_cpu != -1 && !cpu_present(best_cpu)); -out: - WARN_ON(best_cpu != -1 && !cpu_present(best_cpu)); + if (cpumask_test_cpu(best_cpu, &p->cpus_allowed) && + dl_time_before(dl_se->deadline, cp->elements[0].dl)) { + if (later_mask) + cpumask_set_cpu(best_cpu, later_mask); - return best_cpu; + return 1; + } + } + return 0; } /* @@ -246,7 +246,6 @@ int cpudl_init(struct cpudl *cp) { int i; - memset(cp, 0, sizeof(*cp)); raw_spin_lock_init(&cp->lock); cp->size = 0; diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c index 981fcd7dc394..2511aba36b89 100644 --- a/kernel/sched/cpupri.c +++ b/kernel/sched/cpupri.c @@ -209,8 +209,6 @@ int cpupri_init(struct cpupri *cp) { int i; - memset(cp, 0, sizeof(*cp)); - for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) { struct cpupri_vec *vec = &cp->pri_to_cpu[i]; diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 755bd3f1a1a9..d05bd9457a40 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1594,7 +1594,7 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p) * let's hope p can move out. */ if (rq->curr->nr_cpus_allowed == 1 || - cpudl_find(&rq->rd->cpudl, rq->curr, NULL) == -1) + !cpudl_find(&rq->rd->cpudl, rq->curr, NULL)) return; /* @@ -1602,7 +1602,7 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p) * see if it is pushed or pulled somewhere else. */ if (p->nr_cpus_allowed != 1 && - cpudl_find(&rq->rd->cpudl, p, NULL) != -1) + cpudl_find(&rq->rd->cpudl, p, NULL)) return; resched_curr(rq); @@ -1655,7 +1655,7 @@ static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq, return rb_entry(left, struct sched_dl_entity, rb_node); } -struct task_struct * +static struct task_struct * pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) { struct sched_dl_entity *dl_se; @@ -1798,7 +1798,7 @@ static int find_later_rq(struct task_struct *task) struct sched_domain *sd; struct cpumask *later_mask = this_cpu_cpumask_var_ptr(local_cpu_mask_dl); int this_cpu = smp_processor_id(); - int best_cpu, cpu = task_cpu(task); + int cpu = task_cpu(task); /* Make sure the mask is initialized first */ if (unlikely(!later_mask)) @@ -1811,17 +1811,14 @@ static int find_later_rq(struct task_struct *task) * We have to consider system topology and task affinity * first, then we can look for a suitable cpu. */ - best_cpu = cpudl_find(&task_rq(task)->rd->cpudl, - task, later_mask); - if (best_cpu == -1) + if (!cpudl_find(&task_rq(task)->rd->cpudl, task, later_mask)) return -1; /* - * If we are here, some target has been found, - * the most suitable of which is cached in best_cpu. - * This is, among the runqueues where the current tasks - * have later deadlines than the task's one, the rq - * with the latest possible one. + * If we are here, some targets have been found, including + * the most suitable which is, among the runqueues where the + * current tasks have later deadlines than the task's one, the + * rq with the latest possible one. * * Now we check how well this matches with task's * affinity and system topology. @@ -1841,6 +1838,7 @@ static int find_later_rq(struct task_struct *task) rcu_read_lock(); for_each_domain(cpu, sd) { if (sd->flags & SD_WAKE_AFFINE) { + int best_cpu; /* * If possible, preempting this_cpu is @@ -1852,12 +1850,15 @@ static int find_later_rq(struct task_struct *task) return this_cpu; } + best_cpu = cpumask_first_and(later_mask, + sched_domain_span(sd)); /* - * Last chance: if best_cpu is valid and is - * in the mask, that becomes our choice. + * Last chance: if a cpu being in both later_mask + * and current sd span is valid, that becomes our + * choice. Of course, the latest possible cpu is + * already under consideration through later_mask. */ - if (best_cpu < nr_cpu_ids && - cpumask_test_cpu(best_cpu, sched_domain_span(sd))) { + if (best_cpu < nr_cpu_ids) { rcu_read_unlock(); return best_cpu; } diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 4fa66de52bd6..4a23bbc3111b 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -327,38 +327,78 @@ static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu) return table; } +static cpumask_var_t sd_sysctl_cpus; static struct ctl_table_header *sd_sysctl_header; + void register_sched_domain_sysctl(void) { - int i, cpu_num = num_possible_cpus(); - struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1); + static struct ctl_table *cpu_entries; + static struct ctl_table **cpu_idx; char buf[32]; + int i; - WARN_ON(sd_ctl_dir[0].child); - sd_ctl_dir[0].child = entry; + if (!cpu_entries) { + cpu_entries = sd_alloc_ctl_entry(num_possible_cpus() + 1); + if (!cpu_entries) + return; - if (entry == NULL) - return; + WARN_ON(sd_ctl_dir[0].child); + sd_ctl_dir[0].child = cpu_entries; + } - for_each_possible_cpu(i) { - snprintf(buf, 32, "cpu%d", i); - entry->procname = kstrdup(buf, GFP_KERNEL); - entry->mode = 0555; - entry->child = sd_alloc_ctl_cpu_table(i); - entry++; + if (!cpu_idx) { + struct ctl_table *e = cpu_entries; + + cpu_idx = kcalloc(nr_cpu_ids, sizeof(struct ctl_table*), GFP_KERNEL); + if (!cpu_idx) + return; + + /* deal with sparse possible map */ + for_each_possible_cpu(i) { + cpu_idx[i] = e; + e++; + } + } + + if (!cpumask_available(sd_sysctl_cpus)) { + if (!alloc_cpumask_var(&sd_sysctl_cpus, GFP_KERNEL)) + return; + + /* init to possible to not have holes in @cpu_entries */ + cpumask_copy(sd_sysctl_cpus, cpu_possible_mask); + } + + for_each_cpu(i, sd_sysctl_cpus) { + struct ctl_table *e = cpu_idx[i]; + + if (e->child) + sd_free_ctl_entry(&e->child); + + if (!e->procname) { + snprintf(buf, 32, "cpu%d", i); + e->procname = kstrdup(buf, GFP_KERNEL); + } + e->mode = 0555; + e->child = sd_alloc_ctl_cpu_table(i); + + __cpumask_clear_cpu(i, sd_sysctl_cpus); } WARN_ON(sd_sysctl_header); sd_sysctl_header = register_sysctl_table(sd_ctl_root); } +void dirty_sched_domain_sysctl(int cpu) +{ + if (cpumask_available(sd_sysctl_cpus)) + __cpumask_set_cpu(cpu, sd_sysctl_cpus); +} + /* may be called multiple times per register */ void unregister_sched_domain_sysctl(void) { unregister_sysctl_table(sd_sysctl_header); sd_sysctl_header = NULL; - if (sd_ctl_dir[0].child) - sd_free_ctl_entry(&sd_ctl_dir[0].child); } #endif /* CONFIG_SYSCTL */ #endif /* CONFIG_SMP */ @@ -421,13 +461,15 @@ static char *task_group_path(struct task_group *tg) } #endif +static const char stat_nam[] = TASK_STATE_TO_CHAR_STR; + static void print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) { if (rq->curr == p) - SEQ_printf(m, "R"); + SEQ_printf(m, ">R"); else - SEQ_printf(m, " "); + SEQ_printf(m, " %c", task_state_to_char(p)); SEQ_printf(m, "%15s %5d %9Ld.%06ld %9Ld %5d ", p->comm, task_pid_nr(p), @@ -456,9 +498,9 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) SEQ_printf(m, "\nrunnable tasks:\n" - " task PID tree-key switches prio" + " S task PID tree-key switches prio" " wait-time sum-exec sum-sleep\n" - "------------------------------------------------------" + "-------------------------------------------------------" "----------------------------------------------------\n"); rcu_read_lock(); @@ -872,11 +914,12 @@ static void sched_show_numa(struct task_struct *p, struct seq_file *m) #endif } -void proc_sched_show_task(struct task_struct *p, struct seq_file *m) +void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, + struct seq_file *m) { unsigned long nr_switches; - SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, task_pid_nr(p), + SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, task_pid_nr_ns(p, ns), get_nr_threads(p)); SEQ_printf(m, "---------------------------------------------------------" diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c95880e216f6..8d5868771cb3 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -806,7 +806,7 @@ void post_init_entity_util_avg(struct sched_entity *se) /* * For !fair tasks do: * - update_cfs_rq_load_avg(now, cfs_rq, false); + update_cfs_rq_load_avg(now, cfs_rq); attach_entity_load_avg(cfs_rq, se); switched_from_fair(rq, p); * @@ -1071,6 +1071,29 @@ unsigned int sysctl_numa_balancing_scan_size = 256; /* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */ unsigned int sysctl_numa_balancing_scan_delay = 1000; +struct numa_group { + atomic_t refcount; + + spinlock_t lock; /* nr_tasks, tasks */ + int nr_tasks; + pid_t gid; + int active_nodes; + + struct rcu_head rcu; + unsigned long total_faults; + unsigned long max_faults_cpu; + /* + * Faults_cpu is used to decide whether memory should move + * towards the CPU. As a consequence, these stats are weighted + * more by CPU use than by memory faults. + */ + unsigned long *faults_cpu; + unsigned long faults[0]; +}; + +static inline unsigned long group_faults_priv(struct numa_group *ng); +static inline unsigned long group_faults_shared(struct numa_group *ng); + static unsigned int task_nr_scan_windows(struct task_struct *p) { unsigned long rss = 0; @@ -1107,13 +1130,47 @@ static unsigned int task_scan_min(struct task_struct *p) return max_t(unsigned int, floor, scan); } +static unsigned int task_scan_start(struct task_struct *p) +{ + unsigned long smin = task_scan_min(p); + unsigned long period = smin; + + /* Scale the maximum scan period with the amount of shared memory. */ + if (p->numa_group) { + struct numa_group *ng = p->numa_group; + unsigned long shared = group_faults_shared(ng); + unsigned long private = group_faults_priv(ng); + + period *= atomic_read(&ng->refcount); + period *= shared + 1; + period /= private + shared + 1; + } + + return max(smin, period); +} + static unsigned int task_scan_max(struct task_struct *p) { - unsigned int smin = task_scan_min(p); - unsigned int smax; + unsigned long smin = task_scan_min(p); + unsigned long smax; /* Watch for min being lower than max due to floor calculations */ smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p); + + /* Scale the maximum scan period with the amount of shared memory. */ + if (p->numa_group) { + struct numa_group *ng = p->numa_group; + unsigned long shared = group_faults_shared(ng); + unsigned long private = group_faults_priv(ng); + unsigned long period = smax; + + period *= atomic_read(&ng->refcount); + period *= shared + 1; + period /= private + shared + 1; + + smax = max(smax, period); + } + return max(smin, smax); } @@ -1129,26 +1186,6 @@ static void account_numa_dequeue(struct rq *rq, struct task_struct *p) rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p)); } -struct numa_group { - atomic_t refcount; - - spinlock_t lock; /* nr_tasks, tasks */ - int nr_tasks; - pid_t gid; - int active_nodes; - - struct rcu_head rcu; - unsigned long total_faults; - unsigned long max_faults_cpu; - /* - * Faults_cpu is used to decide whether memory should move - * towards the CPU. As a consequence, these stats are weighted - * more by CPU use than by memory faults. - */ - unsigned long *faults_cpu; - unsigned long faults[0]; -}; - /* Shared or private faults. */ #define NR_NUMA_HINT_FAULT_TYPES 2 @@ -1198,6 +1235,30 @@ static inline unsigned long group_faults_cpu(struct numa_group *group, int nid) group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)]; } +static inline unsigned long group_faults_priv(struct numa_group *ng) +{ + unsigned long faults = 0; + int node; + + for_each_online_node(node) { + faults += ng->faults[task_faults_idx(NUMA_MEM, node, 1)]; + } + + return faults; +} + +static inline unsigned long group_faults_shared(struct numa_group *ng) +{ + unsigned long faults = 0; + int node; + + for_each_online_node(node) { + faults += ng->faults[task_faults_idx(NUMA_MEM, node, 0)]; + } + + return faults; +} + /* * A node triggering more than 1/3 as many NUMA faults as the maximum is * considered part of a numa group's pseudo-interleaving set. Migrations @@ -1378,7 +1439,7 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page, group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4; } -static unsigned long weighted_cpuload(const int cpu); +static unsigned long weighted_cpuload(struct rq *rq); static unsigned long source_load(int cpu, int type); static unsigned long target_load(int cpu, int type); static unsigned long capacity_of(int cpu); @@ -1409,7 +1470,7 @@ static void update_numa_stats(struct numa_stats *ns, int nid) struct rq *rq = cpu_rq(cpu); ns->nr_running += rq->nr_running; - ns->load += weighted_cpuload(cpu); + ns->load += weighted_cpuload(rq); ns->compute_capacity += capacity_of(cpu); cpus++; @@ -1808,7 +1869,7 @@ static int task_numa_migrate(struct task_struct *p) * Reset the scan period if the task is being rescheduled on an * alternative node to recheck if the tasks is now properly placed. */ - p->numa_scan_period = task_scan_min(p); + p->numa_scan_period = task_scan_start(p); if (env.best_task == NULL) { ret = migrate_task_to(p, env.best_cpu); @@ -1892,7 +1953,7 @@ static void update_task_scan_period(struct task_struct *p, unsigned long shared, unsigned long private) { unsigned int period_slot; - int ratio; + int lr_ratio, ps_ratio; int diff; unsigned long remote = p->numa_faults_locality[0]; @@ -1922,25 +1983,36 @@ static void update_task_scan_period(struct task_struct *p, * >= NUMA_PERIOD_THRESHOLD scan period increases (scan slower) */ period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS); - ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote); - if (ratio >= NUMA_PERIOD_THRESHOLD) { - int slot = ratio - NUMA_PERIOD_THRESHOLD; + lr_ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote); + ps_ratio = (private * NUMA_PERIOD_SLOTS) / (private + shared); + + if (ps_ratio >= NUMA_PERIOD_THRESHOLD) { + /* + * Most memory accesses are local. There is no need to + * do fast NUMA scanning, since memory is already local. + */ + int slot = ps_ratio - NUMA_PERIOD_THRESHOLD; + if (!slot) + slot = 1; + diff = slot * period_slot; + } else if (lr_ratio >= NUMA_PERIOD_THRESHOLD) { + /* + * Most memory accesses are shared with other tasks. + * There is no point in continuing fast NUMA scanning, + * since other tasks may just move the memory elsewhere. + */ + int slot = lr_ratio - NUMA_PERIOD_THRESHOLD; if (!slot) slot = 1; diff = slot * period_slot; } else { - diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot; - /* - * Scale scan rate increases based on sharing. There is an - * inverse relationship between the degree of sharing and - * the adjustment made to the scanning period. Broadly - * speaking the intent is that there is little point - * scanning faster if shared accesses dominate as it may - * simply bounce migrations uselessly + * Private memory faults exceed (SLOTS-THRESHOLD)/SLOTS, + * yet they are not on the local NUMA node. Speed up + * NUMA scanning to get the memory moved over. */ - ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared + 1)); - diff = (diff * ratio) / NUMA_PERIOD_SLOTS; + int ratio = max(lr_ratio, ps_ratio); + diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot; } p->numa_scan_period = clamp(p->numa_scan_period + diff, @@ -2448,7 +2520,7 @@ void task_numa_work(struct callback_head *work) if (p->numa_scan_period == 0) { p->numa_scan_period_max = task_scan_max(p); - p->numa_scan_period = task_scan_min(p); + p->numa_scan_period = task_scan_start(p); } next_scan = now + msecs_to_jiffies(p->numa_scan_period); @@ -2576,7 +2648,7 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr) if (now > curr->node_stamp + period) { if (!curr->node_stamp) - curr->numa_scan_period = task_scan_min(curr); + curr->numa_scan_period = task_scan_start(curr); curr->node_stamp += period; if (!time_before(jiffies, curr->mm->numa_next_scan)) { @@ -2586,59 +2658,6 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr) } } -/* - * Can a task be moved from prev_cpu to this_cpu without causing a load - * imbalance that would trigger the load balancer? - */ -static inline bool numa_wake_affine(struct sched_domain *sd, - struct task_struct *p, int this_cpu, - int prev_cpu, int sync) -{ - struct numa_stats prev_load, this_load; - s64 this_eff_load, prev_eff_load; - - update_numa_stats(&prev_load, cpu_to_node(prev_cpu)); - update_numa_stats(&this_load, cpu_to_node(this_cpu)); - - /* - * If sync wakeup then subtract the (maximum possible) - * effect of the currently running task from the load - * of the current CPU: - */ - if (sync) { - unsigned long current_load = task_h_load(current); - - if (this_load.load > current_load) - this_load.load -= current_load; - else - this_load.load = 0; - } - - /* - * In low-load situations, where this_cpu's node is idle due to the - * sync cause above having dropped this_load.load to 0, move the task. - * Moving to an idle socket will not create a bad imbalance. - * - * Otherwise check if the nodes are near enough in load to allow this - * task to be woken on this_cpu's node. - */ - if (this_load.load > 0) { - unsigned long task_load = task_h_load(p); - - this_eff_load = 100; - this_eff_load *= prev_load.compute_capacity; - - prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2; - prev_eff_load *= this_load.compute_capacity; - - this_eff_load *= this_load.load + task_load; - prev_eff_load *= prev_load.load - task_load; - - return this_eff_load <= prev_eff_load; - } - - return true; -} #else static void task_tick_numa(struct rq *rq, struct task_struct *curr) { @@ -2652,14 +2671,6 @@ static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p) { } -#ifdef CONFIG_SMP -static inline bool numa_wake_affine(struct sched_domain *sd, - struct task_struct *p, int this_cpu, - int prev_cpu, int sync) -{ - return true; -} -#endif /* !SMP */ #endif /* CONFIG_NUMA_BALANCING */ static void @@ -2790,6 +2801,29 @@ static inline void update_cfs_shares(struct sched_entity *se) } #endif /* CONFIG_FAIR_GROUP_SCHED */ +static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq) +{ + if (&this_rq()->cfs == cfs_rq) { + /* + * There are a few boundary cases this might miss but it should + * get called often enough that that should (hopefully) not be + * a real problem -- added to that it only calls on the local + * CPU, so if we enqueue remotely we'll miss an update, but + * the next tick/schedule should update. + * + * It will not get called when we go idle, because the idle + * thread is a different class (!fair), nor will the utilization + * number include things like RT tasks. + * + * As is, the util number is not freq-invariant (we'd have to + * implement arch_scale_freq_capacity() for that). + * + * See cpu_util(). + */ + cpufreq_update_util(rq_of(cfs_rq), 0); + } +} + #ifdef CONFIG_SMP /* * Approximate: @@ -2968,6 +3002,18 @@ ___update_load_avg(u64 now, int cpu, struct sched_avg *sa, sa->last_update_time += delta << 10; /* + * running is a subset of runnable (weight) so running can't be set if + * runnable is clear. But there are some corner cases where the current + * se has been already dequeued but cfs_rq->curr still points to it. + * This means that weight will be 0 but not running for a sched_entity + * but also for a cfs_rq if the latter becomes idle. As an example, + * this happens during idle_balance() which calls + * update_blocked_averages() + */ + if (!weight) + running = 0; + + /* * Now we know we crossed measurement unit boundaries. The *_avg * accrues by two steps: * @@ -3276,29 +3322,6 @@ static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) {} #endif /* CONFIG_FAIR_GROUP_SCHED */ -static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq) -{ - if (&this_rq()->cfs == cfs_rq) { - /* - * There are a few boundary cases this might miss but it should - * get called often enough that that should (hopefully) not be - * a real problem -- added to that it only calls on the local - * CPU, so if we enqueue remotely we'll miss an update, but - * the next tick/schedule should update. - * - * It will not get called when we go idle, because the idle - * thread is a different class (!fair), nor will the utilization - * number include things like RT tasks. - * - * As is, the util number is not freq-invariant (we'd have to - * implement arch_scale_freq_capacity() for that). - * - * See cpu_util(). - */ - cpufreq_update_util(rq_of(cfs_rq), 0); - } -} - /* * Unsigned subtract and clamp on underflow. * @@ -3320,7 +3343,6 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq) * update_cfs_rq_load_avg - update the cfs_rq's load/util averages * @now: current time, as per cfs_rq_clock_task() * @cfs_rq: cfs_rq to update - * @update_freq: should we call cfs_rq_util_change() or will the call do so * * The cfs_rq avg is the direct sum of all its entities (blocked and runnable) * avg. The immediate corollary is that all (fair) tasks must be attached, see @@ -3334,7 +3356,7 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq) * call update_tg_load_avg() when this function returns true. */ static inline int -update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) +update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) { struct sched_avg *sa = &cfs_rq->avg; int decayed, removed_load = 0, removed_util = 0; @@ -3362,7 +3384,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) cfs_rq->load_last_update_time_copy = sa->last_update_time; #endif - if (update_freq && (decayed || removed_util)) + if (decayed || removed_util) cfs_rq_util_change(cfs_rq); return decayed || removed_load; @@ -3390,7 +3412,7 @@ static inline void update_load_avg(struct sched_entity *se, int flags) if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD)) __update_load_avg_se(now, cpu, cfs_rq, se); - decayed = update_cfs_rq_load_avg(now, cfs_rq, true); + decayed = update_cfs_rq_load_avg(now, cfs_rq); decayed |= propagate_entity_load_avg(se); if (decayed && (flags & UPDATE_TG)) @@ -3534,7 +3556,7 @@ static int idle_balance(struct rq *this_rq, struct rq_flags *rf); #else /* CONFIG_SMP */ static inline int -update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) +update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) { return 0; } @@ -3544,7 +3566,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) static inline void update_load_avg(struct sched_entity *se, int not_used1) { - cpufreq_update_util(rq_of(cfs_rq_of(se)), 0); + cfs_rq_util_change(cfs_rq_of(se)); } static inline void @@ -5125,9 +5147,9 @@ static void cpu_load_update(struct rq *this_rq, unsigned long this_load, } /* Used instead of source_load when we know the type == 0 */ -static unsigned long weighted_cpuload(const int cpu) +static unsigned long weighted_cpuload(struct rq *rq) { - return cfs_rq_runnable_load_avg(&cpu_rq(cpu)->cfs); + return cfs_rq_runnable_load_avg(&rq->cfs); } #ifdef CONFIG_NO_HZ_COMMON @@ -5172,7 +5194,7 @@ static void cpu_load_update_idle(struct rq *this_rq) /* * bail if there's load or we're actually up-to-date. */ - if (weighted_cpuload(cpu_of(this_rq))) + if (weighted_cpuload(this_rq)) return; cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), 0); @@ -5193,7 +5215,7 @@ void cpu_load_update_nohz_start(void) * concurrently we'll exit nohz. And cpu_load write can race with * cpu_load_update_idle() but both updater would be writing the same. */ - this_rq->cpu_load[0] = weighted_cpuload(cpu_of(this_rq)); + this_rq->cpu_load[0] = weighted_cpuload(this_rq); } /* @@ -5209,7 +5231,7 @@ void cpu_load_update_nohz_stop(void) if (curr_jiffies == this_rq->last_load_update_tick) return; - load = weighted_cpuload(cpu_of(this_rq)); + load = weighted_cpuload(this_rq); rq_lock(this_rq, &rf); update_rq_clock(this_rq); cpu_load_update_nohz(this_rq, curr_jiffies, load); @@ -5235,7 +5257,7 @@ static void cpu_load_update_periodic(struct rq *this_rq, unsigned long load) */ void cpu_load_update_active(struct rq *this_rq) { - unsigned long load = weighted_cpuload(cpu_of(this_rq)); + unsigned long load = weighted_cpuload(this_rq); if (tick_nohz_tick_stopped()) cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), load); @@ -5253,7 +5275,7 @@ void cpu_load_update_active(struct rq *this_rq) static unsigned long source_load(int cpu, int type) { struct rq *rq = cpu_rq(cpu); - unsigned long total = weighted_cpuload(cpu); + unsigned long total = weighted_cpuload(rq); if (type == 0 || !sched_feat(LB_BIAS)) return total; @@ -5268,7 +5290,7 @@ static unsigned long source_load(int cpu, int type) static unsigned long target_load(int cpu, int type) { struct rq *rq = cpu_rq(cpu); - unsigned long total = weighted_cpuload(cpu); + unsigned long total = weighted_cpuload(rq); if (type == 0 || !sched_feat(LB_BIAS)) return total; @@ -5290,7 +5312,7 @@ static unsigned long cpu_avg_load_per_task(int cpu) { struct rq *rq = cpu_rq(cpu); unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running); - unsigned long load_avg = weighted_cpuload(cpu); + unsigned long load_avg = weighted_cpuload(rq); if (nr_running) return load_avg / nr_running; @@ -5345,20 +5367,115 @@ static int wake_wide(struct task_struct *p) return 1; } +struct llc_stats { + unsigned long nr_running; + unsigned long load; + unsigned long capacity; + int has_capacity; +}; + +static bool get_llc_stats(struct llc_stats *stats, int cpu) +{ + struct sched_domain_shared *sds = rcu_dereference(per_cpu(sd_llc_shared, cpu)); + + if (!sds) + return false; + + stats->nr_running = READ_ONCE(sds->nr_running); + stats->load = READ_ONCE(sds->load); + stats->capacity = READ_ONCE(sds->capacity); + stats->has_capacity = stats->nr_running < per_cpu(sd_llc_size, cpu); + + return true; +} + +/* + * Can a task be moved from prev_cpu to this_cpu without causing a load + * imbalance that would trigger the load balancer? + * + * Since we're running on 'stale' values, we might in fact create an imbalance + * but recomputing these values is expensive, as that'd mean iteration 2 cache + * domains worth of CPUs. + */ +static bool +wake_affine_llc(struct sched_domain *sd, struct task_struct *p, + int this_cpu, int prev_cpu, int sync) +{ + struct llc_stats prev_stats, this_stats; + s64 this_eff_load, prev_eff_load; + unsigned long task_load; + + if (!get_llc_stats(&prev_stats, prev_cpu) || + !get_llc_stats(&this_stats, this_cpu)) + return false; + + /* + * If sync wakeup then subtract the (maximum possible) + * effect of the currently running task from the load + * of the current LLC. + */ + if (sync) { + unsigned long current_load = task_h_load(current); + + /* in this case load hits 0 and this LLC is considered 'idle' */ + if (current_load > this_stats.load) + return true; + + this_stats.load -= current_load; + } + + /* + * The has_capacity stuff is not SMT aware, but by trying to balance + * the nr_running on both ends we try and fill the domain at equal + * rates, thereby first consuming cores before siblings. + */ + + /* if the old cache has capacity, stay there */ + if (prev_stats.has_capacity && prev_stats.nr_running < this_stats.nr_running+1) + return false; + + /* if this cache has capacity, come here */ + if (this_stats.has_capacity && this_stats.nr_running < prev_stats.nr_running+1) + return true; + + /* + * Check to see if we can move the load without causing too much + * imbalance. + */ + task_load = task_h_load(p); + + this_eff_load = 100; + this_eff_load *= prev_stats.capacity; + + prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2; + prev_eff_load *= this_stats.capacity; + + this_eff_load *= this_stats.load + task_load; + prev_eff_load *= prev_stats.load - task_load; + + return this_eff_load <= prev_eff_load; +} + static int wake_affine(struct sched_domain *sd, struct task_struct *p, int prev_cpu, int sync) { int this_cpu = smp_processor_id(); - bool affine = false; + bool affine; /* - * Common case: CPUs are in the same socket, and select_idle_sibling() - * will do its thing regardless of what we return: + * Default to no affine wakeups; wake_affine() should not effect a task + * placement the load-balancer feels inclined to undo. The conservative + * option is therefore to not move tasks when they wake up. */ - if (cpus_share_cache(prev_cpu, this_cpu)) - affine = true; - else - affine = numa_wake_affine(sd, p, this_cpu, prev_cpu, sync); + affine = false; + + /* + * If the wakeup is across cache domains, try to evaluate if movement + * makes sense, otherwise rely on select_idle_siblings() to do + * placement inside the cache domain. + */ + if (!cpus_share_cache(prev_cpu, this_cpu)) + affine = wake_affine_llc(sd, p, this_cpu, prev_cpu, sync); schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts); if (affine) { @@ -5550,7 +5667,7 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) shallowest_idle_cpu = i; } } else if (shallowest_idle_cpu == -1) { - load = weighted_cpuload(i); + load = weighted_cpuload(cpu_rq(i)); if (load < min_load || (load == min_load && i == this_cpu)) { min_load = load; least_loaded_cpu = i; @@ -6187,10 +6304,10 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf int new_tasks; again: -#ifdef CONFIG_FAIR_GROUP_SCHED if (!cfs_rq->nr_running) goto idle; +#ifdef CONFIG_FAIR_GROUP_SCHED if (prev->sched_class != &fair_sched_class) goto simple; @@ -6220,11 +6337,17 @@ again: /* * This call to check_cfs_rq_runtime() will do the * throttle and dequeue its entity in the parent(s). - * Therefore the 'simple' nr_running test will indeed + * Therefore the nr_running test will indeed * be correct. */ - if (unlikely(check_cfs_rq_runtime(cfs_rq))) + if (unlikely(check_cfs_rq_runtime(cfs_rq))) { + cfs_rq = &rq->cfs; + + if (!cfs_rq->nr_running) + goto idle; + goto simple; + } } se = pick_next_entity(cfs_rq, curr); @@ -6264,12 +6387,8 @@ again: return p; simple: - cfs_rq = &rq->cfs; #endif - if (!cfs_rq->nr_running) - goto idle; - put_prev_task(rq, prev); do { @@ -6917,7 +7036,7 @@ static void update_blocked_averages(int cpu) if (throttled_hierarchy(cfs_rq)) continue; - if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true)) + if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq)) update_tg_load_avg(cfs_rq, 0); /* Propagate pending load changes to the parent, if any: */ @@ -6990,7 +7109,7 @@ static inline void update_blocked_averages(int cpu) rq_lock_irqsave(rq, &rf); update_rq_clock(rq); - update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true); + update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq); rq_unlock_irqrestore(rq, &rf); } @@ -7036,6 +7155,7 @@ struct sg_lb_stats { struct sd_lb_stats { struct sched_group *busiest; /* Busiest group in this sd */ struct sched_group *local; /* Local group in this sd */ + unsigned long total_running; unsigned long total_load; /* Total load of all groups in sd */ unsigned long total_capacity; /* Total capacity of all groups in sd */ unsigned long avg_load; /* Average load across all groups in sd */ @@ -7055,6 +7175,7 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds) *sds = (struct sd_lb_stats){ .busiest = NULL, .local = NULL, + .total_running = 0UL, .total_load = 0UL, .total_capacity = 0UL, .busiest_stat = { @@ -7363,7 +7484,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->nr_numa_running += rq->nr_numa_running; sgs->nr_preferred_running += rq->nr_preferred_running; #endif - sgs->sum_weighted_load += weighted_cpuload(i); + sgs->sum_weighted_load += weighted_cpuload(rq); /* * No need to call idle_cpu() if nr_running is not 0 */ @@ -7490,6 +7611,7 @@ static inline enum fbq_type fbq_classify_rq(struct rq *rq) */ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds) { + struct sched_domain_shared *shared = env->sd->shared; struct sched_domain *child = env->sd->child; struct sched_group *sg = env->sd->groups; struct sg_lb_stats *local = &sds->local_stat; @@ -7546,6 +7668,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd next_group: /* Now, start updating sd_lb_stats */ + sds->total_running += sgs->sum_nr_running; sds->total_load += sgs->group_load; sds->total_capacity += sgs->group_capacity; @@ -7561,6 +7684,21 @@ next_group: env->dst_rq->rd->overload = overload; } + if (!shared) + return; + + /* + * Since these are sums over groups they can contain some CPUs + * multiple times for the NUMA domains. + * + * Currently only wake_affine_llc() and find_busiest_group() + * uses these numbers, only the last is affected by this problem. + * + * XXX fix that. + */ + WRITE_ONCE(shared->nr_running, sds->total_running); + WRITE_ONCE(shared->load, sds->total_load); + WRITE_ONCE(shared->capacity, sds->total_capacity); } /** @@ -7790,6 +7928,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env) if (!sds.busiest || busiest->sum_nr_running == 0) goto out_balanced; + /* XXX broken for overlapping NUMA groups */ sds.avg_load = (SCHED_CAPACITY_SCALE * sds.total_load) / sds.total_capacity; @@ -7892,7 +8031,7 @@ static struct rq *find_busiest_queue(struct lb_env *env, capacity = capacity_of(i); - wl = weighted_cpuload(i); + wl = weighted_cpuload(rq); /* * When comparing with imbalance, use weighted_cpuload() diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c new file mode 100644 index 000000000000..a92fddc22747 --- /dev/null +++ b/kernel/sched/membarrier.c @@ -0,0 +1,152 @@ +/* + * Copyright (C) 2010-2017 Mathieu Desnoyers <mathieu.desnoyers@efficios.com> + * + * membarrier system call + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include <linux/syscalls.h> +#include <linux/membarrier.h> +#include <linux/tick.h> +#include <linux/cpumask.h> + +#include "sched.h" /* for cpu_rq(). */ + +/* + * Bitmask made from a "or" of all commands within enum membarrier_cmd, + * except MEMBARRIER_CMD_QUERY. + */ +#define MEMBARRIER_CMD_BITMASK \ + (MEMBARRIER_CMD_SHARED | MEMBARRIER_CMD_PRIVATE_EXPEDITED) + +static void ipi_mb(void *info) +{ + smp_mb(); /* IPIs should be serializing but paranoid. */ +} + +static void membarrier_private_expedited(void) +{ + int cpu; + bool fallback = false; + cpumask_var_t tmpmask; + + if (num_online_cpus() == 1) + return; + + /* + * Matches memory barriers around rq->curr modification in + * scheduler. + */ + smp_mb(); /* system call entry is not a mb. */ + + /* + * Expedited membarrier commands guarantee that they won't + * block, hence the GFP_NOWAIT allocation flag and fallback + * implementation. + */ + if (!zalloc_cpumask_var(&tmpmask, GFP_NOWAIT)) { + /* Fallback for OOM. */ + fallback = true; + } + + cpus_read_lock(); + for_each_online_cpu(cpu) { + struct task_struct *p; + + /* + * Skipping the current CPU is OK even through we can be + * migrated at any point. The current CPU, at the point + * where we read raw_smp_processor_id(), is ensured to + * be in program order with respect to the caller + * thread. Therefore, we can skip this CPU from the + * iteration. + */ + if (cpu == raw_smp_processor_id()) + continue; + rcu_read_lock(); + p = task_rcu_dereference(&cpu_rq(cpu)->curr); + if (p && p->mm == current->mm) { + if (!fallback) + __cpumask_set_cpu(cpu, tmpmask); + else + smp_call_function_single(cpu, ipi_mb, NULL, 1); + } + rcu_read_unlock(); + } + if (!fallback) { + smp_call_function_many(tmpmask, ipi_mb, NULL, 1); + free_cpumask_var(tmpmask); + } + cpus_read_unlock(); + + /* + * Memory barrier on the caller thread _after_ we finished + * waiting for the last IPI. Matches memory barriers around + * rq->curr modification in scheduler. + */ + smp_mb(); /* exit from system call is not a mb */ +} + +/** + * sys_membarrier - issue memory barriers on a set of threads + * @cmd: Takes command values defined in enum membarrier_cmd. + * @flags: Currently needs to be 0. For future extensions. + * + * If this system call is not implemented, -ENOSYS is returned. If the + * command specified does not exist, not available on the running + * kernel, or if the command argument is invalid, this system call + * returns -EINVAL. For a given command, with flags argument set to 0, + * this system call is guaranteed to always return the same value until + * reboot. + * + * All memory accesses performed in program order from each targeted thread + * is guaranteed to be ordered with respect to sys_membarrier(). If we use + * the semantic "barrier()" to represent a compiler barrier forcing memory + * accesses to be performed in program order across the barrier, and + * smp_mb() to represent explicit memory barriers forcing full memory + * ordering across the barrier, we have the following ordering table for + * each pair of barrier(), sys_membarrier() and smp_mb(): + * + * The pair ordering is detailed as (O: ordered, X: not ordered): + * + * barrier() smp_mb() sys_membarrier() + * barrier() X X O + * smp_mb() X O O + * sys_membarrier() O O O + */ +SYSCALL_DEFINE2(membarrier, int, cmd, int, flags) +{ + if (unlikely(flags)) + return -EINVAL; + switch (cmd) { + case MEMBARRIER_CMD_QUERY: + { + int cmd_mask = MEMBARRIER_CMD_BITMASK; + + if (tick_nohz_full_enabled()) + cmd_mask &= ~MEMBARRIER_CMD_SHARED; + return cmd_mask; + } + case MEMBARRIER_CMD_SHARED: + /* MEMBARRIER_CMD_SHARED is not compatible with nohz_full. */ + if (tick_nohz_full_enabled()) + return -EINVAL; + if (num_online_cpus() > 1) + synchronize_sched(); + return 0; + case MEMBARRIER_CMD_PRIVATE_EXPEDITED: + membarrier_private_expedited(); + return 0; + default: + return -EINVAL; + } +} diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index eeef1a3086d1..ab1c7f5409a0 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -769,7 +769,7 @@ struct rq { #ifdef CONFIG_SCHED_HRTICK #ifdef CONFIG_SMP int hrtick_csd_pending; - struct call_single_data hrtick_csd; + call_single_data_t hrtick_csd; #endif struct hrtimer hrtick_timer; #endif @@ -1120,11 +1120,15 @@ extern int group_balance_cpu(struct sched_group *sg); #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) void register_sched_domain_sysctl(void); +void dirty_sched_domain_sysctl(int cpu); void unregister_sched_domain_sysctl(void); #else static inline void register_sched_domain_sysctl(void) { } +static inline void dirty_sched_domain_sysctl(int cpu) +{ +} static inline void unregister_sched_domain_sysctl(void) { } diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c index 3d5610dcce11..2227e183e202 100644 --- a/kernel/sched/swait.c +++ b/kernel/sched/swait.c @@ -33,9 +33,6 @@ void swake_up(struct swait_queue_head *q) { unsigned long flags; - if (!swait_active(q)) - return; - raw_spin_lock_irqsave(&q->lock, flags); swake_up_locked(q); raw_spin_unlock_irqrestore(&q->lock, flags); @@ -51,9 +48,6 @@ void swake_up_all(struct swait_queue_head *q) struct swait_queue *curr; LIST_HEAD(tmp); - if (!swait_active(q)) - return; - raw_spin_lock_irq(&q->lock); list_splice_init(&q->task_list, &tmp); while (!list_empty(&tmp)) { diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 79895aec281e..6f7b43982f73 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -261,8 +261,6 @@ void rq_attach_root(struct rq *rq, struct root_domain *rd) static int init_rootdomain(struct root_domain *rd) { - memset(rd, 0, sizeof(*rd)); - if (!zalloc_cpumask_var(&rd->span, GFP_KERNEL)) goto out; if (!zalloc_cpumask_var(&rd->online, GFP_KERNEL)) @@ -311,7 +309,7 @@ static struct root_domain *alloc_rootdomain(void) { struct root_domain *rd; - rd = kmalloc(sizeof(*rd), GFP_KERNEL); + rd = kzalloc(sizeof(*rd), GFP_KERNEL); if (!rd) return NULL; @@ -337,7 +335,8 @@ static void free_sched_groups(struct sched_group *sg, int free_sgc) if (free_sgc && atomic_dec_and_test(&sg->sgc->ref)) kfree(sg->sgc); - kfree(sg); + if (atomic_dec_and_test(&sg->ref)) + kfree(sg); sg = tmp; } while (sg != first); } @@ -345,15 +344,12 @@ static void free_sched_groups(struct sched_group *sg, int free_sgc) static void destroy_sched_domain(struct sched_domain *sd) { /* - * If its an overlapping domain it has private groups, iterate and - * nuke them all. + * A normal sched domain may have multiple group references, an + * overlapping domain, having private groups, only one. Iterate, + * dropping group/capacity references, freeing where none remain. */ - if (sd->flags & SD_OVERLAP) { - free_sched_groups(sd->groups, 1); - } else if (atomic_dec_and_test(&sd->groups->ref)) { - kfree(sd->groups->sgc); - kfree(sd->groups); - } + free_sched_groups(sd->groups, 1); + if (sd->shared && atomic_dec_and_test(&sd->shared->ref)) kfree(sd->shared); kfree(sd); @@ -463,6 +459,7 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) rq_attach_root(rq, rd); tmp = rq->sd; rcu_assign_pointer(rq->sd, sd); + dirty_sched_domain_sysctl(cpu); destroy_sched_domains(tmp); update_top_cache_domain(cpu); @@ -670,6 +667,7 @@ build_group_from_child_sched_domain(struct sched_domain *sd, int cpu) else cpumask_copy(sg_span, sched_domain_span(sd)); + atomic_inc(&sg->ref); return sg; } @@ -1595,7 +1593,7 @@ static void __sdt_free(const struct cpumask *cpu_map) } } -struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, +static struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, const struct cpumask *cpu_map, struct sched_domain_attr *attr, struct sched_domain *child, int cpu) { @@ -1854,7 +1852,17 @@ void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], /* Let the architecture update CPU core mappings: */ new_topology = arch_update_cpu_topology(); - n = doms_new ? ndoms_new : 0; + if (!doms_new) { + WARN_ON_ONCE(dattr_new); + n = 0; + doms_new = alloc_sched_domains(1); + if (doms_new) { + n = 1; + cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map); + } + } else { + n = ndoms_new; + } /* Destroy deleted domains: */ for (i = 0; i < ndoms_cur; i++) { @@ -1870,11 +1878,10 @@ match1: } n = ndoms_cur; - if (doms_new == NULL) { + if (!doms_new) { n = 0; doms_new = &fallback_doms; cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map); - WARN_ON_ONCE(dattr_new); } /* Build new domains: */ diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 17f11c6b0a9f..d6afed6d0752 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -70,9 +70,10 @@ static void __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode, list_for_each_entry_safe(curr, next, &wq_head->head, entry) { unsigned flags = curr->flags; - - if (curr->func(curr, mode, wake_flags, key) && - (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) + int ret = curr->func(curr, mode, wake_flags, key); + if (ret < 0) + break; + if (ret && (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) break; } } diff --git a/kernel/signal.c b/kernel/signal.c index 7e33f8c583e6..ed804a470dcd 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1194,7 +1194,11 @@ force_sig_info(int sig, struct siginfo *info, struct task_struct *t) recalc_sigpending_and_wake(t); } } - if (action->sa.sa_handler == SIG_DFL) + /* + * Don't clear SIGNAL_UNKILLABLE for traced tasks, users won't expect + * debugging to leave init killable. + */ + if (action->sa.sa_handler == SIG_DFL && !t->ptrace) t->signal->flags &= ~SIGNAL_UNKILLABLE; ret = specific_send_sig_info(sig, info, t); spin_unlock_irqrestore(&t->sighand->siglock, flags); diff --git a/kernel/smp.c b/kernel/smp.c index 3061483cb3ad..81cfca9b4cc3 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -28,7 +28,7 @@ enum { }; struct call_function_data { - struct call_single_data __percpu *csd; + call_single_data_t __percpu *csd; cpumask_var_t cpumask; cpumask_var_t cpumask_ipi; }; @@ -51,7 +51,7 @@ int smpcfd_prepare_cpu(unsigned int cpu) free_cpumask_var(cfd->cpumask); return -ENOMEM; } - cfd->csd = alloc_percpu(struct call_single_data); + cfd->csd = alloc_percpu(call_single_data_t); if (!cfd->csd) { free_cpumask_var(cfd->cpumask); free_cpumask_var(cfd->cpumask_ipi); @@ -103,12 +103,12 @@ void __init call_function_init(void) * previous function call. For multi-cpu calls its even more interesting * as we'll have to ensure no other cpu is observing our csd. */ -static __always_inline void csd_lock_wait(struct call_single_data *csd) +static __always_inline void csd_lock_wait(call_single_data_t *csd) { smp_cond_load_acquire(&csd->flags, !(VAL & CSD_FLAG_LOCK)); } -static __always_inline void csd_lock(struct call_single_data *csd) +static __always_inline void csd_lock(call_single_data_t *csd) { csd_lock_wait(csd); csd->flags |= CSD_FLAG_LOCK; @@ -116,12 +116,12 @@ static __always_inline void csd_lock(struct call_single_data *csd) /* * prevent CPU from reordering the above assignment * to ->flags with any subsequent assignments to other - * fields of the specified call_single_data structure: + * fields of the specified call_single_data_t structure: */ smp_wmb(); } -static __always_inline void csd_unlock(struct call_single_data *csd) +static __always_inline void csd_unlock(call_single_data_t *csd) { WARN_ON(!(csd->flags & CSD_FLAG_LOCK)); @@ -131,14 +131,14 @@ static __always_inline void csd_unlock(struct call_single_data *csd) smp_store_release(&csd->flags, 0); } -static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_single_data, csd_data); +static DEFINE_PER_CPU_SHARED_ALIGNED(call_single_data_t, csd_data); /* - * Insert a previously allocated call_single_data element + * Insert a previously allocated call_single_data_t element * for execution on the given CPU. data must already have * ->func, ->info, and ->flags set. */ -static int generic_exec_single(int cpu, struct call_single_data *csd, +static int generic_exec_single(int cpu, call_single_data_t *csd, smp_call_func_t func, void *info) { if (cpu == smp_processor_id()) { @@ -210,7 +210,7 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline) { struct llist_head *head; struct llist_node *entry; - struct call_single_data *csd, *csd_next; + call_single_data_t *csd, *csd_next; static bool warned; WARN_ON(!irqs_disabled()); @@ -268,8 +268,10 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline) int smp_call_function_single(int cpu, smp_call_func_t func, void *info, int wait) { - struct call_single_data *csd; - struct call_single_data csd_stack = { .flags = CSD_FLAG_LOCK | CSD_FLAG_SYNCHRONOUS }; + call_single_data_t *csd; + call_single_data_t csd_stack = { + .flags = CSD_FLAG_LOCK | CSD_FLAG_SYNCHRONOUS, + }; int this_cpu; int err; @@ -321,7 +323,7 @@ EXPORT_SYMBOL(smp_call_function_single); * NOTE: Be careful, there is unfortunately no current debugging facility to * validate the correctness of this serialization. */ -int smp_call_function_single_async(int cpu, struct call_single_data *csd) +int smp_call_function_single_async(int cpu, call_single_data_t *csd) { int err = 0; @@ -444,7 +446,7 @@ void smp_call_function_many(const struct cpumask *mask, cpumask_clear(cfd->cpumask_ipi); for_each_cpu(cpu, cfd->cpumask) { - struct call_single_data *csd = per_cpu_ptr(cfd->csd, cpu); + call_single_data_t *csd = per_cpu_ptr(cfd->csd, cpu); csd_lock(csd); if (wait) @@ -460,7 +462,7 @@ void smp_call_function_many(const struct cpumask *mask, if (wait) { for_each_cpu(cpu, cfd->cpumask) { - struct call_single_data *csd; + call_single_data_t *csd; csd = per_cpu_ptr(cfd->csd, cpu); csd_lock_wait(csd); diff --git a/kernel/task_work.c b/kernel/task_work.c index d513051fcca2..836a72a66fba 100644 --- a/kernel/task_work.c +++ b/kernel/task_work.c @@ -96,20 +96,16 @@ void task_work_run(void) * work->func() can do task_work_add(), do not set * work_exited unless the list is empty. */ + raw_spin_lock_irq(&task->pi_lock); do { work = READ_ONCE(task->task_works); head = !work && (task->flags & PF_EXITING) ? &work_exited : NULL; } while (cmpxchg(&task->task_works, work, head) != work); + raw_spin_unlock_irq(&task->pi_lock); if (!work) break; - /* - * Synchronize with task_work_cancel(). It can't remove - * the first entry == work, cmpxchg(task_works) should - * fail, but it can play with *work and other entries. - */ - raw_spin_unlock_wait(&task->pi_lock); do { next = work->next; diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 0b8ff7d257ea..ec09ce9a6012 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -28,6 +28,7 @@ #include <linux/workqueue.h> #include <linux/freezer.h> #include <linux/compat.h> +#include <linux/module.h> #include "posix-timers.h" @@ -56,9 +57,9 @@ static ktime_t freezer_delta; static DEFINE_SPINLOCK(freezer_delta_lock); #endif +#ifdef CONFIG_RTC_CLASS static struct wakeup_source *ws; -#ifdef CONFIG_RTC_CLASS /* rtc timer and device for setting alarm wakeups at suspend */ static struct rtc_timer rtctimer; static struct rtc_device *rtcdev; @@ -89,6 +90,7 @@ static int alarmtimer_rtc_add_device(struct device *dev, { unsigned long flags; struct rtc_device *rtc = to_rtc_device(dev); + struct wakeup_source *__ws; if (rtcdev) return -EBUSY; @@ -98,13 +100,25 @@ static int alarmtimer_rtc_add_device(struct device *dev, if (!device_may_wakeup(rtc->dev.parent)) return -1; + __ws = wakeup_source_register("alarmtimer"); + spin_lock_irqsave(&rtcdev_lock, flags); if (!rtcdev) { + if (!try_module_get(rtc->owner)) { + spin_unlock_irqrestore(&rtcdev_lock, flags); + return -1; + } + rtcdev = rtc; /* hold a reference so it doesn't go away */ get_device(dev); + ws = __ws; + __ws = NULL; } spin_unlock_irqrestore(&rtcdev_lock, flags); + + wakeup_source_unregister(__ws); + return 0; } @@ -860,7 +874,6 @@ static int __init alarmtimer_init(void) error = PTR_ERR(pdev); goto out_drv; } - ws = wakeup_source_register("alarmtimer"); return 0; out_drv: diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index a3bd5dbe0dc4..8585ad6e472a 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -799,7 +799,6 @@ static void check_thread_timers(struct task_struct *tsk, struct list_head *firing) { struct list_head *timers = tsk->cpu_timers; - struct signal_struct *const sig = tsk->signal; struct task_cputime *tsk_expires = &tsk->cputime_expires; u64 expires; unsigned long soft; @@ -823,10 +822,9 @@ static void check_thread_timers(struct task_struct *tsk, /* * Check for the special case thread timers. */ - soft = READ_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_cur); + soft = task_rlimit(tsk, RLIMIT_RTTIME); if (soft != RLIM_INFINITY) { - unsigned long hard = - READ_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_max); + unsigned long hard = task_rlimit_max(tsk, RLIMIT_RTTIME); if (hard != RLIM_INFINITY && tsk->rt.timeout > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) { @@ -847,7 +845,8 @@ static void check_thread_timers(struct task_struct *tsk, */ if (soft < hard) { soft += USEC_PER_SEC; - sig->rlim[RLIMIT_RTTIME].rlim_cur = soft; + tsk->signal->rlim[RLIMIT_RTTIME].rlim_cur = + soft; } if (print_fatal_signals) { pr_info("RT Watchdog Timeout (soft): %s[%d]\n", @@ -938,11 +937,10 @@ static void check_process_timers(struct task_struct *tsk, SIGPROF); check_cpu_itimer(tsk, &sig->it[CPUCLOCK_VIRT], &virt_expires, utime, SIGVTALRM); - soft = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur); + soft = task_rlimit(tsk, RLIMIT_CPU); if (soft != RLIM_INFINITY) { unsigned long psecs = div_u64(ptime, NSEC_PER_SEC); - unsigned long hard = - READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_max); + unsigned long hard = task_rlimit_max(tsk, RLIMIT_CPU); u64 x; if (psecs >= hard) { /* diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index cedafa008de5..8ea4fb315719 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -637,9 +637,7 @@ static inline void tk_update_ktime_data(struct timekeeper *tk) tk->ktime_sec = seconds; /* Update the monotonic raw base */ - seconds = tk->raw_sec; - nsec = (u32)(tk->tkr_raw.xtime_nsec >> tk->tkr_raw.shift); - tk->tkr_raw.base = ns_to_ktime(seconds * NSEC_PER_SEC + nsec); + tk->tkr_raw.base = ns_to_ktime(tk->raw_sec * NSEC_PER_SEC); } /* must hold timekeeper_lock */ @@ -2066,7 +2064,7 @@ void update_wall_time(void) goto out; /* Do some additional sanity checking */ - timekeeping_check_update(real_tk, offset); + timekeeping_check_update(tk, offset); /* * With NO_HZ we may have to accumulate many cycle_intervals diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 8f5d1bf18854..f2674a056c26 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -203,6 +203,7 @@ struct timer_base { bool migration_enabled; bool nohz_active; bool is_idle; + bool must_forward_clk; DECLARE_BITMAP(pending_map, WHEEL_SIZE); struct hlist_head vectors[WHEEL_SIZE]; } ____cacheline_aligned; @@ -856,13 +857,19 @@ get_target_base(struct timer_base *base, unsigned tflags) static inline void forward_timer_base(struct timer_base *base) { - unsigned long jnow = READ_ONCE(jiffies); + unsigned long jnow; /* - * We only forward the base when it's idle and we have a delta between - * base clock and jiffies. + * We only forward the base when we are idle or have just come out of + * idle (must_forward_clk logic), and have a delta between base clock + * and jiffies. In the common case, run_timers will take care of it. */ - if (!base->is_idle || (long) (jnow - base->clk) < 2) + if (likely(!base->must_forward_clk)) + return; + + jnow = READ_ONCE(jiffies); + base->must_forward_clk = base->is_idle; + if ((long)(jnow - base->clk) < 2) return; /* @@ -938,6 +945,11 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only) * same array bucket then just return: */ if (timer_pending(timer)) { + /* + * The downside of this optimization is that it can result in + * larger granularity than you would get from adding a new + * timer with this expiry. + */ if (timer->expires == expires) return 1; @@ -948,6 +960,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only) * dequeue/enqueue dance. */ base = lock_timer_base(timer, &flags); + forward_timer_base(base); clk = base->clk; idx = calc_wheel_index(expires, clk); @@ -964,6 +977,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only) } } else { base = lock_timer_base(timer, &flags); + forward_timer_base(base); } ret = detach_if_pending(timer, base, false); @@ -991,12 +1005,10 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only) raw_spin_lock(&base->lock); WRITE_ONCE(timer->flags, (timer->flags & ~TIMER_BASEMASK) | base->cpu); + forward_timer_base(base); } } - /* Try to forward a stale timer base clock */ - forward_timer_base(base); - timer->expires = expires; /* * If 'idx' was calculated above and the base time did not advance @@ -1112,6 +1124,7 @@ void add_timer_on(struct timer_list *timer, int cpu) WRITE_ONCE(timer->flags, (timer->flags & ~TIMER_BASEMASK) | cpu); } + forward_timer_base(base); debug_activate(timer, timer->expires); internal_add_timer(base, timer); @@ -1497,10 +1510,16 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem) if (!is_max_delta) expires = basem + (u64)(nextevt - basej) * TICK_NSEC; /* - * If we expect to sleep more than a tick, mark the base idle: + * If we expect to sleep more than a tick, mark the base idle. + * Also the tick is stopped so any added timer must forward + * the base clk itself to keep granularity small. This idle + * logic is only maintained for the BASE_STD base, deferrable + * timers may still see large granularity skew (by design). */ - if ((expires - basem) > TICK_NSEC) + if ((expires - basem) > TICK_NSEC) { + base->must_forward_clk = true; base->is_idle = true; + } } raw_spin_unlock(&base->lock); @@ -1611,6 +1630,19 @@ static __latent_entropy void run_timer_softirq(struct softirq_action *h) { struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]); + /* + * must_forward_clk must be cleared before running timers so that any + * timer functions that call mod_timer will not try to forward the + * base. idle trcking / clock forwarding logic is only used with + * BASE_STD timers. + * + * The deferrable base does not do idle tracking at all, so we do + * not forward it. This can result in very large variations in + * granularity for deferrable timers, but they can be deferred for + * long periods due to idle. + */ + base->must_forward_clk = false; + __run_timers(base); if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active) __run_timers(this_cpu_ptr(&timer_bases[BASE_DEF])); diff --git a/kernel/torture.c b/kernel/torture.c index 55de96529287..637e172835d8 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -117,7 +117,7 @@ bool torture_offline(int cpu, long *n_offl_attempts, long *n_offl_successes, torture_type, cpu); (*n_offl_successes)++; delta = jiffies - starttime; - sum_offl += delta; + *sum_offl += delta; if (*min_offl < 0) { *min_offl = delta; *max_offl = delta; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 37385193a608..dc498b605d5d 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -204,10 +204,36 @@ BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1, fmt_cnt++; } - return __trace_printk(1/* fake ip will not be printed */, fmt, - mod[0] == 2 ? arg1 : mod[0] == 1 ? (long) arg1 : (u32) arg1, - mod[1] == 2 ? arg2 : mod[1] == 1 ? (long) arg2 : (u32) arg2, - mod[2] == 2 ? arg3 : mod[2] == 1 ? (long) arg3 : (u32) arg3); +/* Horrid workaround for getting va_list handling working with different + * argument type combinations generically for 32 and 64 bit archs. + */ +#define __BPF_TP_EMIT() __BPF_ARG3_TP() +#define __BPF_TP(...) \ + __trace_printk(1 /* Fake ip will not be printed. */, \ + fmt, ##__VA_ARGS__) + +#define __BPF_ARG1_TP(...) \ + ((mod[0] == 2 || (mod[0] == 1 && __BITS_PER_LONG == 64)) \ + ? __BPF_TP(arg1, ##__VA_ARGS__) \ + : ((mod[0] == 1 || (mod[0] == 0 && __BITS_PER_LONG == 32)) \ + ? __BPF_TP((long)arg1, ##__VA_ARGS__) \ + : __BPF_TP((u32)arg1, ##__VA_ARGS__))) + +#define __BPF_ARG2_TP(...) \ + ((mod[1] == 2 || (mod[1] == 1 && __BITS_PER_LONG == 64)) \ + ? __BPF_ARG1_TP(arg2, ##__VA_ARGS__) \ + : ((mod[1] == 1 || (mod[1] == 0 && __BITS_PER_LONG == 32)) \ + ? __BPF_ARG1_TP((long)arg2, ##__VA_ARGS__) \ + : __BPF_ARG1_TP((u32)arg2, ##__VA_ARGS__))) + +#define __BPF_ARG3_TP(...) \ + ((mod[2] == 2 || (mod[2] == 1 && __BITS_PER_LONG == 64)) \ + ? __BPF_ARG2_TP(arg3, ##__VA_ARGS__) \ + : ((mod[2] == 1 || (mod[2] == 0 && __BITS_PER_LONG == 32)) \ + ? __BPF_ARG2_TP((long)arg3, ##__VA_ARGS__) \ + : __BPF_ARG2_TP((u32)arg3, ##__VA_ARGS__))) + + return __BPF_TP_EMIT(); } static const struct bpf_func_proto bpf_trace_printk_proto = { diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 02004ae91860..96cea88fa00f 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -889,6 +889,10 @@ static int profile_graph_entry(struct ftrace_graph_ent *trace) function_profile_call(trace->func, 0, NULL, NULL); + /* If function graph is shutting down, ret_stack can be NULL */ + if (!current->ret_stack) + return 0; + if (index >= 0 && index < FTRACE_RETFUNC_DEPTH) current->ret_stack[index].subtime = 0; diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 529cc50d7243..81279c6602ff 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -4386,15 +4386,19 @@ EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu); * the page that was allocated, with the read page of the buffer. * * Returns: - * The page allocated, or NULL on error. + * The page allocated, or ERR_PTR */ void *ring_buffer_alloc_read_page(struct ring_buffer *buffer, int cpu) { - struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; + struct ring_buffer_per_cpu *cpu_buffer; struct buffer_data_page *bpage = NULL; unsigned long flags; struct page *page; + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + return ERR_PTR(-ENODEV); + + cpu_buffer = buffer->buffers[cpu]; local_irq_save(flags); arch_spin_lock(&cpu_buffer->lock); @@ -4412,7 +4416,7 @@ void *ring_buffer_alloc_read_page(struct ring_buffer *buffer, int cpu) page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL | __GFP_NORETRY, 0); if (!page) - return NULL; + return ERR_PTR(-ENOMEM); bpage = page_address(page); @@ -4467,8 +4471,8 @@ EXPORT_SYMBOL_GPL(ring_buffer_free_read_page); * * for example: * rpage = ring_buffer_alloc_read_page(buffer, cpu); - * if (!rpage) - * return error; + * if (IS_ERR(rpage)) + * return PTR_ERR(rpage); * ret = ring_buffer_read_page(buffer, &rpage, len, cpu, 0); * if (ret >= 0) * process_page(rpage, ret); diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c index 9fbcaf567886..68ee79afe31c 100644 --- a/kernel/trace/ring_buffer_benchmark.c +++ b/kernel/trace/ring_buffer_benchmark.c @@ -113,7 +113,7 @@ static enum event_status read_page(int cpu) int i; bpage = ring_buffer_alloc_read_page(buffer, cpu); - if (!bpage) + if (IS_ERR(bpage)) return EVENT_DROPPED; ret = ring_buffer_read_page(buffer, &bpage, PAGE_SIZE, cpu, 1); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 42b9355033d4..44004d8aa3b3 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -6598,7 +6598,7 @@ tracing_buffers_read(struct file *filp, char __user *ubuf, { struct ftrace_buffer_info *info = filp->private_data; struct trace_iterator *iter = &info->iter; - ssize_t ret; + ssize_t ret = 0; ssize_t size; if (!count) @@ -6612,10 +6612,15 @@ tracing_buffers_read(struct file *filp, char __user *ubuf, if (!info->spare) { info->spare = ring_buffer_alloc_read_page(iter->trace_buffer->buffer, iter->cpu_file); - info->spare_cpu = iter->cpu_file; + if (IS_ERR(info->spare)) { + ret = PTR_ERR(info->spare); + info->spare = NULL; + } else { + info->spare_cpu = iter->cpu_file; + } } if (!info->spare) - return -ENOMEM; + return ret; /* Do we have previous read data to read? */ if (info->read < PAGE_SIZE) @@ -6790,8 +6795,9 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, ref->ref = 1; ref->buffer = iter->trace_buffer->buffer; ref->page = ring_buffer_alloc_read_page(ref->buffer, iter->cpu_file); - if (!ref->page) { - ret = -ENOMEM; + if (IS_ERR(ref->page)) { + ret = PTR_ERR(ref->page); + ref->page = NULL; kfree(ref); break; } @@ -8293,6 +8299,7 @@ __init static int tracer_alloc_buffers(void) if (ret < 0) goto out_free_cpumask; /* Used for event triggers */ + ret = -ENOMEM; temp_buffer = ring_buffer_alloc(PAGE_SIZE, RB_FL_OVERWRITE); if (!temp_buffer) goto out_rm_hp_state; @@ -8407,4 +8414,4 @@ __init static int clear_boot_tracer(void) } fs_initcall(tracer_init_tracefs); -late_initcall(clear_boot_tracer); +late_initcall_sync(clear_boot_tracer); diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 562fa69df5d3..13ba2d3f6a91 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -306,6 +306,7 @@ static void perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *ops, struct pt_regs *pt_regs) { + struct perf_event *event; struct ftrace_entry *entry; struct hlist_head *head; struct pt_regs regs; @@ -329,8 +330,9 @@ perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip, entry->ip = ip; entry->parent_ip = parent_ip; + event = container_of(ops, struct perf_event, ftrace_ops); perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, TRACE_FN, - 1, ®s, head, NULL); + 1, ®s, head, NULL, event); #undef ENTRY_SIZE } diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 59a411ff60c7..181e139a8057 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -1959,6 +1959,10 @@ static int create_filter(struct trace_event_call *call, if (err && set_str) append_filter_err(ps, filter); } + if (err && !set_str) { + free_event_filter(filter); + filter = NULL; + } create_filter_finish(ps); *filterp = filter; diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index c9b5aa10fbf9..8a907e12b6b9 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1200,7 +1200,7 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs) memset(&entry[1], 0, dsize); store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize); perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs, - head, NULL); + head, NULL, NULL); } NOKPROBE_SYMBOL(kprobe_perf_func); @@ -1236,7 +1236,7 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, entry->ret_ip = (unsigned long)ri->ret_addr; store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize); perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs, - head, NULL); + head, NULL, NULL); } NOKPROBE_SYMBOL(kretprobe_perf_func); #endif /* CONFIG_PERF_EVENTS */ diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 5e10395da88e..74d9a86eccc0 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -596,7 +596,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) (unsigned long *)&rec->args); perf_trace_buf_submit(rec, size, rctx, sys_data->enter_event->event.type, 1, regs, - head, NULL); + head, NULL, NULL); } static int perf_sysenter_enable(struct trace_event_call *call) @@ -667,7 +667,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) rec->nr = syscall_nr; rec->ret = syscall_get_return_value(current, regs); perf_trace_buf_submit(rec, size, rctx, sys_data->exit_event->event.type, - 1, regs, head, NULL); + 1, regs, head, NULL, NULL); } static int perf_sysexit_enable(struct trace_event_call *call) diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index a7581fec9681..4525e0271a53 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -1156,7 +1156,7 @@ static void __uprobe_perf_func(struct trace_uprobe *tu, } perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs, - head, NULL); + head, NULL, NULL); out: preempt_enable(); } diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c index 0a689bbb78ef..305039b122fa 100644 --- a/kernel/trace/tracing_map.c +++ b/kernel/trace/tracing_map.c @@ -221,16 +221,19 @@ void tracing_map_array_free(struct tracing_map_array *a) if (!a) return; - if (!a->pages) { - kfree(a); - return; - } + if (!a->pages) + goto free; for (i = 0; i < a->n_pages; i++) { if (!a->pages[i]) break; free_page((unsigned long)a->pages[i]); } + + kfree(a->pages); + + free: + kfree(a); } struct tracing_map_array *tracing_map_array_alloc(unsigned int n_elts, diff --git a/kernel/up.c b/kernel/up.c index ee81ac9af4ca..42c46bf3e0a5 100644 --- a/kernel/up.c +++ b/kernel/up.c @@ -23,7 +23,7 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info, } EXPORT_SYMBOL(smp_call_function_single); -int smp_call_function_single_async(int cpu, struct call_single_data *csd) +int smp_call_function_single_async(int cpu, call_single_data_t *csd) { unsigned long flags; diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 06d3389bca0d..f5d52024f6b7 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -240,6 +240,7 @@ static void set_sample_period(void) * hardlockup detector generates a warning */ sample_period = get_softlockup_thresh() * ((u64)NSEC_PER_SEC / 5); + watchdog_update_hrtimer_threshold(sample_period); } /* Commands for resetting the watchdog */ diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c index 295a0d84934c..3a09ea1b1d3d 100644 --- a/kernel/watchdog_hld.c +++ b/kernel/watchdog_hld.c @@ -37,6 +37,62 @@ void arch_touch_nmi_watchdog(void) } EXPORT_SYMBOL(arch_touch_nmi_watchdog); +#ifdef CONFIG_HARDLOCKUP_CHECK_TIMESTAMP +static DEFINE_PER_CPU(ktime_t, last_timestamp); +static DEFINE_PER_CPU(unsigned int, nmi_rearmed); +static ktime_t watchdog_hrtimer_sample_threshold __read_mostly; + +void watchdog_update_hrtimer_threshold(u64 period) +{ + /* + * The hrtimer runs with a period of (watchdog_threshold * 2) / 5 + * + * So it runs effectively with 2.5 times the rate of the NMI + * watchdog. That means the hrtimer should fire 2-3 times before + * the NMI watchdog expires. The NMI watchdog on x86 is based on + * unhalted CPU cycles, so if Turbo-Mode is enabled the CPU cycles + * might run way faster than expected and the NMI fires in a + * smaller period than the one deduced from the nominal CPU + * frequency. Depending on the Turbo-Mode factor this might be fast + * enough to get the NMI period smaller than the hrtimer watchdog + * period and trigger false positives. + * + * The sample threshold is used to check in the NMI handler whether + * the minimum time between two NMI samples has elapsed. That + * prevents false positives. + * + * Set this to 4/5 of the actual watchdog threshold period so the + * hrtimer is guaranteed to fire at least once within the real + * watchdog threshold. + */ + watchdog_hrtimer_sample_threshold = period * 2; +} + +static bool watchdog_check_timestamp(void) +{ + ktime_t delta, now = ktime_get_mono_fast_ns(); + + delta = now - __this_cpu_read(last_timestamp); + if (delta < watchdog_hrtimer_sample_threshold) { + /* + * If ktime is jiffies based, a stalled timer would prevent + * jiffies from being incremented and the filter would look + * at a stale timestamp and never trigger. + */ + if (__this_cpu_inc_return(nmi_rearmed) < 10) + return false; + } + __this_cpu_write(nmi_rearmed, 0); + __this_cpu_write(last_timestamp, now); + return true; +} +#else +static inline bool watchdog_check_timestamp(void) +{ + return true; +} +#endif + static struct perf_event_attr wd_hw_attr = { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CPU_CYCLES, @@ -61,6 +117,9 @@ static void watchdog_overflow_callback(struct perf_event *event, return; } + if (!watchdog_check_timestamp()) + return; + /* check for a hardlockup * This is done by making sure our timer interrupt * is incrementing. The timer interrupt should have diff --git a/kernel/workqueue.c b/kernel/workqueue.c index ca937b0c3a96..ab3c0dc8c7ed 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2091,8 +2091,30 @@ __acquires(&pool->lock) spin_unlock_irq(&pool->lock); - lock_map_acquire_read(&pwq->wq->lockdep_map); + lock_map_acquire(&pwq->wq->lockdep_map); lock_map_acquire(&lockdep_map); + /* + * Strictly speaking we should mark the invariant state without holding + * any locks, that is, before these two lock_map_acquire()'s. + * + * However, that would result in: + * + * A(W1) + * WFC(C) + * A(W1) + * C(C) + * + * Which would create W1->C->W1 dependencies, even though there is no + * actual deadlock possible. There are two solutions, using a + * read-recursive acquire on the work(queue) 'locks', but this will then + * hit the lockdep limitation on recursive locks, or simply discard + * these locks. + * + * AFAICT there is no possible deadlock scenario between the + * flush_work() and complete() primitives (except for single-threaded + * workqueues), so hiding them isn't a problem. + */ + lockdep_invariant_state(true); trace_workqueue_execute_start(work); worker->current_func(work); /* @@ -2474,7 +2496,16 @@ static void insert_wq_barrier(struct pool_workqueue *pwq, */ INIT_WORK_ONSTACK(&barr->work, wq_barrier_func); __set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&barr->work)); - init_completion(&barr->done); + + /* + * Explicitly init the crosslock for wq_barrier::done, make its lock + * key a subkey of the corresponding work. As a result we won't + * build a dependency between wq_barrier::done and unrelated work. + */ + lockdep_init_map_crosslock((struct lockdep_map *)&barr->done.map, + "(complete)wq_barr::done", + target->lockdep_map.key, 1); + __init_completion(&barr->done); barr->task = current; /* @@ -2815,16 +2846,18 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr) spin_unlock_irq(&pool->lock); /* - * If @max_active is 1 or rescuer is in use, flushing another work - * item on the same workqueue may lead to deadlock. Make sure the - * flusher is not running on the same workqueue by verifying write - * access. + * Force a lock recursion deadlock when using flush_work() inside a + * single-threaded or rescuer equipped workqueue. + * + * For single threaded workqueues the deadlock happens when the work + * is after the work issuing the flush_work(). For rescuer equipped + * workqueues the deadlock happens when the rescuer stalls, blocking + * forward progress. */ - if (pwq->wq->saved_max_active == 1 || pwq->wq->rescuer) + if (pwq->wq->saved_max_active == 1 || pwq->wq->rescuer) { lock_map_acquire(&pwq->wq->lockdep_map); - else - lock_map_acquire_read(&pwq->wq->lockdep_map); - lock_map_release(&pwq->wq->lockdep_map); + lock_map_release(&pwq->wq->lockdep_map); + } return true; already_gone: |