summaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/jump_label.c4
-rw-r--r--kernel/ksysfs.c7
-rw-r--r--kernel/locking/qspinlock_paravirt.h2
-rw-r--r--kernel/profile.c242
-rw-r--r--kernel/sched/core.c68
-rw-r--r--kernel/sched/cputime.c6
-rw-r--r--kernel/sched/stats.c10
-rw-r--r--kernel/task_work.c6
-rw-r--r--kernel/time/clocksource.c2
-rw-r--r--kernel/time/tick-broadcast.c3
10 files changed, 79 insertions, 271 deletions
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 4ad5ed8adf96..6dc76b590703 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -236,7 +236,7 @@ void static_key_disable_cpuslocked(struct static_key *key)
}
jump_label_lock();
- if (atomic_cmpxchg(&key->enabled, 1, 0))
+ if (atomic_cmpxchg(&key->enabled, 1, 0) == 1)
jump_label_update(key);
jump_label_unlock();
}
@@ -289,7 +289,7 @@ static void __static_key_slow_dec_cpuslocked(struct static_key *key)
return;
guard(mutex)(&jump_label_mutex);
- if (atomic_cmpxchg(&key->enabled, 1, 0))
+ if (atomic_cmpxchg(&key->enabled, 1, 0) == 1)
jump_label_update(key);
else
WARN_ON_ONCE(!static_key_slow_try_dec(key));
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 07fb5987b42b..1bab21b4718f 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -92,7 +92,14 @@ static ssize_t profiling_store(struct kobject *kobj,
const char *buf, size_t count)
{
int ret;
+ static DEFINE_MUTEX(lock);
+ /*
+ * We need serialization, for profile_setup() initializes prof_on
+ * value and profile_init() must not reallocate prof_buffer after
+ * once allocated.
+ */
+ guard(mutex)(&lock);
if (prof_on)
return -EEXIST;
/*
diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
index f5a36e67b593..ac2e22502741 100644
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -357,7 +357,7 @@ static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev)
static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node)
{
struct pv_node *pn = (struct pv_node *)node;
- enum vcpu_state old = vcpu_halted;
+ u8 old = vcpu_halted;
/*
* If the vCPU is indeed halted, advance its state to match that of
* pv_wait_node(). If OTOH this fails, the vCPU was running and will
diff --git a/kernel/profile.c b/kernel/profile.c
index 2b775cc5c28f..1fcf1adcf4eb 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -47,30 +47,14 @@ static unsigned short int prof_shift;
int prof_on __read_mostly;
EXPORT_SYMBOL_GPL(prof_on);
-static cpumask_var_t prof_cpu_mask;
-#if defined(CONFIG_SMP) && defined(CONFIG_PROC_FS)
-static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits);
-static DEFINE_PER_CPU(int, cpu_profile_flip);
-static DEFINE_MUTEX(profile_flip_mutex);
-#endif /* CONFIG_SMP */
-
int profile_setup(char *str)
{
static const char schedstr[] = "schedule";
- static const char sleepstr[] = "sleep";
static const char kvmstr[] = "kvm";
const char *select = NULL;
int par;
- if (!strncmp(str, sleepstr, strlen(sleepstr))) {
-#ifdef CONFIG_SCHEDSTATS
- force_schedstat_enabled();
- prof_on = SLEEP_PROFILING;
- select = sleepstr;
-#else
- pr_warn("kernel sleep profiling requires CONFIG_SCHEDSTATS\n");
-#endif /* CONFIG_SCHEDSTATS */
- } else if (!strncmp(str, schedstr, strlen(schedstr))) {
+ if (!strncmp(str, schedstr, strlen(schedstr))) {
prof_on = SCHED_PROFILING;
select = schedstr;
} else if (!strncmp(str, kvmstr, strlen(kvmstr))) {
@@ -114,11 +98,6 @@ int __ref profile_init(void)
buffer_bytes = prof_len*sizeof(atomic_t);
- if (!alloc_cpumask_var(&prof_cpu_mask, GFP_KERNEL))
- return -ENOMEM;
-
- cpumask_copy(prof_cpu_mask, cpu_possible_mask);
-
prof_buffer = kzalloc(buffer_bytes, GFP_KERNEL|__GFP_NOWARN);
if (prof_buffer)
return 0;
@@ -132,195 +111,16 @@ int __ref profile_init(void)
if (prof_buffer)
return 0;
- free_cpumask_var(prof_cpu_mask);
return -ENOMEM;
}
-#if defined(CONFIG_SMP) && defined(CONFIG_PROC_FS)
-/*
- * Each cpu has a pair of open-addressed hashtables for pending
- * profile hits. read_profile() IPI's all cpus to request them
- * to flip buffers and flushes their contents to prof_buffer itself.
- * Flip requests are serialized by the profile_flip_mutex. The sole
- * use of having a second hashtable is for avoiding cacheline
- * contention that would otherwise happen during flushes of pending
- * profile hits required for the accuracy of reported profile hits
- * and so resurrect the interrupt livelock issue.
- *
- * The open-addressed hashtables are indexed by profile buffer slot
- * and hold the number of pending hits to that profile buffer slot on
- * a cpu in an entry. When the hashtable overflows, all pending hits
- * are accounted to their corresponding profile buffer slots with
- * atomic_add() and the hashtable emptied. As numerous pending hits
- * may be accounted to a profile buffer slot in a hashtable entry,
- * this amortizes a number of atomic profile buffer increments likely
- * to be far larger than the number of entries in the hashtable,
- * particularly given that the number of distinct profile buffer
- * positions to which hits are accounted during short intervals (e.g.
- * several seconds) is usually very small. Exclusion from buffer
- * flipping is provided by interrupt disablement (note that for
- * SCHED_PROFILING or SLEEP_PROFILING profile_hit() may be called from
- * process context).
- * The hash function is meant to be lightweight as opposed to strong,
- * and was vaguely inspired by ppc64 firmware-supported inverted
- * pagetable hash functions, but uses a full hashtable full of finite
- * collision chains, not just pairs of them.
- *
- * -- nyc
- */
-static void __profile_flip_buffers(void *unused)
-{
- int cpu = smp_processor_id();
-
- per_cpu(cpu_profile_flip, cpu) = !per_cpu(cpu_profile_flip, cpu);
-}
-
-static void profile_flip_buffers(void)
-{
- int i, j, cpu;
-
- mutex_lock(&profile_flip_mutex);
- j = per_cpu(cpu_profile_flip, get_cpu());
- put_cpu();
- on_each_cpu(__profile_flip_buffers, NULL, 1);
- for_each_online_cpu(cpu) {
- struct profile_hit *hits = per_cpu(cpu_profile_hits, cpu)[j];
- for (i = 0; i < NR_PROFILE_HIT; ++i) {
- if (!hits[i].hits) {
- if (hits[i].pc)
- hits[i].pc = 0;
- continue;
- }
- atomic_add(hits[i].hits, &prof_buffer[hits[i].pc]);
- hits[i].hits = hits[i].pc = 0;
- }
- }
- mutex_unlock(&profile_flip_mutex);
-}
-
-static void profile_discard_flip_buffers(void)
-{
- int i, cpu;
-
- mutex_lock(&profile_flip_mutex);
- i = per_cpu(cpu_profile_flip, get_cpu());
- put_cpu();
- on_each_cpu(__profile_flip_buffers, NULL, 1);
- for_each_online_cpu(cpu) {
- struct profile_hit *hits = per_cpu(cpu_profile_hits, cpu)[i];
- memset(hits, 0, NR_PROFILE_HIT*sizeof(struct profile_hit));
- }
- mutex_unlock(&profile_flip_mutex);
-}
-
-static void do_profile_hits(int type, void *__pc, unsigned int nr_hits)
-{
- unsigned long primary, secondary, flags, pc = (unsigned long)__pc;
- int i, j, cpu;
- struct profile_hit *hits;
-
- pc = min((pc - (unsigned long)_stext) >> prof_shift, prof_len - 1);
- i = primary = (pc & (NR_PROFILE_GRP - 1)) << PROFILE_GRPSHIFT;
- secondary = (~(pc << 1) & (NR_PROFILE_GRP - 1)) << PROFILE_GRPSHIFT;
- cpu = get_cpu();
- hits = per_cpu(cpu_profile_hits, cpu)[per_cpu(cpu_profile_flip, cpu)];
- if (!hits) {
- put_cpu();
- return;
- }
- /*
- * We buffer the global profiler buffer into a per-CPU
- * queue and thus reduce the number of global (and possibly
- * NUMA-alien) accesses. The write-queue is self-coalescing:
- */
- local_irq_save(flags);
- do {
- for (j = 0; j < PROFILE_GRPSZ; ++j) {
- if (hits[i + j].pc == pc) {
- hits[i + j].hits += nr_hits;
- goto out;
- } else if (!hits[i + j].hits) {
- hits[i + j].pc = pc;
- hits[i + j].hits = nr_hits;
- goto out;
- }
- }
- i = (i + secondary) & (NR_PROFILE_HIT - 1);
- } while (i != primary);
-
- /*
- * Add the current hit(s) and flush the write-queue out
- * to the global buffer:
- */
- atomic_add(nr_hits, &prof_buffer[pc]);
- for (i = 0; i < NR_PROFILE_HIT; ++i) {
- atomic_add(hits[i].hits, &prof_buffer[hits[i].pc]);
- hits[i].pc = hits[i].hits = 0;
- }
-out:
- local_irq_restore(flags);
- put_cpu();
-}
-
-static int profile_dead_cpu(unsigned int cpu)
-{
- struct page *page;
- int i;
-
- if (cpumask_available(prof_cpu_mask))
- cpumask_clear_cpu(cpu, prof_cpu_mask);
-
- for (i = 0; i < 2; i++) {
- if (per_cpu(cpu_profile_hits, cpu)[i]) {
- page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[i]);
- per_cpu(cpu_profile_hits, cpu)[i] = NULL;
- __free_page(page);
- }
- }
- return 0;
-}
-
-static int profile_prepare_cpu(unsigned int cpu)
-{
- int i, node = cpu_to_mem(cpu);
- struct page *page;
-
- per_cpu(cpu_profile_flip, cpu) = 0;
-
- for (i = 0; i < 2; i++) {
- if (per_cpu(cpu_profile_hits, cpu)[i])
- continue;
-
- page = __alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
- if (!page) {
- profile_dead_cpu(cpu);
- return -ENOMEM;
- }
- per_cpu(cpu_profile_hits, cpu)[i] = page_address(page);
-
- }
- return 0;
-}
-
-static int profile_online_cpu(unsigned int cpu)
-{
- if (cpumask_available(prof_cpu_mask))
- cpumask_set_cpu(cpu, prof_cpu_mask);
-
- return 0;
-}
-
-#else /* !CONFIG_SMP */
-#define profile_flip_buffers() do { } while (0)
-#define profile_discard_flip_buffers() do { } while (0)
-
static void do_profile_hits(int type, void *__pc, unsigned int nr_hits)
{
unsigned long pc;
pc = ((unsigned long)__pc - (unsigned long)_stext) >> prof_shift;
- atomic_add(nr_hits, &prof_buffer[min(pc, prof_len - 1)]);
+ if (pc < prof_len)
+ atomic_add(nr_hits, &prof_buffer[pc]);
}
-#endif /* !CONFIG_SMP */
void profile_hits(int type, void *__pc, unsigned int nr_hits)
{
@@ -334,8 +134,8 @@ void profile_tick(int type)
{
struct pt_regs *regs = get_irq_regs();
- if (!user_mode(regs) && cpumask_available(prof_cpu_mask) &&
- cpumask_test_cpu(smp_processor_id(), prof_cpu_mask))
+ /* This is the old kernel-only legacy profiling */
+ if (!user_mode(regs))
profile_hit(type, (void *)profile_pc(regs));
}
@@ -358,7 +158,6 @@ read_profile(struct file *file, char __user *buf, size_t count, loff_t *ppos)
char *pnt;
unsigned long sample_step = 1UL << prof_shift;
- profile_flip_buffers();
if (p >= (prof_len+1)*sizeof(unsigned int))
return 0;
if (count > (prof_len+1)*sizeof(unsigned int) - p)
@@ -404,7 +203,6 @@ static ssize_t write_profile(struct file *file, const char __user *buf,
return -EINVAL;
}
#endif
- profile_discard_flip_buffers();
memset(prof_buffer, 0, prof_len * sizeof(atomic_t));
return count;
}
@@ -418,40 +216,14 @@ static const struct proc_ops profile_proc_ops = {
int __ref create_proc_profile(void)
{
struct proc_dir_entry *entry;
-#ifdef CONFIG_SMP
- enum cpuhp_state online_state;
-#endif
-
int err = 0;
if (!prof_on)
return 0;
-#ifdef CONFIG_SMP
- err = cpuhp_setup_state(CPUHP_PROFILE_PREPARE, "PROFILE_PREPARE",
- profile_prepare_cpu, profile_dead_cpu);
- if (err)
- return err;
-
- err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "AP_PROFILE_ONLINE",
- profile_online_cpu, NULL);
- if (err < 0)
- goto err_state_prep;
- online_state = err;
- err = 0;
-#endif
entry = proc_create("profile", S_IWUSR | S_IRUGO,
NULL, &profile_proc_ops);
- if (!entry)
- goto err_state_onl;
- proc_set_size(entry, (1 + prof_len) * sizeof(atomic_t));
-
- return err;
-err_state_onl:
-#ifdef CONFIG_SMP
- cpuhp_remove_state(online_state);
-err_state_prep:
- cpuhp_remove_state(CPUHP_PROFILE_PREPARE);
-#endif
+ if (entry)
+ proc_set_size(entry, (1 + prof_len) * sizeof(atomic_t));
return err;
}
subsys_initcall(create_proc_profile);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index a9f655025607..f3951e4a55e5 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7845,6 +7845,30 @@ void set_rq_offline(struct rq *rq)
}
}
+static inline void sched_set_rq_online(struct rq *rq, int cpu)
+{
+ struct rq_flags rf;
+
+ rq_lock_irqsave(rq, &rf);
+ if (rq->rd) {
+ BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
+ set_rq_online(rq);
+ }
+ rq_unlock_irqrestore(rq, &rf);
+}
+
+static inline void sched_set_rq_offline(struct rq *rq, int cpu)
+{
+ struct rq_flags rf;
+
+ rq_lock_irqsave(rq, &rf);
+ if (rq->rd) {
+ BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
+ set_rq_offline(rq);
+ }
+ rq_unlock_irqrestore(rq, &rf);
+}
+
/*
* used to mark begin/end of suspend/resume:
*/
@@ -7895,10 +7919,25 @@ static int cpuset_cpu_inactive(unsigned int cpu)
return 0;
}
+static inline void sched_smt_present_inc(int cpu)
+{
+#ifdef CONFIG_SCHED_SMT
+ if (cpumask_weight(cpu_smt_mask(cpu)) == 2)
+ static_branch_inc_cpuslocked(&sched_smt_present);
+#endif
+}
+
+static inline void sched_smt_present_dec(int cpu)
+{
+#ifdef CONFIG_SCHED_SMT
+ if (cpumask_weight(cpu_smt_mask(cpu)) == 2)
+ static_branch_dec_cpuslocked(&sched_smt_present);
+#endif
+}
+
int sched_cpu_activate(unsigned int cpu)
{
struct rq *rq = cpu_rq(cpu);
- struct rq_flags rf;
/*
* Clear the balance_push callback and prepare to schedule
@@ -7906,13 +7945,10 @@ int sched_cpu_activate(unsigned int cpu)
*/
balance_push_set(cpu, false);
-#ifdef CONFIG_SCHED_SMT
/*
* When going up, increment the number of cores with SMT present.
*/
- if (cpumask_weight(cpu_smt_mask(cpu)) == 2)
- static_branch_inc_cpuslocked(&sched_smt_present);
-#endif
+ sched_smt_present_inc(cpu);
set_cpu_active(cpu, true);
if (sched_smp_initialized) {
@@ -7930,12 +7966,7 @@ int sched_cpu_activate(unsigned int cpu)
* 2) At runtime, if cpuset_cpu_active() fails to rebuild the
* domains.
*/
- rq_lock_irqsave(rq, &rf);
- if (rq->rd) {
- BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
- set_rq_online(rq);
- }
- rq_unlock_irqrestore(rq, &rf);
+ sched_set_rq_online(rq, cpu);
return 0;
}
@@ -7943,7 +7974,6 @@ int sched_cpu_activate(unsigned int cpu)
int sched_cpu_deactivate(unsigned int cpu)
{
struct rq *rq = cpu_rq(cpu);
- struct rq_flags rf;
int ret;
/*
@@ -7974,20 +8004,14 @@ int sched_cpu_deactivate(unsigned int cpu)
*/
synchronize_rcu();
- rq_lock_irqsave(rq, &rf);
- if (rq->rd) {
- BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
- set_rq_offline(rq);
- }
- rq_unlock_irqrestore(rq, &rf);
+ sched_set_rq_offline(rq, cpu);
-#ifdef CONFIG_SCHED_SMT
/*
* When going down, decrement the number of cores with SMT present.
*/
- if (cpumask_weight(cpu_smt_mask(cpu)) == 2)
- static_branch_dec_cpuslocked(&sched_smt_present);
+ sched_smt_present_dec(cpu);
+#ifdef CONFIG_SCHED_SMT
sched_core_cpu_deactivate(cpu);
#endif
@@ -7997,6 +8021,8 @@ int sched_cpu_deactivate(unsigned int cpu)
sched_update_numa(cpu, false);
ret = cpuset_cpu_inactive(cpu);
if (ret) {
+ sched_smt_present_inc(cpu);
+ sched_set_rq_online(rq, cpu);
balance_push_set(cpu, false);
set_cpu_active(cpu, true);
sched_update_numa(cpu, true);
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index a5e00293ae43..0bed0fa1acd9 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -582,6 +582,12 @@ void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev,
}
stime = mul_u64_u64_div_u64(stime, rtime, stime + utime);
+ /*
+ * Because mul_u64_u64_div_u64() can approximate on some
+ * achitectures; enforce the constraint that: a*b/(b+c) <= a.
+ */
+ if (unlikely(stime > rtime))
+ stime = rtime;
update:
/*
diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c
index 78e48f5426ee..eb0cdcd4d921 100644
--- a/kernel/sched/stats.c
+++ b/kernel/sched/stats.c
@@ -92,16 +92,6 @@ void __update_stats_enqueue_sleeper(struct rq *rq, struct task_struct *p,
trace_sched_stat_blocked(p, delta);
- /*
- * Blocking time is in units of nanosecs, so shift by
- * 20 to get a milliseconds-range estimation of the
- * amount of time that the task spent sleeping:
- */
- if (unlikely(prof_on == SLEEP_PROFILING)) {
- profile_hits(SLEEP_PROFILING,
- (void *)get_wchan(p),
- delta >> 20);
- }
account_scheduler_latency(p, delta >> 10, 0);
}
}
diff --git a/kernel/task_work.c b/kernel/task_work.c
index 5c2daa7ad3f9..5d14d639ac71 100644
--- a/kernel/task_work.c
+++ b/kernel/task_work.c
@@ -6,12 +6,14 @@
static struct callback_head work_exited; /* all we need is ->next == NULL */
+#ifdef CONFIG_IRQ_WORK
static void task_work_set_notify_irq(struct irq_work *entry)
{
test_and_set_tsk_thread_flag(current, TIF_NOTIFY_RESUME);
}
static DEFINE_PER_CPU(struct irq_work, irq_work_NMI_resume) =
IRQ_WORK_INIT_HARD(task_work_set_notify_irq);
+#endif
/**
* task_work_add - ask the @task to execute @work->func()
@@ -57,6 +59,8 @@ int task_work_add(struct task_struct *task, struct callback_head *work,
if (notify == TWA_NMI_CURRENT) {
if (WARN_ON_ONCE(task != current))
return -EINVAL;
+ if (!IS_ENABLED(CONFIG_IRQ_WORK))
+ return -EINVAL;
} else {
/* record the work call stack in order to print it in KASAN reports */
kasan_record_aux_stack(work);
@@ -81,9 +85,11 @@ int task_work_add(struct task_struct *task, struct callback_head *work,
case TWA_SIGNAL_NO_IPI:
__set_notify_signal(task);
break;
+#ifdef CONFIG_IRQ_WORK
case TWA_NMI_CURRENT:
irq_work_queue(this_cpu_ptr(&irq_work_NMI_resume));
break;
+#endif
default:
WARN_ON_ONCE(1);
break;
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index d25ba49e313c..d0538a75f4c6 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -246,7 +246,7 @@ static enum wd_read_status cs_watchdog_read(struct clocksource *cs, u64 *csnow,
wd_delay = cycles_to_nsec_safe(watchdog, *wdnow, wd_end);
if (wd_delay <= WATCHDOG_MAX_SKEW) {
- if (nretries > 1 || nretries >= max_retries) {
+ if (nretries > 1 && nretries >= max_retries) {
pr_warn("timekeeping watchdog on CPU%d: %s retried %d times before success\n",
smp_processor_id(), watchdog->name, nretries);
}
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index b4843099a8da..ed58eebb4e8f 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -1141,7 +1141,6 @@ void tick_broadcast_switch_to_oneshot(void)
#ifdef CONFIG_HOTPLUG_CPU
void hotplug_cpu__broadcast_tick_pull(int deadcpu)
{
- struct tick_device *td = this_cpu_ptr(&tick_cpu_device);
struct clock_event_device *bc;
unsigned long flags;
@@ -1167,6 +1166,8 @@ void hotplug_cpu__broadcast_tick_pull(int deadcpu)
* device to avoid the starvation.
*/
if (tick_check_broadcast_expired()) {
+ struct tick_device *td = this_cpu_ptr(&tick_cpu_device);
+
cpumask_clear_cpu(smp_processor_id(), tick_broadcast_force_mask);
tick_program_event(td->evtdev->next_event, 1);
}