summaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorThomas Gleixner <tglx@linutronix.de>2023-06-26 11:05:49 +0200
committerThomas Gleixner <tglx@linutronix.de>2023-06-26 11:05:49 +0200
commitf121ab7f4ac32ed2aa51035534926f9507a8308b (patch)
treed17cd6dc29b64e6d681caa70424f3beacce21f14 /kernel
parentgenirq: Use a maple tree for interrupt descriptor management (diff)
parentMerge branch irq/misc-6.5 into irq/irqchip-next (diff)
downloadlinux-f121ab7f4ac32ed2aa51035534926f9507a8308b.tar.xz
linux-f121ab7f4ac32ed2aa51035534926f9507a8308b.zip
Merge tag 'irqchip-6.5' of git://git.kernel.org/pub/scm/linux/kernel/git/maz/arm-platforms into irq/core
Pull irqchip updates from Marc Zyngier: - A number of Loogson/Loogarch fixes - Allow the core code to retrigger an interrupt that has fired while the same interrupt is being handled on another CPU, papering over a GICv3 architecture issue - Work around an integration problem on ASR8601, where the CPU numbering isn't representable in the GIC implementation... - Add some missing interrupt to the STM32 irqchip - A bunch of warning squashing triggered by W=1 builds Link: https://lore.kernel.org/r/20230623224345.3577134-1-maz@kernel.org
Diffstat (limited to 'kernel')
-rw-r--r--kernel/bpf/hashtab.c6
-rw-r--r--kernel/bpf/offload.c2
-rw-r--r--kernel/bpf/verifier.c2
-rw-r--r--kernel/events/core.c14
-rw-r--r--kernel/irq/chip.c16
-rw-r--r--kernel/irq/debugfs.c2
-rw-r--r--kernel/irq/internals.h7
-rw-r--r--kernel/irq/irqdomain.c2
-rw-r--r--kernel/irq/msi.c4
-rw-r--r--kernel/locking/lockdep.c28
-rw-r--r--kernel/locking/rwsem.c8
-rw-r--r--kernel/module/main.c58
-rw-r--r--kernel/module/stats.c4
-rw-r--r--kernel/sched/core.c4
-rw-r--r--kernel/time/tick-broadcast.c120
-rw-r--r--kernel/trace/fprobe.c73
-rw-r--r--kernel/trace/rethook.c4
17 files changed, 266 insertions, 88 deletions
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 00c253b84bf5..9901efee4339 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -1215,7 +1215,7 @@ static long htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value
ret = htab_lock_bucket(htab, b, hash, &flags);
if (ret)
- return ret;
+ goto err_lock_bucket;
l_old = lookup_elem_raw(head, hash, key, key_size);
@@ -1236,6 +1236,7 @@ static long htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value
err:
htab_unlock_bucket(htab, b, hash, flags);
+err_lock_bucket:
if (ret)
htab_lru_push_free(htab, l_new);
else if (l_old)
@@ -1338,7 +1339,7 @@ static long __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
ret = htab_lock_bucket(htab, b, hash, &flags);
if (ret)
- return ret;
+ goto err_lock_bucket;
l_old = lookup_elem_raw(head, hash, key, key_size);
@@ -1361,6 +1362,7 @@ static long __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
ret = 0;
err:
htab_unlock_bucket(htab, b, hash, flags);
+err_lock_bucket:
if (l_new)
bpf_lru_push_free(&htab->lru, &l_new->lru_node);
return ret;
diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
index d9c9f45e3529..8a26cd8814c1 100644
--- a/kernel/bpf/offload.c
+++ b/kernel/bpf/offload.c
@@ -859,4 +859,4 @@ static int __init bpf_offload_init(void)
return rhashtable_init(&offdevs, &offdevs_params);
}
-late_initcall(bpf_offload_init);
+core_initcall(bpf_offload_init);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index fbcf5a4e2fcd..5871aa78d01a 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -17033,7 +17033,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
insn_buf[cnt++] = BPF_ALU64_IMM(BPF_RSH,
insn->dst_reg,
shift);
- insn_buf[cnt++] = BPF_ALU64_IMM(BPF_AND, insn->dst_reg,
+ insn_buf[cnt++] = BPF_ALU32_IMM(BPF_AND, insn->dst_reg,
(1ULL << size * 8) - 1);
}
}
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 68baa8194d9f..db016e418931 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -10150,8 +10150,20 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
perf_trace_buf_update(record, event_type);
hlist_for_each_entry_rcu(event, head, hlist_entry) {
- if (perf_tp_event_match(event, &data, regs))
+ if (perf_tp_event_match(event, &data, regs)) {
perf_swevent_event(event, count, &data, regs);
+
+ /*
+ * Here use the same on-stack perf_sample_data,
+ * some members in data are event-specific and
+ * need to be re-computed for different sweveents.
+ * Re-initialize data->sample_flags safely to avoid
+ * the problem that next event skips preparing data
+ * because data->sample_flags is set.
+ */
+ perf_sample_data_init(&data, 0, 0);
+ perf_sample_save_raw_data(&data, &raw);
+ }
}
/*
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 2eac5532c3c8..ee8c0acf39df 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -693,8 +693,16 @@ void handle_fasteoi_irq(struct irq_desc *desc)
raw_spin_lock(&desc->lock);
- if (!irq_may_run(desc))
+ /*
+ * When an affinity change races with IRQ handling, the next interrupt
+ * can arrive on the new CPU before the original CPU has completed
+ * handling the previous one - it may need to be resent.
+ */
+ if (!irq_may_run(desc)) {
+ if (irqd_needs_resend_when_in_progress(&desc->irq_data))
+ desc->istate |= IRQS_PENDING;
goto out;
+ }
desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
@@ -716,6 +724,12 @@ void handle_fasteoi_irq(struct irq_desc *desc)
cond_unmask_eoi_irq(desc, chip);
+ /*
+ * When the race described above happens this will resend the interrupt.
+ */
+ if (unlikely(desc->istate & IRQS_PENDING))
+ check_irq_resend(desc, false);
+
raw_spin_unlock(&desc->lock);
return;
out:
diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c
index bbcaac64038e..5971a66be034 100644
--- a/kernel/irq/debugfs.c
+++ b/kernel/irq/debugfs.c
@@ -133,6 +133,8 @@ static const struct irq_bit_descr irqdata_states[] = {
BIT_MASK_DESCR(IRQD_HANDLE_ENFORCE_IRQCTX),
BIT_MASK_DESCR(IRQD_IRQ_ENABLED_ON_SUSPEND),
+
+ BIT_MASK_DESCR(IRQD_RESEND_WHEN_IN_PROGRESS),
};
static const struct irq_bit_descr irqdesc_states[] = {
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 7bdb7507efb0..bdd35bb9c735 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -47,9 +47,12 @@ enum {
* detection
* IRQS_POLL_INPROGRESS - polling in progress
* IRQS_ONESHOT - irq is not unmasked in primary handler
- * IRQS_REPLAY - irq is replayed
+ * IRQS_REPLAY - irq has been resent and will not be resent
+ * again until the handler has run and cleared
+ * this flag.
* IRQS_WAITING - irq is waiting
- * IRQS_PENDING - irq is pending and replayed later
+ * IRQS_PENDING - irq needs to be resent and should be resent
+ * at the next available opportunity.
* IRQS_SUSPENDED - irq is suspended
* IRQS_NMI - irq line is used to deliver NMIs
* IRQS_SYSFS - descriptor has been added to sysfs
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index f34760a1e222..5bd01624e447 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -1915,6 +1915,8 @@ static void irq_domain_check_hierarchy(struct irq_domain *domain)
#endif /* CONFIG_IRQ_DOMAIN_HIERARCHY */
#ifdef CONFIG_GENERIC_IRQ_DEBUGFS
+#include "internals.h"
+
static struct dentry *domain_dir;
static void
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index 7a97bcb086bf..b4c31a5c1147 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -542,7 +542,7 @@ fail:
return ret;
}
-#ifdef CONFIG_PCI_MSI_ARCH_FALLBACKS
+#if defined(CONFIG_PCI_MSI_ARCH_FALLBACKS) || defined(CONFIG_PCI_XEN)
/**
* msi_device_populate_sysfs - Populate msi_irqs sysfs entries for a device
* @dev: The device (PCI, platform etc) which will get sysfs entries
@@ -574,7 +574,7 @@ void msi_device_destroy_sysfs(struct device *dev)
msi_for_each_desc(desc, dev, MSI_DESC_ALL)
msi_sysfs_remove_desc(dev, desc);
}
-#endif /* CONFIG_PCI_MSI_ARCH_FALLBACK */
+#endif /* CONFIG_PCI_MSI_ARCH_FALLBACK || CONFIG_PCI_XEN */
#else /* CONFIG_SYSFS */
static inline int msi_sysfs_create_group(struct device *dev) { return 0; }
static inline int msi_sysfs_populate_desc(struct device *dev, struct msi_desc *desc) { return 0; }
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index dcd1d5bfc1e0..4dfd2f3e09b2 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -2263,6 +2263,9 @@ static inline bool usage_match(struct lock_list *entry, void *mask)
static inline bool usage_skip(struct lock_list *entry, void *mask)
{
+ if (entry->class->lock_type == LD_LOCK_NORMAL)
+ return false;
+
/*
* Skip local_lock() for irq inversion detection.
*
@@ -2289,14 +2292,16 @@ static inline bool usage_skip(struct lock_list *entry, void *mask)
* As a result, we will skip local_lock(), when we search for irq
* inversion bugs.
*/
- if (entry->class->lock_type == LD_LOCK_PERCPU) {
- if (DEBUG_LOCKS_WARN_ON(entry->class->wait_type_inner < LD_WAIT_CONFIG))
- return false;
+ if (entry->class->lock_type == LD_LOCK_PERCPU &&
+ DEBUG_LOCKS_WARN_ON(entry->class->wait_type_inner < LD_WAIT_CONFIG))
+ return false;
- return true;
- }
+ /*
+ * Skip WAIT_OVERRIDE for irq inversion detection -- it's not actually
+ * a lock and only used to override the wait_type.
+ */
- return false;
+ return true;
}
/*
@@ -4768,7 +4773,8 @@ static int check_wait_context(struct task_struct *curr, struct held_lock *next)
for (; depth < curr->lockdep_depth; depth++) {
struct held_lock *prev = curr->held_locks + depth;
- u8 prev_inner = hlock_class(prev)->wait_type_inner;
+ struct lock_class *class = hlock_class(prev);
+ u8 prev_inner = class->wait_type_inner;
if (prev_inner) {
/*
@@ -4778,6 +4784,14 @@ static int check_wait_context(struct task_struct *curr, struct held_lock *next)
* Also due to trylocks.
*/
curr_inner = min(curr_inner, prev_inner);
+
+ /*
+ * Allow override for annotations -- this is typically
+ * only valid/needed for code that only exists when
+ * CONFIG_PREEMPT_RT=n.
+ */
+ if (unlikely(class->lock_type == LD_LOCK_WAIT_OVERRIDE))
+ curr_inner = prev_inner;
}
}
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index acb5a50309a1..9eabd585ce7a 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -1240,7 +1240,7 @@ static struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem)
/*
* lock for reading
*/
-static inline int __down_read_common(struct rw_semaphore *sem, int state)
+static __always_inline int __down_read_common(struct rw_semaphore *sem, int state)
{
int ret = 0;
long count;
@@ -1258,17 +1258,17 @@ out:
return ret;
}
-static inline void __down_read(struct rw_semaphore *sem)
+static __always_inline void __down_read(struct rw_semaphore *sem)
{
__down_read_common(sem, TASK_UNINTERRUPTIBLE);
}
-static inline int __down_read_interruptible(struct rw_semaphore *sem)
+static __always_inline int __down_read_interruptible(struct rw_semaphore *sem)
{
return __down_read_common(sem, TASK_INTERRUPTIBLE);
}
-static inline int __down_read_killable(struct rw_semaphore *sem)
+static __always_inline int __down_read_killable(struct rw_semaphore *sem)
{
return __down_read_common(sem, TASK_KILLABLE);
}
diff --git a/kernel/module/main.c b/kernel/module/main.c
index 044aa2c9e3cb..b4c7e925fdb0 100644
--- a/kernel/module/main.c
+++ b/kernel/module/main.c
@@ -3057,25 +3057,13 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
return load_module(&info, uargs, 0);
}
-SYSCALL_DEFINE3(finit_module, int, fd, const char __user *, uargs, int, flags)
+static int file_init_module(struct file *file, const char __user * uargs, int flags)
{
struct load_info info = { };
void *buf = NULL;
int len;
- int err;
-
- err = may_init_module();
- if (err)
- return err;
-
- pr_debug("finit_module: fd=%d, uargs=%p, flags=%i\n", fd, uargs, flags);
- if (flags & ~(MODULE_INIT_IGNORE_MODVERSIONS
- |MODULE_INIT_IGNORE_VERMAGIC
- |MODULE_INIT_COMPRESSED_FILE))
- return -EINVAL;
-
- len = kernel_read_file_from_fd(fd, 0, &buf, INT_MAX, NULL,
+ len = kernel_read_file(file, 0, &buf, INT_MAX, NULL,
READING_MODULE);
if (len < 0) {
mod_stat_inc(&failed_kreads);
@@ -3084,7 +3072,7 @@ SYSCALL_DEFINE3(finit_module, int, fd, const char __user *, uargs, int, flags)
}
if (flags & MODULE_INIT_COMPRESSED_FILE) {
- err = module_decompress(&info, buf, len);
+ int err = module_decompress(&info, buf, len);
vfree(buf); /* compressed data is no longer needed */
if (err) {
mod_stat_inc(&failed_decompress);
@@ -3099,6 +3087,46 @@ SYSCALL_DEFINE3(finit_module, int, fd, const char __user *, uargs, int, flags)
return load_module(&info, uargs, flags);
}
+/*
+ * kernel_read_file() will already deny write access, but module
+ * loading wants _exclusive_ access to the file, so we do that
+ * here, along with basic sanity checks.
+ */
+static int prepare_file_for_module_load(struct file *file)
+{
+ if (!file || !(file->f_mode & FMODE_READ))
+ return -EBADF;
+ if (!S_ISREG(file_inode(file)->i_mode))
+ return -EINVAL;
+ return exclusive_deny_write_access(file);
+}
+
+SYSCALL_DEFINE3(finit_module, int, fd, const char __user *, uargs, int, flags)
+{
+ struct fd f;
+ int err;
+
+ err = may_init_module();
+ if (err)
+ return err;
+
+ pr_debug("finit_module: fd=%d, uargs=%p, flags=%i\n", fd, uargs, flags);
+
+ if (flags & ~(MODULE_INIT_IGNORE_MODVERSIONS
+ |MODULE_INIT_IGNORE_VERMAGIC
+ |MODULE_INIT_COMPRESSED_FILE))
+ return -EINVAL;
+
+ f = fdget(fd);
+ err = prepare_file_for_module_load(f.file);
+ if (!err) {
+ err = file_init_module(f.file, uargs, flags);
+ allow_write_access(f.file);
+ }
+ fdput(f);
+ return err;
+}
+
/* Keep in sync with MODULE_FLAGS_BUF_SIZE !!! */
char *module_flags(struct module *mod, char *buf, bool show_state)
{
diff --git a/kernel/module/stats.c b/kernel/module/stats.c
index ad7b6ada29f2..6ab2c94d6bc3 100644
--- a/kernel/module/stats.c
+++ b/kernel/module/stats.c
@@ -276,6 +276,7 @@ static ssize_t read_file_mod_stats(struct file *file, char __user *user_buf,
struct mod_fail_load *mod_fail;
unsigned int len, size, count_failed = 0;
char *buf;
+ int ret;
u32 live_mod_count, fkreads, fdecompress, fbecoming, floads;
unsigned long total_size, text_size, ikread_bytes, ibecoming_bytes,
idecompress_bytes, imod_bytes, total_virtual_lost;
@@ -390,8 +391,9 @@ static ssize_t read_file_mod_stats(struct file *file, char __user *user_buf,
out_unlock:
mutex_unlock(&module_mutex);
out:
+ ret = simple_read_from_buffer(user_buf, count, ppos, buf, len);
kfree(buf);
- return simple_read_from_buffer(user_buf, count, ppos, buf, len);
+ return ret;
}
#undef MAX_PREAMBLE
#undef MAX_FAILED_MOD_PRINT
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 944c3ae39861..a68d1276bab0 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -11492,7 +11492,7 @@ void call_trace_sched_update_nr_running(struct rq *rq, int count)
#ifdef CONFIG_SCHED_MM_CID
-/**
+/*
* @cid_lock: Guarantee forward-progress of cid allocation.
*
* Concurrency ID allocation within a bitmap is mostly lock-free. The cid_lock
@@ -11501,7 +11501,7 @@ void call_trace_sched_update_nr_running(struct rq *rq, int count)
*/
DEFINE_RAW_SPINLOCK(cid_lock);
-/**
+/*
* @use_cid_lock: Select cid allocation behavior: lock-free vs spinlock.
*
* When @use_cid_lock is 0, the cid allocation is lock-free. When contention is
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 93bf2b4e47e5..771d1e040303 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -35,14 +35,15 @@ static __cacheline_aligned_in_smp DEFINE_RAW_SPINLOCK(tick_broadcast_lock);
#ifdef CONFIG_TICK_ONESHOT
static DEFINE_PER_CPU(struct clock_event_device *, tick_oneshot_wakeup_device);
-static void tick_broadcast_setup_oneshot(struct clock_event_device *bc);
+static void tick_broadcast_setup_oneshot(struct clock_event_device *bc, bool from_periodic);
static void tick_broadcast_clear_oneshot(int cpu);
static void tick_resume_broadcast_oneshot(struct clock_event_device *bc);
# ifdef CONFIG_HOTPLUG_CPU
static void tick_broadcast_oneshot_offline(unsigned int cpu);
# endif
#else
-static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) { BUG(); }
+static inline void
+tick_broadcast_setup_oneshot(struct clock_event_device *bc, bool from_periodic) { BUG(); }
static inline void tick_broadcast_clear_oneshot(int cpu) { }
static inline void tick_resume_broadcast_oneshot(struct clock_event_device *bc) { }
# ifdef CONFIG_HOTPLUG_CPU
@@ -264,7 +265,7 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu)
if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC)
tick_broadcast_start_periodic(bc);
else
- tick_broadcast_setup_oneshot(bc);
+ tick_broadcast_setup_oneshot(bc, false);
ret = 1;
} else {
/*
@@ -500,7 +501,7 @@ void tick_broadcast_control(enum tick_broadcast_mode mode)
if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC)
tick_broadcast_start_periodic(bc);
else
- tick_broadcast_setup_oneshot(bc);
+ tick_broadcast_setup_oneshot(bc, false);
}
}
out:
@@ -1020,48 +1021,101 @@ static inline ktime_t tick_get_next_period(void)
/**
* tick_broadcast_setup_oneshot - setup the broadcast device
*/
-static void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
+static void tick_broadcast_setup_oneshot(struct clock_event_device *bc,
+ bool from_periodic)
{
int cpu = smp_processor_id();
+ ktime_t nexttick = 0;
if (!bc)
return;
- /* Set it up only once ! */
- if (bc->event_handler != tick_handle_oneshot_broadcast) {
- int was_periodic = clockevent_state_periodic(bc);
-
- bc->event_handler = tick_handle_oneshot_broadcast;
-
+ /*
+ * When the broadcast device was switched to oneshot by the first
+ * CPU handling the NOHZ change, the other CPUs will reach this
+ * code via hrtimer_run_queues() -> tick_check_oneshot_change()
+ * too. Set up the broadcast device only once!
+ */
+ if (bc->event_handler == tick_handle_oneshot_broadcast) {
/*
- * We must be careful here. There might be other CPUs
- * waiting for periodic broadcast. We need to set the
- * oneshot_mask bits for those and program the
- * broadcast device to fire.
+ * The CPU which switched from periodic to oneshot mode
+ * set the broadcast oneshot bit for all other CPUs which
+ * are in the general (periodic) broadcast mask to ensure
+ * that CPUs which wait for the periodic broadcast are
+ * woken up.
+ *
+ * Clear the bit for the local CPU as the set bit would
+ * prevent the first tick_broadcast_enter() after this CPU
+ * switched to oneshot state to program the broadcast
+ * device.
+ *
+ * This code can also be reached via tick_broadcast_control(),
+ * but this cannot avoid the tick_broadcast_clear_oneshot()
+ * as that would break the periodic to oneshot transition of
+ * secondary CPUs. But that's harmless as the below only
+ * clears already cleared bits.
*/
+ tick_broadcast_clear_oneshot(cpu);
+ return;
+ }
+
+
+ bc->event_handler = tick_handle_oneshot_broadcast;
+ bc->next_event = KTIME_MAX;
+
+ /*
+ * When the tick mode is switched from periodic to oneshot it must
+ * be ensured that CPUs which are waiting for periodic broadcast
+ * get their wake-up at the next tick. This is achieved by ORing
+ * tick_broadcast_mask into tick_broadcast_oneshot_mask.
+ *
+ * For other callers, e.g. broadcast device replacement,
+ * tick_broadcast_oneshot_mask must not be touched as this would
+ * set bits for CPUs which are already NOHZ, but not idle. Their
+ * next tick_broadcast_enter() would observe the bit set and fail
+ * to update the expiry time and the broadcast event device.
+ */
+ if (from_periodic) {
cpumask_copy(tmpmask, tick_broadcast_mask);
+ /* Remove the local CPU as it is obviously not idle */
cpumask_clear_cpu(cpu, tmpmask);
- cpumask_or(tick_broadcast_oneshot_mask,
- tick_broadcast_oneshot_mask, tmpmask);
+ cpumask_or(tick_broadcast_oneshot_mask, tick_broadcast_oneshot_mask, tmpmask);
- if (was_periodic && !cpumask_empty(tmpmask)) {
- ktime_t nextevt = tick_get_next_period();
+ /*
+ * Ensure that the oneshot broadcast handler will wake the
+ * CPUs which are still waiting for periodic broadcast.
+ */
+ nexttick = tick_get_next_period();
+ tick_broadcast_init_next_event(tmpmask, nexttick);
- clockevents_switch_state(bc, CLOCK_EVT_STATE_ONESHOT);
- tick_broadcast_init_next_event(tmpmask, nextevt);
- tick_broadcast_set_event(bc, cpu, nextevt);
- } else
- bc->next_event = KTIME_MAX;
- } else {
/*
- * The first cpu which switches to oneshot mode sets
- * the bit for all other cpus which are in the general
- * (periodic) broadcast mask. So the bit is set and
- * would prevent the first broadcast enter after this
- * to program the bc device.
+ * If the underlying broadcast clock event device is
+ * already in oneshot state, then there is nothing to do.
+ * The device was already armed for the next tick
+ * in tick_handle_broadcast_periodic()
*/
- tick_broadcast_clear_oneshot(cpu);
+ if (clockevent_state_oneshot(bc))
+ return;
}
+
+ /*
+ * When switching from periodic to oneshot mode arm the broadcast
+ * device for the next tick.
+ *
+ * If the broadcast device has been replaced in oneshot mode and
+ * the oneshot broadcast mask is not empty, then arm it to expire
+ * immediately in order to reevaluate the next expiring timer.
+ * @nexttick is 0 and therefore in the past which will cause the
+ * clockevent code to force an event.
+ *
+ * For both cases the programming can be avoided when the oneshot
+ * broadcast mask is empty.
+ *
+ * tick_broadcast_set_event() implicitly switches the broadcast
+ * device to oneshot state.
+ */
+ if (!cpumask_empty(tick_broadcast_oneshot_mask))
+ tick_broadcast_set_event(bc, cpu, nexttick);
}
/*
@@ -1070,14 +1124,16 @@ static void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
void tick_broadcast_switch_to_oneshot(void)
{
struct clock_event_device *bc;
+ enum tick_device_mode oldmode;
unsigned long flags;
raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
+ oldmode = tick_broadcast_device.mode;
tick_broadcast_device.mode = TICKDEV_MODE_ONESHOT;
bc = tick_broadcast_device.evtdev;
if (bc)
- tick_broadcast_setup_oneshot(bc);
+ tick_broadcast_setup_oneshot(bc, oldmode == TICKDEV_MODE_PERIODIC);
raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
}
diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c
index 9abb3905bc8e..18d36842faf5 100644
--- a/kernel/trace/fprobe.c
+++ b/kernel/trace/fprobe.c
@@ -17,36 +17,30 @@
struct fprobe_rethook_node {
struct rethook_node node;
unsigned long entry_ip;
+ unsigned long entry_parent_ip;
char data[];
};
-static void fprobe_handler(unsigned long ip, unsigned long parent_ip,
- struct ftrace_ops *ops, struct ftrace_regs *fregs)
+static inline void __fprobe_handler(unsigned long ip, unsigned long parent_ip,
+ struct ftrace_ops *ops, struct ftrace_regs *fregs)
{
struct fprobe_rethook_node *fpr;
struct rethook_node *rh = NULL;
struct fprobe *fp;
void *entry_data = NULL;
- int bit, ret;
+ int ret = 0;
fp = container_of(ops, struct fprobe, ops);
- if (fprobe_disabled(fp))
- return;
-
- bit = ftrace_test_recursion_trylock(ip, parent_ip);
- if (bit < 0) {
- fp->nmissed++;
- return;
- }
if (fp->exit_handler) {
rh = rethook_try_get(fp->rethook);
if (!rh) {
fp->nmissed++;
- goto out;
+ return;
}
fpr = container_of(rh, struct fprobe_rethook_node, node);
fpr->entry_ip = ip;
+ fpr->entry_parent_ip = parent_ip;
if (fp->entry_data_size)
entry_data = fpr->data;
}
@@ -61,23 +55,60 @@ static void fprobe_handler(unsigned long ip, unsigned long parent_ip,
else
rethook_hook(rh, ftrace_get_regs(fregs), true);
}
-out:
+}
+
+static void fprobe_handler(unsigned long ip, unsigned long parent_ip,
+ struct ftrace_ops *ops, struct ftrace_regs *fregs)
+{
+ struct fprobe *fp;
+ int bit;
+
+ fp = container_of(ops, struct fprobe, ops);
+ if (fprobe_disabled(fp))
+ return;
+
+ /* recursion detection has to go before any traceable function and
+ * all functions before this point should be marked as notrace
+ */
+ bit = ftrace_test_recursion_trylock(ip, parent_ip);
+ if (bit < 0) {
+ fp->nmissed++;
+ return;
+ }
+ __fprobe_handler(ip, parent_ip, ops, fregs);
ftrace_test_recursion_unlock(bit);
+
}
NOKPROBE_SYMBOL(fprobe_handler);
static void fprobe_kprobe_handler(unsigned long ip, unsigned long parent_ip,
struct ftrace_ops *ops, struct ftrace_regs *fregs)
{
- struct fprobe *fp = container_of(ops, struct fprobe, ops);
+ struct fprobe *fp;
+ int bit;
+
+ fp = container_of(ops, struct fprobe, ops);
+ if (fprobe_disabled(fp))
+ return;
+
+ /* recursion detection has to go before any traceable function and
+ * all functions called before this point should be marked as notrace
+ */
+ bit = ftrace_test_recursion_trylock(ip, parent_ip);
+ if (bit < 0) {
+ fp->nmissed++;
+ return;
+ }
if (unlikely(kprobe_running())) {
fp->nmissed++;
return;
}
+
kprobe_busy_begin();
- fprobe_handler(ip, parent_ip, ops, fregs);
+ __fprobe_handler(ip, parent_ip, ops, fregs);
kprobe_busy_end();
+ ftrace_test_recursion_unlock(bit);
}
static void fprobe_exit_handler(struct rethook_node *rh, void *data,
@@ -85,14 +116,26 @@ static void fprobe_exit_handler(struct rethook_node *rh, void *data,
{
struct fprobe *fp = (struct fprobe *)data;
struct fprobe_rethook_node *fpr;
+ int bit;
if (!fp || fprobe_disabled(fp))
return;
fpr = container_of(rh, struct fprobe_rethook_node, node);
+ /*
+ * we need to assure no calls to traceable functions in-between the
+ * end of fprobe_handler and the beginning of fprobe_exit_handler.
+ */
+ bit = ftrace_test_recursion_trylock(fpr->entry_ip, fpr->entry_parent_ip);
+ if (bit < 0) {
+ fp->nmissed++;
+ return;
+ }
+
fp->exit_handler(fp, fpr->entry_ip, regs,
fp->entry_data_size ? (void *)fpr->data : NULL);
+ ftrace_test_recursion_unlock(bit);
}
NOKPROBE_SYMBOL(fprobe_exit_handler);
diff --git a/kernel/trace/rethook.c b/kernel/trace/rethook.c
index 32c3dfdb4d6a..60f6cb2b486b 100644
--- a/kernel/trace/rethook.c
+++ b/kernel/trace/rethook.c
@@ -288,7 +288,7 @@ unsigned long rethook_trampoline_handler(struct pt_regs *regs,
* These loops must be protected from rethook_free_rcu() because those
* are accessing 'rhn->rethook'.
*/
- preempt_disable();
+ preempt_disable_notrace();
/*
* Run the handler on the shadow stack. Do not unlink the list here because
@@ -321,7 +321,7 @@ unsigned long rethook_trampoline_handler(struct pt_regs *regs,
first = first->next;
rethook_recycle(rhn);
}
- preempt_enable();
+ preempt_enable_notrace();
return correct_ret_addr;
}