diff options
author | Ingo Molnar <mingo@kernel.org> | 2020-08-06 10:16:38 +0200 |
---|---|---|
committer | Ingo Molnar <mingo@kernel.org> | 2020-08-06 10:16:38 +0200 |
commit | a703f3633ff1d982bc4adfe7e0921bedb1701216 (patch) | |
tree | eb85b29a0bbcb29045e197ab77e18ffc8649a722 | |
parent | locking/lockdep: Fix overflow in presentation of average lock-time (diff) | |
parent | seqcount: More consistent seqprop names (diff) | |
download | linux-a703f3633ff1d982bc4adfe7e0921bedb1701216.tar.xz linux-a703f3633ff1d982bc4adfe7e0921bedb1701216.zip |
Merge branch 'WIP.locking/seqlocks' into locking/urgent
Pick up the full seqlock series PeterZ is working on.
Signed-off-by: Ingo Molnar <mingo@kernel.org>
86 files changed, 1583 insertions, 822 deletions
diff --git a/Documentation/locking/index.rst b/Documentation/locking/index.rst index d785878cad65..7003bd5aeff4 100644 --- a/Documentation/locking/index.rst +++ b/Documentation/locking/index.rst @@ -14,6 +14,7 @@ locking mutex-design rt-mutex-design rt-mutex + seqlock spinlocks ww-mutex-design preempt-locking diff --git a/Documentation/locking/mutex-design.rst b/Documentation/locking/mutex-design.rst index 4d8236b81fa5..8f3e9a5141f9 100644 --- a/Documentation/locking/mutex-design.rst +++ b/Documentation/locking/mutex-design.rst @@ -18,7 +18,7 @@ as an alternative to these. This new data structure provided a number of advantages, including simpler interfaces, and at that time smaller code (see Disadvantages). -[1] http://lwn.net/Articles/164802/ +[1] https://lwn.net/Articles/164802/ Implementation -------------- diff --git a/Documentation/locking/seqlock.rst b/Documentation/locking/seqlock.rst new file mode 100644 index 000000000000..62c5ad98c11c --- /dev/null +++ b/Documentation/locking/seqlock.rst @@ -0,0 +1,222 @@ +====================================== +Sequence counters and sequential locks +====================================== + +Introduction +============ + +Sequence counters are a reader-writer consistency mechanism with +lockless readers (read-only retry loops), and no writer starvation. They +are used for data that's rarely written to (e.g. system time), where the +reader wants a consistent set of information and is willing to retry if +that information changes. + +A data set is consistent when the sequence count at the beginning of the +read side critical section is even and the same sequence count value is +read again at the end of the critical section. The data in the set must +be copied out inside the read side critical section. If the sequence +count has changed between the start and the end of the critical section, +the reader must retry. + +Writers increment the sequence count at the start and the end of their +critical section. After starting the critical section the sequence count +is odd and indicates to the readers that an update is in progress. At +the end of the write side critical section the sequence count becomes +even again which lets readers make progress. + +A sequence counter write side critical section must never be preempted +or interrupted by read side sections. Otherwise the reader will spin for +the entire scheduler tick due to the odd sequence count value and the +interrupted writer. If that reader belongs to a real-time scheduling +class, it can spin forever and the kernel will livelock. + +This mechanism cannot be used if the protected data contains pointers, +as the writer can invalidate a pointer that the reader is following. + + +.. _seqcount_t: + +Sequence counters (``seqcount_t``) +================================== + +This is the the raw counting mechanism, which does not protect against +multiple writers. Write side critical sections must thus be serialized +by an external lock. + +If the write serialization primitive is not implicitly disabling +preemption, preemption must be explicitly disabled before entering the +write side section. If the read section can be invoked from hardirq or +softirq contexts, interrupts or bottom halves must also be respectively +disabled before entering the write section. + +If it's desired to automatically handle the sequence counter +requirements of writer serialization and non-preemptibility, use +:ref:`seqlock_t` instead. + +Initialization:: + + /* dynamic */ + seqcount_t foo_seqcount; + seqcount_init(&foo_seqcount); + + /* static */ + static seqcount_t foo_seqcount = SEQCNT_ZERO(foo_seqcount); + + /* C99 struct init */ + struct { + .seq = SEQCNT_ZERO(foo.seq), + } foo; + +Write path:: + + /* Serialized context with disabled preemption */ + + write_seqcount_begin(&foo_seqcount); + + /* ... [[write-side critical section]] ... */ + + write_seqcount_end(&foo_seqcount); + +Read path:: + + do { + seq = read_seqcount_begin(&foo_seqcount); + + /* ... [[read-side critical section]] ... */ + + } while (read_seqcount_retry(&foo_seqcount, seq)); + + +.. _seqcount_locktype_t: + +Sequence counters with associated locks (``seqcount_LOCKTYPE_t``) +----------------------------------------------------------------- + +As discussed at :ref:`seqcount_t`, sequence count write side critical +sections must be serialized and non-preemptible. This variant of +sequence counters associate the lock used for writer serialization at +initialization time, which enables lockdep to validate that the write +side critical sections are properly serialized. + +This lock association is a NOOP if lockdep is disabled and has neither +storage nor runtime overhead. If lockdep is enabled, the lock pointer is +stored in struct seqcount and lockdep's "lock is held" assertions are +injected at the beginning of the write side critical section to validate +that it is properly protected. + +For lock types which do not implicitly disable preemption, preemption +protection is enforced in the write side function. + +The following sequence counters with associated locks are defined: + + - ``seqcount_spinlock_t`` + - ``seqcount_raw_spinlock_t`` + - ``seqcount_rwlock_t`` + - ``seqcount_mutex_t`` + - ``seqcount_ww_mutex_t`` + +The plain seqcount read and write APIs branch out to the specific +seqcount_LOCKTYPE_t implementation at compile-time. This avoids kernel +API explosion per each new seqcount LOCKTYPE. + +Initialization (replace "LOCKTYPE" with one of the supported locks):: + + /* dynamic */ + seqcount_LOCKTYPE_t foo_seqcount; + seqcount_LOCKTYPE_init(&foo_seqcount, &lock); + + /* static */ + static seqcount_LOCKTYPE_t foo_seqcount = + SEQCNT_LOCKTYPE_ZERO(foo_seqcount, &lock); + + /* C99 struct init */ + struct { + .seq = SEQCNT_LOCKTYPE_ZERO(foo.seq, &lock), + } foo; + +Write path: same as in :ref:`seqcount_t`, while running from a context +with the associated LOCKTYPE lock acquired. + +Read path: same as in :ref:`seqcount_t`. + +.. _seqlock_t: + +Sequential locks (``seqlock_t``) +================================ + +This contains the :ref:`seqcount_t` mechanism earlier discussed, plus an +embedded spinlock for writer serialization and non-preemptibility. + +If the read side section can be invoked from hardirq or softirq context, +use the write side function variants which disable interrupts or bottom +halves respectively. + +Initialization:: + + /* dynamic */ + seqlock_t foo_seqlock; + seqlock_init(&foo_seqlock); + + /* static */ + static DEFINE_SEQLOCK(foo_seqlock); + + /* C99 struct init */ + struct { + .seql = __SEQLOCK_UNLOCKED(foo.seql) + } foo; + +Write path:: + + write_seqlock(&foo_seqlock); + + /* ... [[write-side critical section]] ... */ + + write_sequnlock(&foo_seqlock); + +Read path, three categories: + +1. Normal Sequence readers which never block a writer but they must + retry if a writer is in progress by detecting change in the sequence + number. Writers do not wait for a sequence reader:: + + do { + seq = read_seqbegin(&foo_seqlock); + + /* ... [[read-side critical section]] ... */ + + } while (read_seqretry(&foo_seqlock, seq)); + +2. Locking readers which will wait if a writer or another locking reader + is in progress. A locking reader in progress will also block a writer + from entering its critical section. This read lock is + exclusive. Unlike rwlock_t, only one locking reader can acquire it:: + + read_seqlock_excl(&foo_seqlock); + + /* ... [[read-side critical section]] ... */ + + read_sequnlock_excl(&foo_seqlock); + +3. Conditional lockless reader (as in 1), or locking reader (as in 2), + according to a passed marker. This is used to avoid lockless readers + starvation (too much retry loops) in case of a sharp spike in write + activity. First, a lockless read is tried (even marker passed). If + that trial fails (odd sequence counter is returned, which is used as + the next iteration marker), the lockless read is transformed to a + full locking read and no retry loop is necessary:: + + /* marker; even initialization */ + int seq = 0; + do { + read_seqbegin_or_lock(&foo_seqlock, &seq); + + /* ... [[read-side critical section]] ... */ + + } while (need_seqretry(&foo_seqlock, seq)); + done_seqretry(&foo_seqlock, seq); + + +API documentation +================= + +.. kernel-doc:: include/linux/seqlock.h diff --git a/arch/alpha/include/asm/atomic.h b/arch/alpha/include/asm/atomic.h index 2144530d1428..e2093994fd0d 100644 --- a/arch/alpha/include/asm/atomic.h +++ b/arch/alpha/include/asm/atomic.h @@ -24,7 +24,6 @@ #define __atomic_acquire_fence() #define __atomic_post_full_fence() -#define ATOMIC_INIT(i) { (i) } #define ATOMIC64_INIT(i) { (i) } #define atomic_read(v) READ_ONCE((v)->counter) diff --git a/arch/arc/include/asm/atomic.h b/arch/arc/include/asm/atomic.h index 7298ce84762e..c614857eb209 100644 --- a/arch/arc/include/asm/atomic.h +++ b/arch/arc/include/asm/atomic.h @@ -14,8 +14,6 @@ #include <asm/barrier.h> #include <asm/smp.h> -#define ATOMIC_INIT(i) { (i) } - #ifndef CONFIG_ARC_PLAT_EZNPS #define atomic_read(v) READ_ONCE((v)->counter) diff --git a/arch/arm/include/asm/atomic.h b/arch/arm/include/asm/atomic.h index 75bb2c543e59..455eb19a5ac1 100644 --- a/arch/arm/include/asm/atomic.h +++ b/arch/arm/include/asm/atomic.h @@ -15,8 +15,6 @@ #include <asm/barrier.h> #include <asm/cmpxchg.h> -#define ATOMIC_INIT(i) { (i) } - #ifdef __KERNEL__ /* diff --git a/arch/arm/include/asm/percpu.h b/arch/arm/include/asm/percpu.h index f44f448537f2..e2fcb3cfd3de 100644 --- a/arch/arm/include/asm/percpu.h +++ b/arch/arm/include/asm/percpu.h @@ -5,6 +5,8 @@ #ifndef _ASM_ARM_PERCPU_H_ #define _ASM_ARM_PERCPU_H_ +register unsigned long current_stack_pointer asm ("sp"); + /* * Same as asm-generic/percpu.h, except that we store the per cpu offset * in the TPIDRPRW. TPIDRPRW only exists on V6K and V7 diff --git a/arch/arm/include/asm/thread_info.h b/arch/arm/include/asm/thread_info.h index 3609a6980c34..536b6b979f63 100644 --- a/arch/arm/include/asm/thread_info.h +++ b/arch/arm/include/asm/thread_info.h @@ -76,11 +76,6 @@ struct thread_info { } /* - * how to get the current stack pointer in C - */ -register unsigned long current_stack_pointer asm ("sp"); - -/* * how to get the thread information struct from C */ static inline struct thread_info *current_thread_info(void) __attribute_const__; diff --git a/arch/arm64/include/asm/atomic.h b/arch/arm64/include/asm/atomic.h index a08890da696c..015ddffaf6ca 100644 --- a/arch/arm64/include/asm/atomic.h +++ b/arch/arm64/include/asm/atomic.h @@ -99,8 +99,6 @@ static inline long arch_atomic64_dec_if_positive(atomic64_t *v) return __lse_ll_sc_body(atomic64_dec_if_positive, v); } -#define ATOMIC_INIT(i) { (i) } - #define arch_atomic_read(v) __READ_ONCE((v)->counter) #define arch_atomic_set(v, i) __WRITE_ONCE(((v)->counter), (i)) diff --git a/arch/h8300/include/asm/atomic.h b/arch/h8300/include/asm/atomic.h index c6b6a06231b2..a990d151f163 100644 --- a/arch/h8300/include/asm/atomic.h +++ b/arch/h8300/include/asm/atomic.h @@ -12,8 +12,6 @@ * resource counting etc.. */ -#define ATOMIC_INIT(i) { (i) } - #define atomic_read(v) READ_ONCE((v)->counter) #define atomic_set(v, i) WRITE_ONCE(((v)->counter), (i)) diff --git a/arch/hexagon/include/asm/atomic.h b/arch/hexagon/include/asm/atomic.h index 0231d69c8bf2..4ab895d7111f 100644 --- a/arch/hexagon/include/asm/atomic.h +++ b/arch/hexagon/include/asm/atomic.h @@ -12,8 +12,6 @@ #include <asm/cmpxchg.h> #include <asm/barrier.h> -#define ATOMIC_INIT(i) { (i) } - /* Normal writes in our arch don't clear lock reservations */ static inline void atomic_set(atomic_t *v, int new) diff --git a/arch/ia64/include/asm/atomic.h b/arch/ia64/include/asm/atomic.h index 50440f3ddc43..f267d956458f 100644 --- a/arch/ia64/include/asm/atomic.h +++ b/arch/ia64/include/asm/atomic.h @@ -19,7 +19,6 @@ #include <asm/barrier.h> -#define ATOMIC_INIT(i) { (i) } #define ATOMIC64_INIT(i) { (i) } #define atomic_read(v) READ_ONCE((v)->counter) diff --git a/arch/m68k/include/asm/atomic.h b/arch/m68k/include/asm/atomic.h index 47228b0d4163..756c5cc58f94 100644 --- a/arch/m68k/include/asm/atomic.h +++ b/arch/m68k/include/asm/atomic.h @@ -16,8 +16,6 @@ * We do not have SMP m68k systems, so we don't have to deal with that. */ -#define ATOMIC_INIT(i) { (i) } - #define atomic_read(v) READ_ONCE((v)->counter) #define atomic_set(v, i) WRITE_ONCE(((v)->counter), (i)) diff --git a/arch/mips/include/asm/atomic.h b/arch/mips/include/asm/atomic.h index e5ac88392d1f..f904084fcb1f 100644 --- a/arch/mips/include/asm/atomic.h +++ b/arch/mips/include/asm/atomic.h @@ -45,7 +45,6 @@ static __always_inline type pfx##_xchg(pfx##_t *v, type n) \ return xchg(&v->counter, n); \ } -#define ATOMIC_INIT(i) { (i) } ATOMIC_OPS(atomic, int) #ifdef CONFIG_64BIT diff --git a/arch/parisc/include/asm/atomic.h b/arch/parisc/include/asm/atomic.h index 118953d41763..f960e2f32b1b 100644 --- a/arch/parisc/include/asm/atomic.h +++ b/arch/parisc/include/asm/atomic.h @@ -136,8 +136,6 @@ ATOMIC_OPS(xor, ^=) #undef ATOMIC_OP_RETURN #undef ATOMIC_OP -#define ATOMIC_INIT(i) { (i) } - #ifdef CONFIG_64BIT #define ATOMIC64_INIT(i) { (i) } diff --git a/arch/powerpc/include/asm/atomic.h b/arch/powerpc/include/asm/atomic.h index 498785ffc25f..0311c3c42960 100644 --- a/arch/powerpc/include/asm/atomic.h +++ b/arch/powerpc/include/asm/atomic.h @@ -11,8 +11,6 @@ #include <asm/cmpxchg.h> #include <asm/barrier.h> -#define ATOMIC_INIT(i) { (i) } - /* * Since *_return_relaxed and {cmp}xchg_relaxed are implemented with * a "bne-" instruction at the end, so an isync is enough as a acquire barrier diff --git a/arch/powerpc/include/asm/dtl.h b/arch/powerpc/include/asm/dtl.h new file mode 100644 index 000000000000..1625888f27ef --- /dev/null +++ b/arch/powerpc/include/asm/dtl.h @@ -0,0 +1,52 @@ +#ifndef _ASM_POWERPC_DTL_H +#define _ASM_POWERPC_DTL_H + +#include <asm/lppaca.h> +#include <linux/spinlock_types.h> + +/* + * Layout of entries in the hypervisor's dispatch trace log buffer. + */ +struct dtl_entry { + u8 dispatch_reason; + u8 preempt_reason; + __be16 processor_id; + __be32 enqueue_to_dispatch_time; + __be32 ready_to_enqueue_time; + __be32 waiting_to_ready_time; + __be64 timebase; + __be64 fault_addr; + __be64 srr0; + __be64 srr1; +}; + +#define DISPATCH_LOG_BYTES 4096 /* bytes per cpu */ +#define N_DISPATCH_LOG (DISPATCH_LOG_BYTES / sizeof(struct dtl_entry)) + +/* + * Dispatch trace log event enable mask: + * 0x1: voluntary virtual processor waits + * 0x2: time-slice preempts + * 0x4: virtual partition memory page faults + */ +#define DTL_LOG_CEDE 0x1 +#define DTL_LOG_PREEMPT 0x2 +#define DTL_LOG_FAULT 0x4 +#define DTL_LOG_ALL (DTL_LOG_CEDE | DTL_LOG_PREEMPT | DTL_LOG_FAULT) + +extern struct kmem_cache *dtl_cache; +extern rwlock_t dtl_access_lock; + +/* + * When CONFIG_VIRT_CPU_ACCOUNTING_NATIVE = y, the cpu accounting code controls + * reading from the dispatch trace log. If other code wants to consume + * DTL entries, it can set this pointer to a function that will get + * called once for each DTL entry that gets processed. + */ +extern void (*dtl_consumer)(struct dtl_entry *entry, u64 index); + +extern void register_dtl_buffer(int cpu); +extern void alloc_dtl_buffers(unsigned long *time_limit); +extern long hcall_vphn(unsigned long cpu, u64 flags, __be32 *associativity); + +#endif /* _ASM_POWERPC_DTL_H */ diff --git a/arch/powerpc/include/asm/lppaca.h b/arch/powerpc/include/asm/lppaca.h index 3b4b305796ae..c390ec377bae 100644 --- a/arch/powerpc/include/asm/lppaca.h +++ b/arch/powerpc/include/asm/lppaca.h @@ -42,7 +42,6 @@ */ #include <linux/cache.h> #include <linux/threads.h> -#include <linux/spinlock_types.h> #include <asm/types.h> #include <asm/mmu.h> #include <asm/firmware.h> @@ -146,49 +145,6 @@ struct slb_shadow { } save_area[SLB_NUM_BOLTED]; } ____cacheline_aligned; -/* - * Layout of entries in the hypervisor's dispatch trace log buffer. - */ -struct dtl_entry { - u8 dispatch_reason; - u8 preempt_reason; - __be16 processor_id; - __be32 enqueue_to_dispatch_time; - __be32 ready_to_enqueue_time; - __be32 waiting_to_ready_time; - __be64 timebase; - __be64 fault_addr; - __be64 srr0; - __be64 srr1; -}; - -#define DISPATCH_LOG_BYTES 4096 /* bytes per cpu */ -#define N_DISPATCH_LOG (DISPATCH_LOG_BYTES / sizeof(struct dtl_entry)) - -/* - * Dispatch trace log event enable mask: - * 0x1: voluntary virtual processor waits - * 0x2: time-slice preempts - * 0x4: virtual partition memory page faults - */ -#define DTL_LOG_CEDE 0x1 -#define DTL_LOG_PREEMPT 0x2 -#define DTL_LOG_FAULT 0x4 -#define DTL_LOG_ALL (DTL_LOG_CEDE | DTL_LOG_PREEMPT | DTL_LOG_FAULT) - -extern struct kmem_cache *dtl_cache; -extern rwlock_t dtl_access_lock; - -/* - * When CONFIG_VIRT_CPU_ACCOUNTING_NATIVE = y, the cpu accounting code controls - * reading from the dispatch trace log. If other code wants to consume - * DTL entries, it can set this pointer to a function that will get - * called once for each DTL entry that gets processed. - */ -extern void (*dtl_consumer)(struct dtl_entry *entry, u64 index); - -extern void register_dtl_buffer(int cpu); -extern void alloc_dtl_buffers(unsigned long *time_limit); extern long hcall_vphn(unsigned long cpu, u64 flags, __be32 *associativity); #endif /* CONFIG_PPC_BOOK3S */ diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h index 45a839a7c6cf..84b2564cf5a4 100644 --- a/arch/powerpc/include/asm/paca.h +++ b/arch/powerpc/include/asm/paca.h @@ -29,7 +29,6 @@ #include <asm/hmi.h> #include <asm/cpuidle.h> #include <asm/atomic.h> -#include <asm/rtas-types.h> #include <asm-generic/mmiowb_types.h> @@ -53,6 +52,7 @@ extern unsigned int debug_smp_processor_id(void); /* from linux/smp.h */ #define get_slb_shadow() (get_paca()->slb_shadow_ptr) struct task_struct; +struct rtas_args; /* * Defines the layout of the paca. diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index 6fcae436ae51..f85539ebb513 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -183,6 +183,8 @@ static inline unsigned long read_spurr(unsigned long tb) #ifdef CONFIG_PPC_SPLPAR +#include <asm/dtl.h> + /* * Scan the dispatch trace log and count up the stolen time. * Should be called with interrupts disabled. diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 6bf66649ab92..ebb04f331ad3 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -74,6 +74,7 @@ #include <asm/hw_breakpoint.h> #include <asm/kvm_book3s_uvmem.h> #include <asm/ultravisor.h> +#include <asm/dtl.h> #include "book3s.h" diff --git a/arch/powerpc/platforms/pseries/dtl.c b/arch/powerpc/platforms/pseries/dtl.c index eab8aa293743..982f069e4c31 100644 --- a/arch/powerpc/platforms/pseries/dtl.c +++ b/arch/powerpc/platforms/pseries/dtl.c @@ -12,6 +12,7 @@ #include <asm/smp.h> #include <linux/uaccess.h> #include <asm/firmware.h> +#include <asm/dtl.h> #include <asm/lppaca.h> #include <asm/debugfs.h> #include <asm/plpar_wrappers.h> diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c index fd26f3d21d7b..f71ff2c94efe 100644 --- a/arch/powerpc/platforms/pseries/lpar.c +++ b/arch/powerpc/platforms/pseries/lpar.c @@ -40,6 +40,7 @@ #include <asm/fadump.h> #include <asm/asm-prototypes.h> #include <asm/debugfs.h> +#include <asm/dtl.h> #include "pseries.h" diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c index 2db8469e475f..27094c872fd6 100644 --- a/arch/powerpc/platforms/pseries/setup.c +++ b/arch/powerpc/platforms/pseries/setup.c @@ -70,6 +70,7 @@ #include <asm/idle.h> #include <asm/swiotlb.h> #include <asm/svm.h> +#include <asm/dtl.h> #include "pseries.h" #include "../../../../drivers/pci/pci.h" diff --git a/arch/powerpc/platforms/pseries/svm.c b/arch/powerpc/platforms/pseries/svm.c index 40c0637203d5..e6d7a344d9f2 100644 --- a/arch/powerpc/platforms/pseries/svm.c +++ b/arch/powerpc/platforms/pseries/svm.c @@ -11,6 +11,7 @@ #include <asm/svm.h> #include <asm/swiotlb.h> #include <asm/ultravisor.h> +#include <asm/dtl.h> static int __init init_svm(void) { diff --git a/arch/riscv/include/asm/atomic.h b/arch/riscv/include/asm/atomic.h index 96f95c9ebd97..400a8c8b6de7 100644 --- a/arch/riscv/include/asm/atomic.h +++ b/arch/riscv/include/asm/atomic.h @@ -19,8 +19,6 @@ #include <asm/cmpxchg.h> #include <asm/barrier.h> -#define ATOMIC_INIT(i) { (i) } - #define __atomic_acquire_fence() \ __asm__ __volatile__(RISCV_ACQUIRE_BARRIER "" ::: "memory") diff --git a/arch/s390/include/asm/atomic.h b/arch/s390/include/asm/atomic.h index 491ad53a0d4e..cae473a7b6f7 100644 --- a/arch/s390/include/asm/atomic.h +++ b/arch/s390/include/asm/atomic.h @@ -15,8 +15,6 @@ #include <asm/barrier.h> #include <asm/cmpxchg.h> -#define ATOMIC_INIT(i) { (i) } - static inline int atomic_read(const atomic_t *v) { int c; diff --git a/arch/s390/include/asm/smp.h b/arch/s390/include/asm/smp.h index 7326f110d48c..f48a43b63d9e 100644 --- a/arch/s390/include/asm/smp.h +++ b/arch/s390/include/asm/smp.h @@ -10,6 +10,7 @@ #include <asm/sigp.h> #include <asm/lowcore.h> +#include <asm/processor.h> #define raw_smp_processor_id() (S390_lowcore.cpu_nr) diff --git a/arch/s390/include/asm/thread_info.h b/arch/s390/include/asm/thread_info.h index e582fbe59e20..13a04fcf7762 100644 --- a/arch/s390/include/asm/thread_info.h +++ b/arch/s390/include/asm/thread_info.h @@ -24,7 +24,6 @@ #ifndef __ASSEMBLY__ #include <asm/lowcore.h> #include <asm/page.h> -#include <asm/processor.h> #define STACK_INIT_OFFSET \ (THREAD_SIZE - STACK_FRAME_OVERHEAD - sizeof(struct pt_regs)) diff --git a/arch/sh/include/asm/atomic.h b/arch/sh/include/asm/atomic.h index f37b95a80232..7c2a8a703b9a 100644 --- a/arch/sh/include/asm/atomic.h +++ b/arch/sh/include/asm/atomic.h @@ -19,8 +19,6 @@ #include <asm/cmpxchg.h> #include <asm/barrier.h> -#define ATOMIC_INIT(i) { (i) } - #define atomic_read(v) READ_ONCE((v)->counter) #define atomic_set(v,i) WRITE_ONCE((v)->counter, (i)) diff --git a/arch/sparc/include/asm/atomic_32.h b/arch/sparc/include/asm/atomic_32.h index 94c930f0bc62..efad5532f169 100644 --- a/arch/sparc/include/asm/atomic_32.h +++ b/arch/sparc/include/asm/atomic_32.h @@ -18,8 +18,6 @@ #include <asm/barrier.h> #include <asm-generic/atomic64.h> -#define ATOMIC_INIT(i) { (i) } - int atomic_add_return(int, atomic_t *); int atomic_fetch_add(int, atomic_t *); int atomic_fetch_and(int, atomic_t *); diff --git a/arch/sparc/include/asm/atomic_64.h b/arch/sparc/include/asm/atomic_64.h index b60448397d4f..6b235d3d1d9d 100644 --- a/arch/sparc/include/asm/atomic_64.h +++ b/arch/sparc/include/asm/atomic_64.h @@ -12,7 +12,6 @@ #include <asm/cmpxchg.h> #include <asm/barrier.h> -#define ATOMIC_INIT(i) { (i) } #define ATOMIC64_INIT(i) { (i) } #define atomic_read(v) READ_ONCE((v)->counter) diff --git a/arch/sparc/include/asm/percpu_64.h b/arch/sparc/include/asm/percpu_64.h index 32ef6f05cc56..a8786a4b90b6 100644 --- a/arch/sparc/include/asm/percpu_64.h +++ b/arch/sparc/include/asm/percpu_64.h @@ -4,7 +4,9 @@ #include <linux/compiler.h> +#ifndef BUILD_VDSO register unsigned long __local_per_cpu_offset asm("g5"); +#endif #ifdef CONFIG_SMP diff --git a/arch/sparc/include/asm/trap_block.h b/arch/sparc/include/asm/trap_block.h index 0f6d0c4f6683..ace0d48e837e 100644 --- a/arch/sparc/include/asm/trap_block.h +++ b/arch/sparc/include/asm/trap_block.h @@ -2,6 +2,8 @@ #ifndef _SPARC_TRAP_BLOCK_H #define _SPARC_TRAP_BLOCK_H +#include <linux/threads.h> + #include <asm/hypervisor.h> #include <asm/asi.h> diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index f09288431f28..54ad1890aefc 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -559,8 +559,7 @@ SYSCALL_DEFINE0(ni_syscall) } /** - * idtentry_enter_cond_rcu - Handle state tracking on idtentry with conditional - * RCU handling + * idtentry_enter - Handle state tracking on ordinary idtentries * @regs: Pointer to pt_regs of interrupted context * * Invokes: @@ -572,6 +571,9 @@ SYSCALL_DEFINE0(ni_syscall) * - The hardirq tracer to keep the state consistent as low level ASM * entry disabled interrupts. * + * As a precondition, this requires that the entry came from user mode, + * idle, or a kernel context in which RCU is watching. + * * For kernel mode entries RCU handling is done conditional. If RCU is * watching then the only RCU requirement is to check whether the tick has * to be restarted. If RCU is not watching then rcu_irq_enter() has to be @@ -585,18 +587,21 @@ SYSCALL_DEFINE0(ni_syscall) * establish the proper context for NOHZ_FULL. Otherwise scheduling on exit * would not be possible. * - * Returns: True if RCU has been adjusted on a kernel entry - * False otherwise + * Returns: An opaque object that must be passed to idtentry_exit() * - * The return value must be fed into the rcu_exit argument of - * idtentry_exit_cond_rcu(). + * The return value must be fed into the state argument of + * idtentry_exit(). */ -bool noinstr idtentry_enter_cond_rcu(struct pt_regs *regs) +noinstr idtentry_state_t idtentry_enter(struct pt_regs *regs) { + idtentry_state_t ret = { + .exit_rcu = false, + }; + if (user_mode(regs)) { check_user_regs(regs); enter_from_user_mode(); - return false; + return ret; } /* @@ -634,7 +639,8 @@ bool noinstr idtentry_enter_cond_rcu(struct pt_regs *regs) trace_hardirqs_off_finish(); instrumentation_end(); - return true; + ret.exit_rcu = true; + return ret; } /* @@ -649,7 +655,7 @@ bool noinstr idtentry_enter_cond_rcu(struct pt_regs *regs) trace_hardirqs_off(); instrumentation_end(); - return false; + return ret; } static void idtentry_exit_cond_resched(struct pt_regs *regs, bool may_sched) @@ -667,10 +673,9 @@ static void idtentry_exit_cond_resched(struct pt_regs *regs, bool may_sched) } /** - * idtentry_exit_cond_rcu - Handle return from exception with conditional RCU - * handling + * idtentry_exit - Handle return from exception that used idtentry_enter() * @regs: Pointer to pt_regs (exception entry regs) - * @rcu_exit: Invoke rcu_irq_exit() if true + * @state: Return value from matching call to idtentry_enter() * * Depending on the return target (kernel/user) this runs the necessary * preemption and work checks if possible and reguired and returns to @@ -679,10 +684,10 @@ static void idtentry_exit_cond_resched(struct pt_regs *regs, bool may_sched) * This is the last action before returning to the low level ASM code which * just needs to return to the appropriate context. * - * Counterpart to idtentry_enter_cond_rcu(). The return value of the entry - * function must be fed into the @rcu_exit argument. + * Counterpart to idtentry_enter(). The return value of the entry + * function must be fed into the @state argument. */ -void noinstr idtentry_exit_cond_rcu(struct pt_regs *regs, bool rcu_exit) +noinstr void idtentry_exit(struct pt_regs *regs, idtentry_state_t state) { lockdep_assert_irqs_disabled(); @@ -695,7 +700,7 @@ void noinstr idtentry_exit_cond_rcu(struct pt_regs *regs, bool rcu_exit) * carefully and needs the same ordering of lockdep/tracing * and RCU as the return to user mode path. */ - if (rcu_exit) { + if (state.exit_rcu) { instrumentation_begin(); /* Tell the tracer that IRET will enable interrupts */ trace_hardirqs_on_prepare(); @@ -714,7 +719,7 @@ void noinstr idtentry_exit_cond_rcu(struct pt_regs *regs, bool rcu_exit) * IRQ flags state is correct already. Just tell RCU if it * was not watching on entry. */ - if (rcu_exit) + if (state.exit_rcu) rcu_irq_exit(); } } @@ -726,7 +731,7 @@ void noinstr idtentry_exit_cond_rcu(struct pt_regs *regs, bool rcu_exit) * Invokes enter_from_user_mode() to establish the proper context for * NOHZ_FULL. Otherwise scheduling on exit would not be possible. */ -void noinstr idtentry_enter_user(struct pt_regs *regs) +noinstr void idtentry_enter_user(struct pt_regs *regs) { check_user_regs(regs); enter_from_user_mode(); @@ -744,13 +749,47 @@ void noinstr idtentry_enter_user(struct pt_regs *regs) * * Counterpart to idtentry_enter_user(). */ -void noinstr idtentry_exit_user(struct pt_regs *regs) +noinstr void idtentry_exit_user(struct pt_regs *regs) { lockdep_assert_irqs_disabled(); prepare_exit_to_usermode(regs); } +noinstr bool idtentry_enter_nmi(struct pt_regs *regs) +{ + bool irq_state = lockdep_hardirqs_enabled(); + + __nmi_enter(); + lockdep_hardirqs_off(CALLER_ADDR0); + lockdep_hardirq_enter(); + rcu_nmi_enter(); + + instrumentation_begin(); + trace_hardirqs_off_finish(); + ftrace_nmi_enter(); + instrumentation_end(); + + return irq_state; +} + +noinstr void idtentry_exit_nmi(struct pt_regs *regs, bool restore) +{ + instrumentation_begin(); + ftrace_nmi_exit(); + if (restore) { + trace_hardirqs_on_prepare(); + lockdep_hardirqs_on_prepare(CALLER_ADDR0); + } + instrumentation_end(); + + rcu_nmi_exit(); + lockdep_hardirq_exit(); + if (restore) + lockdep_hardirqs_on(CALLER_ADDR0); + __nmi_exit(); +} + #ifdef CONFIG_XEN_PV #ifndef CONFIG_PREEMPTION /* @@ -800,9 +839,10 @@ static void __xen_pv_evtchn_do_upcall(void) __visible noinstr void xen_pv_evtchn_do_upcall(struct pt_regs *regs) { struct pt_regs *old_regs; - bool inhcall, rcu_exit; + bool inhcall; + idtentry_state_t state; - rcu_exit = idtentry_enter_cond_rcu(regs); + state = idtentry_enter(regs); old_regs = set_irq_regs(regs); instrumentation_begin(); @@ -812,13 +852,13 @@ __visible noinstr void xen_pv_evtchn_do_upcall(struct pt_regs *regs) set_irq_regs(old_regs); inhcall = get_and_clear_inhcall(); - if (inhcall && !WARN_ON_ONCE(rcu_exit)) { + if (inhcall && !WARN_ON_ONCE(state.exit_rcu)) { instrumentation_begin(); idtentry_exit_cond_resched(regs, true); instrumentation_end(); restore_inhcall(inhcall); } else { - idtentry_exit_cond_rcu(regs, rcu_exit); + idtentry_exit(regs, state); } } #endif /* CONFIG_XEN_PV */ diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h index bf35e476a776..b6cac6e9bb70 100644 --- a/arch/x86/include/asm/atomic.h +++ b/arch/x86/include/asm/atomic.h @@ -14,8 +14,6 @@ * resource counting etc.. */ -#define ATOMIC_INIT(i) { (i) } - /** * arch_atomic_read - read atomic variable * @v: pointer of type atomic_t diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h index 80d3b30d3ee3..d74128c964f8 100644 --- a/arch/x86/include/asm/idtentry.h +++ b/arch/x86/include/asm/idtentry.h @@ -13,8 +13,15 @@ void idtentry_enter_user(struct pt_regs *regs); void idtentry_exit_user(struct pt_regs *regs); -bool idtentry_enter_cond_rcu(struct pt_regs *regs); -void idtentry_exit_cond_rcu(struct pt_regs *regs, bool rcu_exit); +typedef struct idtentry_state { + bool exit_rcu; +} idtentry_state_t; + +idtentry_state_t idtentry_enter(struct pt_regs *regs); +void idtentry_exit(struct pt_regs *regs, idtentry_state_t state); + +bool idtentry_enter_nmi(struct pt_regs *regs); +void idtentry_exit_nmi(struct pt_regs *regs, bool irq_state); /** * DECLARE_IDTENTRY - Declare functions for simple IDT entry points @@ -54,12 +61,12 @@ static __always_inline void __##func(struct pt_regs *regs); \ \ __visible noinstr void func(struct pt_regs *regs) \ { \ - bool rcu_exit = idtentry_enter_cond_rcu(regs); \ + idtentry_state_t state = idtentry_enter(regs); \ \ instrumentation_begin(); \ __##func (regs); \ instrumentation_end(); \ - idtentry_exit_cond_rcu(regs, rcu_exit); \ + idtentry_exit(regs, state); \ } \ \ static __always_inline void __##func(struct pt_regs *regs) @@ -101,12 +108,12 @@ static __always_inline void __##func(struct pt_regs *regs, \ __visible noinstr void func(struct pt_regs *regs, \ unsigned long error_code) \ { \ - bool rcu_exit = idtentry_enter_cond_rcu(regs); \ + idtentry_state_t state = idtentry_enter(regs); \ \ instrumentation_begin(); \ __##func (regs, error_code); \ instrumentation_end(); \ - idtentry_exit_cond_rcu(regs, rcu_exit); \ + idtentry_exit(regs, state); \ } \ \ static __always_inline void __##func(struct pt_regs *regs, \ @@ -199,7 +206,7 @@ static __always_inline void __##func(struct pt_regs *regs, u8 vector); \ __visible noinstr void func(struct pt_regs *regs, \ unsigned long error_code) \ { \ - bool rcu_exit = idtentry_enter_cond_rcu(regs); \ + idtentry_state_t state = idtentry_enter(regs); \ \ instrumentation_begin(); \ irq_enter_rcu(); \ @@ -207,7 +214,7 @@ __visible noinstr void func(struct pt_regs *regs, \ __##func (regs, (u8)error_code); \ irq_exit_rcu(); \ instrumentation_end(); \ - idtentry_exit_cond_rcu(regs, rcu_exit); \ + idtentry_exit(regs, state); \ } \ \ static __always_inline void __##func(struct pt_regs *regs, u8 vector) @@ -241,7 +248,7 @@ static void __##func(struct pt_regs *regs); \ \ __visible noinstr void func(struct pt_regs *regs) \ { \ - bool rcu_exit = idtentry_enter_cond_rcu(regs); \ + idtentry_state_t state = idtentry_enter(regs); \ \ instrumentation_begin(); \ irq_enter_rcu(); \ @@ -249,7 +256,7 @@ __visible noinstr void func(struct pt_regs *regs) \ run_on_irqstack_cond(__##func, regs, regs); \ irq_exit_rcu(); \ instrumentation_end(); \ - idtentry_exit_cond_rcu(regs, rcu_exit); \ + idtentry_exit(regs, state); \ } \ \ static noinline void __##func(struct pt_regs *regs) @@ -270,7 +277,7 @@ static __always_inline void __##func(struct pt_regs *regs); \ \ __visible noinstr void func(struct pt_regs *regs) \ { \ - bool rcu_exit = idtentry_enter_cond_rcu(regs); \ + idtentry_state_t state = idtentry_enter(regs); \ \ instrumentation_begin(); \ __irq_enter_raw(); \ @@ -278,7 +285,7 @@ __visible noinstr void func(struct pt_regs *regs) \ __##func (regs); \ __irq_exit_raw(); \ instrumentation_end(); \ - idtentry_exit_cond_rcu(regs, rcu_exit); \ + idtentry_exit(regs, state); \ } \ \ static __always_inline void __##func(struct pt_regs *regs) diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index df63786e7bfa..3f78482d9496 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -233,7 +233,7 @@ EXPORT_SYMBOL_GPL(kvm_read_and_reset_apf_flags); noinstr bool __kvm_handle_async_pf(struct pt_regs *regs, u32 token) { u32 reason = kvm_read_and_reset_apf_flags(); - bool rcu_exit; + idtentry_state_t state; switch (reason) { case KVM_PV_REASON_PAGE_NOT_PRESENT: @@ -243,7 +243,7 @@ noinstr bool __kvm_handle_async_pf(struct pt_regs *regs, u32 token) return false; } - rcu_exit = idtentry_enter_cond_rcu(regs); + state = idtentry_enter(regs); instrumentation_begin(); /* @@ -264,7 +264,7 @@ noinstr bool __kvm_handle_async_pf(struct pt_regs *regs, u32 token) } instrumentation_end(); - idtentry_exit_cond_rcu(regs, rcu_exit); + idtentry_exit(regs, state); return true; } diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index d7c5e44b26f7..4fc9954a9560 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -330,7 +330,6 @@ static noinstr void default_do_nmi(struct pt_regs *regs) __this_cpu_write(last_nmi_rip, regs->ip); instrumentation_begin(); - trace_hardirqs_off_finish(); handled = nmi_handle(NMI_LOCAL, regs); __this_cpu_add(nmi_stats.normal, handled); @@ -417,8 +416,6 @@ static noinstr void default_do_nmi(struct pt_regs *regs) unknown_nmi_error(reason, regs); out: - if (regs->flags & X86_EFLAGS_IF) - trace_hardirqs_on_prepare(); instrumentation_end(); } @@ -478,6 +475,8 @@ static DEFINE_PER_CPU(unsigned long, nmi_dr7); DEFINE_IDTENTRY_RAW(exc_nmi) { + bool irq_state; + if (IS_ENABLED(CONFIG_SMP) && arch_cpu_is_offline(smp_processor_id())) return; @@ -491,14 +490,14 @@ nmi_restart: this_cpu_write(nmi_dr7, local_db_save()); - nmi_enter(); + irq_state = idtentry_enter_nmi(regs); inc_irq_stat(__nmi_count); if (!ignore_nmis) default_do_nmi(regs); - nmi_exit(); + idtentry_exit_nmi(regs, irq_state); local_db_restore(this_cpu_read(nmi_dr7)); diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index b7cb3e0716f7..8493f55e1167 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -245,7 +245,7 @@ static noinstr bool handle_bug(struct pt_regs *regs) DEFINE_IDTENTRY_RAW(exc_invalid_op) { - bool rcu_exit; + idtentry_state_t state; /* * We use UD2 as a short encoding for 'CALL __WARN', as such @@ -255,11 +255,11 @@ DEFINE_IDTENTRY_RAW(exc_invalid_op) if (!user_mode(regs) && handle_bug(regs)) return; - rcu_exit = idtentry_enter_cond_rcu(regs); + state = idtentry_enter(regs); instrumentation_begin(); handle_invalid_op(regs); instrumentation_end(); - idtentry_exit_cond_rcu(regs, rcu_exit); + idtentry_exit(regs, state); } DEFINE_IDTENTRY(exc_coproc_segment_overrun) @@ -405,7 +405,7 @@ DEFINE_IDTENTRY_DF(exc_double_fault) } #endif - nmi_enter(); + idtentry_enter_nmi(regs); instrumentation_begin(); notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_DF, SIGSEGV); @@ -651,15 +651,12 @@ DEFINE_IDTENTRY_RAW(exc_int3) instrumentation_end(); idtentry_exit_user(regs); } else { - nmi_enter(); + bool irq_state = idtentry_enter_nmi(regs); instrumentation_begin(); - trace_hardirqs_off_finish(); if (!do_int3(regs)) die("int3", regs, 0); - if (regs->flags & X86_EFLAGS_IF) - trace_hardirqs_on_prepare(); instrumentation_end(); - nmi_exit(); + idtentry_exit_nmi(regs, irq_state); } } @@ -867,9 +864,8 @@ out: static __always_inline void exc_debug_kernel(struct pt_regs *regs, unsigned long dr6) { - nmi_enter(); + bool irq_state = idtentry_enter_nmi(regs); instrumentation_begin(); - trace_hardirqs_off_finish(); /* * If something gets miswired and we end up here for a user mode @@ -886,10 +882,8 @@ static __always_inline void exc_debug_kernel(struct pt_regs *regs, handle_debug(regs, dr6, false); - if (regs->flags & X86_EFLAGS_IF) - trace_hardirqs_on_prepare(); instrumentation_end(); - nmi_exit(); + idtentry_exit_nmi(regs, irq_state); } static __always_inline void exc_debug_user(struct pt_regs *regs, @@ -905,6 +899,7 @@ static __always_inline void exc_debug_user(struct pt_regs *regs, instrumentation_begin(); handle_debug(regs, dr6, true); + instrumentation_end(); idtentry_exit_user(regs); } diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 1ead568c0101..5e41949453cc 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1377,7 +1377,7 @@ handle_page_fault(struct pt_regs *regs, unsigned long error_code, DEFINE_IDTENTRY_RAW_ERRORCODE(exc_page_fault) { unsigned long address = read_cr2(); - bool rcu_exit; + idtentry_state_t state; prefetchw(¤t->mm->mmap_lock); @@ -1412,11 +1412,11 @@ DEFINE_IDTENTRY_RAW_ERRORCODE(exc_page_fault) * code reenabled RCU to avoid subsequent wreckage which helps * debugability. */ - rcu_exit = idtentry_enter_cond_rcu(regs); + state = idtentry_enter(regs); instrumentation_begin(); handle_page_fault(regs, error_code, address); instrumentation_end(); - idtentry_exit_cond_rcu(regs, rcu_exit); + idtentry_exit(regs, state); } diff --git a/arch/xtensa/include/asm/atomic.h b/arch/xtensa/include/asm/atomic.h index 3e7c6134ed32..744c2f463845 100644 --- a/arch/xtensa/include/asm/atomic.h +++ b/arch/xtensa/include/asm/atomic.h @@ -19,8 +19,6 @@ #include <asm/cmpxchg.h> #include <asm/barrier.h> -#define ATOMIC_INIT(i) { (i) } - /* * This Xtensa implementation assumes that the right mechanism * for exclusion is for locking interrupts to level EXCM_LEVEL. diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 8ac4aad66ebc..8e940c27c27c 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -406,7 +406,7 @@ struct ioc { enum ioc_running running; atomic64_t vtime_rate; - seqcount_t period_seqcount; + seqcount_spinlock_t period_seqcount; u32 period_at; /* wallclock starttime */ u64 period_at_vtime; /* vtime starttime */ @@ -873,7 +873,6 @@ static void ioc_now(struct ioc *ioc, struct ioc_now *now) static void ioc_start_period(struct ioc *ioc, struct ioc_now *now) { - lockdep_assert_held(&ioc->lock); WARN_ON_ONCE(ioc->running != IOC_RUNNING); write_seqcount_begin(&ioc->period_seqcount); @@ -2001,7 +2000,7 @@ static int blk_iocost_init(struct request_queue *q) ioc->running = IOC_IDLE; atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC); - seqcount_init(&ioc->period_seqcount); + seqcount_spinlock_init(&ioc->period_seqcount, &ioc->lock); ioc->period_at = ktime_to_us(ktime_get()); atomic64_set(&ioc->cur_period, 0); atomic_set(&ioc->hweight_gen, 0); diff --git a/drivers/dma-buf/dma-resv.c b/drivers/dma-buf/dma-resv.c index b45f8514dc82..a7631352a486 100644 --- a/drivers/dma-buf/dma-resv.c +++ b/drivers/dma-buf/dma-resv.c @@ -51,12 +51,6 @@ DEFINE_WD_CLASS(reservation_ww_class); EXPORT_SYMBOL(reservation_ww_class); -struct lock_class_key reservation_seqcount_class; -EXPORT_SYMBOL(reservation_seqcount_class); - -const char reservation_seqcount_string[] = "reservation_seqcount"; -EXPORT_SYMBOL(reservation_seqcount_string); - /** * dma_resv_list_alloc - allocate fence list * @shared_max: number of fences we need space for @@ -135,9 +129,8 @@ subsys_initcall(dma_resv_lockdep); void dma_resv_init(struct dma_resv *obj) { ww_mutex_init(&obj->lock, &reservation_ww_class); + seqcount_ww_mutex_init(&obj->seq, &obj->lock); - __seqcount_init(&obj->seq, reservation_seqcount_string, - &reservation_seqcount_class); RCU_INIT_POINTER(obj->fence, NULL); RCU_INIT_POINTER(obj->fence_excl, NULL); } @@ -267,7 +260,6 @@ void dma_resv_add_shared_fence(struct dma_resv *obj, struct dma_fence *fence) fobj = dma_resv_get_list(obj); count = fobj->shared_count; - preempt_disable(); write_seqcount_begin(&obj->seq); for (i = 0; i < count; ++i) { @@ -289,7 +281,6 @@ replace: smp_store_mb(fobj->shared_count, count); write_seqcount_end(&obj->seq); - preempt_enable(); dma_fence_put(old); } EXPORT_SYMBOL(dma_resv_add_shared_fence); @@ -316,14 +307,12 @@ void dma_resv_add_excl_fence(struct dma_resv *obj, struct dma_fence *fence) if (fence) dma_fence_get(fence); - preempt_disable(); write_seqcount_begin(&obj->seq); /* write_seqcount_begin provides the necessary memory barrier */ RCU_INIT_POINTER(obj->fence_excl, fence); if (old) old->shared_count = 0; write_seqcount_end(&obj->seq); - preempt_enable(); /* inplace update, no shared fences */ while (i--) @@ -401,13 +390,11 @@ retry: src_list = dma_resv_get_list(dst); old = dma_resv_get_excl(dst); - preempt_disable(); write_seqcount_begin(&dst->seq); /* write_seqcount_begin provides the necessary memory barrier */ RCU_INIT_POINTER(dst->fence_excl, new); RCU_INIT_POINTER(dst->fence, dst_list); write_seqcount_end(&dst->seq); - preempt_enable(); dma_resv_list_free(src_list); dma_fence_put(old); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c index b91b5171270f..ff4b583cb96a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c @@ -258,11 +258,9 @@ static int amdgpu_amdkfd_remove_eviction_fence(struct amdgpu_bo *bo, new->shared_count = k; /* Install the new fence list, seqcount provides the barriers */ - preempt_disable(); write_seqcount_begin(&resv->seq); RCU_INIT_POINTER(resv->fence, new); write_seqcount_end(&resv->seq); - preempt_enable(); /* Drop the references to the removed fences or move them to ef_list */ for (i = j, k = 0; i < old->shared_count; ++i) { diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index ab8067f9ce8c..892aefe88fa7 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -6935,7 +6935,7 @@ static struct r5conf *setup_conf(struct mddev *mddev) } else goto abort; spin_lock_init(&conf->device_lock); - seqcount_init(&conf->gen_lock); + seqcount_spinlock_init(&conf->gen_lock, &conf->device_lock); mutex_init(&conf->cache_size_mutex); init_waitqueue_head(&conf->wait_for_quiescent); init_waitqueue_head(&conf->wait_for_stripe); diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index f90e0704bed9..a2c9e9e9f5ac 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -589,7 +589,7 @@ struct r5conf { int prev_chunk_sectors; int prev_algo; short generation; /* increments with every reshape */ - seqcount_t gen_lock; /* lock against generation changes */ + seqcount_spinlock_t gen_lock; /* lock against generation changes */ unsigned long reshape_checkpoint; /* Time we last updated * metadata */ long long min_offset_diff; /* minimum difference between diff --git a/fs/dcache.c b/fs/dcache.c index 361ea7ab30ea..ea0485861d93 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1746,7 +1746,7 @@ static struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name) dentry->d_lockref.count = 1; dentry->d_flags = 0; spin_lock_init(&dentry->d_lock); - seqcount_init(&dentry->d_seq); + seqcount_spinlock_init(&dentry->d_seq, &dentry->d_lock); dentry->d_inode = NULL; dentry->d_parent = dentry; dentry->d_sb = sb; diff --git a/fs/fs_struct.c b/fs/fs_struct.c index ca639ed967b7..04b3f5b9c629 100644 --- a/fs/fs_struct.c +++ b/fs/fs_struct.c @@ -117,7 +117,7 @@ struct fs_struct *copy_fs_struct(struct fs_struct *old) fs->users = 1; fs->in_exec = 0; spin_lock_init(&fs->lock); - seqcount_init(&fs->seq); + seqcount_spinlock_init(&fs->seq, &fs->lock); fs->umask = old->umask; spin_lock(&old->lock); @@ -163,6 +163,6 @@ EXPORT_SYMBOL(current_umask); struct fs_struct init_fs = { .users = 1, .lock = __SPIN_LOCK_UNLOCKED(init_fs.lock), - .seq = SEQCNT_ZERO(init_fs.seq), + .seq = SEQCNT_SPINLOCK_ZERO(init_fs.seq, &init_fs.lock), .umask = 0022, }; diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 2b7f6dcd2eb8..210e590e1f71 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -117,7 +117,7 @@ struct nfs4_state_owner { unsigned long so_flags; struct list_head so_states; struct nfs_seqid_counter so_seqid; - seqcount_t so_reclaim_seqcount; + seqcount_spinlock_t so_reclaim_seqcount; struct mutex so_delegreturn_mutex; }; diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index a8dc25ce48bb..b1dba24918f8 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -509,7 +509,7 @@ nfs4_alloc_state_owner(struct nfs_server *server, nfs4_init_seqid_counter(&sp->so_seqid); atomic_set(&sp->so_count, 1); INIT_LIST_HEAD(&sp->so_lru); - seqcount_init(&sp->so_reclaim_seqcount); + seqcount_spinlock_init(&sp->so_reclaim_seqcount, &sp->so_lock); mutex_init(&sp->so_delegreturn_mutex); return sp; } diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 52de29000c7e..26e8b23594fb 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -61,7 +61,7 @@ struct userfaultfd_ctx { /* waitqueue head for events */ wait_queue_head_t event_wqh; /* a refile sequence protected by fault_pending_wqh lock */ - struct seqcount refile_seq; + seqcount_spinlock_t refile_seq; /* pseudo fd refcounting */ refcount_t refcount; /* userfaultfd syscall flags */ @@ -1998,7 +1998,7 @@ static void init_once_userfaultfd_ctx(void *mem) init_waitqueue_head(&ctx->fault_wqh); init_waitqueue_head(&ctx->event_wqh); init_waitqueue_head(&ctx->fd_wqh); - seqcount_init(&ctx->refile_seq); + seqcount_spinlock_init(&ctx->refile_seq, &ctx->fault_pending_wqh.lock); } SYSCALL_DEFINE1(userfaultfd, int, flags) diff --git a/include/asm-generic/atomic.h b/include/asm-generic/atomic.h index 286867f593d2..11f96f40f4a7 100644 --- a/include/asm-generic/atomic.h +++ b/include/asm-generic/atomic.h @@ -159,8 +159,6 @@ ATOMIC_OP(xor, ^) * resource counting etc.. */ -#define ATOMIC_INIT(i) { (i) } - /** * atomic_read - read atomic variable * @v: pointer of type atomic_t diff --git a/include/asm-generic/qspinlock.h b/include/asm-generic/qspinlock.h index fde943d180e0..2b26cd729b94 100644 --- a/include/asm-generic/qspinlock.h +++ b/include/asm-generic/qspinlock.h @@ -11,6 +11,7 @@ #define __ASM_GENERIC_QSPINLOCK_H #include <asm-generic/qspinlock_types.h> +#include <linux/atomic.h> /** * queued_spin_is_locked - is the spinlock locked? diff --git a/include/asm-generic/qspinlock_types.h b/include/asm-generic/qspinlock_types.h index 56d1309d32f8..2fd1fb89ec36 100644 --- a/include/asm-generic/qspinlock_types.h +++ b/include/asm-generic/qspinlock_types.h @@ -9,15 +9,7 @@ #ifndef __ASM_GENERIC_QSPINLOCK_TYPES_H #define __ASM_GENERIC_QSPINLOCK_TYPES_H -/* - * Including atomic.h with PARAVIRT on will cause compilation errors because - * of recursive header file incluson via paravirt_types.h. So don't include - * it if PARAVIRT is on. - */ -#ifndef CONFIG_PARAVIRT #include <linux/types.h> -#include <linux/atomic.h> -#endif typedef struct qspinlock { union { diff --git a/include/linux/dcache.h b/include/linux/dcache.h index a81f0c3cf352..65d975bf9390 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -89,7 +89,7 @@ extern struct dentry_stat_t dentry_stat; struct dentry { /* RCU lookup touched fields */ unsigned int d_flags; /* protected by d_lock */ - seqcount_t d_seq; /* per dentry seqlock */ + seqcount_spinlock_t d_seq; /* per dentry seqlock */ struct hlist_bl_node d_hash; /* lookup hash list */ struct dentry *d_parent; /* parent directory */ struct qstr d_name; diff --git a/include/linux/dma-resv.h b/include/linux/dma-resv.h index ee50d10f052b..d44a77e8a7e3 100644 --- a/include/linux/dma-resv.h +++ b/include/linux/dma-resv.h @@ -46,8 +46,6 @@ #include <linux/rcupdate.h> extern struct ww_class reservation_ww_class; -extern struct lock_class_key reservation_seqcount_class; -extern const char reservation_seqcount_string[]; /** * struct dma_resv_list - a list of shared fences @@ -71,7 +69,7 @@ struct dma_resv_list { */ struct dma_resv { struct ww_mutex lock; - seqcount_t seq; + seqcount_ww_mutex_t seq; struct dma_fence __rcu *fence_excl; struct dma_resv_list __rcu *fence; diff --git a/include/linux/fs_struct.h b/include/linux/fs_struct.h index cf1015abfbf2..783b48dedb72 100644 --- a/include/linux/fs_struct.h +++ b/include/linux/fs_struct.h @@ -9,7 +9,7 @@ struct fs_struct { int users; spinlock_t lock; - seqcount_t seq; + seqcount_spinlock_t seq; int umask; int in_exec; struct path root, pwd; diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h index 03c9fece7d43..754f67ac4326 100644 --- a/include/linux/hardirq.h +++ b/include/linux/hardirq.h @@ -111,32 +111,42 @@ extern void rcu_nmi_exit(void); /* * nmi_enter() can nest up to 15 times; see NMI_BITS. */ -#define nmi_enter() \ +#define __nmi_enter() \ do { \ + lockdep_off(); \ arch_nmi_enter(); \ printk_nmi_enter(); \ - lockdep_off(); \ BUG_ON(in_nmi() == NMI_MASK); \ __preempt_count_add(NMI_OFFSET + HARDIRQ_OFFSET); \ - rcu_nmi_enter(); \ + } while (0) + +#define nmi_enter() \ + do { \ + __nmi_enter(); \ lockdep_hardirq_enter(); \ + rcu_nmi_enter(); \ instrumentation_begin(); \ ftrace_nmi_enter(); \ instrumentation_end(); \ } while (0) +#define __nmi_exit() \ + do { \ + BUG_ON(!in_nmi()); \ + __preempt_count_sub(NMI_OFFSET + HARDIRQ_OFFSET); \ + printk_nmi_exit(); \ + arch_nmi_exit(); \ + lockdep_on(); \ + } while (0) + #define nmi_exit() \ do { \ instrumentation_begin(); \ ftrace_nmi_exit(); \ instrumentation_end(); \ - lockdep_hardirq_exit(); \ rcu_nmi_exit(); \ - BUG_ON(!in_nmi()); \ - __preempt_count_sub(NMI_OFFSET + HARDIRQ_OFFSET); \ - lockdep_on(); \ - printk_nmi_exit(); \ - arch_nmi_exit(); \ + lockdep_hardirq_exit(); \ + __nmi_exit(); \ } while (0) #endif /* LINUX_HARDIRQ_H */ diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 15c8ac313678..25993b86ac5c 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -159,7 +159,7 @@ struct hrtimer_clock_base { struct hrtimer_cpu_base *cpu_base; unsigned int index; clockid_t clockid; - seqcount_t seq; + seqcount_raw_spinlock_t seq; struct hrtimer *running; struct timerqueue_head active; ktime_t (*get_time)(void); diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h index 6384d2813ded..5811ee8a5cd8 100644 --- a/include/linux/irqflags.h +++ b/include/linux/irqflags.h @@ -14,6 +14,7 @@ #include <linux/typecheck.h> #include <asm/irqflags.h> +#include <asm/percpu.h> /* Currently lockdep_softirqs_on/off is used only by lockdep */ #ifdef CONFIG_PROVE_LOCKING @@ -31,18 +32,22 @@ #endif #ifdef CONFIG_TRACE_IRQFLAGS + +DECLARE_PER_CPU(int, hardirqs_enabled); +DECLARE_PER_CPU(int, hardirq_context); + extern void trace_hardirqs_on_prepare(void); extern void trace_hardirqs_off_finish(void); extern void trace_hardirqs_on(void); extern void trace_hardirqs_off(void); -# define lockdep_hardirq_context(p) ((p)->hardirq_context) +# define lockdep_hardirq_context() (this_cpu_read(hardirq_context)) # define lockdep_softirq_context(p) ((p)->softirq_context) -# define lockdep_hardirqs_enabled(p) ((p)->hardirqs_enabled) +# define lockdep_hardirqs_enabled() (this_cpu_read(hardirqs_enabled)) # define lockdep_softirqs_enabled(p) ((p)->softirqs_enabled) -# define lockdep_hardirq_enter() \ -do { \ - if (!current->hardirq_context++) \ - current->hardirq_threaded = 0; \ +# define lockdep_hardirq_enter() \ +do { \ + if (this_cpu_inc_return(hardirq_context) == 1) \ + current->hardirq_threaded = 0; \ } while (0) # define lockdep_hardirq_threaded() \ do { \ @@ -50,7 +55,7 @@ do { \ } while (0) # define lockdep_hardirq_exit() \ do { \ - current->hardirq_context--; \ + this_cpu_dec(hardirq_context); \ } while (0) # define lockdep_softirq_enter() \ do { \ @@ -104,9 +109,9 @@ do { \ # define trace_hardirqs_off_finish() do { } while (0) # define trace_hardirqs_on() do { } while (0) # define trace_hardirqs_off() do { } while (0) -# define lockdep_hardirq_context(p) 0 +# define lockdep_hardirq_context() 0 # define lockdep_softirq_context(p) 0 -# define lockdep_hardirqs_enabled(p) 0 +# define lockdep_hardirqs_enabled() 0 # define lockdep_softirqs_enabled(p) 0 # define lockdep_hardirq_enter() do { } while (0) # define lockdep_hardirq_threaded() do { } while (0) diff --git a/include/linux/kvm_irqfd.h b/include/linux/kvm_irqfd.h index dc1da020305b..dac047abdba7 100644 --- a/include/linux/kvm_irqfd.h +++ b/include/linux/kvm_irqfd.h @@ -42,7 +42,7 @@ struct kvm_kernel_irqfd { wait_queue_entry_t wait; /* Update side is protected by irqfds.lock */ struct kvm_kernel_irq_routing_entry irq_entry; - seqcount_t irq_entry_sc; + seqcount_spinlock_t irq_entry_sc; /* Used for level IRQ fast-path */ int gsi; struct work_struct inject; diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index 8fce5c98a4b0..39a35699d0d6 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -10,33 +10,15 @@ #ifndef __LINUX_LOCKDEP_H #define __LINUX_LOCKDEP_H +#include <linux/lockdep_types.h> +#include <asm/percpu.h> + struct task_struct; -struct lockdep_map; /* for sysctl */ extern int prove_locking; extern int lock_stat; -#define MAX_LOCKDEP_SUBCLASSES 8UL - -#include <linux/types.h> - -enum lockdep_wait_type { - LD_WAIT_INV = 0, /* not checked, catch all */ - - LD_WAIT_FREE, /* wait free, rcu etc.. */ - LD_WAIT_SPIN, /* spin loops, raw_spinlock_t etc.. */ - -#ifdef CONFIG_PROVE_RAW_LOCK_NESTING - LD_WAIT_CONFIG, /* CONFIG_PREEMPT_LOCK, spinlock_t etc.. */ -#else - LD_WAIT_CONFIG = LD_WAIT_SPIN, -#endif - LD_WAIT_SLEEP, /* sleeping locks, mutex_t etc.. */ - - LD_WAIT_MAX, /* must be last */ -}; - #ifdef CONFIG_LOCKDEP #include <linux/linkage.h> @@ -44,147 +26,6 @@ enum lockdep_wait_type { #include <linux/debug_locks.h> #include <linux/stacktrace.h> -/* - * We'd rather not expose kernel/lockdep_states.h this wide, but we do need - * the total number of states... :-( - */ -#define XXX_LOCK_USAGE_STATES (1+2*4) - -/* - * NR_LOCKDEP_CACHING_CLASSES ... Number of classes - * cached in the instance of lockdep_map - * - * Currently main class (subclass == 0) and signle depth subclass - * are cached in lockdep_map. This optimization is mainly targeting - * on rq->lock. double_rq_lock() acquires this highly competitive with - * single depth. - */ -#define NR_LOCKDEP_CACHING_CLASSES 2 - -/* - * A lockdep key is associated with each lock object. For static locks we use - * the lock address itself as the key. Dynamically allocated lock objects can - * have a statically or dynamically allocated key. Dynamically allocated lock - * keys must be registered before being used and must be unregistered before - * the key memory is freed. - */ -struct lockdep_subclass_key { - char __one_byte; -} __attribute__ ((__packed__)); - -/* hash_entry is used to keep track of dynamically allocated keys. */ -struct lock_class_key { - union { - struct hlist_node hash_entry; - struct lockdep_subclass_key subkeys[MAX_LOCKDEP_SUBCLASSES]; - }; -}; - -extern struct lock_class_key __lockdep_no_validate__; - -struct lock_trace; - -#define LOCKSTAT_POINTS 4 - -/* - * The lock-class itself. The order of the structure members matters. - * reinit_class() zeroes the key member and all subsequent members. - */ -struct lock_class { - /* - * class-hash: - */ - struct hlist_node hash_entry; - - /* - * Entry in all_lock_classes when in use. Entry in free_lock_classes - * when not in use. Instances that are being freed are on one of the - * zapped_classes lists. - */ - struct list_head lock_entry; - - /* - * These fields represent a directed graph of lock dependencies, - * to every node we attach a list of "forward" and a list of - * "backward" graph nodes. - */ - struct list_head locks_after, locks_before; - - const struct lockdep_subclass_key *key; - unsigned int subclass; - unsigned int dep_gen_id; - - /* - * IRQ/softirq usage tracking bits: - */ - unsigned long usage_mask; - const struct lock_trace *usage_traces[XXX_LOCK_USAGE_STATES]; - - /* - * Generation counter, when doing certain classes of graph walking, - * to ensure that we check one node only once: - */ - int name_version; - const char *name; - - short wait_type_inner; - short wait_type_outer; - -#ifdef CONFIG_LOCK_STAT - unsigned long contention_point[LOCKSTAT_POINTS]; - unsigned long contending_point[LOCKSTAT_POINTS]; -#endif -} __no_randomize_layout; - -#ifdef CONFIG_LOCK_STAT -struct lock_time { - s64 min; - s64 max; - s64 total; - unsigned long nr; -}; - -enum bounce_type { - bounce_acquired_write, - bounce_acquired_read, - bounce_contended_write, - bounce_contended_read, - nr_bounce_types, - - bounce_acquired = bounce_acquired_write, - bounce_contended = bounce_contended_write, -}; - -struct lock_class_stats { - unsigned long contention_point[LOCKSTAT_POINTS]; - unsigned long contending_point[LOCKSTAT_POINTS]; - struct lock_time read_waittime; - struct lock_time write_waittime; - struct lock_time read_holdtime; - struct lock_time write_holdtime; - unsigned long bounces[nr_bounce_types]; -}; - -struct lock_class_stats lock_stats(struct lock_class *class); -void clear_lock_stats(struct lock_class *class); -#endif - -/* - * Map the lock object (the lock instance) to the lock-class object. - * This is embedded into specific lock instances: - */ -struct lockdep_map { - struct lock_class_key *key; - struct lock_class *class_cache[NR_LOCKDEP_CACHING_CLASSES]; - const char *name; - short wait_type_outer; /* can be taken in this context */ - short wait_type_inner; /* presents this context */ -#ifdef CONFIG_LOCK_STAT - int cpu; - unsigned long ip; -#endif -}; - static inline void lockdep_copy_map(struct lockdep_map *to, struct lockdep_map *from) { @@ -440,8 +281,6 @@ static inline void lock_set_subclass(struct lockdep_map *lock, extern void lock_downgrade(struct lockdep_map *lock, unsigned long ip); -struct pin_cookie { unsigned int val; }; - #define NIL_COOKIE (struct pin_cookie){ .val = 0U, } extern struct pin_cookie lock_pin_lock(struct lockdep_map *lock); @@ -520,10 +359,6 @@ static inline void lockdep_set_selftest_task(struct task_struct *task) # define lockdep_reset() do { debug_locks = 1; } while (0) # define lockdep_free_key_range(start, size) do { } while (0) # define lockdep_sys_exit() do { } while (0) -/* - * The class key takes no space if lockdep is disabled: - */ -struct lock_class_key { }; static inline void lockdep_register_key(struct lock_class_key *key) { @@ -533,11 +368,6 @@ static inline void lockdep_unregister_key(struct lock_class_key *key) { } -/* - * The lockdep_map takes no space if lockdep is disabled: - */ -struct lockdep_map { }; - #define lockdep_depth(tsk) (0) #define lockdep_is_held_type(l, r) (1) @@ -549,8 +379,6 @@ struct lockdep_map { }; #define lockdep_recursing(tsk) (0) -struct pin_cookie { }; - #define NIL_COOKIE (struct pin_cookie){ } #define lockdep_pin_lock(l) ({ struct pin_cookie cookie = { }; cookie; }) @@ -703,38 +531,58 @@ do { \ lock_release(&(lock)->dep_map, _THIS_IP_); \ } while (0) -#define lockdep_assert_irqs_enabled() do { \ - WARN_ONCE(debug_locks && !current->lockdep_recursion && \ - !current->hardirqs_enabled, \ - "IRQs not enabled as expected\n"); \ - } while (0) +DECLARE_PER_CPU(int, hardirqs_enabled); +DECLARE_PER_CPU(int, hardirq_context); -#define lockdep_assert_irqs_disabled() do { \ - WARN_ONCE(debug_locks && !current->lockdep_recursion && \ - current->hardirqs_enabled, \ - "IRQs not disabled as expected\n"); \ - } while (0) +#define lockdep_assert_irqs_enabled() \ +do { \ + WARN_ON_ONCE(debug_locks && !this_cpu_read(hardirqs_enabled)); \ +} while (0) -#define lockdep_assert_in_irq() do { \ - WARN_ONCE(debug_locks && !current->lockdep_recursion && \ - !current->hardirq_context, \ - "Not in hardirq as expected\n"); \ - } while (0) +#define lockdep_assert_irqs_disabled() \ +do { \ + WARN_ON_ONCE(debug_locks && this_cpu_read(hardirqs_enabled)); \ +} while (0) + +#define lockdep_assert_in_irq() \ +do { \ + WARN_ON_ONCE(debug_locks && !this_cpu_read(hardirq_context)); \ +} while (0) + +#define lockdep_assert_preemption_enabled() \ +do { \ + WARN_ON_ONCE(IS_ENABLED(CONFIG_PREEMPT_COUNT) && \ + debug_locks && \ + (preempt_count() != 0 || \ + !this_cpu_read(hardirqs_enabled))); \ +} while (0) + +#define lockdep_assert_preemption_disabled() \ +do { \ + WARN_ON_ONCE(IS_ENABLED(CONFIG_PREEMPT_COUNT) && \ + debug_locks && \ + (preempt_count() == 0 && \ + this_cpu_read(hardirqs_enabled))); \ +} while (0) #else # define might_lock(lock) do { } while (0) # define might_lock_read(lock) do { } while (0) # define might_lock_nested(lock, subclass) do { } while (0) + # define lockdep_assert_irqs_enabled() do { } while (0) # define lockdep_assert_irqs_disabled() do { } while (0) # define lockdep_assert_in_irq() do { } while (0) + +# define lockdep_assert_preemption_enabled() do { } while (0) +# define lockdep_assert_preemption_disabled() do { } while (0) #endif #ifdef CONFIG_PROVE_RAW_LOCK_NESTING # define lockdep_assert_RT_in_threaded_ctx() do { \ WARN_ONCE(debug_locks && !current->lockdep_recursion && \ - current->hardirq_context && \ + lockdep_hardirq_context() && \ !(current->hardirq_threaded || current->irq_config), \ "Not in threaded context on PREEMPT_RT as expected\n"); \ } while (0) diff --git a/include/linux/lockdep_types.h b/include/linux/lockdep_types.h new file mode 100644 index 000000000000..bb35b449f533 --- /dev/null +++ b/include/linux/lockdep_types.h @@ -0,0 +1,194 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Runtime locking correctness validator + * + * Copyright (C) 2006,2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> + * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra + * + * see Documentation/locking/lockdep-design.rst for more details. + */ +#ifndef __LINUX_LOCKDEP_TYPES_H +#define __LINUX_LOCKDEP_TYPES_H + +#include <linux/types.h> + +#define MAX_LOCKDEP_SUBCLASSES 8UL + +enum lockdep_wait_type { + LD_WAIT_INV = 0, /* not checked, catch all */ + + LD_WAIT_FREE, /* wait free, rcu etc.. */ + LD_WAIT_SPIN, /* spin loops, raw_spinlock_t etc.. */ + +#ifdef CONFIG_PROVE_RAW_LOCK_NESTING + LD_WAIT_CONFIG, /* CONFIG_PREEMPT_LOCK, spinlock_t etc.. */ +#else + LD_WAIT_CONFIG = LD_WAIT_SPIN, +#endif + LD_WAIT_SLEEP, /* sleeping locks, mutex_t etc.. */ + + LD_WAIT_MAX, /* must be last */ +}; + +#ifdef CONFIG_LOCKDEP + +/* + * We'd rather not expose kernel/lockdep_states.h this wide, but we do need + * the total number of states... :-( + */ +#define XXX_LOCK_USAGE_STATES (1+2*4) + +/* + * NR_LOCKDEP_CACHING_CLASSES ... Number of classes + * cached in the instance of lockdep_map + * + * Currently main class (subclass == 0) and signle depth subclass + * are cached in lockdep_map. This optimization is mainly targeting + * on rq->lock. double_rq_lock() acquires this highly competitive with + * single depth. + */ +#define NR_LOCKDEP_CACHING_CLASSES 2 + +/* + * A lockdep key is associated with each lock object. For static locks we use + * the lock address itself as the key. Dynamically allocated lock objects can + * have a statically or dynamically allocated key. Dynamically allocated lock + * keys must be registered before being used and must be unregistered before + * the key memory is freed. + */ +struct lockdep_subclass_key { + char __one_byte; +} __attribute__ ((__packed__)); + +/* hash_entry is used to keep track of dynamically allocated keys. */ +struct lock_class_key { + union { + struct hlist_node hash_entry; + struct lockdep_subclass_key subkeys[MAX_LOCKDEP_SUBCLASSES]; + }; +}; + +extern struct lock_class_key __lockdep_no_validate__; + +struct lock_trace; + +#define LOCKSTAT_POINTS 4 + +/* + * The lock-class itself. The order of the structure members matters. + * reinit_class() zeroes the key member and all subsequent members. + */ +struct lock_class { + /* + * class-hash: + */ + struct hlist_node hash_entry; + + /* + * Entry in all_lock_classes when in use. Entry in free_lock_classes + * when not in use. Instances that are being freed are on one of the + * zapped_classes lists. + */ + struct list_head lock_entry; + + /* + * These fields represent a directed graph of lock dependencies, + * to every node we attach a list of "forward" and a list of + * "backward" graph nodes. + */ + struct list_head locks_after, locks_before; + + const struct lockdep_subclass_key *key; + unsigned int subclass; + unsigned int dep_gen_id; + + /* + * IRQ/softirq usage tracking bits: + */ + unsigned long usage_mask; + const struct lock_trace *usage_traces[XXX_LOCK_USAGE_STATES]; + + /* + * Generation counter, when doing certain classes of graph walking, + * to ensure that we check one node only once: + */ + int name_version; + const char *name; + + short wait_type_inner; + short wait_type_outer; + +#ifdef CONFIG_LOCK_STAT + unsigned long contention_point[LOCKSTAT_POINTS]; + unsigned long contending_point[LOCKSTAT_POINTS]; +#endif +} __no_randomize_layout; + +#ifdef CONFIG_LOCK_STAT +struct lock_time { + s64 min; + s64 max; + s64 total; + unsigned long nr; +}; + +enum bounce_type { + bounce_acquired_write, + bounce_acquired_read, + bounce_contended_write, + bounce_contended_read, + nr_bounce_types, + + bounce_acquired = bounce_acquired_write, + bounce_contended = bounce_contended_write, +}; + +struct lock_class_stats { + unsigned long contention_point[LOCKSTAT_POINTS]; + unsigned long contending_point[LOCKSTAT_POINTS]; + struct lock_time read_waittime; + struct lock_time write_waittime; + struct lock_time read_holdtime; + struct lock_time write_holdtime; + unsigned long bounces[nr_bounce_types]; +}; + +struct lock_class_stats lock_stats(struct lock_class *class); +void clear_lock_stats(struct lock_class *class); +#endif + +/* + * Map the lock object (the lock instance) to the lock-class object. + * This is embedded into specific lock instances: + */ +struct lockdep_map { + struct lock_class_key *key; + struct lock_class *class_cache[NR_LOCKDEP_CACHING_CLASSES]; + const char *name; + short wait_type_outer; /* can be taken in this context */ + short wait_type_inner; /* presents this context */ +#ifdef CONFIG_LOCK_STAT + int cpu; + unsigned long ip; +#endif +}; + +struct pin_cookie { unsigned int val; }; + +#else /* !CONFIG_LOCKDEP */ + +/* + * The class key takes no space if lockdep is disabled: + */ +struct lock_class_key { }; + +/* + * The lockdep_map takes no space if lockdep is disabled: + */ +struct lockdep_map { }; + +struct pin_cookie { }; + +#endif /* !LOCKDEP */ + +#endif /* __LINUX_LOCKDEP_TYPES_H */ diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h index 7e5b2a4eb560..25e3fde85617 100644 --- a/include/linux/rwsem.h +++ b/include/linux/rwsem.h @@ -60,39 +60,39 @@ static inline int rwsem_is_locked(struct rw_semaphore *sem) } #define RWSEM_UNLOCKED_VALUE 0L -#define __RWSEM_INIT_COUNT(name) .count = ATOMIC_LONG_INIT(RWSEM_UNLOCKED_VALUE) +#define __RWSEM_COUNT_INIT(name) .count = ATOMIC_LONG_INIT(RWSEM_UNLOCKED_VALUE) /* Common initializer macros and functions */ #ifdef CONFIG_DEBUG_LOCK_ALLOC # define __RWSEM_DEP_MAP_INIT(lockname) \ - , .dep_map = { \ + .dep_map = { \ .name = #lockname, \ .wait_type_inner = LD_WAIT_SLEEP, \ - } + }, #else # define __RWSEM_DEP_MAP_INIT(lockname) #endif #ifdef CONFIG_DEBUG_RWSEMS -# define __DEBUG_RWSEM_INITIALIZER(lockname) , .magic = &lockname +# define __RWSEM_DEBUG_INIT(lockname) .magic = &lockname, #else -# define __DEBUG_RWSEM_INITIALIZER(lockname) +# define __RWSEM_DEBUG_INIT(lockname) #endif #ifdef CONFIG_RWSEM_SPIN_ON_OWNER -#define __RWSEM_OPT_INIT(lockname) , .osq = OSQ_LOCK_UNLOCKED +#define __RWSEM_OPT_INIT(lockname) .osq = OSQ_LOCK_UNLOCKED, #else #define __RWSEM_OPT_INIT(lockname) #endif #define __RWSEM_INITIALIZER(name) \ - { __RWSEM_INIT_COUNT(name), \ + { __RWSEM_COUNT_INIT(name), \ .owner = ATOMIC_LONG_INIT(0), \ - .wait_list = LIST_HEAD_INIT((name).wait_list), \ - .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock) \ __RWSEM_OPT_INIT(name) \ - __DEBUG_RWSEM_INITIALIZER(name) \ + .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock),\ + .wait_list = LIST_HEAD_INIT((name).wait_list), \ + __RWSEM_DEBUG_INIT(name) \ __RWSEM_DEP_MAP_INIT(name) } #define DECLARE_RWSEM(name) \ diff --git a/include/linux/sched.h b/include/linux/sched.h index 683372943093..9a9d8263962d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -986,8 +986,6 @@ struct task_struct { unsigned long hardirq_disable_ip; unsigned int hardirq_enable_event; unsigned int hardirq_disable_event; - int hardirqs_enabled; - int hardirq_context; u64 hardirq_chain_key; unsigned long softirq_disable_ip; unsigned long softirq_enable_ip; @@ -1052,7 +1050,7 @@ struct task_struct { /* Protected by ->alloc_lock: */ nodemask_t mems_allowed; /* Seqence number to catch updates: */ - seqcount_t mems_allowed_seq; + seqcount_spinlock_t mems_allowed_seq; int cpuset_mem_spread_rotor; int cpuset_slab_spread_rotor; #endif diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h index 8b97204f35a7..a076f783aa36 100644 --- a/include/linux/seqlock.h +++ b/include/linux/seqlock.h @@ -1,48 +1,31 @@ /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_SEQLOCK_H #define __LINUX_SEQLOCK_H + /* - * Reader/writer consistent mechanism without starving writers. This type of - * lock for data where the reader wants a consistent set of information - * and is willing to retry if the information changes. There are two types - * of readers: - * 1. Sequence readers which never block a writer but they may have to retry - * if a writer is in progress by detecting change in sequence number. - * Writers do not wait for a sequence reader. - * 2. Locking readers which will wait if a writer or another locking reader - * is in progress. A locking reader in progress will also block a writer - * from going forward. Unlike the regular rwlock, the read lock here is - * exclusive so that only one locking reader can get it. - * - * This is not as cache friendly as brlock. Also, this may not work well - * for data that contains pointers, because any writer could - * invalidate a pointer that a reader was following. - * - * Expected non-blocking reader usage: - * do { - * seq = read_seqbegin(&foo); - * ... - * } while (read_seqretry(&foo, seq)); + * seqcount_t / seqlock_t - a reader-writer consistency mechanism with + * lockless readers (read-only retry loops), and no writer starvation. * + * See Documentation/locking/seqlock.rst * - * On non-SMP the spin locks disappear but the writer still needs - * to increment the sequence variables because an interrupt routine could - * change the state of the data. - * - * Based on x86_64 vsyscall gettimeofday - * by Keith Owens and Andrea Arcangeli + * Copyrights: + * - Based on x86_64 vsyscall gettimeofday: Keith Owens, Andrea Arcangeli + * - Sequence counters with associated locks, (C) 2020 Linutronix GmbH */ -#include <linux/spinlock.h> -#include <linux/preempt.h> -#include <linux/lockdep.h> #include <linux/compiler.h> #include <linux/kcsan-checks.h> +#include <linux/lockdep.h> +#include <linux/mutex.h> +#include <linux/preempt.h> +#include <linux/spinlock.h> +#include <linux/ww_mutex.h> + #include <asm/processor.h> /* - * The seqlock interface does not prescribe a precise sequence of read - * begin/retry/end. For readers, typically there is a call to + * The seqlock seqcount_t interface does not prescribe a precise sequence of + * read begin/retry/end. For readers, typically there is a call to * read_seqcount_begin() and read_seqcount_retry(), however, there are more * esoteric cases which do not follow this pattern. * @@ -50,16 +33,34 @@ * via seqcount_t under KCSAN: upon beginning a seq-reader critical section, * pessimistically mark the next KCSAN_SEQLOCK_REGION_MAX memory accesses as * atomics; if there is a matching read_seqcount_retry() call, no following - * memory operations are considered atomic. Usage of seqlocks via seqlock_t - * interface is not affected. + * memory operations are considered atomic. Usage of the seqlock_t interface + * is not affected. */ #define KCSAN_SEQLOCK_REGION_MAX 1000 /* - * Version using sequence counter only. - * This can be used when code has its own mutex protecting the - * updating starting before the write_seqcountbeqin() and ending - * after the write_seqcount_end(). + * Sequence counters (seqcount_t) + * + * This is the raw counting mechanism, without any writer protection. + * + * Write side critical sections must be serialized and non-preemptible. + * + * If readers can be invoked from hardirq or softirq contexts, + * interrupts or bottom halves must also be respectively disabled before + * entering the write section. + * + * This mechanism can't be used if the protected data contains pointers, + * as the writer can invalidate a pointer that a reader is following. + * + * If the write serialization mechanism is one of the common kernel + * locking primitives, use a sequence counter with associated lock + * (seqcount_LOCKTYPE_t) instead. + * + * If it's desired to automatically handle the sequence counter writer + * serialization and non-preemptibility requirements, use a sequential + * lock (seqlock_t) instead. + * + * See Documentation/locking/seqlock.rst */ typedef struct seqcount { unsigned sequence; @@ -79,13 +80,18 @@ static inline void __seqcount_init(seqcount_t *s, const char *name, } #ifdef CONFIG_DEBUG_LOCK_ALLOC -# define SEQCOUNT_DEP_MAP_INIT(lockname) \ - .dep_map = { .name = #lockname } \ -# define seqcount_init(s) \ - do { \ - static struct lock_class_key __key; \ - __seqcount_init((s), #s, &__key); \ +# define SEQCOUNT_DEP_MAP_INIT(lockname) \ + .dep_map = { .name = #lockname } + +/** + * seqcount_init() - runtime initializer for seqcount_t + * @s: Pointer to the seqcount_t instance + */ +# define seqcount_init(s) \ + do { \ + static struct lock_class_key __key; \ + __seqcount_init((s), #s, &__key); \ } while (0) static inline void seqcount_lockdep_reader_access(const seqcount_t *s) @@ -105,13 +111,149 @@ static inline void seqcount_lockdep_reader_access(const seqcount_t *s) # define seqcount_lockdep_reader_access(x) #endif -#define SEQCNT_ZERO(lockname) { .sequence = 0, SEQCOUNT_DEP_MAP_INIT(lockname)} +/** + * SEQCNT_ZERO() - static initializer for seqcount_t + * @name: Name of the seqcount_t instance + */ +#define SEQCNT_ZERO(name) { .sequence = 0, SEQCOUNT_DEP_MAP_INIT(name) } + +/* + * Sequence counters with associated locks (seqcount_LOCKTYPE_t) + * + * A sequence counter which associates the lock used for writer + * serialization at initialization time. This enables lockdep to validate + * that the write side critical section is properly serialized. + * + * For associated locks which do not implicitly disable preemption, + * preemption protection is enforced in the write side function. + * + * Lockdep is never used in any for the raw write variants. + * + * See Documentation/locking/seqlock.rst + */ + +#ifdef CONFIG_LOCKDEP +#define __SEQ_LOCK(expr) expr +#else +#define __SEQ_LOCK(expr) +#endif + +/** + * typedef seqcount_LOCKNAME_t - sequence counter with LOCKTYPR associated + * @seqcount: The real sequence counter + * @lock: Pointer to the associated spinlock + * + * A plain sequence counter with external writer synchronization by a + * spinlock. The spinlock is associated to the sequence count in the + * static initializer or init function. This enables lockdep to validate + * that the write side critical section is properly serialized. + */ + +/** + * seqcount_LOCKNAME_init() - runtime initializer for seqcount_LOCKNAME_t + * @s: Pointer to the seqcount_LOCKNAME_t instance + * @lock: Pointer to the associated LOCKTYPE + */ + +/* + * SEQCOUNT_LOCKTYPE() - Instantiate seqcount_LOCKNAME_t and helpers + * @locktype: actual typename + * @lockname: name + * @preemptible: preemptibility of above locktype + * @lockmember: argument for lockdep_assert_held() + */ +#define SEQCOUNT_LOCKTYPE(locktype, lockname, preemptible, lockmember) \ +typedef struct seqcount_##lockname { \ + seqcount_t seqcount; \ + __SEQ_LOCK(locktype *lock); \ +} seqcount_##lockname##_t; \ + \ +static __always_inline void \ +seqcount_##lockname##_init(seqcount_##lockname##_t *s, locktype *lock) \ +{ \ + seqcount_init(&s->seqcount); \ + __SEQ_LOCK(s->lock = lock); \ +} \ + \ +static __always_inline seqcount_t * \ +__seqcount_##lockname##_ptr(seqcount_##lockname##_t *s) \ +{ \ + return &s->seqcount; \ +} \ + \ +static __always_inline bool \ +__seqcount_##lockname##_preemptible(seqcount_##lockname##_t *s) \ +{ \ + return preemptible; \ +} \ + \ +static __always_inline void \ +__seqcount_##lockname##_assert(seqcount_##lockname##_t *s) \ +{ \ + __SEQ_LOCK(lockdep_assert_held(lockmember)); \ +} + +/* + * __seqprop() for seqcount_t + */ + +static inline seqcount_t *__seqcount_ptr(seqcount_t *s) +{ + return s; +} + +static inline bool __seqcount_preemptible(seqcount_t *s) +{ + return false; +} + +static inline void __seqcount_assert(seqcount_t *s) +{ + lockdep_assert_preemption_disabled(); +} + +SEQCOUNT_LOCKTYPE(raw_spinlock_t, raw_spinlock, false, s->lock) +SEQCOUNT_LOCKTYPE(spinlock_t, spinlock, false, s->lock) +SEQCOUNT_LOCKTYPE(rwlock_t, rwlock, false, s->lock) +SEQCOUNT_LOCKTYPE(struct mutex, mutex, true, s->lock) +SEQCOUNT_LOCKTYPE(struct ww_mutex, ww_mutex, true, &s->lock->base) + +/** + * SEQCNT_LOCKNAME_ZERO - static initializer for seqcount_LOCKNAME_t + * @name: Name of the seqcount_LOCKNAME_t instance + * @lock: Pointer to the associated LOCKTYPE + */ + +#define SEQCOUNT_LOCKTYPE_ZERO(seq_name, assoc_lock) { \ + .seqcount = SEQCNT_ZERO(seq_name.seqcount), \ + __SEQ_LOCK(.lock = (assoc_lock)) \ +} + +#define SEQCNT_SPINLOCK_ZERO(name, lock) SEQCOUNT_LOCKTYPE_ZERO(name, lock) +#define SEQCNT_RAW_SPINLOCK_ZERO(name, lock) SEQCOUNT_LOCKTYPE_ZERO(name, lock) +#define SEQCNT_RWLOCK_ZERO(name, lock) SEQCOUNT_LOCKTYPE_ZERO(name, lock) +#define SEQCNT_MUTEX_ZERO(name, lock) SEQCOUNT_LOCKTYPE_ZERO(name, lock) +#define SEQCNT_WW_MUTEX_ZERO(name, lock) SEQCOUNT_LOCKTYPE_ZERO(name, lock) +#define __seqprop_case(s, lockname, prop) \ + seqcount_##lockname##_t: __seqcount_##lockname##_##prop((void *)(s)) + +#define __seqprop(s, prop) _Generic(*(s), \ + seqcount_t: __seqcount_##prop((void *)(s)), \ + __seqprop_case((s), raw_spinlock, prop), \ + __seqprop_case((s), spinlock, prop), \ + __seqprop_case((s), rwlock, prop), \ + __seqprop_case((s), mutex, prop), \ + __seqprop_case((s), ww_mutex, prop)) + +#define __seqcount_ptr(s) __seqprop(s, ptr) +#define __seqcount_lock_preemptible(s) __seqprop(s, preemptible) +#define __seqcount_assert_lock_held(s) __seqprop(s, assert) + /** - * __read_seqcount_begin - begin a seq-read critical section (without barrier) - * @s: pointer to seqcount_t - * Returns: count to be passed to read_seqcount_retry + * __read_seqcount_begin() - begin a seqcount_t read section w/o barrier + * @s: Pointer to seqcount_t or any of the seqcount_locktype_t variants * * __read_seqcount_begin is like read_seqcount_begin, but has no smp_rmb() * barrier. Callers should ensure that smp_rmb() or equivalent ordering is @@ -120,8 +262,13 @@ static inline void seqcount_lockdep_reader_access(const seqcount_t *s) * * Use carefully, only in critical code, and comment how the barrier is * provided. + * + * Return: count to be passed to read_seqcount_retry() */ -static inline unsigned __read_seqcount_begin(const seqcount_t *s) +#define __read_seqcount_begin(s) \ + __read_seqcount_t_begin(__seqcount_ptr(s)) + +static inline unsigned __read_seqcount_t_begin(const seqcount_t *s) { unsigned ret; @@ -136,80 +283,91 @@ repeat: } /** - * raw_read_seqcount - Read the raw seqcount - * @s: pointer to seqcount_t - * Returns: count to be passed to read_seqcount_retry + * raw_read_seqcount_begin() - begin a seqcount_t read section w/o lockdep + * @s: Pointer to seqcount_t or any of the seqcount_locktype_t variants * - * raw_read_seqcount opens a read critical section of the given - * seqcount without any lockdep checking and without checking or - * masking the LSB. Calling code is responsible for handling that. + * Return: count to be passed to read_seqcount_retry() */ -static inline unsigned raw_read_seqcount(const seqcount_t *s) +#define raw_read_seqcount_begin(s) \ + raw_read_seqcount_t_begin(__seqcount_ptr(s)) + +static inline unsigned raw_read_seqcount_t_begin(const seqcount_t *s) { - unsigned ret = READ_ONCE(s->sequence); + unsigned ret = __read_seqcount_t_begin(s); smp_rmb(); - kcsan_atomic_next(KCSAN_SEQLOCK_REGION_MAX); return ret; } /** - * raw_read_seqcount_begin - start seq-read critical section w/o lockdep - * @s: pointer to seqcount_t - * Returns: count to be passed to read_seqcount_retry + * read_seqcount_begin() - begin a seqcount_t read critical section + * @s: Pointer to seqcount_t or any of the seqcount_locktype_t variants * - * raw_read_seqcount_begin opens a read critical section of the given - * seqcount, but without any lockdep checking. Validity of the critical - * section is tested by checking read_seqcount_retry function. + * Return: count to be passed to read_seqcount_retry() */ -static inline unsigned raw_read_seqcount_begin(const seqcount_t *s) +#define read_seqcount_begin(s) \ + read_seqcount_t_begin(__seqcount_ptr(s)) + +static inline unsigned read_seqcount_t_begin(const seqcount_t *s) { - unsigned ret = __read_seqcount_begin(s); - smp_rmb(); - return ret; + seqcount_lockdep_reader_access(s); + return raw_read_seqcount_t_begin(s); } /** - * read_seqcount_begin - begin a seq-read critical section - * @s: pointer to seqcount_t - * Returns: count to be passed to read_seqcount_retry + * raw_read_seqcount() - read the raw seqcount_t counter value + * @s: Pointer to seqcount_t or any of the seqcount_locktype_t variants * - * read_seqcount_begin opens a read critical section of the given seqcount. - * Validity of the critical section is tested by checking read_seqcount_retry - * function. + * raw_read_seqcount opens a read critical section of the given + * seqcount_t, without any lockdep checking, and without checking or + * masking the sequence counter LSB. Calling code is responsible for + * handling that. + * + * Return: count to be passed to read_seqcount_retry() */ -static inline unsigned read_seqcount_begin(const seqcount_t *s) +#define raw_read_seqcount(s) \ + raw_read_seqcount_t(__seqcount_ptr(s)) + +static inline unsigned raw_read_seqcount_t(const seqcount_t *s) { - seqcount_lockdep_reader_access(s); - return raw_read_seqcount_begin(s); + unsigned ret = READ_ONCE(s->sequence); + smp_rmb(); + kcsan_atomic_next(KCSAN_SEQLOCK_REGION_MAX); + return ret; } /** - * raw_seqcount_begin - begin a seq-read critical section - * @s: pointer to seqcount_t - * Returns: count to be passed to read_seqcount_retry + * raw_seqcount_begin() - begin a seqcount_t read critical section w/o + * lockdep and w/o counter stabilization + * @s: Pointer to seqcount_t or any of the seqcount_locktype_t variants * - * raw_seqcount_begin opens a read critical section of the given seqcount. - * Validity of the critical section is tested by checking read_seqcount_retry - * function. + * raw_seqcount_begin opens a read critical section of the given + * seqcount_t. Unlike read_seqcount_begin(), this function will not wait + * for the count to stabilize. If a writer is active when it begins, it + * will fail the read_seqcount_retry() at the end of the read critical + * section instead of stabilizing at the beginning of it. * - * Unlike read_seqcount_begin(), this function will not wait for the count - * to stabilize. If a writer is active when we begin, we will fail the - * read_seqcount_retry() instead of stabilizing at the beginning of the - * critical section. + * Use this only in special kernel hot paths where the read section is + * small and has a high probability of success through other external + * means. It will save a single branching instruction. + * + * Return: count to be passed to read_seqcount_retry() */ -static inline unsigned raw_seqcount_begin(const seqcount_t *s) +#define raw_seqcount_begin(s) \ + raw_seqcount_t_begin(__seqcount_ptr(s)) + +static inline unsigned raw_seqcount_t_begin(const seqcount_t *s) { - unsigned ret = READ_ONCE(s->sequence); - smp_rmb(); - kcsan_atomic_next(KCSAN_SEQLOCK_REGION_MAX); - return ret & ~1; + /* + * If the counter is odd, let read_seqcount_retry() fail + * by decrementing the counter. + */ + return raw_read_seqcount_t(s) & ~1; } /** - * __read_seqcount_retry - end a seq-read critical section (without barrier) - * @s: pointer to seqcount_t - * @start: count, from read_seqcount_begin - * Returns: 1 if retry is required, else 0 + * __read_seqcount_retry() - end a seqcount_t read section w/o barrier + * @s: Pointer to seqcount_t or any of the seqcount_locktype_t variants + * @start: count, from read_seqcount_begin() * * __read_seqcount_retry is like read_seqcount_retry, but has no smp_rmb() * barrier. Callers should ensure that smp_rmb() or equivalent ordering is @@ -218,39 +376,70 @@ static inline unsigned raw_seqcount_begin(const seqcount_t *s) * * Use carefully, only in critical code, and comment how the barrier is * provided. + * + * Return: true if a read section retry is required, else false */ -static inline int __read_seqcount_retry(const seqcount_t *s, unsigned start) +#define __read_seqcount_retry(s, start) \ + __read_seqcount_t_retry(__seqcount_ptr(s), start) + +static inline int __read_seqcount_t_retry(const seqcount_t *s, unsigned start) { kcsan_atomic_next(0); return unlikely(READ_ONCE(s->sequence) != start); } /** - * read_seqcount_retry - end a seq-read critical section - * @s: pointer to seqcount_t - * @start: count, from read_seqcount_begin - * Returns: 1 if retry is required, else 0 + * read_seqcount_retry() - end a seqcount_t read critical section + * @s: Pointer to seqcount_t or any of the seqcount_locktype_t variants + * @start: count, from read_seqcount_begin() * - * read_seqcount_retry closes a read critical section of the given seqcount. - * If the critical section was invalid, it must be ignored (and typically - * retried). + * read_seqcount_retry closes the read critical section of given + * seqcount_t. If the critical section was invalid, it must be ignored + * (and typically retried). + * + * Return: true if a read section retry is required, else false */ -static inline int read_seqcount_retry(const seqcount_t *s, unsigned start) +#define read_seqcount_retry(s, start) \ + read_seqcount_t_retry(__seqcount_ptr(s), start) + +static inline int read_seqcount_t_retry(const seqcount_t *s, unsigned start) { smp_rmb(); - return __read_seqcount_retry(s, start); + return __read_seqcount_t_retry(s, start); } - - -static inline void raw_write_seqcount_begin(seqcount_t *s) +/** + * raw_write_seqcount_begin() - start a seqcount_t write section w/o lockdep + * @s: Pointer to seqcount_t or any of the seqcount_locktype_t variants + */ +#define raw_write_seqcount_begin(s) \ +do { \ + if (__seqcount_lock_preemptible(s)) \ + preempt_disable(); \ + \ + raw_write_seqcount_t_begin(__seqcount_ptr(s)); \ +} while (0) + +static inline void raw_write_seqcount_t_begin(seqcount_t *s) { kcsan_nestable_atomic_begin(); s->sequence++; smp_wmb(); } -static inline void raw_write_seqcount_end(seqcount_t *s) +/** + * raw_write_seqcount_end() - end a seqcount_t write section w/o lockdep + * @s: Pointer to seqcount_t or any of the seqcount_locktype_t variants + */ +#define raw_write_seqcount_end(s) \ +do { \ + raw_write_seqcount_t_end(__seqcount_ptr(s)); \ + \ + if (__seqcount_lock_preemptible(s)) \ + preempt_enable(); \ +} while (0) + +static inline void raw_write_seqcount_t_end(seqcount_t *s) { smp_wmb(); s->sequence++; @@ -258,47 +447,120 @@ static inline void raw_write_seqcount_end(seqcount_t *s) } /** - * raw_write_seqcount_barrier - do a seq write barrier - * @s: pointer to seqcount_t + * write_seqcount_begin_nested() - start a seqcount_t write section with + * custom lockdep nesting level + * @s: Pointer to seqcount_t or any of the seqcount_locktype_t variants + * @subclass: lockdep nesting level + * + * See Documentation/locking/lockdep-design.rst + */ +#define write_seqcount_begin_nested(s, subclass) \ +do { \ + __seqcount_assert_lock_held(s); \ + \ + if (__seqcount_lock_preemptible(s)) \ + preempt_disable(); \ + \ + write_seqcount_t_begin_nested(__seqcount_ptr(s), subclass); \ +} while (0) + +static inline void write_seqcount_t_begin_nested(seqcount_t *s, int subclass) +{ + raw_write_seqcount_t_begin(s); + seqcount_acquire(&s->dep_map, subclass, 0, _RET_IP_); +} + +/** + * write_seqcount_begin() - start a seqcount_t write side critical section + * @s: Pointer to seqcount_t or any of the seqcount_locktype_t variants + * + * write_seqcount_begin opens a write side critical section of the given + * seqcount_t. + * + * Context: seqcount_t write side critical sections must be serialized and + * non-preemptible. If readers can be invoked from hardirq or softirq + * context, interrupts or bottom halves must be respectively disabled. + */ +#define write_seqcount_begin(s) \ +do { \ + __seqcount_assert_lock_held(s); \ + \ + if (__seqcount_lock_preemptible(s)) \ + preempt_disable(); \ + \ + write_seqcount_t_begin(__seqcount_ptr(s)); \ +} while (0) + +static inline void write_seqcount_t_begin(seqcount_t *s) +{ + write_seqcount_t_begin_nested(s, 0); +} + +/** + * write_seqcount_end() - end a seqcount_t write side critical section + * @s: Pointer to seqcount_t or any of the seqcount_locktype_t variants * - * This can be used to provide an ordering guarantee instead of the - * usual consistency guarantee. It is one wmb cheaper, because we can - * collapse the two back-to-back wmb()s. + * The write section must've been opened with write_seqcount_begin(). + */ +#define write_seqcount_end(s) \ +do { \ + write_seqcount_t_end(__seqcount_ptr(s)); \ + \ + if (__seqcount_lock_preemptible(s)) \ + preempt_enable(); \ +} while (0) + +static inline void write_seqcount_t_end(seqcount_t *s) +{ + seqcount_release(&s->dep_map, _RET_IP_); + raw_write_seqcount_t_end(s); +} + +/** + * raw_write_seqcount_barrier() - do a seqcount_t write barrier + * @s: Pointer to seqcount_t or any of the seqcount_locktype_t variants + * + * This can be used to provide an ordering guarantee instead of the usual + * consistency guarantee. It is one wmb cheaper, because it can collapse + * the two back-to-back wmb()s. * * Note that writes surrounding the barrier should be declared atomic (e.g. * via WRITE_ONCE): a) to ensure the writes become visible to other threads * atomically, avoiding compiler optimizations; b) to document which writes are * meant to propagate to the reader critical section. This is necessary because * neither writes before and after the barrier are enclosed in a seq-writer - * critical section that would ensure readers are aware of ongoing writes. + * critical section that would ensure readers are aware of ongoing writes:: * - * seqcount_t seq; - * bool X = true, Y = false; + * seqcount_t seq; + * bool X = true, Y = false; * - * void read(void) - * { - * bool x, y; + * void read(void) + * { + * bool x, y; * - * do { - * int s = read_seqcount_begin(&seq); + * do { + * int s = read_seqcount_begin(&seq); * - * x = X; y = Y; + * x = X; y = Y; * - * } while (read_seqcount_retry(&seq, s)); + * } while (read_seqcount_retry(&seq, s)); * - * BUG_ON(!x && !y); + * BUG_ON(!x && !y); * } * * void write(void) * { - * WRITE_ONCE(Y, true); + * WRITE_ONCE(Y, true); * - * raw_write_seqcount_barrier(seq); + * raw_write_seqcount_barrier(seq); * - * WRITE_ONCE(X, false); + * WRITE_ONCE(X, false); * } */ -static inline void raw_write_seqcount_barrier(seqcount_t *s) +#define raw_write_seqcount_barrier(s) \ + raw_write_seqcount_t_barrier(__seqcount_ptr(s)) + +static inline void raw_write_seqcount_t_barrier(seqcount_t *s) { kcsan_nestable_atomic_begin(); s->sequence++; @@ -307,7 +569,44 @@ static inline void raw_write_seqcount_barrier(seqcount_t *s) kcsan_nestable_atomic_end(); } -static inline int raw_read_seqcount_latch(seqcount_t *s) +/** + * write_seqcount_invalidate() - invalidate in-progress seqcount_t read + * side operations + * @s: Pointer to seqcount_t or any of the seqcount_locktype_t variants + * + * After write_seqcount_invalidate, no seqcount_t read side operations + * will complete successfully and see data older than this. + */ +#define write_seqcount_invalidate(s) \ + write_seqcount_t_invalidate(__seqcount_ptr(s)) + +static inline void write_seqcount_t_invalidate(seqcount_t *s) +{ + smp_wmb(); + kcsan_nestable_atomic_begin(); + s->sequence+=2; + kcsan_nestable_atomic_end(); +} + +/** + * raw_read_seqcount_latch() - pick even/odd seqcount_t latch data copy + * @s: Pointer to seqcount_t or any of the seqcount_locktype_t variants + * + * Use seqcount_t latching to switch between two storage places protected + * by a sequence counter. Doing so allows having interruptible, preemptible, + * seqcount_t write side critical sections. + * + * Check raw_write_seqcount_latch() for more details and a full reader and + * writer usage example. + * + * Return: sequence counter raw value. Use the lowest bit as an index for + * picking which data copy to read. The full counter value must then be + * checked with read_seqcount_retry(). + */ +#define raw_read_seqcount_latch(s) \ + raw_read_seqcount_t_latch(__seqcount_ptr(s)) + +static inline int raw_read_seqcount_t_latch(seqcount_t *s) { /* Pairs with the first smp_wmb() in raw_write_seqcount_latch() */ int seq = READ_ONCE(s->sequence); /* ^^^ */ @@ -315,8 +614,8 @@ static inline int raw_read_seqcount_latch(seqcount_t *s) } /** - * raw_write_seqcount_latch - redirect readers to even/odd copy - * @s: pointer to seqcount_t + * raw_write_seqcount_latch() - redirect readers to even/odd copy + * @s: Pointer to seqcount_t or any of the seqcount_locktype_t variants * * The latch technique is a multiversion concurrency control method that allows * queries during non-atomic modifications. If you can guarantee queries never @@ -332,66 +631,73 @@ static inline int raw_read_seqcount_latch(seqcount_t *s) * Very simply put: we first modify one copy and then the other. This ensures * there is always one copy in a stable state, ready to give us an answer. * - * The basic form is a data structure like: + * The basic form is a data structure like:: * - * struct latch_struct { - * seqcount_t seq; - * struct data_struct data[2]; - * }; + * struct latch_struct { + * seqcount_t seq; + * struct data_struct data[2]; + * }; * * Where a modification, which is assumed to be externally serialized, does the - * following: + * following:: * - * void latch_modify(struct latch_struct *latch, ...) - * { - * smp_wmb(); <- Ensure that the last data[1] update is visible - * latch->seq++; - * smp_wmb(); <- Ensure that the seqcount update is visible + * void latch_modify(struct latch_struct *latch, ...) + * { + * smp_wmb(); // Ensure that the last data[1] update is visible + * latch->seq++; + * smp_wmb(); // Ensure that the seqcount update is visible * - * modify(latch->data[0], ...); + * modify(latch->data[0], ...); * - * smp_wmb(); <- Ensure that the data[0] update is visible - * latch->seq++; - * smp_wmb(); <- Ensure that the seqcount update is visible + * smp_wmb(); // Ensure that the data[0] update is visible + * latch->seq++; + * smp_wmb(); // Ensure that the seqcount update is visible * - * modify(latch->data[1], ...); - * } + * modify(latch->data[1], ...); + * } * - * The query will have a form like: + * The query will have a form like:: * - * struct entry *latch_query(struct latch_struct *latch, ...) - * { - * struct entry *entry; - * unsigned seq, idx; + * struct entry *latch_query(struct latch_struct *latch, ...) + * { + * struct entry *entry; + * unsigned seq, idx; * - * do { - * seq = raw_read_seqcount_latch(&latch->seq); + * do { + * seq = raw_read_seqcount_latch(&latch->seq); * - * idx = seq & 0x01; - * entry = data_query(latch->data[idx], ...); + * idx = seq & 0x01; + * entry = data_query(latch->data[idx], ...); * - * smp_rmb(); - * } while (seq != latch->seq); + * // read_seqcount_retry() includes needed smp_rmb() + * } while (read_seqcount_retry(&latch->seq, seq)); * - * return entry; - * } + * return entry; + * } * * So during the modification, queries are first redirected to data[1]. Then we * modify data[0]. When that is complete, we redirect queries back to data[0] * and we can modify data[1]. * - * NOTE: The non-requirement for atomic modifications does _NOT_ include - * the publishing of new entries in the case where data is a dynamic - * data structure. + * NOTE: + * + * The non-requirement for atomic modifications does _NOT_ include + * the publishing of new entries in the case where data is a dynamic + * data structure. * - * An iteration might start in data[0] and get suspended long enough - * to miss an entire modification sequence, once it resumes it might - * observe the new entry. + * An iteration might start in data[0] and get suspended long enough + * to miss an entire modification sequence, once it resumes it might + * observe the new entry. * - * NOTE: When data is a dynamic data structure; one should use regular RCU - * patterns to manage the lifetimes of the objects within. + * NOTE: + * + * When data is a dynamic data structure; one should use regular RCU + * patterns to manage the lifetimes of the objects within. */ -static inline void raw_write_seqcount_latch(seqcount_t *s) +#define raw_write_seqcount_latch(s) \ + raw_write_seqcount_t_latch(__seqcount_ptr(s)) + +static inline void raw_write_seqcount_t_latch(seqcount_t *s) { smp_wmb(); /* prior stores before incrementing "sequence" */ s->sequence++; @@ -399,67 +705,48 @@ static inline void raw_write_seqcount_latch(seqcount_t *s) } /* - * Sequence counter only version assumes that callers are using their - * own mutexing. - */ -static inline void write_seqcount_begin_nested(seqcount_t *s, int subclass) -{ - raw_write_seqcount_begin(s); - seqcount_acquire(&s->dep_map, subclass, 0, _RET_IP_); -} - -static inline void write_seqcount_begin(seqcount_t *s) -{ - write_seqcount_begin_nested(s, 0); -} - -static inline void write_seqcount_end(seqcount_t *s) -{ - seqcount_release(&s->dep_map, _RET_IP_); - raw_write_seqcount_end(s); -} - -/** - * write_seqcount_invalidate - invalidate in-progress read-side seq operations - * @s: pointer to seqcount_t + * Sequential locks (seqlock_t) + * + * Sequence counters with an embedded spinlock for writer serialization + * and non-preemptibility. * - * After write_seqcount_invalidate, no read-side seq operations will complete - * successfully and see data older than this. + * For more info, see: + * - Comments on top of seqcount_t + * - Documentation/locking/seqlock.rst */ -static inline void write_seqcount_invalidate(seqcount_t *s) -{ - smp_wmb(); - kcsan_nestable_atomic_begin(); - s->sequence+=2; - kcsan_nestable_atomic_end(); -} - typedef struct { struct seqcount seqcount; spinlock_t lock; } seqlock_t; -/* - * These macros triggered gcc-3.x compile-time problems. We think these are - * OK now. Be cautious. - */ -#define __SEQLOCK_UNLOCKED(lockname) \ - { \ - .seqcount = SEQCNT_ZERO(lockname), \ - .lock = __SPIN_LOCK_UNLOCKED(lockname) \ +#define __SEQLOCK_UNLOCKED(lockname) \ + { \ + .seqcount = SEQCNT_ZERO(lockname), \ + .lock = __SPIN_LOCK_UNLOCKED(lockname) \ } -#define seqlock_init(x) \ - do { \ - seqcount_init(&(x)->seqcount); \ - spin_lock_init(&(x)->lock); \ +/** + * seqlock_init() - dynamic initializer for seqlock_t + * @sl: Pointer to the seqlock_t instance + */ +#define seqlock_init(sl) \ + do { \ + seqcount_init(&(sl)->seqcount); \ + spin_lock_init(&(sl)->lock); \ } while (0) -#define DEFINE_SEQLOCK(x) \ - seqlock_t x = __SEQLOCK_UNLOCKED(x) +/** + * DEFINE_SEQLOCK() - Define a statically allocated seqlock_t + * @sl: Name of the seqlock_t instance + */ +#define DEFINE_SEQLOCK(sl) \ + seqlock_t sl = __SEQLOCK_UNLOCKED(sl) -/* - * Read side functions for starting and finalizing a read side section. +/** + * read_seqbegin() - start a seqlock_t read side critical section + * @sl: Pointer to seqlock_t + * + * Return: count, to be passed to read_seqretry() */ static inline unsigned read_seqbegin(const seqlock_t *sl) { @@ -470,6 +757,17 @@ static inline unsigned read_seqbegin(const seqlock_t *sl) return ret; } +/** + * read_seqretry() - end a seqlock_t read side section + * @sl: Pointer to seqlock_t + * @start: count, from read_seqbegin() + * + * read_seqretry closes the read side critical section of given seqlock_t. + * If the critical section was invalid, it must be ignored (and typically + * retried). + * + * Return: true if a read section retry is required, else false + */ static inline unsigned read_seqretry(const seqlock_t *sl, unsigned start) { /* @@ -481,44 +779,88 @@ static inline unsigned read_seqretry(const seqlock_t *sl, unsigned start) return read_seqcount_retry(&sl->seqcount, start); } -/* - * Lock out other writers and update the count. - * Acts like a normal spin_lock/unlock. - * Don't need preempt_disable() because that is in the spin_lock already. +/** + * write_seqlock() - start a seqlock_t write side critical section + * @sl: Pointer to seqlock_t + * + * write_seqlock opens a write side critical section for the given + * seqlock_t. It also implicitly acquires the spinlock_t embedded inside + * that sequential lock. All seqlock_t write side sections are thus + * automatically serialized and non-preemptible. + * + * Context: if the seqlock_t read section, or other write side critical + * sections, can be invoked from hardirq or softirq contexts, use the + * _irqsave or _bh variants of this function instead. */ static inline void write_seqlock(seqlock_t *sl) { spin_lock(&sl->lock); - write_seqcount_begin(&sl->seqcount); + write_seqcount_t_begin(&sl->seqcount); } +/** + * write_sequnlock() - end a seqlock_t write side critical section + * @sl: Pointer to seqlock_t + * + * write_sequnlock closes the (serialized and non-preemptible) write side + * critical section of given seqlock_t. + */ static inline void write_sequnlock(seqlock_t *sl) { - write_seqcount_end(&sl->seqcount); + write_seqcount_t_end(&sl->seqcount); spin_unlock(&sl->lock); } +/** + * write_seqlock_bh() - start a softirqs-disabled seqlock_t write section + * @sl: Pointer to seqlock_t + * + * _bh variant of write_seqlock(). Use only if the read side section, or + * other write side sections, can be invoked from softirq contexts. + */ static inline void write_seqlock_bh(seqlock_t *sl) { spin_lock_bh(&sl->lock); - write_seqcount_begin(&sl->seqcount); + write_seqcount_t_begin(&sl->seqcount); } +/** + * write_sequnlock_bh() - end a softirqs-disabled seqlock_t write section + * @sl: Pointer to seqlock_t + * + * write_sequnlock_bh closes the serialized, non-preemptible, and + * softirqs-disabled, seqlock_t write side critical section opened with + * write_seqlock_bh(). + */ static inline void write_sequnlock_bh(seqlock_t *sl) { - write_seqcount_end(&sl->seqcount); + write_seqcount_t_end(&sl->seqcount); spin_unlock_bh(&sl->lock); } +/** + * write_seqlock_irq() - start a non-interruptible seqlock_t write section + * @sl: Pointer to seqlock_t + * + * _irq variant of write_seqlock(). Use only if the read side section, or + * other write sections, can be invoked from hardirq contexts. + */ static inline void write_seqlock_irq(seqlock_t *sl) { spin_lock_irq(&sl->lock); - write_seqcount_begin(&sl->seqcount); + write_seqcount_t_begin(&sl->seqcount); } +/** + * write_sequnlock_irq() - end a non-interruptible seqlock_t write section + * @sl: Pointer to seqlock_t + * + * write_sequnlock_irq closes the serialized and non-interruptible + * seqlock_t write side section opened with write_seqlock_irq(). + */ static inline void write_sequnlock_irq(seqlock_t *sl) { - write_seqcount_end(&sl->seqcount); + write_seqcount_t_end(&sl->seqcount); spin_unlock_irq(&sl->lock); } @@ -527,79 +869,112 @@ static inline unsigned long __write_seqlock_irqsave(seqlock_t *sl) unsigned long flags; spin_lock_irqsave(&sl->lock, flags); - write_seqcount_begin(&sl->seqcount); + write_seqcount_t_begin(&sl->seqcount); return flags; } +/** + * write_seqlock_irqsave() - start a non-interruptible seqlock_t write + * section + * @lock: Pointer to seqlock_t + * @flags: Stack-allocated storage for saving caller's local interrupt + * state, to be passed to write_sequnlock_irqrestore(). + * + * _irqsave variant of write_seqlock(). Use it only if the read side + * section, or other write sections, can be invoked from hardirq context. + */ #define write_seqlock_irqsave(lock, flags) \ do { flags = __write_seqlock_irqsave(lock); } while (0) +/** + * write_sequnlock_irqrestore() - end non-interruptible seqlock_t write + * section + * @sl: Pointer to seqlock_t + * @flags: Caller's saved interrupt state, from write_seqlock_irqsave() + * + * write_sequnlock_irqrestore closes the serialized and non-interruptible + * seqlock_t write section previously opened with write_seqlock_irqsave(). + */ static inline void write_sequnlock_irqrestore(seqlock_t *sl, unsigned long flags) { - write_seqcount_end(&sl->seqcount); + write_seqcount_t_end(&sl->seqcount); spin_unlock_irqrestore(&sl->lock, flags); } -/* - * A locking reader exclusively locks out other writers and locking readers, - * but doesn't update the sequence number. Acts like a normal spin_lock/unlock. - * Don't need preempt_disable() because that is in the spin_lock already. +/** + * read_seqlock_excl() - begin a seqlock_t locking reader section + * @sl: Pointer to seqlock_t + * + * read_seqlock_excl opens a seqlock_t locking reader critical section. A + * locking reader exclusively locks out *both* other writers *and* other + * locking readers, but it does not update the embedded sequence number. + * + * Locking readers act like a normal spin_lock()/spin_unlock(). + * + * Context: if the seqlock_t write section, *or other read sections*, can + * be invoked from hardirq or softirq contexts, use the _irqsave or _bh + * variant of this function instead. + * + * The opened read section must be closed with read_sequnlock_excl(). */ static inline void read_seqlock_excl(seqlock_t *sl) { spin_lock(&sl->lock); } +/** + * read_sequnlock_excl() - end a seqlock_t locking reader critical section + * @sl: Pointer to seqlock_t + */ static inline void read_sequnlock_excl(seqlock_t *sl) { spin_unlock(&sl->lock); } /** - * read_seqbegin_or_lock - begin a sequence number check or locking block - * @lock: sequence lock - * @seq : sequence number to be checked + * read_seqlock_excl_bh() - start a seqlock_t locking reader section with + * softirqs disabled + * @sl: Pointer to seqlock_t * - * First try it once optimistically without taking the lock. If that fails, - * take the lock. The sequence number is also used as a marker for deciding - * whether to be a reader (even) or writer (odd). - * N.B. seq must be initialized to an even number to begin with. + * _bh variant of read_seqlock_excl(). Use this variant only if the + * seqlock_t write side section, *or other read sections*, can be invoked + * from softirq contexts. */ -static inline void read_seqbegin_or_lock(seqlock_t *lock, int *seq) -{ - if (!(*seq & 1)) /* Even */ - *seq = read_seqbegin(lock); - else /* Odd */ - read_seqlock_excl(lock); -} - -static inline int need_seqretry(seqlock_t *lock, int seq) -{ - return !(seq & 1) && read_seqretry(lock, seq); -} - -static inline void done_seqretry(seqlock_t *lock, int seq) -{ - if (seq & 1) - read_sequnlock_excl(lock); -} - static inline void read_seqlock_excl_bh(seqlock_t *sl) { spin_lock_bh(&sl->lock); } +/** + * read_sequnlock_excl_bh() - stop a seqlock_t softirq-disabled locking + * reader section + * @sl: Pointer to seqlock_t + */ static inline void read_sequnlock_excl_bh(seqlock_t *sl) { spin_unlock_bh(&sl->lock); } +/** + * read_seqlock_excl_irq() - start a non-interruptible seqlock_t locking + * reader section + * @sl: Pointer to seqlock_t + * + * _irq variant of read_seqlock_excl(). Use this only if the seqlock_t + * write side section, *or other read sections*, can be invoked from a + * hardirq context. + */ static inline void read_seqlock_excl_irq(seqlock_t *sl) { spin_lock_irq(&sl->lock); } +/** + * read_sequnlock_excl_irq() - end an interrupts-disabled seqlock_t + * locking reader section + * @sl: Pointer to seqlock_t + */ static inline void read_sequnlock_excl_irq(seqlock_t *sl) { spin_unlock_irq(&sl->lock); @@ -613,15 +988,117 @@ static inline unsigned long __read_seqlock_excl_irqsave(seqlock_t *sl) return flags; } +/** + * read_seqlock_excl_irqsave() - start a non-interruptible seqlock_t + * locking reader section + * @lock: Pointer to seqlock_t + * @flags: Stack-allocated storage for saving caller's local interrupt + * state, to be passed to read_sequnlock_excl_irqrestore(). + * + * _irqsave variant of read_seqlock_excl(). Use this only if the seqlock_t + * write side section, *or other read sections*, can be invoked from a + * hardirq context. + */ #define read_seqlock_excl_irqsave(lock, flags) \ do { flags = __read_seqlock_excl_irqsave(lock); } while (0) +/** + * read_sequnlock_excl_irqrestore() - end non-interruptible seqlock_t + * locking reader section + * @sl: Pointer to seqlock_t + * @flags: Caller saved interrupt state, from read_seqlock_excl_irqsave() + */ static inline void read_sequnlock_excl_irqrestore(seqlock_t *sl, unsigned long flags) { spin_unlock_irqrestore(&sl->lock, flags); } +/** + * read_seqbegin_or_lock() - begin a seqlock_t lockless or locking reader + * @lock: Pointer to seqlock_t + * @seq : Marker and return parameter. If the passed value is even, the + * reader will become a *lockless* seqlock_t reader as in read_seqbegin(). + * If the passed value is odd, the reader will become a *locking* reader + * as in read_seqlock_excl(). In the first call to this function, the + * caller *must* initialize and pass an even value to @seq; this way, a + * lockless read can be optimistically tried first. + * + * read_seqbegin_or_lock is an API designed to optimistically try a normal + * lockless seqlock_t read section first. If an odd counter is found, the + * lockless read trial has failed, and the next read iteration transforms + * itself into a full seqlock_t locking reader. + * + * This is typically used to avoid seqlock_t lockless readers starvation + * (too much retry loops) in the case of a sharp spike in write side + * activity. + * + * Context: if the seqlock_t write section, *or other read sections*, can + * be invoked from hardirq or softirq contexts, use the _irqsave or _bh + * variant of this function instead. + * + * Check Documentation/locking/seqlock.rst for template example code. + * + * Return: the encountered sequence counter value, through the @seq + * parameter, which is overloaded as a return parameter. This returned + * value must be checked with need_seqretry(). If the read section need to + * be retried, this returned value must also be passed as the @seq + * parameter of the next read_seqbegin_or_lock() iteration. + */ +static inline void read_seqbegin_or_lock(seqlock_t *lock, int *seq) +{ + if (!(*seq & 1)) /* Even */ + *seq = read_seqbegin(lock); + else /* Odd */ + read_seqlock_excl(lock); +} + +/** + * need_seqretry() - validate seqlock_t "locking or lockless" read section + * @lock: Pointer to seqlock_t + * @seq: sequence count, from read_seqbegin_or_lock() + * + * Return: true if a read section retry is required, false otherwise + */ +static inline int need_seqretry(seqlock_t *lock, int seq) +{ + return !(seq & 1) && read_seqretry(lock, seq); +} + +/** + * done_seqretry() - end seqlock_t "locking or lockless" reader section + * @lock: Pointer to seqlock_t + * @seq: count, from read_seqbegin_or_lock() + * + * done_seqretry finishes the seqlock_t read side critical section started + * with read_seqbegin_or_lock() and validated by need_seqretry(). + */ +static inline void done_seqretry(seqlock_t *lock, int seq) +{ + if (seq & 1) + read_sequnlock_excl(lock); +} + +/** + * read_seqbegin_or_lock_irqsave() - begin a seqlock_t lockless reader, or + * a non-interruptible locking reader + * @lock: Pointer to seqlock_t + * @seq: Marker and return parameter. Check read_seqbegin_or_lock(). + * + * This is the _irqsave variant of read_seqbegin_or_lock(). Use it only if + * the seqlock_t write section, *or other read sections*, can be invoked + * from hardirq context. + * + * Note: Interrupts will be disabled only for "locking reader" mode. + * + * Return: + * + * 1. The saved local interrupts state in case of a locking reader, to + * be passed to done_seqretry_irqrestore(). + * + * 2. The encountered sequence counter value, returned through @seq + * overloaded as a return parameter. Check read_seqbegin_or_lock(). + */ static inline unsigned long read_seqbegin_or_lock_irqsave(seqlock_t *lock, int *seq) { @@ -635,6 +1112,18 @@ read_seqbegin_or_lock_irqsave(seqlock_t *lock, int *seq) return flags; } +/** + * done_seqretry_irqrestore() - end a seqlock_t lockless reader, or a + * non-interruptible locking reader section + * @lock: Pointer to seqlock_t + * @seq: Count, from read_seqbegin_or_lock_irqsave() + * @flags: Caller's saved local interrupt state in case of a locking + * reader, also from read_seqbegin_or_lock_irqsave() + * + * This is the _irqrestore variant of done_seqretry(). The read section + * must've been opened with read_seqbegin_or_lock_irqsave(), and validated + * by need_seqretry(). + */ static inline void done_seqretry_irqrestore(seqlock_t *lock, int seq, unsigned long flags) { diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h index d3770b3f9d9a..f2f12d746dbd 100644 --- a/include/linux/spinlock.h +++ b/include/linux/spinlock.h @@ -56,6 +56,7 @@ #include <linux/kernel.h> #include <linux/stringify.h> #include <linux/bottom_half.h> +#include <linux/lockdep.h> #include <asm/barrier.h> #include <asm/mmiowb.h> diff --git a/include/linux/spinlock_types.h b/include/linux/spinlock_types.h index 6102e6bff3ae..b981caafe8bf 100644 --- a/include/linux/spinlock_types.h +++ b/include/linux/spinlock_types.h @@ -15,7 +15,7 @@ # include <linux/spinlock_types_up.h> #endif -#include <linux/lockdep.h> +#include <linux/lockdep_types.h> typedef struct raw_spinlock { arch_spinlock_t raw_lock; diff --git a/include/linux/types.h b/include/linux/types.h index d3021c879179..a147977602b5 100644 --- a/include/linux/types.h +++ b/include/linux/types.h @@ -167,6 +167,8 @@ typedef struct { int counter; } atomic_t; +#define ATOMIC_INIT(i) { (i) } + #ifdef CONFIG_64BIT typedef struct { s64 counter; diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h index 90690e37a56f..ea4e2010b246 100644 --- a/include/net/netfilter/nf_conntrack.h +++ b/include/net/netfilter/nf_conntrack.h @@ -286,7 +286,7 @@ int nf_conntrack_hash_resize(unsigned int hashsize); extern struct hlist_nulls_head *nf_conntrack_hash; extern unsigned int nf_conntrack_htable_size; -extern seqcount_t nf_conntrack_generation; +extern seqcount_spinlock_t nf_conntrack_generation; extern unsigned int nf_conntrack_max; /* must be called with rcu read lock held */ diff --git a/init/init_task.c b/init/init_task.c index 15089d15010a..94fe3ba1bb60 100644 --- a/init/init_task.c +++ b/init/init_task.c @@ -154,7 +154,8 @@ struct task_struct init_task .trc_holdout_list = LIST_HEAD_INIT(init_task.trc_holdout_list), #endif #ifdef CONFIG_CPUSETS - .mems_allowed_seq = SEQCNT_ZERO(init_task.mems_allowed_seq), + .mems_allowed_seq = SEQCNT_SPINLOCK_ZERO(init_task.mems_allowed_seq, + &init_task.alloc_lock), #endif #ifdef CONFIG_RT_MUTEXES .pi_waiters = RB_ROOT_CACHED, diff --git a/kernel/fork.c b/kernel/fork.c index efc5493203ae..fc72f09a61b2 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1954,8 +1954,8 @@ static __latent_entropy struct task_struct *copy_process( rt_mutex_init_task(p); + lockdep_assert_irqs_enabled(); #ifdef CONFIG_PROVE_LOCKING - DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled); DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled); #endif retval = -EAGAIN; @@ -2032,11 +2032,10 @@ static __latent_entropy struct task_struct *copy_process( #ifdef CONFIG_CPUSETS p->cpuset_mem_spread_rotor = NUMA_NO_NODE; p->cpuset_slab_spread_rotor = NUMA_NO_NODE; - seqcount_init(&p->mems_allowed_seq); + seqcount_spinlock_init(&p->mems_allowed_seq, &p->alloc_lock); #endif #ifdef CONFIG_TRACE_IRQFLAGS p->irq_events = 0; - p->hardirqs_enabled = 0; p->hardirq_enable_ip = 0; p->hardirq_enable_event = 0; p->hardirq_disable_ip = _THIS_IP_; @@ -2046,7 +2045,6 @@ static __latent_entropy struct task_struct *copy_process( p->softirq_enable_event = 0; p->softirq_disable_ip = 0; p->softirq_disable_event = 0; - p->hardirq_context = 0; p->softirq_context = 0; #endif diff --git a/kernel/futex.c b/kernel/futex.c index e646661f6282..4616d4ad609d 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -32,30 +32,13 @@ * "But they come in a choice of three flavours!" */ #include <linux/compat.h> -#include <linux/slab.h> -#include <linux/poll.h> -#include <linux/fs.h> -#include <linux/file.h> #include <linux/jhash.h> -#include <linux/init.h> -#include <linux/futex.h> -#include <linux/mount.h> #include <linux/pagemap.h> #include <linux/syscalls.h> -#include <linux/signal.h> -#include <linux/export.h> -#include <linux/magic.h> -#include <linux/pid.h> -#include <linux/nsproxy.h> -#include <linux/ptrace.h> -#include <linux/sched/rt.h> -#include <linux/sched/wake_q.h> -#include <linux/sched/mm.h> #include <linux/hugetlb.h> #include <linux/freezer.h> #include <linux/memblock.h> #include <linux/fault-inject.h> -#include <linux/refcount.h> #include <asm/futex.h> @@ -476,7 +459,7 @@ static u64 get_inode_sequence_number(struct inode *inode) /** * get_futex_key() - Get parameters which are the keys for a futex * @uaddr: virtual address of the futex - * @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED + * @fshared: false for a PROCESS_PRIVATE futex, true for PROCESS_SHARED * @key: address where result is stored. * @rw: mapping needs to be read/write (values: FUTEX_READ, * FUTEX_WRITE) @@ -500,8 +483,8 @@ static u64 get_inode_sequence_number(struct inode *inode) * * lock_page() might sleep, the caller should not hold a spinlock. */ -static int -get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, enum futex_access rw) +static int get_futex_key(u32 __user *uaddr, bool fshared, union futex_key *key, + enum futex_access rw) { unsigned long address = (unsigned long)uaddr; struct mm_struct *mm = current->mm; @@ -538,7 +521,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, enum futex_a again: /* Ignore any VERIFY_READ mapping (futex common case) */ - if (unlikely(should_fail_futex(fshared))) + if (unlikely(should_fail_futex(true))) return -EFAULT; err = get_user_pages_fast(address, 1, FOLL_WRITE, &page); @@ -626,7 +609,7 @@ again: * A RO anonymous page will never change and thus doesn't make * sense for futex operations. */ - if (unlikely(should_fail_futex(fshared)) || ro) { + if (unlikely(should_fail_futex(true)) || ro) { err = -EFAULT; goto out; } @@ -677,10 +660,6 @@ out: return err; } -static inline void put_futex_key(union futex_key *key) -{ -} - /** * fault_in_user_writeable() - Fault in user address and verify RW access * @uaddr: pointer to faulting user space address @@ -1611,13 +1590,13 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, FUTEX_READ); if (unlikely(ret != 0)) - goto out; + return ret; hb = hash_futex(&key); /* Make sure we really have tasks to wakeup */ if (!hb_waiters_pending(hb)) - goto out_put_key; + return ret; spin_lock(&hb->lock); @@ -1640,9 +1619,6 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) spin_unlock(&hb->lock); wake_up_q(&wake_q); -out_put_key: - put_futex_key(&key); -out: return ret; } @@ -1709,10 +1685,10 @@ futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2, retry: ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, FUTEX_READ); if (unlikely(ret != 0)) - goto out; + return ret; ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, FUTEX_WRITE); if (unlikely(ret != 0)) - goto out_put_key1; + return ret; hb1 = hash_futex(&key1); hb2 = hash_futex(&key2); @@ -1730,13 +1706,13 @@ retry_private: * an MMU, but we might get them from range checking */ ret = op_ret; - goto out_put_keys; + return ret; } if (op_ret == -EFAULT) { ret = fault_in_user_writeable(uaddr2); if (ret) - goto out_put_keys; + return ret; } if (!(flags & FLAGS_SHARED)) { @@ -1744,8 +1720,6 @@ retry_private: goto retry_private; } - put_futex_key(&key2); - put_futex_key(&key1); cond_resched(); goto retry; } @@ -1781,11 +1755,6 @@ retry_private: out_unlock: double_unlock_hb(hb1, hb2); wake_up_q(&wake_q); -out_put_keys: - put_futex_key(&key2); -out_put_key1: - put_futex_key(&key1); -out: return ret; } @@ -1992,20 +1961,18 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, retry: ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, FUTEX_READ); if (unlikely(ret != 0)) - goto out; + return ret; ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, requeue_pi ? FUTEX_WRITE : FUTEX_READ); if (unlikely(ret != 0)) - goto out_put_key1; + return ret; /* * The check above which compares uaddrs is not sufficient for * shared futexes. We need to compare the keys: */ - if (requeue_pi && match_futex(&key1, &key2)) { - ret = -EINVAL; - goto out_put_keys; - } + if (requeue_pi && match_futex(&key1, &key2)) + return -EINVAL; hb1 = hash_futex(&key1); hb2 = hash_futex(&key2); @@ -2025,13 +1992,11 @@ retry_private: ret = get_user(curval, uaddr1); if (ret) - goto out_put_keys; + return ret; if (!(flags & FLAGS_SHARED)) goto retry_private; - put_futex_key(&key2); - put_futex_key(&key1); goto retry; } if (curval != *cmpval) { @@ -2090,12 +2055,10 @@ retry_private: case -EFAULT: double_unlock_hb(hb1, hb2); hb_waiters_dec(hb2); - put_futex_key(&key2); - put_futex_key(&key1); ret = fault_in_user_writeable(uaddr2); if (!ret) goto retry; - goto out; + return ret; case -EBUSY: case -EAGAIN: /* @@ -2106,8 +2069,6 @@ retry_private: */ double_unlock_hb(hb1, hb2); hb_waiters_dec(hb2); - put_futex_key(&key2); - put_futex_key(&key1); /* * Handle the case where the owner is in the middle of * exiting. Wait for the exit to complete otherwise @@ -2216,12 +2177,6 @@ out_unlock: double_unlock_hb(hb1, hb2); wake_up_q(&wake_q); hb_waiters_dec(hb2); - -out_put_keys: - put_futex_key(&key2); -out_put_key1: - put_futex_key(&key1); -out: return ret ? ret : task_count; } @@ -2567,7 +2522,7 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) */ if (q->pi_state->owner != current) ret = fixup_pi_state_owner(uaddr, q, current); - goto out; + return ret ? ret : locked; } /* @@ -2580,7 +2535,7 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) */ if (q->pi_state->owner == current) { ret = fixup_pi_state_owner(uaddr, q, NULL); - goto out; + return ret; } /* @@ -2594,8 +2549,7 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) q->pi_state->owner); } -out: - return ret ? ret : locked; + return ret; } /** @@ -2692,12 +2646,11 @@ retry_private: ret = get_user(uval, uaddr); if (ret) - goto out; + return ret; if (!(flags & FLAGS_SHARED)) goto retry_private; - put_futex_key(&q->key); goto retry; } @@ -2706,9 +2659,6 @@ retry_private: ret = -EWOULDBLOCK; } -out: - if (ret) - put_futex_key(&q->key); return ret; } @@ -2853,7 +2803,6 @@ retry_private: * - EAGAIN: The user space value changed. */ queue_unlock(hb); - put_futex_key(&q.key); /* * Handle the case where the owner is in the middle of * exiting. Wait for the exit to complete otherwise @@ -2961,13 +2910,11 @@ no_block: put_pi_state(pi_state); } - goto out_put_key; + goto out; out_unlock_put_key: queue_unlock(hb); -out_put_key: - put_futex_key(&q.key); out: if (to) { hrtimer_cancel(&to->timer); @@ -2980,12 +2927,11 @@ uaddr_faulted: ret = fault_in_user_writeable(uaddr); if (ret) - goto out_put_key; + goto out; if (!(flags & FLAGS_SHARED)) goto retry_private; - put_futex_key(&q.key); goto retry; } @@ -3114,16 +3060,13 @@ retry: out_unlock: spin_unlock(&hb->lock); out_putkey: - put_futex_key(&key); return ret; pi_retry: - put_futex_key(&key); cond_resched(); goto retry; pi_faulted: - put_futex_key(&key); ret = fault_in_user_writeable(uaddr); if (!ret) @@ -3265,7 +3208,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, */ ret = futex_wait_setup(uaddr, val, flags, &q, &hb); if (ret) - goto out_key2; + goto out; /* * The check above which compares uaddrs is not sufficient for @@ -3274,7 +3217,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, if (match_futex(&q.key, &key2)) { queue_unlock(hb); ret = -EINVAL; - goto out_put_keys; + goto out; } /* Queue the futex_q, drop the hb lock, wait for wakeup. */ @@ -3284,7 +3227,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to); spin_unlock(&hb->lock); if (ret) - goto out_put_keys; + goto out; /* * In order for us to be here, we know our q.key == key2, and since @@ -3374,11 +3317,6 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, ret = -EWOULDBLOCK; } -out_put_keys: - put_futex_key(&q.key); -out_key2: - put_futex_key(&key2); - out: if (to) { hrtimer_cancel(&to->timer); diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c index 15f67949d11e..732623c30359 100644 --- a/kernel/kcsan/core.c +++ b/kernel/kcsan/core.c @@ -397,8 +397,7 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type) } if (!kcsan_interrupt_watcher) - /* Use raw to avoid lockdep recursion via IRQ flags tracing. */ - raw_local_irq_save(irq_flags); + local_irq_save(irq_flags); watchpoint = insert_watchpoint((unsigned long)ptr, size, is_write); if (watchpoint == NULL) { @@ -539,7 +538,7 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type) kcsan_counter_dec(KCSAN_COUNTER_USED_WATCHPOINTS); out_unlock: if (!kcsan_interrupt_watcher) - raw_local_irq_restore(irq_flags); + local_irq_restore(irq_flags); out: user_access_restore(ua_flags); } diff --git a/kernel/kcsan/report.c b/kernel/kcsan/report.c index ac5f8345bae9..6b2fb1a6d8cd 100644 --- a/kernel/kcsan/report.c +++ b/kernel/kcsan/report.c @@ -606,10 +606,11 @@ void kcsan_report(const volatile void *ptr, size_t size, int access_type, goto out; /* - * With TRACE_IRQFLAGS, lockdep's IRQ trace state becomes corrupted if - * we do not turn off lockdep here; this could happen due to recursion - * into lockdep via KCSAN if we detect a race in utilities used by - * lockdep. + * Because we may generate reports when we're in scheduler code, the use + * of printk() could deadlock. Until such time that all printing code + * called in print_report() is scheduler-safe, accept the risk, and just + * get our message out. As such, also disable lockdep to hide the + * warning, and avoid disabling lockdep for the rest of the kernel. */ lockdep_off(); diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 29a8de4c50b9..c9ea05edce25 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -395,7 +395,7 @@ void lockdep_init_task(struct task_struct *task) static __always_inline void lockdep_recursion_finish(void) { - if (WARN_ON_ONCE(--current->lockdep_recursion)) + if (WARN_ON_ONCE((--current->lockdep_recursion) & LOCKDEP_RECURSION_MASK)) current->lockdep_recursion = 0; } @@ -2062,9 +2062,9 @@ print_bad_irq_dependency(struct task_struct *curr, pr_warn("-----------------------------------------------------\n"); pr_warn("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n", curr->comm, task_pid_nr(curr), - curr->hardirq_context, hardirq_count() >> HARDIRQ_SHIFT, + lockdep_hardirq_context(), hardirq_count() >> HARDIRQ_SHIFT, curr->softirq_context, softirq_count() >> SOFTIRQ_SHIFT, - curr->hardirqs_enabled, + lockdep_hardirqs_enabled(), curr->softirqs_enabled); print_lock(next); @@ -3331,9 +3331,9 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this, pr_warn("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] takes:\n", curr->comm, task_pid_nr(curr), - lockdep_hardirq_context(curr), hardirq_count() >> HARDIRQ_SHIFT, + lockdep_hardirq_context(), hardirq_count() >> HARDIRQ_SHIFT, lockdep_softirq_context(curr), softirq_count() >> SOFTIRQ_SHIFT, - lockdep_hardirqs_enabled(curr), + lockdep_hardirqs_enabled(), lockdep_softirqs_enabled(curr)); print_lock(this); @@ -3646,10 +3646,19 @@ static void __trace_hardirqs_on_caller(void) */ void lockdep_hardirqs_on_prepare(unsigned long ip) { - if (unlikely(!debug_locks || current->lockdep_recursion)) + if (unlikely(!debug_locks)) + return; + + /* + * NMIs do not (and cannot) track lock dependencies, nothing to do. + */ + if (unlikely(in_nmi())) return; - if (unlikely(current->hardirqs_enabled)) { + if (unlikely(current->lockdep_recursion & LOCKDEP_RECURSION_MASK)) + return; + + if (unlikely(lockdep_hardirqs_enabled())) { /* * Neither irq nor preemption are disabled here * so this is racy by nature but losing one hit @@ -3677,7 +3686,7 @@ void lockdep_hardirqs_on_prepare(unsigned long ip) * Can't allow enabling interrupts while in an interrupt handler, * that's general bad form and such. Recursion, limited stack etc.. */ - if (DEBUG_LOCKS_WARN_ON(current->hardirq_context)) + if (DEBUG_LOCKS_WARN_ON(lockdep_hardirq_context())) return; current->hardirq_chain_key = current->curr_chain_key; @@ -3692,10 +3701,30 @@ void noinstr lockdep_hardirqs_on(unsigned long ip) { struct task_struct *curr = current; - if (unlikely(!debug_locks || curr->lockdep_recursion)) + if (unlikely(!debug_locks)) return; - if (curr->hardirqs_enabled) { + /* + * NMIs can happen in the middle of local_irq_{en,dis}able() where the + * tracking state and hardware state are out of sync. + * + * NMIs must save lockdep_hardirqs_enabled() to restore IRQ state from, + * and not rely on hardware state like normal interrupts. + */ + if (unlikely(in_nmi())) { + /* + * Skip: + * - recursion check, because NMI can hit lockdep; + * - hardware state check, because above; + * - chain_key check, see lockdep_hardirqs_on_prepare(). + */ + goto skip_checks; + } + + if (unlikely(current->lockdep_recursion & LOCKDEP_RECURSION_MASK)) + return; + + if (lockdep_hardirqs_enabled()) { /* * Neither irq nor preemption are disabled here * so this is racy by nature but losing one hit @@ -3720,8 +3749,9 @@ void noinstr lockdep_hardirqs_on(unsigned long ip) DEBUG_LOCKS_WARN_ON(current->hardirq_chain_key != current->curr_chain_key); +skip_checks: /* we'll do an OFF -> ON transition: */ - curr->hardirqs_enabled = 1; + this_cpu_write(hardirqs_enabled, 1); curr->hardirq_enable_ip = ip; curr->hardirq_enable_event = ++curr->irq_events; debug_atomic_inc(hardirqs_on_events); @@ -3735,7 +3765,15 @@ void noinstr lockdep_hardirqs_off(unsigned long ip) { struct task_struct *curr = current; - if (unlikely(!debug_locks || curr->lockdep_recursion)) + if (unlikely(!debug_locks)) + return; + + /* + * Matching lockdep_hardirqs_on(), allow NMIs in the middle of lockdep; + * they will restore the software state. This ensures the software + * state is consistent inside NMIs as well. + */ + if (unlikely(!in_nmi() && (current->lockdep_recursion & LOCKDEP_RECURSION_MASK))) return; /* @@ -3745,11 +3783,11 @@ void noinstr lockdep_hardirqs_off(unsigned long ip) if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) return; - if (curr->hardirqs_enabled) { + if (lockdep_hardirqs_enabled()) { /* * We have done an ON -> OFF transition: */ - curr->hardirqs_enabled = 0; + this_cpu_write(hardirqs_enabled, 0); curr->hardirq_disable_ip = ip; curr->hardirq_disable_event = ++curr->irq_events; debug_atomic_inc(hardirqs_off_events); @@ -3794,7 +3832,7 @@ void lockdep_softirqs_on(unsigned long ip) * usage bit for all held locks, if hardirqs are * enabled too: */ - if (curr->hardirqs_enabled) + if (lockdep_hardirqs_enabled()) mark_held_locks(curr, LOCK_ENABLED_SOFTIRQ); lockdep_recursion_finish(); } @@ -3843,7 +3881,7 @@ mark_usage(struct task_struct *curr, struct held_lock *hlock, int check) */ if (!hlock->trylock) { if (hlock->read) { - if (curr->hardirq_context) + if (lockdep_hardirq_context()) if (!mark_lock(curr, hlock, LOCK_USED_IN_HARDIRQ_READ)) return 0; @@ -3852,7 +3890,7 @@ mark_usage(struct task_struct *curr, struct held_lock *hlock, int check) LOCK_USED_IN_SOFTIRQ_READ)) return 0; } else { - if (curr->hardirq_context) + if (lockdep_hardirq_context()) if (!mark_lock(curr, hlock, LOCK_USED_IN_HARDIRQ)) return 0; if (curr->softirq_context) @@ -3890,7 +3928,7 @@ lock_used: static inline unsigned int task_irq_context(struct task_struct *task) { - return LOCK_CHAIN_HARDIRQ_CONTEXT * !!task->hardirq_context + + return LOCK_CHAIN_HARDIRQ_CONTEXT * !!lockdep_hardirq_context() + LOCK_CHAIN_SOFTIRQ_CONTEXT * !!task->softirq_context; } @@ -3983,7 +4021,7 @@ static inline short task_wait_context(struct task_struct *curr) * Set appropriate wait type for the context; for IRQs we have to take * into account force_irqthread as that is implied by PREEMPT_RT. */ - if (curr->hardirq_context) { + if (lockdep_hardirq_context()) { /* * Check if force_irqthreads will run us threaded. */ @@ -4826,11 +4864,11 @@ static void check_flags(unsigned long flags) return; if (irqs_disabled_flags(flags)) { - if (DEBUG_LOCKS_WARN_ON(current->hardirqs_enabled)) { + if (DEBUG_LOCKS_WARN_ON(lockdep_hardirqs_enabled())) { printk("possible reason: unannotated irqs-off.\n"); } } else { - if (DEBUG_LOCKS_WARN_ON(!current->hardirqs_enabled)) { + if (DEBUG_LOCKS_WARN_ON(!lockdep_hardirqs_enabled())) { printk("possible reason: unannotated irqs-on.\n"); } } diff --git a/kernel/softirq.c b/kernel/softirq.c index c4201b7f42b1..5e9aaa648a74 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -107,6 +107,12 @@ static bool ksoftirqd_running(unsigned long pending) * where hardirqs are disabled legitimately: */ #ifdef CONFIG_TRACE_IRQFLAGS + +DEFINE_PER_CPU(int, hardirqs_enabled); +DEFINE_PER_CPU(int, hardirq_context); +EXPORT_PER_CPU_SYMBOL_GPL(hardirqs_enabled); +EXPORT_PER_CPU_SYMBOL_GPL(hardirq_context); + void __local_bh_disable_ip(unsigned long ip, unsigned int cnt) { unsigned long flags; @@ -224,7 +230,7 @@ static inline bool lockdep_softirq_start(void) { bool in_hardirq = false; - if (lockdep_hardirq_context(current)) { + if (lockdep_hardirq_context()) { in_hardirq = true; lockdep_hardirq_exit(); } diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index d89da1c7e005..c4038511d5c9 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -135,7 +135,11 @@ static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = { * timer->base->cpu_base */ static struct hrtimer_cpu_base migration_cpu_base = { - .clock_base = { { .cpu_base = &migration_cpu_base, }, }, + .clock_base = { { + .cpu_base = &migration_cpu_base, + .seq = SEQCNT_RAW_SPINLOCK_ZERO(migration_cpu_base.seq, + &migration_cpu_base.lock), + }, }, }; #define migration_base migration_cpu_base.clock_base[0] @@ -1998,8 +2002,11 @@ int hrtimers_prepare_cpu(unsigned int cpu) int i; for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { - cpu_base->clock_base[i].cpu_base = cpu_base; - timerqueue_init_head(&cpu_base->clock_base[i].active); + struct hrtimer_clock_base *clock_b = &cpu_base->clock_base[i]; + + clock_b->cpu_base = cpu_base; + seqcount_raw_spinlock_init(&clock_b->seq, &cpu_base->lock); + timerqueue_init_head(&clock_b->active); } cpu_base->cpu = cpu; diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index d20d489841c8..05ecfd8a3314 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -39,18 +39,19 @@ enum timekeeping_adv_mode { TK_ADV_FREQ }; +static DEFINE_RAW_SPINLOCK(timekeeper_lock); + /* * The most important data for readout fits into a single 64 byte * cache line. */ static struct { - seqcount_t seq; + seqcount_raw_spinlock_t seq; struct timekeeper timekeeper; } tk_core ____cacheline_aligned = { - .seq = SEQCNT_ZERO(tk_core.seq), + .seq = SEQCNT_RAW_SPINLOCK_ZERO(tk_core.seq, &timekeeper_lock), }; -static DEFINE_RAW_SPINLOCK(timekeeper_lock); static struct timekeeper shadow_timekeeper; /** @@ -63,7 +64,7 @@ static struct timekeeper shadow_timekeeper; * See @update_fast_timekeeper() below. */ struct tk_fast { - seqcount_t seq; + seqcount_raw_spinlock_t seq; struct tk_read_base base[2]; }; @@ -80,11 +81,13 @@ static struct clocksource dummy_clock = { }; static struct tk_fast tk_fast_mono ____cacheline_aligned = { + .seq = SEQCNT_RAW_SPINLOCK_ZERO(tk_fast_mono.seq, &timekeeper_lock), .base[0] = { .clock = &dummy_clock, }, .base[1] = { .clock = &dummy_clock, }, }; static struct tk_fast tk_fast_raw ____cacheline_aligned = { + .seq = SEQCNT_RAW_SPINLOCK_ZERO(tk_fast_raw.seq, &timekeeper_lock), .base[0] = { .clock = &dummy_clock, }, .base[1] = { .clock = &dummy_clock, }, }; @@ -157,7 +160,7 @@ static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta) * tk_clock_read - atomic clocksource read() helper * * This helper is necessary to use in the read paths because, while the - * seqlock ensures we don't return a bad value while structures are updated, + * seqcount ensures we don't return a bad value while structures are updated, * it doesn't protect from potential crashes. There is the possibility that * the tkr's clocksource may change between the read reference, and the * clock reference passed to the read function. This can cause crashes if @@ -222,10 +225,10 @@ static inline u64 timekeeping_get_delta(const struct tk_read_base *tkr) unsigned int seq; /* - * Since we're called holding a seqlock, the data may shift + * Since we're called holding a seqcount, the data may shift * under us while we're doing the calculation. This can cause * false positives, since we'd note a problem but throw the - * results away. So nest another seqlock here to atomically + * results away. So nest another seqcount here to atomically * grab the points we are checking with. */ do { @@ -486,7 +489,7 @@ EXPORT_SYMBOL_GPL(ktime_get_raw_fast_ns); * * To keep it NMI safe since we're accessing from tracing, we're not using a * separate timekeeper with updates to monotonic clock and boot offset - * protected with seqlocks. This has the following minor side effects: + * protected with seqcounts. This has the following minor side effects: * * (1) Its possible that a timestamp be taken after the boot offset is updated * but before the timekeeper is updated. If this happens, the new boot offset diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 9ad9210d70a1..5379931ba3b5 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1117,6 +1117,7 @@ config PROVE_LOCKING select DEBUG_RWSEMS select DEBUG_WW_MUTEX_SLOWPATH select DEBUG_LOCK_ALLOC + select PREEMPT_COUNT if !ARCH_NO_PREEMPT select TRACE_IRQFLAGS default n help diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index f33d72c5b06e..b597b5b16ba1 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -180,7 +180,7 @@ EXPORT_SYMBOL_GPL(nf_conntrack_htable_size); unsigned int nf_conntrack_max __read_mostly; EXPORT_SYMBOL_GPL(nf_conntrack_max); -seqcount_t nf_conntrack_generation __read_mostly; +seqcount_spinlock_t nf_conntrack_generation __read_mostly; static unsigned int nf_conntrack_hash_rnd __read_mostly; static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple, @@ -2600,7 +2600,8 @@ int nf_conntrack_init_start(void) /* struct nf_ct_ext uses u8 to store offsets/size */ BUILD_BUG_ON(total_extension_size() > 255u); - seqcount_init(&nf_conntrack_generation); + seqcount_spinlock_init(&nf_conntrack_generation, + &nf_conntrack_locks_all_lock); for (i = 0; i < CONNTRACK_LOCKS; i++) spin_lock_init(&nf_conntrack_locks[i]); diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index b6aad3fc46c3..4b2834fd17b2 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -18,7 +18,7 @@ struct nft_rbtree { struct rb_root root; rwlock_t lock; - seqcount_t count; + seqcount_rwlock_t count; struct delayed_work gc_work; }; @@ -523,7 +523,7 @@ static int nft_rbtree_init(const struct nft_set *set, struct nft_rbtree *priv = nft_set_priv(set); rwlock_init(&priv->lock); - seqcount_init(&priv->count); + seqcount_rwlock_init(&priv->count, &priv->lock); priv->root = RB_ROOT; INIT_DEFERRABLE_WORK(&priv->gc_work, nft_rbtree_gc); diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 564aa6492e7c..732a940468b0 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -122,7 +122,7 @@ struct xfrm_pol_inexact_bin { /* list containing '*:*' policies */ struct hlist_head hhead; - seqcount_t count; + seqcount_spinlock_t count; /* tree sorted by daddr/prefix */ struct rb_root root_d; @@ -155,7 +155,7 @@ static struct xfrm_policy_afinfo const __rcu *xfrm_policy_afinfo[AF_INET6 + 1] __read_mostly; static struct kmem_cache *xfrm_dst_cache __ro_after_init; -static __read_mostly seqcount_t xfrm_policy_hash_generation; +static __read_mostly seqcount_mutex_t xfrm_policy_hash_generation; static struct rhashtable xfrm_policy_inexact_table; static const struct rhashtable_params xfrm_pol_inexact_params; @@ -719,7 +719,7 @@ xfrm_policy_inexact_alloc_bin(const struct xfrm_policy *pol, u8 dir) INIT_HLIST_HEAD(&bin->hhead); bin->root_d = RB_ROOT; bin->root_s = RB_ROOT; - seqcount_init(&bin->count); + seqcount_spinlock_init(&bin->count, &net->xfrm.xfrm_policy_lock); prev = rhashtable_lookup_get_insert_key(&xfrm_policy_inexact_table, &bin->k, &bin->head, @@ -1906,7 +1906,7 @@ static int xfrm_policy_match(const struct xfrm_policy *pol, static struct xfrm_pol_inexact_node * xfrm_policy_lookup_inexact_addr(const struct rb_root *r, - seqcount_t *count, + seqcount_spinlock_t *count, const xfrm_address_t *addr, u16 family) { const struct rb_node *parent; @@ -4153,7 +4153,7 @@ void __init xfrm_init(void) { register_pernet_subsys(&xfrm_net_ops); xfrm_dev_init(); - seqcount_init(&xfrm_policy_hash_generation); + seqcount_mutex_init(&xfrm_policy_hash_generation, &hash_resize_mutex); xfrm_input_init(); #ifdef CONFIG_INET_ESPINTCP diff --git a/tools/include/linux/irqflags.h b/tools/include/linux/irqflags.h index 67e01bbadbfe..501262aee8ff 100644 --- a/tools/include/linux/irqflags.h +++ b/tools/include/linux/irqflags.h @@ -2,9 +2,9 @@ #ifndef _LIBLOCKDEP_LINUX_TRACE_IRQFLAGS_H_ #define _LIBLOCKDEP_LINUX_TRACE_IRQFLAGS_H_ -# define lockdep_hardirq_context(p) 0 +# define lockdep_hardirq_context() 0 # define lockdep_softirq_context(p) 0 -# define lockdep_hardirqs_enabled(p) 0 +# define lockdep_hardirqs_enabled() 0 # define lockdep_softirqs_enabled(p) 0 # define lockdep_hardirq_enter() do { } while (0) # define lockdep_hardirq_exit() do { } while (0) diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index ef7ed916ad4a..d6408bb497dc 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -303,7 +303,7 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args) INIT_LIST_HEAD(&irqfd->list); INIT_WORK(&irqfd->inject, irqfd_inject); INIT_WORK(&irqfd->shutdown, irqfd_shutdown); - seqcount_init(&irqfd->irq_entry_sc); + seqcount_spinlock_init(&irqfd->irq_entry_sc, &kvm->irqfds.lock); f = fdget(args->fd); if (!f.file) { |