diff options
81 files changed, 993 insertions, 684 deletions
diff --git a/MAINTAINERS b/MAINTAINERS index 8a0cbf3cf2c8..aee6733391cb 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7282,6 +7282,8 @@ S: Maintained F: kernel/sched/ F: include/linux/sched.h F: include/uapi/linux/sched.h +F: kernel/wait.c +F: include/linux/wait.h SCORE ARCHITECTURE M: Chen Liqin <liqin.linux@gmail.com> diff --git a/arch/alpha/include/asm/Kbuild b/arch/alpha/include/asm/Kbuild index a6e85f448c1c..f01fb505ad52 100644 --- a/arch/alpha/include/asm/Kbuild +++ b/arch/alpha/include/asm/Kbuild @@ -3,3 +3,4 @@ generic-y += clkdev.h generic-y += exec.h generic-y += trace_clock.h +generic-y += preempt.h diff --git a/arch/arc/include/asm/Kbuild b/arch/arc/include/asm/Kbuild index d8dd660898b9..5943f7f9d325 100644 --- a/arch/arc/include/asm/Kbuild +++ b/arch/arc/include/asm/Kbuild @@ -46,3 +46,4 @@ generic-y += ucontext.h generic-y += user.h generic-y += vga.h generic-y += xor.h +generic-y += preempt.h diff --git a/arch/arm/include/asm/Kbuild b/arch/arm/include/asm/Kbuild index d3db39860b9c..4e6838d4ddf6 100644 --- a/arch/arm/include/asm/Kbuild +++ b/arch/arm/include/asm/Kbuild @@ -33,3 +33,4 @@ generic-y += timex.h generic-y += trace_clock.h generic-y += types.h generic-y += unaligned.h +generic-y += preempt.h diff --git a/arch/arm64/include/asm/Kbuild b/arch/arm64/include/asm/Kbuild index 79a642d199f2..519f89f5b6a3 100644 --- a/arch/arm64/include/asm/Kbuild +++ b/arch/arm64/include/asm/Kbuild @@ -50,3 +50,4 @@ generic-y += unaligned.h generic-y += user.h generic-y += vga.h generic-y += xor.h +generic-y += preempt.h diff --git a/arch/avr32/include/asm/Kbuild b/arch/avr32/include/asm/Kbuild index fd7980743890..658001b52400 100644 --- a/arch/avr32/include/asm/Kbuild +++ b/arch/avr32/include/asm/Kbuild @@ -7,6 +7,7 @@ generic-y += div64.h generic-y += emergency-restart.h generic-y += exec.h generic-y += futex.h +generic-y += preempt.h generic-y += irq_regs.h generic-y += param.h generic-y += local.h diff --git a/arch/blackfin/include/asm/Kbuild b/arch/blackfin/include/asm/Kbuild index 127826f8a375..f2b43474b0e2 100644 --- a/arch/blackfin/include/asm/Kbuild +++ b/arch/blackfin/include/asm/Kbuild @@ -44,3 +44,4 @@ generic-y += ucontext.h generic-y += unaligned.h generic-y += user.h generic-y += xor.h +generic-y += preempt.h diff --git a/arch/c6x/include/asm/Kbuild b/arch/c6x/include/asm/Kbuild index e49f918531ad..fc0b3c356027 100644 --- a/arch/c6x/include/asm/Kbuild +++ b/arch/c6x/include/asm/Kbuild @@ -56,3 +56,4 @@ generic-y += ucontext.h generic-y += user.h generic-y += vga.h generic-y += xor.h +generic-y += preempt.h diff --git a/arch/cris/include/asm/Kbuild b/arch/cris/include/asm/Kbuild index c8325455520e..b06caf649a95 100644 --- a/arch/cris/include/asm/Kbuild +++ b/arch/cris/include/asm/Kbuild @@ -11,3 +11,4 @@ generic-y += module.h generic-y += trace_clock.h generic-y += vga.h generic-y += xor.h +generic-y += preempt.h diff --git a/arch/frv/include/asm/Kbuild b/arch/frv/include/asm/Kbuild index c5d767028306..74742dc6a3da 100644 --- a/arch/frv/include/asm/Kbuild +++ b/arch/frv/include/asm/Kbuild @@ -2,3 +2,4 @@ generic-y += clkdev.h generic-y += exec.h generic-y += trace_clock.h +generic-y += preempt.h diff --git a/arch/h8300/include/asm/Kbuild b/arch/h8300/include/asm/Kbuild index 8ada3cf0c98d..7e0e7213a481 100644 --- a/arch/h8300/include/asm/Kbuild +++ b/arch/h8300/include/asm/Kbuild @@ -6,3 +6,4 @@ generic-y += mmu.h generic-y += module.h generic-y += trace_clock.h generic-y += xor.h +generic-y += preempt.h diff --git a/arch/hexagon/include/asm/Kbuild b/arch/hexagon/include/asm/Kbuild index 1da17caac23c..67c3450309b7 100644 --- a/arch/hexagon/include/asm/Kbuild +++ b/arch/hexagon/include/asm/Kbuild @@ -53,3 +53,4 @@ generic-y += types.h generic-y += ucontext.h generic-y += unaligned.h generic-y += xor.h +generic-y += preempt.h diff --git a/arch/ia64/include/asm/Kbuild b/arch/ia64/include/asm/Kbuild index a3456f34f672..f93ee087e8fe 100644 --- a/arch/ia64/include/asm/Kbuild +++ b/arch/ia64/include/asm/Kbuild @@ -3,4 +3,5 @@ generic-y += clkdev.h generic-y += exec.h generic-y += kvm_para.h generic-y += trace_clock.h +generic-y += preempt.h generic-y += vtime.h
\ No newline at end of file diff --git a/arch/m32r/include/asm/Kbuild b/arch/m32r/include/asm/Kbuild index bebdc36ebb0a..2b58c5f0bc38 100644 --- a/arch/m32r/include/asm/Kbuild +++ b/arch/m32r/include/asm/Kbuild @@ -3,3 +3,4 @@ generic-y += clkdev.h generic-y += exec.h generic-y += module.h generic-y += trace_clock.h +generic-y += preempt.h diff --git a/arch/m68k/include/asm/Kbuild b/arch/m68k/include/asm/Kbuild index 09d77a862da3..a5d27f272a59 100644 --- a/arch/m68k/include/asm/Kbuild +++ b/arch/m68k/include/asm/Kbuild @@ -31,3 +31,4 @@ generic-y += trace_clock.h generic-y += types.h generic-y += word-at-a-time.h generic-y += xor.h +generic-y += preempt.h diff --git a/arch/metag/include/asm/Kbuild b/arch/metag/include/asm/Kbuild index 6ae0ccb632cb..84d0c1d6b9b3 100644 --- a/arch/metag/include/asm/Kbuild +++ b/arch/metag/include/asm/Kbuild @@ -52,3 +52,4 @@ generic-y += unaligned.h generic-y += user.h generic-y += vga.h generic-y += xor.h +generic-y += preempt.h diff --git a/arch/metag/include/asm/topology.h b/arch/metag/include/asm/topology.h index 23f5118f58db..8e9c0b3b9691 100644 --- a/arch/metag/include/asm/topology.h +++ b/arch/metag/include/asm/topology.h @@ -26,6 +26,8 @@ .last_balance = jiffies, \ .balance_interval = 1, \ .nr_balance_failed = 0, \ + .max_newidle_lb_cost = 0, \ + .next_decay_max_lb_cost = jiffies, \ } #define cpu_to_node(cpu) ((void)(cpu), 0) diff --git a/arch/microblaze/include/asm/Kbuild b/arch/microblaze/include/asm/Kbuild index d3c51a6a601d..ce0bbf8f5640 100644 --- a/arch/microblaze/include/asm/Kbuild +++ b/arch/microblaze/include/asm/Kbuild @@ -3,3 +3,4 @@ generic-y += clkdev.h generic-y += exec.h generic-y += trace_clock.h generic-y += syscalls.h +generic-y += preempt.h diff --git a/arch/mips/include/asm/Kbuild b/arch/mips/include/asm/Kbuild index 454ddf9bb76f..1acbb8b77a71 100644 --- a/arch/mips/include/asm/Kbuild +++ b/arch/mips/include/asm/Kbuild @@ -11,5 +11,6 @@ generic-y += sections.h generic-y += segment.h generic-y += serial.h generic-y += trace_clock.h +generic-y += preempt.h generic-y += ucontext.h generic-y += xor.h diff --git a/arch/mips/kernel/rtlx.c b/arch/mips/kernel/rtlx.c index d763f11e35e2..2c12ea1668d1 100644 --- a/arch/mips/kernel/rtlx.c +++ b/arch/mips/kernel/rtlx.c @@ -172,8 +172,9 @@ int rtlx_open(int index, int can_sleep) if (rtlx == NULL) { if( (p = vpe_get_shared(tclimit)) == NULL) { if (can_sleep) { - __wait_event_interruptible(channel_wqs[index].lx_queue, - (p = vpe_get_shared(tclimit)), ret); + ret = __wait_event_interruptible( + channel_wqs[index].lx_queue, + (p = vpe_get_shared(tclimit))); if (ret) goto out_fail; } else { @@ -263,11 +264,10 @@ unsigned int rtlx_read_poll(int index, int can_sleep) /* data available to read? */ if (chan->lx_read == chan->lx_write) { if (can_sleep) { - int ret = 0; - - __wait_event_interruptible(channel_wqs[index].lx_queue, + int ret = __wait_event_interruptible( + channel_wqs[index].lx_queue, (chan->lx_read != chan->lx_write) || - sp_stopping, ret); + sp_stopping); if (ret) return ret; @@ -440,14 +440,13 @@ static ssize_t file_write(struct file *file, const char __user * buffer, /* any space left... */ if (!rtlx_write_poll(minor)) { - int ret = 0; + int ret; if (file->f_flags & O_NONBLOCK) return -EAGAIN; - __wait_event_interruptible(channel_wqs[minor].rt_queue, - rtlx_write_poll(minor), - ret); + ret = __wait_event_interruptible(channel_wqs[minor].rt_queue, + rtlx_write_poll(minor)); if (ret) return ret; } diff --git a/arch/mips/mm/init.c b/arch/mips/mm/init.c index e205ef598e97..12156176c7ca 100644 --- a/arch/mips/mm/init.c +++ b/arch/mips/mm/init.c @@ -124,7 +124,7 @@ void *kmap_coherent(struct page *page, unsigned long addr) BUG_ON(Page_dcache_dirty(page)); - inc_preempt_count(); + pagefault_disable(); idx = (addr >> PAGE_SHIFT) & (FIX_N_COLOURS - 1); #ifdef CONFIG_MIPS_MT_SMTC idx += FIX_N_COLOURS * smp_processor_id() + @@ -193,8 +193,7 @@ void kunmap_coherent(void) write_c0_entryhi(old_ctx); EXIT_CRITICAL(flags); #endif - dec_preempt_count(); - preempt_check_resched(); + pagefault_enable(); } void copy_user_highpage(struct page *to, struct page *from, diff --git a/arch/mn10300/include/asm/Kbuild b/arch/mn10300/include/asm/Kbuild index c5d767028306..74742dc6a3da 100644 --- a/arch/mn10300/include/asm/Kbuild +++ b/arch/mn10300/include/asm/Kbuild @@ -2,3 +2,4 @@ generic-y += clkdev.h generic-y += exec.h generic-y += trace_clock.h +generic-y += preempt.h diff --git a/arch/openrisc/include/asm/Kbuild b/arch/openrisc/include/asm/Kbuild index 195653e851da..78405625e799 100644 --- a/arch/openrisc/include/asm/Kbuild +++ b/arch/openrisc/include/asm/Kbuild @@ -67,3 +67,4 @@ generic-y += ucontext.h generic-y += user.h generic-y += word-at-a-time.h generic-y += xor.h +generic-y += preempt.h diff --git a/arch/parisc/include/asm/Kbuild b/arch/parisc/include/asm/Kbuild index ff4c9faed546..a603b9ebe54c 100644 --- a/arch/parisc/include/asm/Kbuild +++ b/arch/parisc/include/asm/Kbuild @@ -4,3 +4,4 @@ generic-y += word-at-a-time.h auxvec.h user.h cputime.h emergency-restart.h \ div64.h irq_regs.h kdebug.h kvm_para.h local64.h local.h param.h \ poll.h xor.h clkdev.h exec.h generic-y += trace_clock.h +generic-y += preempt.h diff --git a/arch/powerpc/include/asm/Kbuild b/arch/powerpc/include/asm/Kbuild index 704e6f10ae80..d8f9d2f18a23 100644 --- a/arch/powerpc/include/asm/Kbuild +++ b/arch/powerpc/include/asm/Kbuild @@ -2,4 +2,5 @@ generic-y += clkdev.h generic-y += rwsem.h generic-y += trace_clock.h +generic-y += preempt.h generic-y += vtime.h
\ No newline at end of file diff --git a/arch/s390/include/asm/Kbuild b/arch/s390/include/asm/Kbuild index f313f9cbcf44..7a5288f3479a 100644 --- a/arch/s390/include/asm/Kbuild +++ b/arch/s390/include/asm/Kbuild @@ -2,3 +2,4 @@ generic-y += clkdev.h generic-y += trace_clock.h +generic-y += preempt.h diff --git a/arch/score/include/asm/Kbuild b/arch/score/include/asm/Kbuild index e1c7bb999b06..f3414ade77a3 100644 --- a/arch/score/include/asm/Kbuild +++ b/arch/score/include/asm/Kbuild @@ -4,3 +4,4 @@ header-y += generic-y += clkdev.h generic-y += trace_clock.h generic-y += xor.h +generic-y += preempt.h diff --git a/arch/sh/include/asm/Kbuild b/arch/sh/include/asm/Kbuild index 280bea9e5e2b..231efbb68108 100644 --- a/arch/sh/include/asm/Kbuild +++ b/arch/sh/include/asm/Kbuild @@ -34,3 +34,4 @@ generic-y += termios.h generic-y += trace_clock.h generic-y += ucontext.h generic-y += xor.h +generic-y += preempt.h diff --git a/arch/sparc/include/asm/Kbuild b/arch/sparc/include/asm/Kbuild index 7e4a97fbded4..bf390667657a 100644 --- a/arch/sparc/include/asm/Kbuild +++ b/arch/sparc/include/asm/Kbuild @@ -16,3 +16,4 @@ generic-y += serial.h generic-y += trace_clock.h generic-y += types.h generic-y += word-at-a-time.h +generic-y += preempt.h diff --git a/arch/tile/include/asm/Kbuild b/arch/tile/include/asm/Kbuild index 664d6ad23f80..22f3bd147fa7 100644 --- a/arch/tile/include/asm/Kbuild +++ b/arch/tile/include/asm/Kbuild @@ -38,3 +38,4 @@ generic-y += termios.h generic-y += trace_clock.h generic-y += types.h generic-y += xor.h +generic-y += preempt.h diff --git a/arch/um/include/asm/Kbuild b/arch/um/include/asm/Kbuild index b30f34a79882..fdde187e6087 100644 --- a/arch/um/include/asm/Kbuild +++ b/arch/um/include/asm/Kbuild @@ -3,3 +3,4 @@ generic-y += hw_irq.h irq_regs.h kdebug.h percpu.h sections.h topology.h xor.h generic-y += ftrace.h pci.h io.h param.h delay.h mutex.h current.h exec.h generic-y += switch_to.h clkdev.h generic-y += trace_clock.h +generic-y += preempt.h diff --git a/arch/unicore32/include/asm/Kbuild b/arch/unicore32/include/asm/Kbuild index 89d8b6c4e39a..00045cbe5c63 100644 --- a/arch/unicore32/include/asm/Kbuild +++ b/arch/unicore32/include/asm/Kbuild @@ -60,3 +60,4 @@ generic-y += unaligned.h generic-y += user.h generic-y += vga.h generic-y += xor.h +generic-y += preempt.h diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h index 722aa3b04624..da31c8b8a92d 100644 --- a/arch/x86/include/asm/atomic.h +++ b/arch/x86/include/asm/atomic.h @@ -6,6 +6,7 @@ #include <asm/processor.h> #include <asm/alternative.h> #include <asm/cmpxchg.h> +#include <asm/rmwcc.h> /* * Atomic operations that C can't guarantee us. Useful for @@ -76,12 +77,7 @@ static inline void atomic_sub(int i, atomic_t *v) */ static inline int atomic_sub_and_test(int i, atomic_t *v) { - unsigned char c; - - asm volatile(LOCK_PREFIX "subl %2,%0; sete %1" - : "+m" (v->counter), "=qm" (c) - : "ir" (i) : "memory"); - return c; + GEN_BINARY_RMWcc(LOCK_PREFIX "subl", v->counter, i, "%0", "e"); } /** @@ -118,12 +114,7 @@ static inline void atomic_dec(atomic_t *v) */ static inline int atomic_dec_and_test(atomic_t *v) { - unsigned char c; - - asm volatile(LOCK_PREFIX "decl %0; sete %1" - : "+m" (v->counter), "=qm" (c) - : : "memory"); - return c != 0; + GEN_UNARY_RMWcc(LOCK_PREFIX "decl", v->counter, "%0", "e"); } /** @@ -136,12 +127,7 @@ static inline int atomic_dec_and_test(atomic_t *v) */ static inline int atomic_inc_and_test(atomic_t *v) { - unsigned char c; - - asm volatile(LOCK_PREFIX "incl %0; sete %1" - : "+m" (v->counter), "=qm" (c) - : : "memory"); - return c != 0; + GEN_UNARY_RMWcc(LOCK_PREFIX "incl", v->counter, "%0", "e"); } /** @@ -155,12 +141,7 @@ static inline int atomic_inc_and_test(atomic_t *v) */ static inline int atomic_add_negative(int i, atomic_t *v) { - unsigned char c; - - asm volatile(LOCK_PREFIX "addl %2,%0; sets %1" - : "+m" (v->counter), "=qm" (c) - : "ir" (i) : "memory"); - return c; + GEN_BINARY_RMWcc(LOCK_PREFIX "addl", v->counter, i, "%0", "s"); } /** diff --git a/arch/x86/include/asm/atomic64_64.h b/arch/x86/include/asm/atomic64_64.h index 0e1cbfc8ee06..3f065c985aee 100644 --- a/arch/x86/include/asm/atomic64_64.h +++ b/arch/x86/include/asm/atomic64_64.h @@ -72,12 +72,7 @@ static inline void atomic64_sub(long i, atomic64_t *v) */ static inline int atomic64_sub_and_test(long i, atomic64_t *v) { - unsigned char c; - - asm volatile(LOCK_PREFIX "subq %2,%0; sete %1" - : "=m" (v->counter), "=qm" (c) - : "er" (i), "m" (v->counter) : "memory"); - return c; + GEN_BINARY_RMWcc(LOCK_PREFIX "subq", v->counter, i, "%0", "e"); } /** @@ -116,12 +111,7 @@ static inline void atomic64_dec(atomic64_t *v) */ static inline int atomic64_dec_and_test(atomic64_t *v) { - unsigned char c; - - asm volatile(LOCK_PREFIX "decq %0; sete %1" - : "=m" (v->counter), "=qm" (c) - : "m" (v->counter) : "memory"); - return c != 0; + GEN_UNARY_RMWcc(LOCK_PREFIX "decq", v->counter, "%0", "e"); } /** @@ -134,12 +124,7 @@ static inline int atomic64_dec_and_test(atomic64_t *v) */ static inline int atomic64_inc_and_test(atomic64_t *v) { - unsigned char c; - - asm volatile(LOCK_PREFIX "incq %0; sete %1" - : "=m" (v->counter), "=qm" (c) - : "m" (v->counter) : "memory"); - return c != 0; + GEN_UNARY_RMWcc(LOCK_PREFIX "incq", v->counter, "%0", "e"); } /** @@ -153,12 +138,7 @@ static inline int atomic64_inc_and_test(atomic64_t *v) */ static inline int atomic64_add_negative(long i, atomic64_t *v) { - unsigned char c; - - asm volatile(LOCK_PREFIX "addq %2,%0; sets %1" - : "=m" (v->counter), "=qm" (c) - : "er" (i), "m" (v->counter) : "memory"); - return c; + GEN_BINARY_RMWcc(LOCK_PREFIX "addq", v->counter, i, "%0", "s"); } /** diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h index 41639ce8fd63..6d76d0935989 100644 --- a/arch/x86/include/asm/bitops.h +++ b/arch/x86/include/asm/bitops.h @@ -14,6 +14,7 @@ #include <linux/compiler.h> #include <asm/alternative.h> +#include <asm/rmwcc.h> #if BITS_PER_LONG == 32 # define _BITOPS_LONG_SHIFT 5 @@ -204,12 +205,7 @@ static inline void change_bit(long nr, volatile unsigned long *addr) */ static inline int test_and_set_bit(long nr, volatile unsigned long *addr) { - int oldbit; - - asm volatile(LOCK_PREFIX "bts %2,%1\n\t" - "sbb %0,%0" : "=r" (oldbit), ADDR : "Ir" (nr) : "memory"); - - return oldbit; + GEN_BINARY_RMWcc(LOCK_PREFIX "bts", *addr, nr, "%0", "c"); } /** @@ -255,13 +251,7 @@ static inline int __test_and_set_bit(long nr, volatile unsigned long *addr) */ static inline int test_and_clear_bit(long nr, volatile unsigned long *addr) { - int oldbit; - - asm volatile(LOCK_PREFIX "btr %2,%1\n\t" - "sbb %0,%0" - : "=r" (oldbit), ADDR : "Ir" (nr) : "memory"); - - return oldbit; + GEN_BINARY_RMWcc(LOCK_PREFIX "btr", *addr, nr, "%0", "c"); } /** @@ -314,13 +304,7 @@ static inline int __test_and_change_bit(long nr, volatile unsigned long *addr) */ static inline int test_and_change_bit(long nr, volatile unsigned long *addr) { - int oldbit; - - asm volatile(LOCK_PREFIX "btc %2,%1\n\t" - "sbb %0,%0" - : "=r" (oldbit), ADDR : "Ir" (nr) : "memory"); - - return oldbit; + GEN_BINARY_RMWcc(LOCK_PREFIX "btc", *addr, nr, "%0", "c"); } static __always_inline int constant_test_bit(long nr, const volatile unsigned long *addr) diff --git a/arch/x86/include/asm/calling.h b/arch/x86/include/asm/calling.h index 0fa675033912..cb4c73bfeb48 100644 --- a/arch/x86/include/asm/calling.h +++ b/arch/x86/include/asm/calling.h @@ -48,6 +48,8 @@ For 32-bit we have the following conventions - kernel is built with #include <asm/dwarf2.h> +#ifdef CONFIG_X86_64 + /* * 64-bit system call stack frame layout defines and helpers, * for assembly code: @@ -192,3 +194,51 @@ For 32-bit we have the following conventions - kernel is built with .macro icebp .byte 0xf1 .endm + +#else /* CONFIG_X86_64 */ + +/* + * For 32bit only simplified versions of SAVE_ALL/RESTORE_ALL. These + * are different from the entry_32.S versions in not changing the segment + * registers. So only suitable for in kernel use, not when transitioning + * from or to user space. The resulting stack frame is not a standard + * pt_regs frame. The main use case is calling C code from assembler + * when all the registers need to be preserved. + */ + + .macro SAVE_ALL + pushl_cfi %eax + CFI_REL_OFFSET eax, 0 + pushl_cfi %ebp + CFI_REL_OFFSET ebp, 0 + pushl_cfi %edi + CFI_REL_OFFSET edi, 0 + pushl_cfi %esi + CFI_REL_OFFSET esi, 0 + pushl_cfi %edx + CFI_REL_OFFSET edx, 0 + pushl_cfi %ecx + CFI_REL_OFFSET ecx, 0 + pushl_cfi %ebx + CFI_REL_OFFSET ebx, 0 + .endm + + .macro RESTORE_ALL + popl_cfi %ebx + CFI_RESTORE ebx + popl_cfi %ecx + CFI_RESTORE ecx + popl_cfi %edx + CFI_RESTORE edx + popl_cfi %esi + CFI_RESTORE esi + popl_cfi %edi + CFI_RESTORE edi + popl_cfi %ebp + CFI_RESTORE ebp + popl_cfi %eax + CFI_RESTORE eax + .endm + +#endif /* CONFIG_X86_64 */ + diff --git a/arch/x86/include/asm/local.h b/arch/x86/include/asm/local.h index 2d89e3980cbd..5b23e605e707 100644 --- a/arch/x86/include/asm/local.h +++ b/arch/x86/include/asm/local.h @@ -52,12 +52,7 @@ static inline void local_sub(long i, local_t *l) */ static inline int local_sub_and_test(long i, local_t *l) { - unsigned char c; - - asm volatile(_ASM_SUB "%2,%0; sete %1" - : "+m" (l->a.counter), "=qm" (c) - : "ir" (i) : "memory"); - return c; + GEN_BINARY_RMWcc(_ASM_SUB, l->a.counter, i, "%0", "e"); } /** @@ -70,12 +65,7 @@ static inline int local_sub_and_test(long i, local_t *l) */ static inline int local_dec_and_test(local_t *l) { - unsigned char c; - - asm volatile(_ASM_DEC "%0; sete %1" - : "+m" (l->a.counter), "=qm" (c) - : : "memory"); - return c != 0; + GEN_UNARY_RMWcc(_ASM_DEC, l->a.counter, "%0", "e"); } /** @@ -88,12 +78,7 @@ static inline int local_dec_and_test(local_t *l) */ static inline int local_inc_and_test(local_t *l) { - unsigned char c; - - asm volatile(_ASM_INC "%0; sete %1" - : "+m" (l->a.counter), "=qm" (c) - : : "memory"); - return c != 0; + GEN_UNARY_RMWcc(_ASM_INC, l->a.counter, "%0", "e"); } /** @@ -107,12 +92,7 @@ static inline int local_inc_and_test(local_t *l) */ static inline int local_add_negative(long i, local_t *l) { - unsigned char c; - - asm volatile(_ASM_ADD "%2,%0; sets %1" - : "+m" (l->a.counter), "=qm" (c) - : "ir" (i) : "memory"); - return c; + GEN_BINARY_RMWcc(_ASM_ADD, l->a.counter, i, "%0", "s"); } /** diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h new file mode 100644 index 000000000000..8729723636fd --- /dev/null +++ b/arch/x86/include/asm/preempt.h @@ -0,0 +1,100 @@ +#ifndef __ASM_PREEMPT_H +#define __ASM_PREEMPT_H + +#include <asm/rmwcc.h> +#include <asm/percpu.h> +#include <linux/thread_info.h> + +DECLARE_PER_CPU(int, __preempt_count); + +/* + * We mask the PREEMPT_NEED_RESCHED bit so as not to confuse all current users + * that think a non-zero value indicates we cannot preempt. + */ +static __always_inline int preempt_count(void) +{ + return __this_cpu_read_4(__preempt_count) & ~PREEMPT_NEED_RESCHED; +} + +static __always_inline void preempt_count_set(int pc) +{ + __this_cpu_write_4(__preempt_count, pc); +} + +/* + * must be macros to avoid header recursion hell + */ +#define task_preempt_count(p) \ + (task_thread_info(p)->saved_preempt_count & ~PREEMPT_NEED_RESCHED) + +#define init_task_preempt_count(p) do { \ + task_thread_info(p)->saved_preempt_count = PREEMPT_DISABLED; \ +} while (0) + +#define init_idle_preempt_count(p, cpu) do { \ + task_thread_info(p)->saved_preempt_count = PREEMPT_ENABLED; \ + per_cpu(__preempt_count, (cpu)) = PREEMPT_ENABLED; \ +} while (0) + +/* + * We fold the NEED_RESCHED bit into the preempt count such that + * preempt_enable() can decrement and test for needing to reschedule with a + * single instruction. + * + * We invert the actual bit, so that when the decrement hits 0 we know we both + * need to resched (the bit is cleared) and can resched (no preempt count). + */ + +static __always_inline void set_preempt_need_resched(void) +{ + __this_cpu_and_4(__preempt_count, ~PREEMPT_NEED_RESCHED); +} + +static __always_inline void clear_preempt_need_resched(void) +{ + __this_cpu_or_4(__preempt_count, PREEMPT_NEED_RESCHED); +} + +static __always_inline bool test_preempt_need_resched(void) +{ + return !(__this_cpu_read_4(__preempt_count) & PREEMPT_NEED_RESCHED); +} + +/* + * The various preempt_count add/sub methods + */ + +static __always_inline void __preempt_count_add(int val) +{ + __this_cpu_add_4(__preempt_count, val); +} + +static __always_inline void __preempt_count_sub(int val) +{ + __this_cpu_add_4(__preempt_count, -val); +} + +static __always_inline bool __preempt_count_dec_and_test(void) +{ + GEN_UNARY_RMWcc("decl", __preempt_count, __percpu_arg(0), "e"); +} + +/* + * Returns true when we need to resched and can (barring IRQ state). + */ +static __always_inline bool should_resched(void) +{ + return unlikely(!__this_cpu_read_4(__preempt_count)); +} + +#ifdef CONFIG_PREEMPT + extern asmlinkage void ___preempt_schedule(void); +# define __preempt_schedule() asm ("call ___preempt_schedule") + extern asmlinkage void preempt_schedule(void); +# ifdef CONFIG_CONTEXT_TRACKING + extern asmlinkage void ___preempt_schedule_context(void); +# define __preempt_schedule_context() asm ("call ___preempt_schedule_context") +# endif +#endif + +#endif /* __ASM_PREEMPT_H */ diff --git a/arch/x86/include/asm/rmwcc.h b/arch/x86/include/asm/rmwcc.h new file mode 100644 index 000000000000..735f1849795f --- /dev/null +++ b/arch/x86/include/asm/rmwcc.h @@ -0,0 +1,41 @@ +#ifndef _ASM_X86_RMWcc +#define _ASM_X86_RMWcc + +#ifdef CC_HAVE_ASM_GOTO + +#define __GEN_RMWcc(fullop, var, cc, ...) \ +do { \ + asm volatile goto (fullop "; j" cc " %l[cc_label]" \ + : : "m" (var), ## __VA_ARGS__ \ + : "memory" : cc_label); \ + return 0; \ +cc_label: \ + return 1; \ +} while (0) + +#define GEN_UNARY_RMWcc(op, var, arg0, cc) \ + __GEN_RMWcc(op " " arg0, var, cc) + +#define GEN_BINARY_RMWcc(op, var, val, arg0, cc) \ + __GEN_RMWcc(op " %1, " arg0, var, cc, "er" (val)) + +#else /* !CC_HAVE_ASM_GOTO */ + +#define __GEN_RMWcc(fullop, var, cc, ...) \ +do { \ + char c; \ + asm volatile (fullop "; set" cc " %1" \ + : "+m" (var), "=qm" (c) \ + : __VA_ARGS__ : "memory"); \ + return c != 0; \ +} while (0) + +#define GEN_UNARY_RMWcc(op, var, arg0, cc) \ + __GEN_RMWcc(op " " arg0, var, cc) + +#define GEN_BINARY_RMWcc(op, var, val, arg0, cc) \ + __GEN_RMWcc(op " %2, " arg0, var, cc, "er" (val)) + +#endif /* CC_HAVE_ASM_GOTO */ + +#endif /* _ASM_X86_RMWcc */ diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 27811190cbd7..c46a46be1ec6 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -28,8 +28,7 @@ struct thread_info { __u32 flags; /* low level flags */ __u32 status; /* thread synchronous flags */ __u32 cpu; /* current CPU */ - int preempt_count; /* 0 => preemptable, - <0 => BUG */ + int saved_preempt_count; mm_segment_t addr_limit; struct restart_block restart_block; void __user *sysenter_return; @@ -49,7 +48,7 @@ struct thread_info { .exec_domain = &default_exec_domain, \ .flags = 0, \ .cpu = 0, \ - .preempt_count = INIT_PREEMPT_COUNT, \ + .saved_preempt_count = INIT_PREEMPT_COUNT, \ .addr_limit = KERNEL_DS, \ .restart_block = { \ .fn = do_no_restart_syscall, \ diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index a5408b965c9d..9b0a34e2cd79 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -36,6 +36,8 @@ obj-y += tsc.o io_delay.o rtc.o obj-y += pci-iommu_table.o obj-y += resource.o +obj-$(CONFIG_PREEMPT) += preempt.o + obj-y += process.o obj-y += i387.o xsave.o obj-y += ptrace.o diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index 28610822fb3c..9f6b9341950f 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -32,7 +32,6 @@ void common(void) { OFFSET(TI_flags, thread_info, flags); OFFSET(TI_status, thread_info, status); OFFSET(TI_addr_limit, thread_info, addr_limit); - OFFSET(TI_preempt_count, thread_info, preempt_count); BLANK(); OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx); diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 2793d1f095a2..5223fe6dec7b 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1095,6 +1095,9 @@ DEFINE_PER_CPU(char *, irq_stack_ptr) = DEFINE_PER_CPU(unsigned int, irq_count) __visible = -1; +DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT; +EXPORT_PER_CPU_SYMBOL(__preempt_count); + DEFINE_PER_CPU(struct task_struct *, fpu_owner_task); /* @@ -1169,6 +1172,8 @@ void debug_stack_reset(void) DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task; EXPORT_PER_CPU_SYMBOL(current_task); +DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT; +EXPORT_PER_CPU_SYMBOL(__preempt_count); DEFINE_PER_CPU(struct task_struct *, fpu_owner_task); #ifdef CONFIG_CC_STACKPROTECTOR diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index f0dcb0ceb6a2..fd1bc1b15e6d 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -362,12 +362,9 @@ END(ret_from_exception) #ifdef CONFIG_PREEMPT ENTRY(resume_kernel) DISABLE_INTERRUPTS(CLBR_ANY) - cmpl $0,TI_preempt_count(%ebp) # non-zero preempt_count ? - jnz restore_all need_resched: - movl TI_flags(%ebp), %ecx # need_resched set ? - testb $_TIF_NEED_RESCHED, %cl - jz restore_all + cmpl $0,PER_CPU_VAR(__preempt_count) + jnz restore_all testl $X86_EFLAGS_IF,PT_EFLAGS(%esp) # interrupts off (exception path) ? jz restore_all call preempt_schedule_irq diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index b077f4cc225a..1a2cc64abcd7 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1103,10 +1103,8 @@ retint_signal: /* Returning to kernel space. Check if we need preemption */ /* rcx: threadinfo. interrupts off. */ ENTRY(retint_kernel) - cmpl $0,TI_preempt_count(%rcx) + cmpl $0,PER_CPU_VAR(__preempt_count) jnz retint_restore_args - bt $TIF_NEED_RESCHED,TI_flags(%rcx) - jnc retint_restore_args bt $9,EFLAGS-ARGOFFSET(%rsp) /* interrupts off? */ jnc retint_restore_args call preempt_schedule_irq diff --git a/arch/x86/kernel/i386_ksyms_32.c b/arch/x86/kernel/i386_ksyms_32.c index 0fa69127209a..05fd74f537d6 100644 --- a/arch/x86/kernel/i386_ksyms_32.c +++ b/arch/x86/kernel/i386_ksyms_32.c @@ -37,3 +37,10 @@ EXPORT_SYMBOL(strstr); EXPORT_SYMBOL(csum_partial); EXPORT_SYMBOL(empty_zero_page); + +#ifdef CONFIG_PREEMPT +EXPORT_SYMBOL(___preempt_schedule); +#ifdef CONFIG_CONTEXT_TRACKING +EXPORT_SYMBOL(___preempt_schedule_context); +#endif +#endif diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index 4186755f1d7c..3fe066359ac0 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -100,9 +100,6 @@ execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq) irqctx->tinfo.task = curctx->tinfo.task; irqctx->tinfo.previous_esp = current_stack_pointer; - /* Copy the preempt_count so that the [soft]irq checks work. */ - irqctx->tinfo.preempt_count = curctx->tinfo.preempt_count; - if (unlikely(overflow)) call_on_stack(print_stack_overflow, isp); @@ -131,7 +128,6 @@ void irq_ctx_init(int cpu) THREAD_SIZE_ORDER)); memset(&irqctx->tinfo, 0, sizeof(struct thread_info)); irqctx->tinfo.cpu = cpu; - irqctx->tinfo.preempt_count = HARDIRQ_OFFSET; irqctx->tinfo.addr_limit = MAKE_MM_SEG(0); per_cpu(hardirq_ctx, cpu) = irqctx; diff --git a/arch/x86/kernel/preempt.S b/arch/x86/kernel/preempt.S new file mode 100644 index 000000000000..ca7f0d58a87d --- /dev/null +++ b/arch/x86/kernel/preempt.S @@ -0,0 +1,25 @@ + +#include <linux/linkage.h> +#include <asm/dwarf2.h> +#include <asm/asm.h> +#include <asm/calling.h> + +ENTRY(___preempt_schedule) + CFI_STARTPROC + SAVE_ALL + call preempt_schedule + RESTORE_ALL + ret + CFI_ENDPROC + +#ifdef CONFIG_CONTEXT_TRACKING + +ENTRY(___preempt_schedule_context) + CFI_STARTPROC + SAVE_ALL + call preempt_schedule_context + RESTORE_ALL + ret + CFI_ENDPROC + +#endif diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index c83516be1052..3fb8d95ab8b5 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -391,9 +391,9 @@ static void amd_e400_idle(void) * The switch back from broadcast mode needs to be * called with interrupts disabled. */ - local_irq_disable(); - clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &cpu); - local_irq_enable(); + local_irq_disable(); + clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &cpu); + local_irq_enable(); } else default_idle(); } diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 884f98f69354..c2ec1aa6d454 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -292,6 +292,14 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) set_iopl_mask(next->iopl); /* + * If it were not for PREEMPT_ACTIVE we could guarantee that the + * preempt_count of all tasks was equal here and this would not be + * needed. + */ + task_thread_info(prev_p)->saved_preempt_count = this_cpu_read(__preempt_count); + this_cpu_write(__preempt_count, task_thread_info(next_p)->saved_preempt_count); + + /* * Now maybe handle debug registers and/or IO bitmaps */ if (unlikely(task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV || diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index bb1dc51bab05..45ab4d6fc8a7 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -363,6 +363,14 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) this_cpu_write(old_rsp, next->usersp); this_cpu_write(current_task, next_p); + /* + * If it were not for PREEMPT_ACTIVE we could guarantee that the + * preempt_count of all tasks was equal here and this would not be + * needed. + */ + task_thread_info(prev_p)->saved_preempt_count = this_cpu_read(__preempt_count); + this_cpu_write(__preempt_count, task_thread_info(next_p)->saved_preempt_count); + this_cpu_write(kernel_stack, (unsigned long)task_stack_page(next_p) + THREAD_SIZE - KERNEL_STACK_OFFSET); diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 8c8093b146ca..729aa779ff75 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -88,7 +88,7 @@ static inline void conditional_sti(struct pt_regs *regs) static inline void preempt_conditional_sti(struct pt_regs *regs) { - inc_preempt_count(); + preempt_count_inc(); if (regs->flags & X86_EFLAGS_IF) local_irq_enable(); } @@ -103,7 +103,7 @@ static inline void preempt_conditional_cli(struct pt_regs *regs) { if (regs->flags & X86_EFLAGS_IF) local_irq_disable(); - dec_preempt_count(); + preempt_count_dec(); } static int __kprobes diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c index b014d9414d08..040681928e9d 100644 --- a/arch/x86/kernel/x8664_ksyms_64.c +++ b/arch/x86/kernel/x8664_ksyms_64.c @@ -66,3 +66,10 @@ EXPORT_SYMBOL(empty_zero_page); #ifndef CONFIG_PARAVIRT EXPORT_SYMBOL(native_load_gs_index); #endif + +#ifdef CONFIG_PREEMPT +EXPORT_SYMBOL(___preempt_schedule); +#ifdef CONFIG_CONTEXT_TRACKING +EXPORT_SYMBOL(___preempt_schedule_context); +#endif +#endif diff --git a/arch/xtensa/include/asm/Kbuild b/arch/xtensa/include/asm/Kbuild index 1b982641ec35..228d6aee3a16 100644 --- a/arch/xtensa/include/asm/Kbuild +++ b/arch/xtensa/include/asm/Kbuild @@ -28,3 +28,4 @@ generic-y += termios.h generic-y += topology.h generic-y += trace_clock.h generic-y += xor.h +generic-y += preempt.h diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c index f98dd00b51a9..c7414a545a4f 100644 --- a/drivers/acpi/processor_idle.c +++ b/drivers/acpi/processor_idle.c @@ -119,17 +119,10 @@ static struct dmi_system_id processor_power_dmi_table[] = { */ static void acpi_safe_halt(void) { - current_thread_info()->status &= ~TS_POLLING; - /* - * TS_POLLING-cleared state must be visible before we - * test NEED_RESCHED: - */ - smp_mb(); - if (!need_resched()) { + if (!tif_need_resched()) { safe_halt(); local_irq_disable(); } - current_thread_info()->status |= TS_POLLING; } #ifdef ARCH_APICTIMER_STOPS_ON_C3 @@ -737,6 +730,11 @@ static int acpi_idle_enter_c1(struct cpuidle_device *dev, if (unlikely(!pr)) return -EINVAL; + if (cx->entry_method == ACPI_CSTATE_FFH) { + if (current_set_polling_and_test()) + return -EINVAL; + } + lapic_timer_state_broadcast(pr, cx, 1); acpi_idle_do_entry(cx); @@ -790,18 +788,9 @@ static int acpi_idle_enter_simple(struct cpuidle_device *dev, if (unlikely(!pr)) return -EINVAL; - if (cx->entry_method != ACPI_CSTATE_FFH) { - current_thread_info()->status &= ~TS_POLLING; - /* - * TS_POLLING-cleared state must be visible before we test - * NEED_RESCHED: - */ - smp_mb(); - - if (unlikely(need_resched())) { - current_thread_info()->status |= TS_POLLING; + if (cx->entry_method == ACPI_CSTATE_FFH) { + if (current_set_polling_and_test()) return -EINVAL; - } } /* @@ -819,9 +808,6 @@ static int acpi_idle_enter_simple(struct cpuidle_device *dev, sched_clock_idle_wakeup_event(0); - if (cx->entry_method != ACPI_CSTATE_FFH) - current_thread_info()->status |= TS_POLLING; - lapic_timer_state_broadcast(pr, cx, 0); return index; } @@ -858,18 +844,9 @@ static int acpi_idle_enter_bm(struct cpuidle_device *dev, } } - if (cx->entry_method != ACPI_CSTATE_FFH) { - current_thread_info()->status &= ~TS_POLLING; - /* - * TS_POLLING-cleared state must be visible before we test - * NEED_RESCHED: - */ - smp_mb(); - - if (unlikely(need_resched())) { - current_thread_info()->status |= TS_POLLING; + if (cx->entry_method == ACPI_CSTATE_FFH) { + if (current_set_polling_and_test()) return -EINVAL; - } } acpi_unlazy_tlb(smp_processor_id()); @@ -915,9 +892,6 @@ static int acpi_idle_enter_bm(struct cpuidle_device *dev, sched_clock_idle_wakeup_event(0); - if (cx->entry_method != ACPI_CSTATE_FFH) - current_thread_info()->status |= TS_POLLING; - lapic_timer_state_broadcast(pr, cx, 0); return index; } diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index fa6964d8681a..f116d664b473 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -359,7 +359,7 @@ static int intel_idle(struct cpuidle_device *dev, if (!(lapic_timer_reliable_states & (1 << (cstate)))) clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu); - if (!need_resched()) { + if (!current_set_polling_and_test()) { __monitor((void *)¤t_thread_info()->flags, 0, 0); smp_mb(); diff --git a/include/asm-generic/preempt.h b/include/asm-generic/preempt.h new file mode 100644 index 000000000000..ddf2b420ac8f --- /dev/null +++ b/include/asm-generic/preempt.h @@ -0,0 +1,105 @@ +#ifndef __ASM_PREEMPT_H +#define __ASM_PREEMPT_H + +#include <linux/thread_info.h> + +/* + * We mask the PREEMPT_NEED_RESCHED bit so as not to confuse all current users + * that think a non-zero value indicates we cannot preempt. + */ +static __always_inline int preempt_count(void) +{ + return current_thread_info()->preempt_count & ~PREEMPT_NEED_RESCHED; +} + +static __always_inline int *preempt_count_ptr(void) +{ + return ¤t_thread_info()->preempt_count; +} + +/* + * We now loose PREEMPT_NEED_RESCHED and cause an extra reschedule; however the + * alternative is loosing a reschedule. Better schedule too often -- also this + * should be a very rare operation. + */ +static __always_inline void preempt_count_set(int pc) +{ + *preempt_count_ptr() = pc; +} + +/* + * must be macros to avoid header recursion hell + */ +#define task_preempt_count(p) \ + (task_thread_info(p)->preempt_count & ~PREEMPT_NEED_RESCHED) + +#define init_task_preempt_count(p) do { \ + task_thread_info(p)->preempt_count = PREEMPT_DISABLED; \ +} while (0) + +#define init_idle_preempt_count(p, cpu) do { \ + task_thread_info(p)->preempt_count = PREEMPT_ENABLED; \ +} while (0) + +/* + * We fold the NEED_RESCHED bit into the preempt count such that + * preempt_enable() can decrement and test for needing to reschedule with a + * single instruction. + * + * We invert the actual bit, so that when the decrement hits 0 we know we both + * need to resched (the bit is cleared) and can resched (no preempt count). + */ + +static __always_inline void set_preempt_need_resched(void) +{ + *preempt_count_ptr() &= ~PREEMPT_NEED_RESCHED; +} + +static __always_inline void clear_preempt_need_resched(void) +{ + *preempt_count_ptr() |= PREEMPT_NEED_RESCHED; +} + +static __always_inline bool test_preempt_need_resched(void) +{ + return !(*preempt_count_ptr() & PREEMPT_NEED_RESCHED); +} + +/* + * The various preempt_count add/sub methods + */ + +static __always_inline void __preempt_count_add(int val) +{ + *preempt_count_ptr() += val; +} + +static __always_inline void __preempt_count_sub(int val) +{ + *preempt_count_ptr() -= val; +} + +static __always_inline bool __preempt_count_dec_and_test(void) +{ + return !--*preempt_count_ptr(); +} + +/* + * Returns true when we need to resched and can (barring IRQ state). + */ +static __always_inline bool should_resched(void) +{ + return unlikely(!*preempt_count_ptr()); +} + +#ifdef CONFIG_PREEMPT +extern asmlinkage void preempt_schedule(void); +#define __preempt_schedule() preempt_schedule() + +#ifdef CONFIG_CONTEXT_TRACKING +extern asmlinkage void preempt_schedule_context(void); +#define __preempt_schedule_context() preempt_schedule_context() +#endif +#endif /* CONFIG_PREEMPT */ + +#endif /* __ASM_PREEMPT_H */ diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h index 1e041063b226..d9cf963ac832 100644 --- a/include/linux/hardirq.h +++ b/include/linux/hardirq.h @@ -33,7 +33,7 @@ extern void rcu_nmi_exit(void); #define __irq_enter() \ do { \ account_irq_enter_time(current); \ - add_preempt_count(HARDIRQ_OFFSET); \ + preempt_count_add(HARDIRQ_OFFSET); \ trace_hardirq_enter(); \ } while (0) @@ -49,7 +49,7 @@ extern void irq_enter(void); do { \ trace_hardirq_exit(); \ account_irq_exit_time(current); \ - sub_preempt_count(HARDIRQ_OFFSET); \ + preempt_count_sub(HARDIRQ_OFFSET); \ } while (0) /* @@ -62,7 +62,7 @@ extern void irq_exit(void); lockdep_off(); \ ftrace_nmi_enter(); \ BUG_ON(in_nmi()); \ - add_preempt_count(NMI_OFFSET + HARDIRQ_OFFSET); \ + preempt_count_add(NMI_OFFSET + HARDIRQ_OFFSET); \ rcu_nmi_enter(); \ trace_hardirq_enter(); \ } while (0) @@ -72,7 +72,7 @@ extern void irq_exit(void); trace_hardirq_exit(); \ rcu_nmi_exit(); \ BUG_ON(!in_nmi()); \ - sub_preempt_count(NMI_OFFSET + HARDIRQ_OFFSET); \ + preempt_count_sub(NMI_OFFSET + HARDIRQ_OFFSET); \ ftrace_nmi_exit(); \ lockdep_on(); \ } while (0) diff --git a/include/linux/preempt.h b/include/linux/preempt.h index f5d4723cdb3d..a3d9dc8c2c00 100644 --- a/include/linux/preempt.h +++ b/include/linux/preempt.h @@ -6,106 +6,95 @@ * preempt_count (used for kernel preemption, interrupt count, etc.) */ -#include <linux/thread_info.h> #include <linux/linkage.h> #include <linux/list.h> -#if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_PREEMPT_TRACER) - extern void add_preempt_count(int val); - extern void sub_preempt_count(int val); -#else -# define add_preempt_count(val) do { preempt_count() += (val); } while (0) -# define sub_preempt_count(val) do { preempt_count() -= (val); } while (0) -#endif - -#define inc_preempt_count() add_preempt_count(1) -#define dec_preempt_count() sub_preempt_count(1) - -#define preempt_count() (current_thread_info()->preempt_count) - -#ifdef CONFIG_PREEMPT - -asmlinkage void preempt_schedule(void); - -#define preempt_check_resched() \ -do { \ - if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) \ - preempt_schedule(); \ -} while (0) - -#ifdef CONFIG_CONTEXT_TRACKING +/* + * We use the MSB mostly because its available; see <linux/preempt_mask.h> for + * the other bits -- can't include that header due to inclusion hell. + */ +#define PREEMPT_NEED_RESCHED 0x80000000 -void preempt_schedule_context(void); +#include <asm/preempt.h> -#define preempt_check_resched_context() \ -do { \ - if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) \ - preempt_schedule_context(); \ -} while (0) +#if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_PREEMPT_TRACER) +extern void preempt_count_add(int val); +extern void preempt_count_sub(int val); +#define preempt_count_dec_and_test() ({ preempt_count_sub(1); should_resched(); }) #else +#define preempt_count_add(val) __preempt_count_add(val) +#define preempt_count_sub(val) __preempt_count_sub(val) +#define preempt_count_dec_and_test() __preempt_count_dec_and_test() +#endif -#define preempt_check_resched_context() preempt_check_resched() - -#endif /* CONFIG_CONTEXT_TRACKING */ - -#else /* !CONFIG_PREEMPT */ - -#define preempt_check_resched() do { } while (0) -#define preempt_check_resched_context() do { } while (0) - -#endif /* CONFIG_PREEMPT */ +#define __preempt_count_inc() __preempt_count_add(1) +#define __preempt_count_dec() __preempt_count_sub(1) +#define preempt_count_inc() preempt_count_add(1) +#define preempt_count_dec() preempt_count_sub(1) #ifdef CONFIG_PREEMPT_COUNT #define preempt_disable() \ do { \ - inc_preempt_count(); \ + preempt_count_inc(); \ barrier(); \ } while (0) #define sched_preempt_enable_no_resched() \ do { \ barrier(); \ - dec_preempt_count(); \ + preempt_count_dec(); \ } while (0) -#define preempt_enable_no_resched() sched_preempt_enable_no_resched() +#define preempt_enable_no_resched() sched_preempt_enable_no_resched() +#ifdef CONFIG_PREEMPT #define preempt_enable() \ do { \ - preempt_enable_no_resched(); \ barrier(); \ - preempt_check_resched(); \ + if (unlikely(preempt_count_dec_and_test())) \ + __preempt_schedule(); \ +} while (0) + +#define preempt_check_resched() \ +do { \ + if (should_resched()) \ + __preempt_schedule(); \ } while (0) -/* For debugging and tracer internals only! */ -#define add_preempt_count_notrace(val) \ - do { preempt_count() += (val); } while (0) -#define sub_preempt_count_notrace(val) \ - do { preempt_count() -= (val); } while (0) -#define inc_preempt_count_notrace() add_preempt_count_notrace(1) -#define dec_preempt_count_notrace() sub_preempt_count_notrace(1) +#else +#define preempt_enable() preempt_enable_no_resched() +#define preempt_check_resched() do { } while (0) +#endif #define preempt_disable_notrace() \ do { \ - inc_preempt_count_notrace(); \ + __preempt_count_inc(); \ barrier(); \ } while (0) #define preempt_enable_no_resched_notrace() \ do { \ barrier(); \ - dec_preempt_count_notrace(); \ + __preempt_count_dec(); \ } while (0) -/* preempt_check_resched is OK to trace */ +#ifdef CONFIG_PREEMPT + +#ifndef CONFIG_CONTEXT_TRACKING +#define __preempt_schedule_context() __preempt_schedule() +#endif + #define preempt_enable_notrace() \ do { \ - preempt_enable_no_resched_notrace(); \ barrier(); \ - preempt_check_resched_context(); \ + if (unlikely(__preempt_count_dec_and_test())) \ + __preempt_schedule_context(); \ } while (0) +#else +#define preempt_enable_notrace() preempt_enable_no_resched_notrace() +#endif #else /* !CONFIG_PREEMPT_COUNT */ @@ -115,10 +104,11 @@ do { \ * that can cause faults and scheduling migrate into our preempt-protected * region. */ -#define preempt_disable() barrier() +#define preempt_disable() barrier() #define sched_preempt_enable_no_resched() barrier() -#define preempt_enable_no_resched() barrier() -#define preempt_enable() barrier() +#define preempt_enable_no_resched() barrier() +#define preempt_enable() barrier() +#define preempt_check_resched() do { } while (0) #define preempt_disable_notrace() barrier() #define preempt_enable_no_resched_notrace() barrier() diff --git a/include/linux/sched.h b/include/linux/sched.h index 6682da36b293..2ac5285db434 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -22,6 +22,7 @@ struct sched_param { #include <linux/errno.h> #include <linux/nodemask.h> #include <linux/mm_types.h> +#include <linux/preempt.h> #include <asm/page.h> #include <asm/ptrace.h> @@ -427,6 +428,14 @@ struct task_cputime { .sum_exec_runtime = 0, \ } +#define PREEMPT_ENABLED (PREEMPT_NEED_RESCHED) + +#ifdef CONFIG_PREEMPT_COUNT +#define PREEMPT_DISABLED (1 + PREEMPT_ENABLED) +#else +#define PREEMPT_DISABLED PREEMPT_ENABLED +#endif + /* * Disable preemption until the scheduler is running. * Reset by start_kernel()->sched_init()->init_idle(). @@ -434,7 +443,7 @@ struct task_cputime { * We include PREEMPT_ACTIVE to avoid cond_resched() from working * before the scheduler is active -- see should_resched(). */ -#define INIT_PREEMPT_COUNT (1 + PREEMPT_ACTIVE) +#define INIT_PREEMPT_COUNT (PREEMPT_DISABLED + PREEMPT_ACTIVE) /** * struct thread_group_cputimer - thread group interval timer counts @@ -811,6 +820,10 @@ struct sched_domain { u64 last_update; + /* idle_balance() stats */ + u64 max_newidle_lb_cost; + unsigned long next_decay_max_lb_cost; + #ifdef CONFIG_SCHEDSTATS /* load_balance() stats */ unsigned int lb_count[CPU_MAX_IDLE_TYPES]; @@ -2402,11 +2415,6 @@ static inline int signal_pending_state(long state, struct task_struct *p) return (state & TASK_INTERRUPTIBLE) || __fatal_signal_pending(p); } -static inline int need_resched(void) -{ - return unlikely(test_thread_flag(TIF_NEED_RESCHED)); -} - /* * cond_resched() and cond_resched_lock(): latency reduction via * explicit rescheduling in places that are safe. The return @@ -2475,36 +2483,105 @@ static inline int tsk_is_polling(struct task_struct *p) { return task_thread_info(p)->status & TS_POLLING; } -static inline void current_set_polling(void) +static inline void __current_set_polling(void) { current_thread_info()->status |= TS_POLLING; } -static inline void current_clr_polling(void) +static inline bool __must_check current_set_polling_and_test(void) +{ + __current_set_polling(); + + /* + * Polling state must be visible before we test NEED_RESCHED, + * paired by resched_task() + */ + smp_mb(); + + return unlikely(tif_need_resched()); +} + +static inline void __current_clr_polling(void) { current_thread_info()->status &= ~TS_POLLING; - smp_mb__after_clear_bit(); +} + +static inline bool __must_check current_clr_polling_and_test(void) +{ + __current_clr_polling(); + + /* + * Polling state must be visible before we test NEED_RESCHED, + * paired by resched_task() + */ + smp_mb(); + + return unlikely(tif_need_resched()); } #elif defined(TIF_POLLING_NRFLAG) static inline int tsk_is_polling(struct task_struct *p) { return test_tsk_thread_flag(p, TIF_POLLING_NRFLAG); } -static inline void current_set_polling(void) + +static inline void __current_set_polling(void) { set_thread_flag(TIF_POLLING_NRFLAG); } -static inline void current_clr_polling(void) +static inline bool __must_check current_set_polling_and_test(void) +{ + __current_set_polling(); + + /* + * Polling state must be visible before we test NEED_RESCHED, + * paired by resched_task() + * + * XXX: assumes set/clear bit are identical barrier wise. + */ + smp_mb__after_clear_bit(); + + return unlikely(tif_need_resched()); +} + +static inline void __current_clr_polling(void) { clear_thread_flag(TIF_POLLING_NRFLAG); } + +static inline bool __must_check current_clr_polling_and_test(void) +{ + __current_clr_polling(); + + /* + * Polling state must be visible before we test NEED_RESCHED, + * paired by resched_task() + */ + smp_mb__after_clear_bit(); + + return unlikely(tif_need_resched()); +} + #else static inline int tsk_is_polling(struct task_struct *p) { return 0; } -static inline void current_set_polling(void) { } -static inline void current_clr_polling(void) { } +static inline void __current_set_polling(void) { } +static inline void __current_clr_polling(void) { } + +static inline bool __must_check current_set_polling_and_test(void) +{ + return unlikely(tif_need_resched()); +} +static inline bool __must_check current_clr_polling_and_test(void) +{ + return unlikely(tif_need_resched()); +} #endif +static __always_inline bool need_resched(void) +{ + return unlikely(tif_need_resched()); +} + /* * Thread group CPU time accounting. */ diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h index e7e04736802f..fddbe2023a5d 100644 --- a/include/linux/thread_info.h +++ b/include/linux/thread_info.h @@ -104,8 +104,21 @@ static inline int test_ti_thread_flag(struct thread_info *ti, int flag) #define test_thread_flag(flag) \ test_ti_thread_flag(current_thread_info(), flag) -#define set_need_resched() set_thread_flag(TIF_NEED_RESCHED) -#define clear_need_resched() clear_thread_flag(TIF_NEED_RESCHED) +static inline __deprecated void set_need_resched(void) +{ + /* + * Use of this function in deprecated. + * + * As of this writing there are only a few users in the DRM tree left + * all of which are wrong and can be removed without causing too much + * grief. + * + * The DRM people are aware and are working on removing the last few + * instances. + */ +} + +#define tif_need_resched() test_thread_flag(TIF_NEED_RESCHED) #if defined TIF_RESTORE_SIGMASK && !defined HAVE_SET_RESTORE_SIGMASK /* diff --git a/include/linux/topology.h b/include/linux/topology.h index d3cf0d6e7712..12ae6ce997d6 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h @@ -106,6 +106,8 @@ int arch_update_cpu_topology(void); .last_balance = jiffies, \ .balance_interval = 1, \ .smt_gain = 1178, /* 15% */ \ + .max_newidle_lb_cost = 0, \ + .next_decay_max_lb_cost = jiffies, \ } #endif #endif /* CONFIG_SCHED_SMT */ @@ -135,6 +137,8 @@ int arch_update_cpu_topology(void); , \ .last_balance = jiffies, \ .balance_interval = 1, \ + .max_newidle_lb_cost = 0, \ + .next_decay_max_lb_cost = jiffies, \ } #endif #endif /* CONFIG_SCHED_MC */ @@ -166,6 +170,8 @@ int arch_update_cpu_topology(void); , \ .last_balance = jiffies, \ .balance_interval = 1, \ + .max_newidle_lb_cost = 0, \ + .next_decay_max_lb_cost = jiffies, \ } #endif diff --git a/include/linux/tty.h b/include/linux/tty.h index 64f864651d86..633cac77f9f9 100644 --- a/include/linux/tty.h +++ b/include/linux/tty.h @@ -672,31 +672,17 @@ static inline void tty_wait_until_sent_from_close(struct tty_struct *tty, #define wait_event_interruptible_tty(tty, wq, condition) \ ({ \ int __ret = 0; \ - if (!(condition)) { \ - __wait_event_interruptible_tty(tty, wq, condition, __ret); \ - } \ + if (!(condition)) \ + __ret = __wait_event_interruptible_tty(tty, wq, \ + condition); \ __ret; \ }) -#define __wait_event_interruptible_tty(tty, wq, condition, ret) \ -do { \ - DEFINE_WAIT(__wait); \ - \ - for (;;) { \ - prepare_to_wait(&wq, &__wait, TASK_INTERRUPTIBLE); \ - if (condition) \ - break; \ - if (!signal_pending(current)) { \ - tty_unlock(tty); \ +#define __wait_event_interruptible_tty(tty, wq, condition) \ + ___wait_event(wq, condition, TASK_INTERRUPTIBLE, 0, 0, \ + tty_unlock(tty); \ schedule(); \ - tty_lock(tty); \ - continue; \ - } \ - ret = -ERESTARTSYS; \ - break; \ - } \ - finish_wait(&wq, &__wait); \ -} while (0) + tty_lock(tty)) #ifdef CONFIG_PROC_FS extern void proc_tty_register_driver(struct tty_driver *); diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index 5ca0951e1855..9d8cf056e661 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -15,7 +15,7 @@ */ static inline void pagefault_disable(void) { - inc_preempt_count(); + preempt_count_inc(); /* * make sure to have issued the store before a pagefault * can hit. @@ -30,11 +30,7 @@ static inline void pagefault_enable(void) * the pagefault handler again. */ barrier(); - dec_preempt_count(); - /* - * make sure we do.. - */ - barrier(); + preempt_count_dec(); preempt_check_resched(); } diff --git a/include/linux/wait.h b/include/linux/wait.h index a67fc1635592..a2726c7dd244 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -1,7 +1,8 @@ #ifndef _LINUX_WAIT_H #define _LINUX_WAIT_H - - +/* + * Linux wait queue related types and methods + */ #include <linux/list.h> #include <linux/stddef.h> #include <linux/spinlock.h> @@ -13,27 +14,27 @@ typedef int (*wait_queue_func_t)(wait_queue_t *wait, unsigned mode, int flags, v int default_wake_function(wait_queue_t *wait, unsigned mode, int flags, void *key); struct __wait_queue { - unsigned int flags; + unsigned int flags; #define WQ_FLAG_EXCLUSIVE 0x01 - void *private; - wait_queue_func_t func; - struct list_head task_list; + void *private; + wait_queue_func_t func; + struct list_head task_list; }; struct wait_bit_key { - void *flags; - int bit_nr; -#define WAIT_ATOMIC_T_BIT_NR -1 + void *flags; + int bit_nr; +#define WAIT_ATOMIC_T_BIT_NR -1 }; struct wait_bit_queue { - struct wait_bit_key key; - wait_queue_t wait; + struct wait_bit_key key; + wait_queue_t wait; }; struct __wait_queue_head { - spinlock_t lock; - struct list_head task_list; + spinlock_t lock; + struct list_head task_list; }; typedef struct __wait_queue_head wait_queue_head_t; @@ -84,17 +85,17 @@ extern void __init_waitqueue_head(wait_queue_head_t *q, const char *name, struct static inline void init_waitqueue_entry(wait_queue_t *q, struct task_struct *p) { - q->flags = 0; - q->private = p; - q->func = default_wake_function; + q->flags = 0; + q->private = p; + q->func = default_wake_function; } -static inline void init_waitqueue_func_entry(wait_queue_t *q, - wait_queue_func_t func) +static inline void +init_waitqueue_func_entry(wait_queue_t *q, wait_queue_func_t func) { - q->flags = 0; - q->private = NULL; - q->func = func; + q->flags = 0; + q->private = NULL; + q->func = func; } static inline int waitqueue_active(wait_queue_head_t *q) @@ -114,8 +115,8 @@ static inline void __add_wait_queue(wait_queue_head_t *head, wait_queue_t *new) /* * Used for wake-one threads: */ -static inline void __add_wait_queue_exclusive(wait_queue_head_t *q, - wait_queue_t *wait) +static inline void +__add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t *wait) { wait->flags |= WQ_FLAG_EXCLUSIVE; __add_wait_queue(q, wait); @@ -127,23 +128,22 @@ static inline void __add_wait_queue_tail(wait_queue_head_t *head, list_add_tail(&new->task_list, &head->task_list); } -static inline void __add_wait_queue_tail_exclusive(wait_queue_head_t *q, - wait_queue_t *wait) +static inline void +__add_wait_queue_tail_exclusive(wait_queue_head_t *q, wait_queue_t *wait) { wait->flags |= WQ_FLAG_EXCLUSIVE; __add_wait_queue_tail(q, wait); } -static inline void __remove_wait_queue(wait_queue_head_t *head, - wait_queue_t *old) +static inline void +__remove_wait_queue(wait_queue_head_t *head, wait_queue_t *old) { list_del(&old->task_list); } void __wake_up(wait_queue_head_t *q, unsigned int mode, int nr, void *key); void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key); -void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, int nr, - void *key); +void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, int nr, void *key); void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr); void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr); void __wake_up_bit(wait_queue_head_t *, void *, int); @@ -170,27 +170,61 @@ wait_queue_head_t *bit_waitqueue(void *, int); /* * Wakeup macros to be used to report events to the targets. */ -#define wake_up_poll(x, m) \ +#define wake_up_poll(x, m) \ __wake_up(x, TASK_NORMAL, 1, (void *) (m)) -#define wake_up_locked_poll(x, m) \ +#define wake_up_locked_poll(x, m) \ __wake_up_locked_key((x), TASK_NORMAL, (void *) (m)) -#define wake_up_interruptible_poll(x, m) \ +#define wake_up_interruptible_poll(x, m) \ __wake_up(x, TASK_INTERRUPTIBLE, 1, (void *) (m)) #define wake_up_interruptible_sync_poll(x, m) \ __wake_up_sync_key((x), TASK_INTERRUPTIBLE, 1, (void *) (m)) -#define __wait_event(wq, condition) \ -do { \ +#define ___wait_cond_timeout(condition) \ +({ \ + bool __cond = (condition); \ + if (__cond && !__ret) \ + __ret = 1; \ + __cond || !__ret; \ +}) + +#define ___wait_signal_pending(state) \ + ((state == TASK_INTERRUPTIBLE && signal_pending(current)) || \ + (state == TASK_KILLABLE && fatal_signal_pending(current))) + +#define ___wait_event(wq, condition, state, exclusive, ret, cmd) \ +({ \ + __label__ __out; \ DEFINE_WAIT(__wait); \ + long __ret = ret; \ \ for (;;) { \ - prepare_to_wait(&wq, &__wait, TASK_UNINTERRUPTIBLE); \ + if (exclusive) \ + prepare_to_wait_exclusive(&wq, &__wait, state); \ + else \ + prepare_to_wait(&wq, &__wait, state); \ + \ if (condition) \ break; \ - schedule(); \ + \ + if (___wait_signal_pending(state)) { \ + __ret = -ERESTARTSYS; \ + if (exclusive) { \ + abort_exclusive_wait(&wq, &__wait, \ + state, NULL); \ + goto __out; \ + } \ + break; \ + } \ + \ + cmd; \ } \ finish_wait(&wq, &__wait); \ -} while (0) +__out: __ret; \ +}) + +#define __wait_event(wq, condition) \ + (void)___wait_event(wq, condition, TASK_UNINTERRUPTIBLE, 0, 0, \ + schedule()) /** * wait_event - sleep until a condition gets true @@ -204,29 +238,17 @@ do { \ * wake_up() has to be called after changing any variable that could * change the result of the wait condition. */ -#define wait_event(wq, condition) \ +#define wait_event(wq, condition) \ do { \ - if (condition) \ + if (condition) \ break; \ __wait_event(wq, condition); \ } while (0) -#define __wait_event_timeout(wq, condition, ret) \ -do { \ - DEFINE_WAIT(__wait); \ - \ - for (;;) { \ - prepare_to_wait(&wq, &__wait, TASK_UNINTERRUPTIBLE); \ - if (condition) \ - break; \ - ret = schedule_timeout(ret); \ - if (!ret) \ - break; \ - } \ - if (!ret && (condition)) \ - ret = 1; \ - finish_wait(&wq, &__wait); \ -} while (0) +#define __wait_event_timeout(wq, condition, timeout) \ + ___wait_event(wq, ___wait_cond_timeout(condition), \ + TASK_UNINTERRUPTIBLE, 0, timeout, \ + __ret = schedule_timeout(__ret)) /** * wait_event_timeout - sleep until a condition gets true or a timeout elapses @@ -248,28 +270,14 @@ do { \ #define wait_event_timeout(wq, condition, timeout) \ ({ \ long __ret = timeout; \ - if (!(condition)) \ - __wait_event_timeout(wq, condition, __ret); \ + if (!(condition)) \ + __ret = __wait_event_timeout(wq, condition, timeout); \ __ret; \ }) -#define __wait_event_interruptible(wq, condition, ret) \ -do { \ - DEFINE_WAIT(__wait); \ - \ - for (;;) { \ - prepare_to_wait(&wq, &__wait, TASK_INTERRUPTIBLE); \ - if (condition) \ - break; \ - if (!signal_pending(current)) { \ - schedule(); \ - continue; \ - } \ - ret = -ERESTARTSYS; \ - break; \ - } \ - finish_wait(&wq, &__wait); \ -} while (0) +#define __wait_event_interruptible(wq, condition) \ + ___wait_event(wq, condition, TASK_INTERRUPTIBLE, 0, 0, \ + schedule()) /** * wait_event_interruptible - sleep until a condition gets true @@ -290,31 +298,14 @@ do { \ ({ \ int __ret = 0; \ if (!(condition)) \ - __wait_event_interruptible(wq, condition, __ret); \ + __ret = __wait_event_interruptible(wq, condition); \ __ret; \ }) -#define __wait_event_interruptible_timeout(wq, condition, ret) \ -do { \ - DEFINE_WAIT(__wait); \ - \ - for (;;) { \ - prepare_to_wait(&wq, &__wait, TASK_INTERRUPTIBLE); \ - if (condition) \ - break; \ - if (!signal_pending(current)) { \ - ret = schedule_timeout(ret); \ - if (!ret) \ - break; \ - continue; \ - } \ - ret = -ERESTARTSYS; \ - break; \ - } \ - if (!ret && (condition)) \ - ret = 1; \ - finish_wait(&wq, &__wait); \ -} while (0) +#define __wait_event_interruptible_timeout(wq, condition, timeout) \ + ___wait_event(wq, ___wait_cond_timeout(condition), \ + TASK_INTERRUPTIBLE, 0, timeout, \ + __ret = schedule_timeout(__ret)) /** * wait_event_interruptible_timeout - sleep until a condition gets true or a timeout elapses @@ -338,14 +329,14 @@ do { \ ({ \ long __ret = timeout; \ if (!(condition)) \ - __wait_event_interruptible_timeout(wq, condition, __ret); \ + __ret = __wait_event_interruptible_timeout(wq, \ + condition, timeout); \ __ret; \ }) #define __wait_event_hrtimeout(wq, condition, timeout, state) \ ({ \ int __ret = 0; \ - DEFINE_WAIT(__wait); \ struct hrtimer_sleeper __t; \ \ hrtimer_init_on_stack(&__t.timer, CLOCK_MONOTONIC, \ @@ -356,25 +347,15 @@ do { \ current->timer_slack_ns, \ HRTIMER_MODE_REL); \ \ - for (;;) { \ - prepare_to_wait(&wq, &__wait, state); \ - if (condition) \ - break; \ - if (state == TASK_INTERRUPTIBLE && \ - signal_pending(current)) { \ - __ret = -ERESTARTSYS; \ - break; \ - } \ + __ret = ___wait_event(wq, condition, state, 0, 0, \ if (!__t.task) { \ __ret = -ETIME; \ break; \ } \ - schedule(); \ - } \ + schedule()); \ \ hrtimer_cancel(&__t.timer); \ destroy_hrtimer_on_stack(&__t.timer); \ - finish_wait(&wq, &__wait); \ __ret; \ }) @@ -428,33 +409,15 @@ do { \ __ret; \ }) -#define __wait_event_interruptible_exclusive(wq, condition, ret) \ -do { \ - DEFINE_WAIT(__wait); \ - \ - for (;;) { \ - prepare_to_wait_exclusive(&wq, &__wait, \ - TASK_INTERRUPTIBLE); \ - if (condition) { \ - finish_wait(&wq, &__wait); \ - break; \ - } \ - if (!signal_pending(current)) { \ - schedule(); \ - continue; \ - } \ - ret = -ERESTARTSYS; \ - abort_exclusive_wait(&wq, &__wait, \ - TASK_INTERRUPTIBLE, NULL); \ - break; \ - } \ -} while (0) +#define __wait_event_interruptible_exclusive(wq, condition) \ + ___wait_event(wq, condition, TASK_INTERRUPTIBLE, 1, 0, \ + schedule()) #define wait_event_interruptible_exclusive(wq, condition) \ ({ \ int __ret = 0; \ if (!(condition)) \ - __wait_event_interruptible_exclusive(wq, condition, __ret);\ + __ret = __wait_event_interruptible_exclusive(wq, condition);\ __ret; \ }) @@ -606,24 +569,8 @@ do { \ ? 0 : __wait_event_interruptible_locked(wq, condition, 1, 1)) - -#define __wait_event_killable(wq, condition, ret) \ -do { \ - DEFINE_WAIT(__wait); \ - \ - for (;;) { \ - prepare_to_wait(&wq, &__wait, TASK_KILLABLE); \ - if (condition) \ - break; \ - if (!fatal_signal_pending(current)) { \ - schedule(); \ - continue; \ - } \ - ret = -ERESTARTSYS; \ - break; \ - } \ - finish_wait(&wq, &__wait); \ -} while (0) +#define __wait_event_killable(wq, condition) \ + ___wait_event(wq, condition, TASK_KILLABLE, 0, 0, schedule()) /** * wait_event_killable - sleep until a condition gets true @@ -644,26 +591,17 @@ do { \ ({ \ int __ret = 0; \ if (!(condition)) \ - __wait_event_killable(wq, condition, __ret); \ + __ret = __wait_event_killable(wq, condition); \ __ret; \ }) #define __wait_event_lock_irq(wq, condition, lock, cmd) \ -do { \ - DEFINE_WAIT(__wait); \ - \ - for (;;) { \ - prepare_to_wait(&wq, &__wait, TASK_UNINTERRUPTIBLE); \ - if (condition) \ - break; \ - spin_unlock_irq(&lock); \ - cmd; \ - schedule(); \ - spin_lock_irq(&lock); \ - } \ - finish_wait(&wq, &__wait); \ -} while (0) + (void)___wait_event(wq, condition, TASK_UNINTERRUPTIBLE, 0, 0, \ + spin_unlock_irq(&lock); \ + cmd; \ + schedule(); \ + spin_lock_irq(&lock)) /** * wait_event_lock_irq_cmd - sleep until a condition gets true. The @@ -723,26 +661,12 @@ do { \ } while (0) -#define __wait_event_interruptible_lock_irq(wq, condition, \ - lock, ret, cmd) \ -do { \ - DEFINE_WAIT(__wait); \ - \ - for (;;) { \ - prepare_to_wait(&wq, &__wait, TASK_INTERRUPTIBLE); \ - if (condition) \ - break; \ - if (signal_pending(current)) { \ - ret = -ERESTARTSYS; \ - break; \ - } \ - spin_unlock_irq(&lock); \ - cmd; \ - schedule(); \ - spin_lock_irq(&lock); \ - } \ - finish_wait(&wq, &__wait); \ -} while (0) +#define __wait_event_interruptible_lock_irq(wq, condition, lock, cmd) \ + ___wait_event(wq, condition, TASK_INTERRUPTIBLE, 0, 0, \ + spin_unlock_irq(&lock); \ + cmd; \ + schedule(); \ + spin_lock_irq(&lock)) /** * wait_event_interruptible_lock_irq_cmd - sleep until a condition gets true. @@ -772,10 +696,9 @@ do { \ #define wait_event_interruptible_lock_irq_cmd(wq, condition, lock, cmd) \ ({ \ int __ret = 0; \ - \ if (!(condition)) \ - __wait_event_interruptible_lock_irq(wq, condition, \ - lock, __ret, cmd); \ + __ret = __wait_event_interruptible_lock_irq(wq, \ + condition, lock, cmd); \ __ret; \ }) @@ -804,39 +727,24 @@ do { \ #define wait_event_interruptible_lock_irq(wq, condition, lock) \ ({ \ int __ret = 0; \ - \ if (!(condition)) \ - __wait_event_interruptible_lock_irq(wq, condition, \ - lock, __ret, ); \ + __ret = __wait_event_interruptible_lock_irq(wq, \ + condition, lock,) \ __ret; \ }) #define __wait_event_interruptible_lock_irq_timeout(wq, condition, \ - lock, ret) \ -do { \ - DEFINE_WAIT(__wait); \ - \ - for (;;) { \ - prepare_to_wait(&wq, &__wait, TASK_INTERRUPTIBLE); \ - if (condition) \ - break; \ - if (signal_pending(current)) { \ - ret = -ERESTARTSYS; \ - break; \ - } \ - spin_unlock_irq(&lock); \ - ret = schedule_timeout(ret); \ - spin_lock_irq(&lock); \ - if (!ret) \ - break; \ - } \ - finish_wait(&wq, &__wait); \ -} while (0) + lock, timeout) \ + ___wait_event(wq, ___wait_cond_timeout(condition), \ + TASK_INTERRUPTIBLE, 0, ret, \ + spin_unlock_irq(&lock); \ + __ret = schedule_timeout(__ret); \ + spin_lock_irq(&lock)); /** - * wait_event_interruptible_lock_irq_timeout - sleep until a condition gets true or a timeout elapses. - * The condition is checked under the lock. This is expected - * to be called with the lock taken. + * wait_event_interruptible_lock_irq_timeout - sleep until a condition gets + * true or a timeout elapses. The condition is checked under + * the lock. This is expected to be called with the lock taken. * @wq: the waitqueue to wait on * @condition: a C expression for the event to wait for * @lock: a locked spinlock_t, which will be released before schedule() @@ -860,11 +768,10 @@ do { \ #define wait_event_interruptible_lock_irq_timeout(wq, condition, lock, \ timeout) \ ({ \ - int __ret = timeout; \ - \ + long __ret = timeout; \ if (!(condition)) \ - __wait_event_interruptible_lock_irq_timeout( \ - wq, condition, lock, __ret); \ + __ret = __wait_event_interruptible_lock_irq_timeout( \ + wq, condition, lock, timeout); \ __ret; \ }) @@ -875,11 +782,9 @@ do { \ * We plan to remove these interfaces. */ extern void sleep_on(wait_queue_head_t *q); -extern long sleep_on_timeout(wait_queue_head_t *q, - signed long timeout); +extern long sleep_on_timeout(wait_queue_head_t *q, signed long timeout); extern void interruptible_sleep_on(wait_queue_head_t *q); -extern long interruptible_sleep_on_timeout(wait_queue_head_t *q, - signed long timeout); +extern long interruptible_sleep_on_timeout(wait_queue_head_t *q, signed long timeout); /* * Waitqueues which are removed from the waitqueue_head at wakeup time @@ -887,8 +792,7 @@ extern long interruptible_sleep_on_timeout(wait_queue_head_t *q, void prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state); void prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state); void finish_wait(wait_queue_head_t *q, wait_queue_t *wait); -void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait, - unsigned int mode, void *key); +void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait, unsigned int mode, void *key); int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key); int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *key); @@ -934,8 +838,8 @@ int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *key); * One uses wait_on_bit() where one is waiting for the bit to clear, * but has no intention of setting it. */ -static inline int wait_on_bit(void *word, int bit, - int (*action)(void *), unsigned mode) +static inline int +wait_on_bit(void *word, int bit, int (*action)(void *), unsigned mode) { if (!test_bit(bit, word)) return 0; @@ -958,8 +862,8 @@ static inline int wait_on_bit(void *word, int bit, * One uses wait_on_bit_lock() where one is waiting for the bit to * clear with the intention of setting it, and when done, clearing it. */ -static inline int wait_on_bit_lock(void *word, int bit, - int (*action)(void *), unsigned mode) +static inline int +wait_on_bit_lock(void *word, int bit, int (*action)(void *), unsigned mode) { if (!test_and_set_bit(bit, word)) return 0; @@ -983,5 +887,5 @@ int wait_on_atomic_t(atomic_t *val, int (*action)(atomic_t *), unsigned mode) return 0; return out_of_line_wait_on_atomic_t(val, action, mode); } - -#endif + +#endif /* _LINUX_WAIT_H */ diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 2e7d9947a10d..613381bcde40 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -100,7 +100,7 @@ static inline long __trace_sched_switch_state(struct task_struct *p) /* * For all intents and purposes a preempted task is a running task. */ - if (task_thread_info(p)->preempt_count & PREEMPT_ACTIVE) + if (task_preempt_count(p) & PREEMPT_ACTIVE) state = TASK_RUNNING | TASK_STATE_MAX; #endif diff --git a/init/main.c b/init/main.c index af310afbef28..7cc4b7889a88 100644 --- a/init/main.c +++ b/init/main.c @@ -692,7 +692,7 @@ int __init_or_module do_one_initcall(initcall_t fn) if (preempt_count() != count) { sprintf(msgbuf, "preemption imbalance "); - preempt_count() = count; + preempt_count_set(count); } if (irqs_disabled()) { strlcat(msgbuf, "disabled interrupts ", sizeof(msgbuf)); diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index 859c8dfd78a1..e5f3917aa05b 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -120,7 +120,7 @@ void context_tracking_user_enter(void) * instead of preempt_schedule() to exit user context if needed before * calling the scheduler. */ -void __sched notrace preempt_schedule_context(void) +asmlinkage void __sched notrace preempt_schedule_context(void) { enum ctx_state prev_ctx; diff --git a/kernel/cpu/idle.c b/kernel/cpu/idle.c index e695c0a0bcb5..988573a9a387 100644 --- a/kernel/cpu/idle.c +++ b/kernel/cpu/idle.c @@ -44,7 +44,7 @@ static inline int cpu_idle_poll(void) rcu_idle_enter(); trace_cpu_idle_rcuidle(0, smp_processor_id()); local_irq_enable(); - while (!need_resched()) + while (!tif_need_resched()) cpu_relax(); trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); rcu_idle_exit(); @@ -92,8 +92,7 @@ static void cpu_idle_loop(void) if (cpu_idle_force_poll || tick_check_broadcast_expired()) { cpu_idle_poll(); } else { - current_clr_polling(); - if (!need_resched()) { + if (!current_clr_polling_and_test()) { stop_critical_timings(); rcu_idle_enter(); arch_cpu_idle(); @@ -103,9 +102,16 @@ static void cpu_idle_loop(void) } else { local_irq_enable(); } - current_set_polling(); + __current_set_polling(); } arch_cpu_idle_exit(); + /* + * We need to test and propagate the TIF_NEED_RESCHED + * bit here because we might not have send the + * reschedule IPI to idle tasks. + */ + if (tif_need_resched()) + set_preempt_need_resched(); } tick_nohz_idle_exit(); schedule_preempt_disabled(); @@ -129,7 +135,7 @@ void cpu_startup_entry(enum cpuhp_state state) */ boot_init_stack_canary(); #endif - current_set_polling(); + __current_set_polling(); arch_cpu_idle_prepare(); cpu_idle_loop(); } diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 32618b3fe4e6..1dc9f3604ad8 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -898,6 +898,12 @@ static void print_other_cpu_stall(struct rcu_state *rsp) force_quiescent_state(rsp); /* Kick them all. */ } +/* + * This function really isn't for public consumption, but RCU is special in + * that context switches can allow the state machine to make progress. + */ +extern void resched_cpu(int cpu); + static void print_cpu_stall(struct rcu_state *rsp) { int cpu; @@ -927,7 +933,14 @@ static void print_cpu_stall(struct rcu_state *rsp) 3 * rcu_jiffies_till_stall_check() + 3; raw_spin_unlock_irqrestore(&rnp->lock, flags); - set_need_resched(); /* kick ourselves to get things going. */ + /* + * Attempt to revive the RCU machinery by forcing a context switch. + * + * A context switch would normally allow the RCU state machine to make + * progress and it could be we're stuck in kernel space without context + * switches for an entirely unreasonable amount of time. + */ + resched_cpu(smp_processor_id()); } static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 5ac63c9a995a..f575d5bd7e7a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -513,12 +513,11 @@ static inline void init_hrtick(void) * might also involve a cross-CPU call to trigger the scheduler on * the target CPU. */ -#ifdef CONFIG_SMP void resched_task(struct task_struct *p) { int cpu; - assert_raw_spin_locked(&task_rq(p)->lock); + lockdep_assert_held(&task_rq(p)->lock); if (test_tsk_need_resched(p)) return; @@ -526,8 +525,10 @@ void resched_task(struct task_struct *p) set_tsk_need_resched(p); cpu = task_cpu(p); - if (cpu == smp_processor_id()) + if (cpu == smp_processor_id()) { + set_preempt_need_resched(); return; + } /* NEED_RESCHED must be visible before we test polling */ smp_mb(); @@ -546,6 +547,7 @@ void resched_cpu(int cpu) raw_spin_unlock_irqrestore(&rq->lock, flags); } +#ifdef CONFIG_SMP #ifdef CONFIG_NO_HZ_COMMON /* * In the semi idle case, use the nearest busy cpu for migrating timers @@ -693,12 +695,6 @@ void sched_avg_update(struct rq *rq) } } -#else /* !CONFIG_SMP */ -void resched_task(struct task_struct *p) -{ - assert_raw_spin_locked(&task_rq(p)->lock); - set_tsk_need_resched(p); -} #endif /* CONFIG_SMP */ #if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \ @@ -767,14 +763,14 @@ static void set_load_weight(struct task_struct *p) static void enqueue_task(struct rq *rq, struct task_struct *p, int flags) { update_rq_clock(rq); - sched_info_queued(p); + sched_info_queued(rq, p); p->sched_class->enqueue_task(rq, p, flags); } static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) { update_rq_clock(rq); - sched_info_dequeued(p); + sched_info_dequeued(rq, p); p->sched_class->dequeue_task(rq, p, flags); } @@ -987,7 +983,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) * ttwu() will sort out the placement. */ WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && - !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE)); + !(task_preempt_count(p) & PREEMPT_ACTIVE)); #ifdef CONFIG_LOCKDEP /* @@ -1330,12 +1326,13 @@ ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) if (rq->idle_stamp) { u64 delta = rq_clock(rq) - rq->idle_stamp; - u64 max = 2*sysctl_sched_migration_cost; + u64 max = 2*rq->max_idle_balance_cost; + + update_avg(&rq->avg_idle, delta); - if (delta > max) + if (rq->avg_idle > max) rq->avg_idle = max; - else - update_avg(&rq->avg_idle, delta); + rq->idle_stamp = 0; } #endif @@ -1396,6 +1393,14 @@ static void sched_ttwu_pending(void) void scheduler_ipi(void) { + /* + * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting + * TIF_NEED_RESCHED remotely (for the first time) will also send + * this IPI. + */ + if (tif_need_resched()) + set_preempt_need_resched(); + if (llist_empty(&this_rq()->wake_list) && !tick_nohz_full_cpu(smp_processor_id()) && !got_nohz_idle_kick()) @@ -1717,10 +1722,7 @@ void sched_fork(struct task_struct *p) #if defined(CONFIG_SMP) p->on_cpu = 0; #endif -#ifdef CONFIG_PREEMPT_COUNT - /* Want to start with kernel preemption disabled. */ - task_thread_info(p)->preempt_count = 1; -#endif + init_task_preempt_count(p); #ifdef CONFIG_SMP plist_node_init(&p->pushable_tasks, MAX_PRIO); #endif @@ -1838,7 +1840,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next) { trace_sched_switch(prev, next); - sched_info_switch(prev, next); + sched_info_switch(rq, prev, next); perf_event_task_sched_out(prev, next); fire_sched_out_preempt_notifiers(prev, next); prepare_lock_switch(rq, next); @@ -2215,7 +2217,7 @@ notrace unsigned long get_parent_ip(unsigned long addr) #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ defined(CONFIG_PREEMPT_TRACER)) -void __kprobes add_preempt_count(int val) +void __kprobes preempt_count_add(int val) { #ifdef CONFIG_DEBUG_PREEMPT /* @@ -2224,7 +2226,7 @@ void __kprobes add_preempt_count(int val) if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) return; #endif - preempt_count() += val; + __preempt_count_add(val); #ifdef CONFIG_DEBUG_PREEMPT /* * Spinlock count overflowing soon? @@ -2235,9 +2237,9 @@ void __kprobes add_preempt_count(int val) if (preempt_count() == val) trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); } -EXPORT_SYMBOL(add_preempt_count); +EXPORT_SYMBOL(preempt_count_add); -void __kprobes sub_preempt_count(int val) +void __kprobes preempt_count_sub(int val) { #ifdef CONFIG_DEBUG_PREEMPT /* @@ -2255,9 +2257,9 @@ void __kprobes sub_preempt_count(int val) if (preempt_count() == val) trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); - preempt_count() -= val; + __preempt_count_sub(val); } -EXPORT_SYMBOL(sub_preempt_count); +EXPORT_SYMBOL(preempt_count_sub); #endif @@ -2430,6 +2432,7 @@ need_resched: put_prev_task(rq, prev); next = pick_next_task(rq); clear_tsk_need_resched(prev); + clear_preempt_need_resched(); rq->skip_clock_update = 0; if (likely(prev != next)) { @@ -2520,9 +2523,9 @@ asmlinkage void __sched notrace preempt_schedule(void) return; do { - add_preempt_count_notrace(PREEMPT_ACTIVE); + __preempt_count_add(PREEMPT_ACTIVE); __schedule(); - sub_preempt_count_notrace(PREEMPT_ACTIVE); + __preempt_count_sub(PREEMPT_ACTIVE); /* * Check again in case we missed a preemption opportunity @@ -2541,20 +2544,19 @@ EXPORT_SYMBOL(preempt_schedule); */ asmlinkage void __sched preempt_schedule_irq(void) { - struct thread_info *ti = current_thread_info(); enum ctx_state prev_state; /* Catch callers which need to be fixed */ - BUG_ON(ti->preempt_count || !irqs_disabled()); + BUG_ON(preempt_count() || !irqs_disabled()); prev_state = exception_enter(); do { - add_preempt_count(PREEMPT_ACTIVE); + __preempt_count_add(PREEMPT_ACTIVE); local_irq_enable(); __schedule(); local_irq_disable(); - sub_preempt_count(PREEMPT_ACTIVE); + __preempt_count_sub(PREEMPT_ACTIVE); /* * Check again in case we missed a preemption opportunity @@ -3794,16 +3796,11 @@ SYSCALL_DEFINE0(sched_yield) return 0; } -static inline int should_resched(void) -{ - return need_resched() && !(preempt_count() & PREEMPT_ACTIVE); -} - static void __cond_resched(void) { - add_preempt_count(PREEMPT_ACTIVE); + __preempt_count_add(PREEMPT_ACTIVE); __schedule(); - sub_preempt_count(PREEMPT_ACTIVE); + __preempt_count_sub(PREEMPT_ACTIVE); } int __sched _cond_resched(void) @@ -4212,7 +4209,7 @@ void init_idle(struct task_struct *idle, int cpu) raw_spin_unlock_irqrestore(&rq->lock, flags); /* Set the preempt count _outside_ the spinlocks! */ - task_thread_info(idle)->preempt_count = 0; + init_idle_preempt_count(idle, cpu); /* * The idle tasks have their own, simple scheduling class: @@ -6505,6 +6502,7 @@ void __init sched_init(void) rq->online = 0; rq->idle_stamp = 0; rq->avg_idle = 2*sysctl_sched_migration_cost; + rq->max_idle_balance_cost = sysctl_sched_migration_cost; INIT_LIST_HEAD(&rq->cfs_tasks); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7c70201fbc61..2b89cd244b0d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3906,7 +3906,8 @@ static unsigned long __read_mostly max_load_balance_interval = HZ/10; #define LBF_ALL_PINNED 0x01 #define LBF_NEED_BREAK 0x02 -#define LBF_SOME_PINNED 0x04 +#define LBF_DST_PINNED 0x04 +#define LBF_SOME_PINNED 0x08 struct lb_env { struct sched_domain *sd; @@ -3997,6 +3998,8 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) schedstat_inc(p, se.statistics.nr_failed_migrations_affine); + env->flags |= LBF_SOME_PINNED; + /* * Remember if this task can be migrated to any other cpu in * our sched_group. We may want to revisit it if we couldn't @@ -4005,13 +4008,13 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) * Also avoid computing new_dst_cpu if we have already computed * one in current iteration. */ - if (!env->dst_grpmask || (env->flags & LBF_SOME_PINNED)) + if (!env->dst_grpmask || (env->flags & LBF_DST_PINNED)) return 0; /* Prevent to re-select dst_cpu via env's cpus */ for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) { if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) { - env->flags |= LBF_SOME_PINNED; + env->flags |= LBF_DST_PINNED; env->new_dst_cpu = cpu; break; } @@ -4447,7 +4450,7 @@ void update_group_power(struct sched_domain *sd, int cpu) { struct sched_domain *child = sd->child; struct sched_group *group, *sdg = sd->groups; - unsigned long power; + unsigned long power, power_orig; unsigned long interval; interval = msecs_to_jiffies(sd->balance_interval); @@ -4459,7 +4462,7 @@ void update_group_power(struct sched_domain *sd, int cpu) return; } - power = 0; + power_orig = power = 0; if (child->flags & SD_OVERLAP) { /* @@ -4467,8 +4470,12 @@ void update_group_power(struct sched_domain *sd, int cpu) * span the current group. */ - for_each_cpu(cpu, sched_group_cpus(sdg)) - power += power_of(cpu); + for_each_cpu(cpu, sched_group_cpus(sdg)) { + struct sched_group *sg = cpu_rq(cpu)->sd->groups; + + power_orig += sg->sgp->power_orig; + power += sg->sgp->power; + } } else { /* * !SD_OVERLAP domains can assume that child groups @@ -4477,12 +4484,14 @@ void update_group_power(struct sched_domain *sd, int cpu) group = child->groups; do { + power_orig += group->sgp->power_orig; power += group->sgp->power; group = group->next; } while (group != child->groups); } - sdg->sgp->power_orig = sdg->sgp->power = power; + sdg->sgp->power_orig = power_orig; + sdg->sgp->power = power; } /* @@ -4526,13 +4535,12 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group) * cpu 3 and leave one of the cpus in the second group unused. * * The current solution to this issue is detecting the skew in the first group - * by noticing it has a cpu that is overloaded while the remaining cpus are - * idle -- or rather, there's a distinct imbalance in the cpus; see - * sg_imbalanced(). + * by noticing the lower domain failed to reach balance and had difficulty + * moving tasks due to affinity constraints. * * When this is so detected; this group becomes a candidate for busiest; see * update_sd_pick_busiest(). And calculcate_imbalance() and - * find_busiest_group() avoid some of the usual balance conditional to allow it + * find_busiest_group() avoid some of the usual balance conditions to allow it * to create an effective group imbalance. * * This is a somewhat tricky proposition since the next run might not find the @@ -4540,49 +4548,36 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group) * subtle and fragile situation. */ -struct sg_imb_stats { - unsigned long max_nr_running, min_nr_running; - unsigned long max_cpu_load, min_cpu_load; -}; - -static inline void init_sg_imb_stats(struct sg_imb_stats *sgi) +static inline int sg_imbalanced(struct sched_group *group) { - sgi->max_cpu_load = sgi->max_nr_running = 0UL; - sgi->min_cpu_load = sgi->min_nr_running = ~0UL; + return group->sgp->imbalance; } -static inline void -update_sg_imb_stats(struct sg_imb_stats *sgi, - unsigned long load, unsigned long nr_running) +/* + * Compute the group capacity. + * + * Avoid the issue where N*frac(smt_power) >= 1 creates 'phantom' cores by + * first dividing out the smt factor and computing the actual number of cores + * and limit power unit capacity with that. + */ +static inline int sg_capacity(struct lb_env *env, struct sched_group *group) { - if (load > sgi->max_cpu_load) - sgi->max_cpu_load = load; - if (sgi->min_cpu_load > load) - sgi->min_cpu_load = load; + unsigned int capacity, smt, cpus; + unsigned int power, power_orig; - if (nr_running > sgi->max_nr_running) - sgi->max_nr_running = nr_running; - if (sgi->min_nr_running > nr_running) - sgi->min_nr_running = nr_running; -} + power = group->sgp->power; + power_orig = group->sgp->power_orig; + cpus = group->group_weight; -static inline int -sg_imbalanced(struct sg_lb_stats *sgs, struct sg_imb_stats *sgi) -{ - /* - * Consider the group unbalanced when the imbalance is larger - * than the average weight of a task. - * - * APZ: with cgroup the avg task weight can vary wildly and - * might not be a suitable number - should we keep a - * normalized nr_running number somewhere that negates - * the hierarchy? - */ - if ((sgi->max_cpu_load - sgi->min_cpu_load) >= sgs->load_per_task && - (sgi->max_nr_running - sgi->min_nr_running) > 1) - return 1; + /* smt := ceil(cpus / power), assumes: 1 < smt_power < 2 */ + smt = DIV_ROUND_UP(SCHED_POWER_SCALE * cpus, power_orig); + capacity = cpus / smt; /* cores */ - return 0; + capacity = min_t(unsigned, capacity, DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE)); + if (!capacity) + capacity = fix_small_capacity(env->sd, group); + + return capacity; } /** @@ -4597,12 +4592,11 @@ static inline void update_sg_lb_stats(struct lb_env *env, struct sched_group *group, int load_idx, int local_group, struct sg_lb_stats *sgs) { - struct sg_imb_stats sgi; unsigned long nr_running; unsigned long load; int i; - init_sg_imb_stats(&sgi); + memset(sgs, 0, sizeof(*sgs)); for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { struct rq *rq = cpu_rq(i); @@ -4610,12 +4604,10 @@ static inline void update_sg_lb_stats(struct lb_env *env, nr_running = rq->nr_running; /* Bias balancing toward cpus of our domain */ - if (local_group) { + if (local_group) load = target_load(i, load_idx); - } else { + else load = source_load(i, load_idx); - update_sg_imb_stats(&sgi, load, nr_running); - } sgs->group_load += load; sgs->sum_nr_running += nr_running; @@ -4624,10 +4616,6 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->idle_cpus++; } - if (local_group && (env->idle != CPU_NEWLY_IDLE || - time_after_eq(jiffies, group->sgp->next_update))) - update_group_power(env->sd, env->dst_cpu); - /* Adjust by relative CPU power of the group */ sgs->group_power = group->sgp->power; sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / sgs->group_power; @@ -4635,16 +4623,11 @@ static inline void update_sg_lb_stats(struct lb_env *env, if (sgs->sum_nr_running) sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; - sgs->group_imb = sg_imbalanced(sgs, &sgi); - - sgs->group_capacity = - DIV_ROUND_CLOSEST(sgs->group_power, SCHED_POWER_SCALE); - - if (!sgs->group_capacity) - sgs->group_capacity = fix_small_capacity(env->sd, group); - sgs->group_weight = group->group_weight; + sgs->group_imb = sg_imbalanced(group); + sgs->group_capacity = sg_capacity(env, group); + if (sgs->group_capacity > sgs->sum_nr_running) sgs->group_has_capacity = 1; } @@ -4720,11 +4703,17 @@ static inline void update_sd_lb_stats(struct lb_env *env, if (local_group) { sds->local = sg; sgs = &sds->local_stat; + + if (env->idle != CPU_NEWLY_IDLE || + time_after_eq(jiffies, sg->sgp->next_update)) + update_group_power(env->sd, env->dst_cpu); } - memset(sgs, 0, sizeof(*sgs)); update_sg_lb_stats(env, sg, load_idx, local_group, sgs); + if (local_group) + goto next_group; + /* * In case the child domain prefers tasks go to siblings * first, lower the sg capacity to one so that we'll try @@ -4735,19 +4724,20 @@ static inline void update_sd_lb_stats(struct lb_env *env, * heaviest group when it is already under-utilized (possible * with a large weight task outweighs the tasks on the system). */ - if (prefer_sibling && !local_group && - sds->local && sds->local_stat.group_has_capacity) + if (prefer_sibling && sds->local && + sds->local_stat.group_has_capacity) sgs->group_capacity = min(sgs->group_capacity, 1U); - /* Now, start updating sd_lb_stats */ - sds->total_load += sgs->group_load; - sds->total_pwr += sgs->group_power; - - if (!local_group && update_sd_pick_busiest(env, sds, sg, sgs)) { + if (update_sd_pick_busiest(env, sds, sg, sgs)) { sds->busiest = sg; sds->busiest_stat = *sgs; } +next_group: + /* Now, start updating sd_lb_stats */ + sds->total_load += sgs->group_load; + sds->total_pwr += sgs->group_power; + sg = sg->next; } while (sg != env->sd->groups); } @@ -5164,6 +5154,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, int *continue_balancing) { int ld_moved, cur_ld_moved, active_balance = 0; + struct sched_domain *sd_parent = sd->parent; struct sched_group *group; struct rq *busiest; unsigned long flags; @@ -5268,17 +5259,17 @@ more_balance: * moreover subsequent load balance cycles should correct the * excess load moved. */ - if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) { + if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) { + + /* Prevent to re-select dst_cpu via env's cpus */ + cpumask_clear_cpu(env.dst_cpu, env.cpus); env.dst_rq = cpu_rq(env.new_dst_cpu); env.dst_cpu = env.new_dst_cpu; - env.flags &= ~LBF_SOME_PINNED; + env.flags &= ~LBF_DST_PINNED; env.loop = 0; env.loop_break = sched_nr_migrate_break; - /* Prevent to re-select dst_cpu via env's cpus */ - cpumask_clear_cpu(env.dst_cpu, env.cpus); - /* * Go back to "more_balance" rather than "redo" since we * need to continue with same src_cpu. @@ -5286,6 +5277,18 @@ more_balance: goto more_balance; } + /* + * We failed to reach balance because of affinity. + */ + if (sd_parent) { + int *group_imbalance = &sd_parent->groups->sgp->imbalance; + + if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) { + *group_imbalance = 1; + } else if (*group_imbalance) + *group_imbalance = 0; + } + /* All tasks on this runqueue were pinned by CPU affinity */ if (unlikely(env.flags & LBF_ALL_PINNED)) { cpumask_clear_cpu(cpu_of(busiest), cpus); @@ -5393,6 +5396,7 @@ void idle_balance(int this_cpu, struct rq *this_rq) struct sched_domain *sd; int pulled_task = 0; unsigned long next_balance = jiffies + HZ; + u64 curr_cost = 0; this_rq->idle_stamp = rq_clock(this_rq); @@ -5409,15 +5413,27 @@ void idle_balance(int this_cpu, struct rq *this_rq) for_each_domain(this_cpu, sd) { unsigned long interval; int continue_balancing = 1; + u64 t0, domain_cost; if (!(sd->flags & SD_LOAD_BALANCE)) continue; + if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) + break; + if (sd->flags & SD_BALANCE_NEWIDLE) { + t0 = sched_clock_cpu(this_cpu); + /* If we've pulled tasks over stop searching: */ pulled_task = load_balance(this_cpu, this_rq, sd, CPU_NEWLY_IDLE, &continue_balancing); + + domain_cost = sched_clock_cpu(this_cpu) - t0; + if (domain_cost > sd->max_newidle_lb_cost) + sd->max_newidle_lb_cost = domain_cost; + + curr_cost += domain_cost; } interval = msecs_to_jiffies(sd->balance_interval); @@ -5439,6 +5455,9 @@ void idle_balance(int this_cpu, struct rq *this_rq) */ this_rq->next_balance = next_balance; } + + if (curr_cost > this_rq->max_idle_balance_cost) + this_rq->max_idle_balance_cost = curr_cost; } /* @@ -5662,15 +5681,39 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) /* Earliest time when we have to do rebalance again */ unsigned long next_balance = jiffies + 60*HZ; int update_next_balance = 0; - int need_serialize; + int need_serialize, need_decay = 0; + u64 max_cost = 0; update_blocked_averages(cpu); rcu_read_lock(); for_each_domain(cpu, sd) { + /* + * Decay the newidle max times here because this is a regular + * visit to all the domains. Decay ~1% per second. + */ + if (time_after(jiffies, sd->next_decay_max_lb_cost)) { + sd->max_newidle_lb_cost = + (sd->max_newidle_lb_cost * 253) / 256; + sd->next_decay_max_lb_cost = jiffies + HZ; + need_decay = 1; + } + max_cost += sd->max_newidle_lb_cost; + if (!(sd->flags & SD_LOAD_BALANCE)) continue; + /* + * Stop the load balance at this level. There is another + * CPU in our sched group which is doing load balancing more + * actively. + */ + if (!continue_balancing) { + if (need_decay) + continue; + break; + } + interval = sd->balance_interval; if (idle != CPU_IDLE) interval *= sd->busy_factor; @@ -5689,7 +5732,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) if (time_after_eq(jiffies, sd->last_balance + interval)) { if (load_balance(cpu, rq, sd, idle, &continue_balancing)) { /* - * The LBF_SOME_PINNED logic could have changed + * The LBF_DST_PINNED logic could have changed * env->dst_cpu, so we can't know our idle * state even if we migrated tasks. Update it. */ @@ -5704,14 +5747,14 @@ out: next_balance = sd->last_balance + interval; update_next_balance = 1; } - + } + if (need_decay) { /* - * Stop the load balance at this level. There is another - * CPU in our sched group which is doing load balancing more - * actively. + * Ensure the rq-wide value also decays but keep it at a + * reasonable floor to avoid funnies with rq->avg_idle. */ - if (!continue_balancing) - break; + rq->max_idle_balance_cost = + max((u64)sysctl_sched_migration_cost, max_cost); } rcu_read_unlock(); diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 01970c8e64df..ceebfba0a1dd 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1213,8 +1213,7 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags) */ if (curr && unlikely(rt_task(curr)) && (curr->nr_cpus_allowed < 2 || - curr->prio <= p->prio) && - (p->nr_cpus_allowed > 1)) { + curr->prio <= p->prio)) { int target = find_lowest_rq(p); if (target != -1) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index b3c5653e1dca..e82484db7699 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -476,6 +476,9 @@ struct rq { u64 age_stamp; u64 idle_stamp; u64 avg_idle; + + /* This is used to determine avg_idle's max value */ + u64 max_idle_balance_cost; #endif #ifdef CONFIG_IRQ_TIME_ACCOUNTING @@ -605,6 +608,7 @@ struct sched_group_power { */ unsigned int power, power_orig; unsigned long next_update; + int imbalance; /* XXX unrelated to power but shared group state */ /* * Number of busy cpus in this group. */ diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index c7edee71bce8..4ab704339656 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -59,9 +59,9 @@ static inline void sched_info_reset_dequeued(struct task_struct *t) * from dequeue_task() to account for possible rq->clock skew across cpus. The * delta taken on each cpu would annul the skew. */ -static inline void sched_info_dequeued(struct task_struct *t) +static inline void sched_info_dequeued(struct rq *rq, struct task_struct *t) { - unsigned long long now = rq_clock(task_rq(t)), delta = 0; + unsigned long long now = rq_clock(rq), delta = 0; if (unlikely(sched_info_on())) if (t->sched_info.last_queued) @@ -69,7 +69,7 @@ static inline void sched_info_dequeued(struct task_struct *t) sched_info_reset_dequeued(t); t->sched_info.run_delay += delta; - rq_sched_info_dequeued(task_rq(t), delta); + rq_sched_info_dequeued(rq, delta); } /* @@ -77,9 +77,9 @@ static inline void sched_info_dequeued(struct task_struct *t) * long it was waiting to run. We also note when it began so that we * can keep stats on how long its timeslice is. */ -static void sched_info_arrive(struct task_struct *t) +static void sched_info_arrive(struct rq *rq, struct task_struct *t) { - unsigned long long now = rq_clock(task_rq(t)), delta = 0; + unsigned long long now = rq_clock(rq), delta = 0; if (t->sched_info.last_queued) delta = now - t->sched_info.last_queued; @@ -88,7 +88,7 @@ static void sched_info_arrive(struct task_struct *t) t->sched_info.last_arrival = now; t->sched_info.pcount++; - rq_sched_info_arrive(task_rq(t), delta); + rq_sched_info_arrive(rq, delta); } /* @@ -96,11 +96,11 @@ static void sched_info_arrive(struct task_struct *t) * the timestamp if it is already not set. It's assumed that * sched_info_dequeued() will clear that stamp when appropriate. */ -static inline void sched_info_queued(struct task_struct *t) +static inline void sched_info_queued(struct rq *rq, struct task_struct *t) { if (unlikely(sched_info_on())) if (!t->sched_info.last_queued) - t->sched_info.last_queued = rq_clock(task_rq(t)); + t->sched_info.last_queued = rq_clock(rq); } /* @@ -111,15 +111,15 @@ static inline void sched_info_queued(struct task_struct *t) * sched_info_queued() to mark that it has now again started waiting on * the runqueue. */ -static inline void sched_info_depart(struct task_struct *t) +static inline void sched_info_depart(struct rq *rq, struct task_struct *t) { - unsigned long long delta = rq_clock(task_rq(t)) - + unsigned long long delta = rq_clock(rq) - t->sched_info.last_arrival; - rq_sched_info_depart(task_rq(t), delta); + rq_sched_info_depart(rq, delta); if (t->state == TASK_RUNNING) - sched_info_queued(t); + sched_info_queued(rq, t); } /* @@ -128,32 +128,34 @@ static inline void sched_info_depart(struct task_struct *t) * the idle task.) We are only called when prev != next. */ static inline void -__sched_info_switch(struct task_struct *prev, struct task_struct *next) +__sched_info_switch(struct rq *rq, + struct task_struct *prev, struct task_struct *next) { - struct rq *rq = task_rq(prev); - /* * prev now departs the cpu. It's not interesting to record * stats about how efficient we were at scheduling the idle * process, however. */ if (prev != rq->idle) - sched_info_depart(prev); + sched_info_depart(rq, prev); if (next != rq->idle) - sched_info_arrive(next); + sched_info_arrive(rq, next); } static inline void -sched_info_switch(struct task_struct *prev, struct task_struct *next) +sched_info_switch(struct rq *rq, + struct task_struct *prev, struct task_struct *next) { if (unlikely(sched_info_on())) - __sched_info_switch(prev, next); + __sched_info_switch(rq, prev, next); } #else -#define sched_info_queued(t) do { } while (0) +#define sched_info_queued(rq, t) do { } while (0) #define sched_info_reset_dequeued(t) do { } while (0) -#define sched_info_dequeued(t) do { } while (0) -#define sched_info_switch(t, next) do { } while (0) +#define sched_info_dequeued(rq, t) do { } while (0) +#define sched_info_depart(rq, t) do { } while (0) +#define sched_info_arrive(rq, next) do { } while (0) +#define sched_info_switch(rq, t, next) do { } while (0) #endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */ /* diff --git a/kernel/softirq.c b/kernel/softirq.c index d7d498d8cc4f..dcab1d3fb53d 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -100,13 +100,13 @@ static void __local_bh_disable(unsigned long ip, unsigned int cnt) raw_local_irq_save(flags); /* - * The preempt tracer hooks into add_preempt_count and will break + * The preempt tracer hooks into preempt_count_add and will break * lockdep because it calls back into lockdep after SOFTIRQ_OFFSET * is set and before current->softirq_enabled is cleared. * We must manually increment preempt_count here and manually * call the trace_preempt_off later. */ - preempt_count() += cnt; + __preempt_count_add(cnt); /* * Were softirqs turned off above: */ @@ -120,7 +120,7 @@ static void __local_bh_disable(unsigned long ip, unsigned int cnt) #else /* !CONFIG_TRACE_IRQFLAGS */ static inline void __local_bh_disable(unsigned long ip, unsigned int cnt) { - add_preempt_count(cnt); + preempt_count_add(cnt); barrier(); } #endif /* CONFIG_TRACE_IRQFLAGS */ @@ -139,7 +139,7 @@ static void __local_bh_enable(unsigned int cnt) if (softirq_count() == cnt) trace_softirqs_on(_RET_IP_); - sub_preempt_count(cnt); + preempt_count_sub(cnt); } /* @@ -169,12 +169,12 @@ static inline void _local_bh_enable_ip(unsigned long ip) * Keep preemption disabled until we are done with * softirq processing: */ - sub_preempt_count(SOFTIRQ_DISABLE_OFFSET - 1); + preempt_count_sub(SOFTIRQ_DISABLE_OFFSET - 1); if (unlikely(!in_interrupt() && local_softirq_pending())) do_softirq(); - dec_preempt_count(); + preempt_count_dec(); #ifdef CONFIG_TRACE_IRQFLAGS local_irq_enable(); #endif @@ -256,7 +256,7 @@ restart: " exited with %08x?\n", vec_nr, softirq_to_name[vec_nr], h->action, prev_count, preempt_count()); - preempt_count() = prev_count; + preempt_count_set(prev_count); } rcu_bh_qs(cpu); @@ -369,7 +369,7 @@ void irq_exit(void) account_irq_exit_time(current); trace_hardirq_exit(); - sub_preempt_count(HARDIRQ_OFFSET); + preempt_count_sub(HARDIRQ_OFFSET); if (!in_interrupt() && local_softirq_pending()) invoke_softirq(); diff --git a/kernel/timer.c b/kernel/timer.c index 4296d13db3d1..6582b82fa966 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -1092,7 +1092,7 @@ static int cascade(struct tvec_base *base, struct tvec *tv, int index) static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long), unsigned long data) { - int preempt_count = preempt_count(); + int count = preempt_count(); #ifdef CONFIG_LOCKDEP /* @@ -1119,16 +1119,16 @@ static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long), lock_map_release(&lockdep_map); - if (preempt_count != preempt_count()) { + if (count != preempt_count()) { WARN_ONCE(1, "timer: %pF preempt leak: %08x -> %08x\n", - fn, preempt_count, preempt_count()); + fn, count, preempt_count()); /* * Restore the preempt count. That gives us a decent * chance to survive and extract information. If the * callback kept a lock held, bad luck, but not worse * than the BUG() we had. */ - preempt_count() = preempt_count; + preempt_count_set(count); } } diff --git a/lib/locking-selftest.c b/lib/locking-selftest.c index 6dc09d8f4c24..872a15a2a637 100644 --- a/lib/locking-selftest.c +++ b/lib/locking-selftest.c @@ -1002,7 +1002,7 @@ static void dotest(void (*testcase_fn)(void), int expected, int lockclass_mask) * Some tests (e.g. double-unlock) might corrupt the preemption * count, so restore it: */ - preempt_count() = saved_preempt_count; + preempt_count_set(saved_preempt_count); #ifdef CONFIG_TRACE_IRQFLAGS if (softirq_count()) current->softirqs_enabled = 0; diff --git a/lib/smp_processor_id.c b/lib/smp_processor_id.c index 4c0d0e51d49e..04abe53f12a1 100644 --- a/lib/smp_processor_id.c +++ b/lib/smp_processor_id.c @@ -9,10 +9,9 @@ notrace unsigned int debug_smp_processor_id(void) { - unsigned long preempt_count = preempt_count(); int this_cpu = raw_smp_processor_id(); - if (likely(preempt_count)) + if (likely(preempt_count())) goto out; if (irqs_disabled()) diff --git a/net/irda/af_irda.c b/net/irda/af_irda.c index 0578d4fa00a9..0f676908d15b 100644 --- a/net/irda/af_irda.c +++ b/net/irda/af_irda.c @@ -2563,9 +2563,8 @@ bed: jiffies + msecs_to_jiffies(val)); /* Wait for IR-LMP to call us back */ - __wait_event_interruptible(self->query_wait, - (self->cachedaddr != 0 || self->errno == -ETIME), - err); + err = __wait_event_interruptible(self->query_wait, + (self->cachedaddr != 0 || self->errno == -ETIME)); /* If watchdog is still activated, kill it! */ del_timer(&(self->watchdog)); diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c index f4484719f3e6..f63c2388f38d 100644 --- a/net/netfilter/ipvs/ip_vs_sync.c +++ b/net/netfilter/ipvs/ip_vs_sync.c @@ -1637,12 +1637,9 @@ static int sync_thread_master(void *data) continue; } while (ip_vs_send_sync_msg(tinfo->sock, sb->mesg) < 0) { - int ret = 0; - - __wait_event_interruptible(*sk_sleep(sk), + int ret = __wait_event_interruptible(*sk_sleep(sk), sock_writeable(sk) || - kthread_should_stop(), - ret); + kthread_should_stop()); if (unlikely(kthread_should_stop())) goto done; } |