diff options
author | Tejun Heo <tj@kernel.org> | 2010-12-18 15:54:36 +0100 |
---|---|---|
committer | Tejun Heo <tj@kernel.org> | 2010-12-18 15:54:36 +0100 |
commit | 05c2d088d0eb904e50460b04d77324c26cef4637 (patch) | |
tree | 1dab544e05f9021a02e76adcbdb5edf4b31c7d62 /arch | |
parent | connector: Use this_cpu operations (diff) | |
parent | cpuops: Use cmpxchg for xchg to avoid lock semantics (diff) | |
download | linux-05c2d088d0eb904e50460b04d77324c26cef4637.tar.xz linux-05c2d088d0eb904e50460b04d77324c26cef4637.zip |
Merge branch 'this_cpu_ops' into for-2.6.38
Diffstat (limited to 'arch')
-rw-r--r-- | arch/x86/Kconfig.cpu | 3 | ||||
-rw-r--r-- | arch/x86/include/asm/percpu.h | 187 |
2 files changed, 153 insertions, 37 deletions
diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu index 2ac9069890cd..15588a0ef466 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu @@ -310,6 +310,9 @@ config X86_INTERNODE_CACHE_SHIFT config X86_CMPXCHG def_bool X86_64 || (X86_32 && !M386) +config CMPXCHG_LOCAL + def_bool X86_64 || (X86_32 && !M386) + config X86_L1_CACHE_SHIFT int default "7" if MPENTIUM4 || MPSC diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index 38f9e965ff96..8ee45167e817 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -177,39 +177,6 @@ do { \ } \ } while (0) -/* - * Add return operation - */ -#define percpu_add_return_op(var, val) \ -({ \ - typeof(var) paro_ret__ = val; \ - switch (sizeof(var)) { \ - case 1: \ - asm("xaddb %0, "__percpu_arg(1) \ - : "+q" (paro_ret__), "+m" (var) \ - : : "memory"); \ - break; \ - case 2: \ - asm("xaddw %0, "__percpu_arg(1) \ - : "+r" (paro_ret__), "+m" (var) \ - : : "memory"); \ - break; \ - case 4: \ - asm("xaddl %0, "__percpu_arg(1) \ - : "+r" (paro_ret__), "+m" (var) \ - : : "memory"); \ - break; \ - case 8: \ - asm("xaddq %0, "__percpu_arg(1) \ - : "+re" (paro_ret__), "+m" (var) \ - : : "memory"); \ - break; \ - default: __bad_percpu_size(); \ - } \ - paro_ret__ += val; \ - paro_ret__; \ -}) - #define percpu_from_op(op, var, constraint) \ ({ \ typeof(var) pfo_ret__; \ @@ -263,6 +230,125 @@ do { \ }) /* + * Add return operation + */ +#define percpu_add_return_op(var, val) \ +({ \ + typeof(var) paro_ret__ = val; \ + switch (sizeof(var)) { \ + case 1: \ + asm("xaddb %0, "__percpu_arg(1) \ + : "+q" (paro_ret__), "+m" (var) \ + : : "memory"); \ + break; \ + case 2: \ + asm("xaddw %0, "__percpu_arg(1) \ + : "+r" (paro_ret__), "+m" (var) \ + : : "memory"); \ + break; \ + case 4: \ + asm("xaddl %0, "__percpu_arg(1) \ + : "+r" (paro_ret__), "+m" (var) \ + : : "memory"); \ + break; \ + case 8: \ + asm("xaddq %0, "__percpu_arg(1) \ + : "+re" (paro_ret__), "+m" (var) \ + : : "memory"); \ + break; \ + default: __bad_percpu_size(); \ + } \ + paro_ret__ += val; \ + paro_ret__; \ +}) + +/* + * xchg is implemented using cmpxchg without a lock prefix. xchg is + * expensive due to the implied lock prefix. The processor cannot prefetch + * cachelines if xchg is used. + */ +#define percpu_xchg_op(var, nval) \ +({ \ + typeof(var) pxo_ret__; \ + typeof(var) pxo_new__ = (nval); \ + switch (sizeof(var)) { \ + case 1: \ + asm("\n1:mov "__percpu_arg(1)",%%al" \ + "\n\tcmpxchgb %2, "__percpu_arg(1) \ + "\n\tjnz 1b" \ + : "=a" (pxo_ret__), "+m" (var) \ + : "q" (pxo_new__) \ + : "memory"); \ + break; \ + case 2: \ + asm("\n1:mov "__percpu_arg(1)",%%ax" \ + "\n\tcmpxchgw %2, "__percpu_arg(1) \ + "\n\tjnz 1b" \ + : "=a" (pxo_ret__), "+m" (var) \ + : "r" (pxo_new__) \ + : "memory"); \ + break; \ + case 4: \ + asm("\n1:mov "__percpu_arg(1)",%%eax" \ + "\n\tcmpxchgl %2, "__percpu_arg(1) \ + "\n\tjnz 1b" \ + : "=a" (pxo_ret__), "+m" (var) \ + : "r" (pxo_new__) \ + : "memory"); \ + break; \ + case 8: \ + asm("\n1:mov "__percpu_arg(1)",%%rax" \ + "\n\tcmpxchgq %2, "__percpu_arg(1) \ + "\n\tjnz 1b" \ + : "=a" (pxo_ret__), "+m" (var) \ + : "r" (pxo_new__) \ + : "memory"); \ + break; \ + default: __bad_percpu_size(); \ + } \ + pxo_ret__; \ +}) + +/* + * cmpxchg has no such implied lock semantics as a result it is much + * more efficient for cpu local operations. + */ +#define percpu_cmpxchg_op(var, oval, nval) \ +({ \ + typeof(var) pco_ret__; \ + typeof(var) pco_old__ = (oval); \ + typeof(var) pco_new__ = (nval); \ + switch (sizeof(var)) { \ + case 1: \ + asm("cmpxchgb %2, "__percpu_arg(1) \ + : "=a" (pco_ret__), "+m" (var) \ + : "q" (pco_new__), "0" (pco_old__) \ + : "memory"); \ + break; \ + case 2: \ + asm("cmpxchgw %2, "__percpu_arg(1) \ + : "=a" (pco_ret__), "+m" (var) \ + : "r" (pco_new__), "0" (pco_old__) \ + : "memory"); \ + break; \ + case 4: \ + asm("cmpxchgl %2, "__percpu_arg(1) \ + : "=a" (pco_ret__), "+m" (var) \ + : "r" (pco_new__), "0" (pco_old__) \ + : "memory"); \ + break; \ + case 8: \ + asm("cmpxchgq %2, "__percpu_arg(1) \ + : "=a" (pco_ret__), "+m" (var) \ + : "r" (pco_new__), "0" (pco_old__) \ + : "memory"); \ + break; \ + default: __bad_percpu_size(); \ + } \ + pco_ret__; \ +}) + +/* * percpu_read() makes gcc load the percpu variable every time it is * accessed while percpu_read_stable() allows the value to be cached. * percpu_read_stable() is more efficient and can be used if its value @@ -300,6 +386,12 @@ do { \ #define __this_cpu_xor_1(pcp, val) percpu_to_op("xor", (pcp), val) #define __this_cpu_xor_2(pcp, val) percpu_to_op("xor", (pcp), val) #define __this_cpu_xor_4(pcp, val) percpu_to_op("xor", (pcp), val) +/* + * Generic fallback operations for __this_cpu_xchg_[1-4] are okay and much + * faster than an xchg with forced lock semantics. + */ +#define __this_cpu_xchg_8(pcp, nval) percpu_xchg_op(pcp, nval) +#define __this_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) #define this_cpu_read_1(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) #define this_cpu_read_2(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) @@ -319,6 +411,11 @@ do { \ #define this_cpu_xor_1(pcp, val) percpu_to_op("xor", (pcp), val) #define this_cpu_xor_2(pcp, val) percpu_to_op("xor", (pcp), val) #define this_cpu_xor_4(pcp, val) percpu_to_op("xor", (pcp), val) +#define this_cpu_xchg_1(pcp, nval) percpu_xchg_op(pcp, nval) +#define this_cpu_xchg_2(pcp, nval) percpu_xchg_op(pcp, nval) +#define this_cpu_xchg_4(pcp, nval) percpu_xchg_op(pcp, nval) +#define this_cpu_xchg_8(pcp, nval) percpu_xchg_op(pcp, nval) +#define this_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) #define irqsafe_cpu_add_1(pcp, val) percpu_add_op((pcp), val) #define irqsafe_cpu_add_2(pcp, val) percpu_add_op((pcp), val) @@ -332,15 +429,32 @@ do { \ #define irqsafe_cpu_xor_1(pcp, val) percpu_to_op("xor", (pcp), val) #define irqsafe_cpu_xor_2(pcp, val) percpu_to_op("xor", (pcp), val) #define irqsafe_cpu_xor_4(pcp, val) percpu_to_op("xor", (pcp), val) +#define irqsafe_cpu_xchg_1(pcp, nval) percpu_xchg_op(pcp, nval) +#define irqsafe_cpu_xchg_2(pcp, nval) percpu_xchg_op(pcp, nval) +#define irqsafe_cpu_xchg_4(pcp, nval) percpu_xchg_op(pcp, nval) +#define irqsafe_cpu_xchg_8(pcp, nval) percpu_xchg_op(pcp, nval) +#define irqsafe_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) #ifndef CONFIG_M386 #define __this_cpu_add_return_1(pcp, val) percpu_add_return_op(pcp, val) #define __this_cpu_add_return_2(pcp, val) percpu_add_return_op(pcp, val) #define __this_cpu_add_return_4(pcp, val) percpu_add_return_op(pcp, val) +#define __this_cpu_cmpxchg_1(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) +#define __this_cpu_cmpxchg_2(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) +#define __this_cpu_cmpxchg_4(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) + #define this_cpu_add_return_1(pcp, val) percpu_add_return_op(pcp, val) #define this_cpu_add_return_2(pcp, val) percpu_add_return_op(pcp, val) #define this_cpu_add_return_4(pcp, val) percpu_add_return_op(pcp, val) -#endif +#define this_cpu_cmpxchg_1(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) +#define this_cpu_cmpxchg_2(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) +#define this_cpu_cmpxchg_4(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) + +#define irqsafe_cpu_cmpxchg_1(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) +#define irqsafe_cpu_cmpxchg_2(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) +#define irqsafe_cpu_cmpxchg_4(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) +#endif /* !CONFIG_M386 */ + /* * Per cpu atomic 64 bit operations are only available under 64 bit. * 32 bit must fall back to generic operations. @@ -352,6 +466,7 @@ do { \ #define __this_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val) #define __this_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val) #define __this_cpu_xor_8(pcp, val) percpu_to_op("xor", (pcp), val) +#define __this_cpu_add_return_8(pcp, val) percpu_add_return_op(pcp, val) #define this_cpu_read_8(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) #define this_cpu_write_8(pcp, val) percpu_to_op("mov", (pcp), val) @@ -359,14 +474,12 @@ do { \ #define this_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val) #define this_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val) #define this_cpu_xor_8(pcp, val) percpu_to_op("xor", (pcp), val) +#define this_cpu_add_return_8(pcp, val) percpu_add_return_op(pcp, val) #define irqsafe_cpu_add_8(pcp, val) percpu_add_op((pcp), val) #define irqsafe_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val) #define irqsafe_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val) #define irqsafe_cpu_xor_8(pcp, val) percpu_to_op("xor", (pcp), val) - -#define __this_cpu_add_return_8(pcp, val) percpu_add_return_op(pcp, val) -#define this_cpu_add_return_8(pcp, val) percpu_add_return_op(pcp, val) #endif /* This is not atomic against other CPUs -- CPU preemption needs to be off */ |