summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMichael Neuling <mikey@neuling.org>2015-09-25 06:01:40 +0200
committerMichael Ellerman <mpe@ellerman.id.au>2015-10-01 08:52:02 +0200
commitc974809a26a13e40254dbe3cf46f49aa32acca11 (patch)
treee06202ed23087fcce22722ee6c0026e58be4b1cb
parentpowerpc/selftest: Add gettimeofday() benchmark (diff)
downloadlinux-c974809a26a13e40254dbe3cf46f49aa32acca11.tar.xz
linux-c974809a26a13e40254dbe3cf46f49aa32acca11.zip
powerpc/vdso: Avoid link stack corruption in __get_datapage()
powerpc has a link register (lr) used for calling functions. We "bl <func>" to call a function, and "blr" to return back to the call site. The lr is only a single register, so if we call another function from inside this function (ie. nested calls), software must save away the lr on the software stack before calling the new function. Before returning (ie. before the "blr"), the lr is restored by software from the software stack. This makes branch prediction quite difficult for the processor as it will only know the branch target just before the "blr". To help with this, modern powerpc processors keep a (non-architected) hardware stack of lr called a "link stack". When a "bl <func>" is run, the lr is pushed onto this stack. When a "blr" is called, the branch predictor pops the lr value from the top of the link stack, and uses it to predict the branch target. Hence the processor pipeline knows a lot earlier the branch target. This works great but there are some cases where you call "bl" but without a matching "blr". Once such case is when trying to determine the program counter (which can't be read directly). Here you "bl+4; mflr" to get the program counter. If you do this, the link stack will get out of sync with reality, causing the branch predictor to mis-predict subsequent function returns. To avoid this, modern micro-architectures have a special case of bl. Using the form "bcl 20,31,+4", ensures the processor doesn't push to the link stack. The 32 and 64 bit variants of __get_datapage() use a "bl; mflr" to determine the loaded address of the VDSO. The current versions of these attempt to use this special bl variant. Unfortunately they use +8 rather than the required +4. Hence the current code results in the link stack getting out of sync with reality and hence the resulting performance degradation. This patch moves it to bcl+4 by moving __kernel_datapage_offset out of __get_datapage(). With this patch, running a gettimeofday() (which uses __get_datapage()) microbenchmark we get a decent bump in performance on POWER7/8. For the benchmark in tools/testing/selftests/powerpc/benchmarks/gettimeofday.c POWER8: 64bit gets ~4% improvement 32bit gets ~9% improvement POWER7: 64bit gets ~7% improvement Signed-off-by: Michael Neuling <mikey@neuling.org> Reported-by: Aaron Sawdey <sawdey@us.ibm.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
-rw-r--r--arch/powerpc/kernel/vdso32/datapage.S12
-rw-r--r--arch/powerpc/kernel/vdso64/datapage.S12
2 files changed, 14 insertions, 10 deletions
diff --git a/arch/powerpc/kernel/vdso32/datapage.S b/arch/powerpc/kernel/vdso32/datapage.S
index dc21e891d2e7..59cf5f452879 100644
--- a/arch/powerpc/kernel/vdso32/datapage.S
+++ b/arch/powerpc/kernel/vdso32/datapage.S
@@ -16,6 +16,10 @@
#include <asm/vdso.h>
.text
+ .global __kernel_datapage_offset;
+__kernel_datapage_offset:
+ .long 0
+
V_FUNCTION_BEGIN(__get_datapage)
.cfi_startproc
/* We don't want that exposed or overridable as we want other objects
@@ -27,13 +31,11 @@ V_FUNCTION_BEGIN(__get_datapage)
mflr r0
.cfi_register lr,r0
- bcl 20,31,1f
- .global __kernel_datapage_offset;
-__kernel_datapage_offset:
- .long 0
-1:
+ bcl 20,31,data_page_branch
+data_page_branch:
mflr r3
mtlr r0
+ addi r3, r3, __kernel_datapage_offset-data_page_branch
lwz r0,0(r3)
add r3,r0,r3
blr
diff --git a/arch/powerpc/kernel/vdso64/datapage.S b/arch/powerpc/kernel/vdso64/datapage.S
index 79796de11737..2f01c4a0d8a0 100644
--- a/arch/powerpc/kernel/vdso64/datapage.S
+++ b/arch/powerpc/kernel/vdso64/datapage.S
@@ -16,6 +16,10 @@
#include <asm/vdso.h>
.text
+.global __kernel_datapage_offset;
+__kernel_datapage_offset:
+ .long 0
+
V_FUNCTION_BEGIN(__get_datapage)
.cfi_startproc
/* We don't want that exposed or overridable as we want other objects
@@ -27,13 +31,11 @@ V_FUNCTION_BEGIN(__get_datapage)
mflr r0
.cfi_register lr,r0
- bcl 20,31,1f
- .global __kernel_datapage_offset;
-__kernel_datapage_offset:
- .long 0
-1:
+ bcl 20,31,data_page_branch
+data_page_branch:
mflr r3
mtlr r0
+ addi r3, r3, __kernel_datapage_offset-data_page_branch
lwz r0,0(r3)
add r3,r0,r3
blr