summaryrefslogtreecommitdiffstats
path: root/arch/powerpc/lib/memcpy_power7.S
diff options
context:
space:
mode:
authorSimon Guo <wei.guo.simon@gmail.com>2018-06-07 03:57:53 +0200
committerMichael Ellerman <mpe@ellerman.id.au>2018-07-24 14:03:21 +0200
commitd58badfb7cf1792ab4f1d0cd7896d733b85d650f (patch)
tree2dbfd8f16f2fd270790f5e91fa527af3b63f1731 /arch/powerpc/lib/memcpy_power7.S
parentpowerpc: add vcmpequd/vcmpequb ppc instruction macro (diff)
downloadlinux-d58badfb7cf1792ab4f1d0cd7896d733b85d650f.tar.xz
linux-d58badfb7cf1792ab4f1d0cd7896d733b85d650f.zip
powerpc/64: enhance memcmp() with VMX instruction for long bytes comparision
This patch add VMX primitives to do memcmp() in case the compare size is equal or greater than 4K bytes. KSM feature can benefit from this. Test result with following test program(replace the "^>" with ""): ------ ># cat tools/testing/selftests/powerpc/stringloops/memcmp.c >#include <malloc.h> >#include <stdlib.h> >#include <string.h> >#include <time.h> >#include "utils.h" >#define SIZE (1024 * 1024 * 900) >#define ITERATIONS 40 int test_memcmp(const void *s1, const void *s2, size_t n); static int testcase(void) { char *s1; char *s2; unsigned long i; s1 = memalign(128, SIZE); if (!s1) { perror("memalign"); exit(1); } s2 = memalign(128, SIZE); if (!s2) { perror("memalign"); exit(1); } for (i = 0; i < SIZE; i++) { s1[i] = i & 0xff; s2[i] = i & 0xff; } for (i = 0; i < ITERATIONS; i++) { int ret = test_memcmp(s1, s2, SIZE); if (ret) { printf("return %d at[%ld]! should have returned zero\n", ret, i); abort(); } } return 0; } int main(void) { return test_harness(testcase, "memcmp"); } ------ Without this patch (but with the first patch "powerpc/64: Align bytes before fall back to .Lshort in powerpc64 memcmp()." in the series): 4.726728762 seconds time elapsed ( +- 3.54%) With VMX patch: 4.234335473 seconds time elapsed ( +- 2.63%) There is ~+10% improvement. Testing with unaligned and different offset version (make s1 and s2 shift random offset within 16 bytes) can archieve higher improvement than 10%.. Signed-off-by: Simon Guo <wei.guo.simon@gmail.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Diffstat (limited to 'arch/powerpc/lib/memcpy_power7.S')
-rw-r--r--arch/powerpc/lib/memcpy_power7.S6
1 files changed, 3 insertions, 3 deletions
diff --git a/arch/powerpc/lib/memcpy_power7.S b/arch/powerpc/lib/memcpy_power7.S
index df7de9d3da08..070cdf6f584f 100644
--- a/arch/powerpc/lib/memcpy_power7.S
+++ b/arch/powerpc/lib/memcpy_power7.S
@@ -230,7 +230,7 @@ _GLOBAL(memcpy_power7)
std r5,-STACKFRAMESIZE+STK_REG(R29)(r1)
std r0,16(r1)
stdu r1,-STACKFRAMESIZE(r1)
- bl enter_vmx_copy
+ bl enter_vmx_ops
cmpwi cr1,r3,0
ld r0,STACKFRAMESIZE+16(r1)
ld r3,STK_REG(R31)(r1)
@@ -445,7 +445,7 @@ _GLOBAL(memcpy_power7)
15: addi r1,r1,STACKFRAMESIZE
ld r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
- b exit_vmx_copy /* tail call optimise */
+ b exit_vmx_ops /* tail call optimise */
.Lvmx_unaligned_copy:
/* Get the destination 16B aligned */
@@ -649,5 +649,5 @@ _GLOBAL(memcpy_power7)
15: addi r1,r1,STACKFRAMESIZE
ld r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
- b exit_vmx_copy /* tail call optimise */
+ b exit_vmx_ops /* tail call optimise */
#endif /* CONFIG_ALTIVEC */