diff options
Diffstat (limited to 'arch/sparc/lib')
-rw-r--r-- | arch/sparc/lib/Makefile | 3 | ||||
-rw-r--r-- | arch/sparc/lib/NG2memcpy.S | 46 | ||||
-rw-r--r-- | arch/sparc/lib/NG4clear_page.S | 29 | ||||
-rw-r--r-- | arch/sparc/lib/NG4copy_from_user.S | 30 | ||||
-rw-r--r-- | arch/sparc/lib/NG4copy_page.S | 57 | ||||
-rw-r--r-- | arch/sparc/lib/NG4copy_to_user.S | 39 | ||||
-rw-r--r-- | arch/sparc/lib/NG4memcpy.S | 360 | ||||
-rw-r--r-- | arch/sparc/lib/NG4memset.S | 105 | ||||
-rw-r--r-- | arch/sparc/lib/NG4patch.S | 54 | ||||
-rw-r--r-- | arch/sparc/lib/NGpage.S | 2 | ||||
-rw-r--r-- | arch/sparc/lib/ksyms.c | 4 |
11 files changed, 706 insertions, 23 deletions
diff --git a/arch/sparc/lib/Makefile b/arch/sparc/lib/Makefile index dff4096f3dec..8410065f2862 100644 --- a/arch/sparc/lib/Makefile +++ b/arch/sparc/lib/Makefile @@ -32,6 +32,9 @@ lib-$(CONFIG_SPARC64) += NGpatch.o NGpage.o NGbzero.o lib-$(CONFIG_SPARC64) += NG2memcpy.o NG2copy_from_user.o NG2copy_to_user.o lib-$(CONFIG_SPARC64) += NG2patch.o +lib-$(CONFIG_SPARC64) += NG4memcpy.o NG4copy_from_user.o NG4copy_to_user.o +lib-$(CONFIG_SPARC64) += NG4patch.o NG4copy_page.o NG4clear_page.o NG4memset.o + lib-$(CONFIG_SPARC64) += GENmemcpy.o GENcopy_from_user.o GENcopy_to_user.o lib-$(CONFIG_SPARC64) += GENpatch.o GENpage.o GENbzero.o diff --git a/arch/sparc/lib/NG2memcpy.S b/arch/sparc/lib/NG2memcpy.S index 03eadf66b0d3..2c20ad63ddbf 100644 --- a/arch/sparc/lib/NG2memcpy.S +++ b/arch/sparc/lib/NG2memcpy.S @@ -14,7 +14,7 @@ #define FPRS_FEF 0x04 #ifdef MEMCPY_DEBUG #define VISEntryHalf rd %fprs, %o5; wr %g0, FPRS_FEF, %fprs; \ - clr %g1; clr %g2; clr %g3; subcc %g0, %g0, %g0; + clr %g1; clr %g2; clr %g3; clr %g5; subcc %g0, %g0, %g0; #define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs #else #define VISEntryHalf rd %fprs, %o5; wr %g0, FPRS_FEF, %fprs @@ -182,13 +182,13 @@ FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */ cmp %g2, 0 tne %xcc, 5 PREAMBLE - mov %o0, GLOBAL_SPARE + mov %o0, %o3 cmp %o2, 0 be,pn %XCC, 85f - or %o0, %o1, %o3 + or %o0, %o1, GLOBAL_SPARE cmp %o2, 16 blu,a,pn %XCC, 80f - or %o3, %o2, %o3 + or GLOBAL_SPARE, %o2, GLOBAL_SPARE /* 2 blocks (128 bytes) is the minimum we can do the block * copy with. We need to ensure that we'll iterate at least @@ -202,7 +202,7 @@ FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */ */ cmp %o2, (4 * 64) blu,pt %XCC, 75f - andcc %o3, 0x7, %g0 + andcc GLOBAL_SPARE, 0x7, %g0 /* %o0: dst * %o1: src @@ -404,13 +404,13 @@ FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */ * over. If anything is left, we copy it one byte at a time. */ brz,pt %o2, 85f - sub %o0, %o1, %o3 + sub %o0, %o1, GLOBAL_SPARE ba,a,pt %XCC, 90f .align 64 75: /* 16 < len <= 64 */ bne,pn %XCC, 75f - sub %o0, %o1, %o3 + sub %o0, %o1, GLOBAL_SPARE 72: andn %o2, 0xf, %o4 @@ -420,9 +420,9 @@ FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */ add %o1, 0x08, %o1 EX_LD(LOAD(ldx, %o1, %g1)) sub %o1, 0x08, %o1 - EX_ST(STORE(stx, %o5, %o1 + %o3)) + EX_ST(STORE(stx, %o5, %o1 + GLOBAL_SPARE)) add %o1, 0x8, %o1 - EX_ST(STORE(stx, %g1, %o1 + %o3)) + EX_ST(STORE(stx, %g1, %o1 + GLOBAL_SPARE)) bgu,pt %XCC, 1b add %o1, 0x8, %o1 73: andcc %o2, 0x8, %g0 @@ -430,14 +430,14 @@ FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */ nop sub %o2, 0x8, %o2 EX_LD(LOAD(ldx, %o1, %o5)) - EX_ST(STORE(stx, %o5, %o1 + %o3)) + EX_ST(STORE(stx, %o5, %o1 + GLOBAL_SPARE)) add %o1, 0x8, %o1 1: andcc %o2, 0x4, %g0 be,pt %XCC, 1f nop sub %o2, 0x4, %o2 EX_LD(LOAD(lduw, %o1, %o5)) - EX_ST(STORE(stw, %o5, %o1 + %o3)) + EX_ST(STORE(stw, %o5, %o1 + GLOBAL_SPARE)) add %o1, 0x4, %o1 1: cmp %o2, 0 be,pt %XCC, 85f @@ -454,11 +454,11 @@ FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */ 1: subcc %g1, 1, %g1 EX_LD(LOAD(ldub, %o1, %o5)) - EX_ST(STORE(stb, %o5, %o1 + %o3)) + EX_ST(STORE(stb, %o5, %o1 + GLOBAL_SPARE)) bgu,pt %icc, 1b add %o1, 1, %o1 -2: add %o1, %o3, %o0 +2: add %o1, GLOBAL_SPARE, %o0 andcc %o1, 0x7, %g1 bne,pt %icc, 8f sll %g1, 3, %g1 @@ -468,16 +468,16 @@ FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */ nop ba,a,pt %xcc, 73b -8: mov 64, %o3 +8: mov 64, GLOBAL_SPARE andn %o1, 0x7, %o1 EX_LD(LOAD(ldx, %o1, %g2)) - sub %o3, %g1, %o3 + sub GLOBAL_SPARE, %g1, GLOBAL_SPARE andn %o2, 0x7, %o4 sllx %g2, %g1, %g2 1: add %o1, 0x8, %o1 EX_LD(LOAD(ldx, %o1, %g3)) subcc %o4, 0x8, %o4 - srlx %g3, %o3, %o5 + srlx %g3, GLOBAL_SPARE, %o5 or %o5, %g2, %o5 EX_ST(STORE(stx, %o5, %o0)) add %o0, 0x8, %o0 @@ -489,32 +489,32 @@ FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */ be,pn %icc, 85f add %o1, %g1, %o1 ba,pt %xcc, 90f - sub %o0, %o1, %o3 + sub %o0, %o1, GLOBAL_SPARE .align 64 80: /* 0 < len <= 16 */ - andcc %o3, 0x3, %g0 + andcc GLOBAL_SPARE, 0x3, %g0 bne,pn %XCC, 90f - sub %o0, %o1, %o3 + sub %o0, %o1, GLOBAL_SPARE 1: subcc %o2, 4, %o2 EX_LD(LOAD(lduw, %o1, %g1)) - EX_ST(STORE(stw, %g1, %o1 + %o3)) + EX_ST(STORE(stw, %g1, %o1 + GLOBAL_SPARE)) bgu,pt %XCC, 1b add %o1, 4, %o1 85: retl - mov EX_RETVAL(GLOBAL_SPARE), %o0 + mov EX_RETVAL(%o3), %o0 .align 32 90: subcc %o2, 1, %o2 EX_LD(LOAD(ldub, %o1, %g1)) - EX_ST(STORE(stb, %g1, %o1 + %o3)) + EX_ST(STORE(stb, %g1, %o1 + GLOBAL_SPARE)) bgu,pt %XCC, 90b add %o1, 1, %o1 retl - mov EX_RETVAL(GLOBAL_SPARE), %o0 + mov EX_RETVAL(%o3), %o0 .size FUNC_NAME, .-FUNC_NAME diff --git a/arch/sparc/lib/NG4clear_page.S b/arch/sparc/lib/NG4clear_page.S new file mode 100644 index 000000000000..e16c88204a42 --- /dev/null +++ b/arch/sparc/lib/NG4clear_page.S @@ -0,0 +1,29 @@ +/* NG4copy_page.S: Niagara-4 optimized clear page. + * + * Copyright (C) 2012 (davem@davemloft.net) + */ + +#include <asm/asi.h> +#include <asm/page.h> + + .text + + .register %g3, #scratch + + .align 32 + .globl NG4clear_page + .globl NG4clear_user_page +NG4clear_page: /* %o0=dest */ +NG4clear_user_page: /* %o0=dest, %o1=vaddr */ + set PAGE_SIZE, %g7 + mov 0x20, %g3 +1: stxa %g0, [%o0 + %g0] ASI_ST_BLKINIT_MRU_P + subcc %g7, 0x40, %g7 + stxa %g0, [%o0 + %g3] ASI_ST_BLKINIT_MRU_P + bne,pt %xcc, 1b + add %o0, 0x40, %o0 + membar #StoreLoad|#StoreStore + retl + nop + .size NG4clear_page,.-NG4clear_page + .size NG4clear_user_page,.-NG4clear_user_page
\ No newline at end of file diff --git a/arch/sparc/lib/NG4copy_from_user.S b/arch/sparc/lib/NG4copy_from_user.S new file mode 100644 index 000000000000..fd9f903ffa32 --- /dev/null +++ b/arch/sparc/lib/NG4copy_from_user.S @@ -0,0 +1,30 @@ +/* NG4copy_from_user.S: Niagara-4 optimized copy from userspace. + * + * Copyright (C) 2012 David S. Miller (davem@davemloft.net) + */ + +#define EX_LD(x) \ +98: x; \ + .section __ex_table,"a";\ + .align 4; \ + .word 98b, __retl_one_asi;\ + .text; \ + .align 4; + +#ifndef ASI_AIUS +#define ASI_AIUS 0x11 +#endif + +#define FUNC_NAME NG4copy_from_user +#define LOAD(type,addr,dest) type##a [addr] %asi, dest +#define EX_RETVAL(x) 0 + +#ifdef __KERNEL__ +#define PREAMBLE \ + rd %asi, %g1; \ + cmp %g1, ASI_AIUS; \ + bne,pn %icc, ___copy_in_user; \ + nop +#endif + +#include "NG4memcpy.S" diff --git a/arch/sparc/lib/NG4copy_page.S b/arch/sparc/lib/NG4copy_page.S new file mode 100644 index 000000000000..28504e88c535 --- /dev/null +++ b/arch/sparc/lib/NG4copy_page.S @@ -0,0 +1,57 @@ +/* NG4copy_page.S: Niagara-4 optimized copy page. + * + * Copyright (C) 2012 (davem@davemloft.net) + */ + +#include <asm/asi.h> +#include <asm/page.h> + + .text + .align 32 + + .register %g2, #scratch + .register %g3, #scratch + + .globl NG4copy_user_page +NG4copy_user_page: /* %o0=dest, %o1=src, %o2=vaddr */ + prefetch [%o1 + 0x000], #n_reads_strong + prefetch [%o1 + 0x040], #n_reads_strong + prefetch [%o1 + 0x080], #n_reads_strong + prefetch [%o1 + 0x0c0], #n_reads_strong + set PAGE_SIZE, %g7 + prefetch [%o1 + 0x100], #n_reads_strong + prefetch [%o1 + 0x140], #n_reads_strong + prefetch [%o1 + 0x180], #n_reads_strong + prefetch [%o1 + 0x1c0], #n_reads_strong +1: + ldx [%o1 + 0x00], %o2 + subcc %g7, 0x40, %g7 + ldx [%o1 + 0x08], %o3 + ldx [%o1 + 0x10], %o4 + ldx [%o1 + 0x18], %o5 + ldx [%o1 + 0x20], %g1 + stxa %o2, [%o0] ASI_ST_BLKINIT_MRU_P + add %o0, 0x08, %o0 + ldx [%o1 + 0x28], %g2 + stxa %o3, [%o0] ASI_ST_BLKINIT_MRU_P + add %o0, 0x08, %o0 + ldx [%o1 + 0x30], %g3 + stxa %o4, [%o0] ASI_ST_BLKINIT_MRU_P + add %o0, 0x08, %o0 + ldx [%o1 + 0x38], %o2 + add %o1, 0x40, %o1 + stxa %o5, [%o0] ASI_ST_BLKINIT_MRU_P + add %o0, 0x08, %o0 + stxa %g1, [%o0] ASI_ST_BLKINIT_MRU_P + add %o0, 0x08, %o0 + stxa %g2, [%o0] ASI_ST_BLKINIT_MRU_P + add %o0, 0x08, %o0 + stxa %g3, [%o0] ASI_ST_BLKINIT_MRU_P + add %o0, 0x08, %o0 + stxa %o2, [%o0] ASI_ST_BLKINIT_MRU_P + add %o0, 0x08, %o0 + bne,pt %icc, 1b + prefetch [%o1 + 0x200], #n_reads_strong + retl + membar #StoreLoad | #StoreStore + .size NG4copy_user_page,.-NG4copy_user_page diff --git a/arch/sparc/lib/NG4copy_to_user.S b/arch/sparc/lib/NG4copy_to_user.S new file mode 100644 index 000000000000..9744c4540a8d --- /dev/null +++ b/arch/sparc/lib/NG4copy_to_user.S @@ -0,0 +1,39 @@ +/* NG4copy_to_user.S: Niagara-4 optimized copy to userspace. + * + * Copyright (C) 2012 David S. Miller (davem@davemloft.net) + */ + +#define EX_ST(x) \ +98: x; \ + .section __ex_table,"a";\ + .align 4; \ + .word 98b, __retl_one_asi;\ + .text; \ + .align 4; + +#ifndef ASI_AIUS +#define ASI_AIUS 0x11 +#endif + +#ifndef ASI_BLK_INIT_QUAD_LDD_AIUS +#define ASI_BLK_INIT_QUAD_LDD_AIUS 0x23 +#endif + +#define FUNC_NAME NG4copy_to_user +#define STORE(type,src,addr) type##a src, [addr] %asi +#define STORE_ASI ASI_BLK_INIT_QUAD_LDD_AIUS +#define EX_RETVAL(x) 0 + +#ifdef __KERNEL__ + /* Writing to %asi is _expensive_ so we hardcode it. + * Reading %asi to check for KERNEL_DS is comparatively + * cheap. + */ +#define PREAMBLE \ + rd %asi, %g1; \ + cmp %g1, ASI_AIUS; \ + bne,pn %icc, ___copy_in_user; \ + nop +#endif + +#include "NG4memcpy.S" diff --git a/arch/sparc/lib/NG4memcpy.S b/arch/sparc/lib/NG4memcpy.S new file mode 100644 index 000000000000..9cf2ee01cee3 --- /dev/null +++ b/arch/sparc/lib/NG4memcpy.S @@ -0,0 +1,360 @@ +/* NG4memcpy.S: Niagara-4 optimized memcpy. + * + * Copyright (C) 2012 David S. Miller (davem@davemloft.net) + */ + +#ifdef __KERNEL__ +#include <asm/visasm.h> +#include <asm/asi.h> +#define GLOBAL_SPARE %g7 +#else +#define ASI_BLK_INIT_QUAD_LDD_P 0xe2 +#define FPRS_FEF 0x04 + +/* On T4 it is very expensive to access ASRs like %fprs and + * %asi, avoiding a read or a write can save ~50 cycles. + */ +#define FPU_ENTER \ + rd %fprs, %o5; \ + andcc %o5, FPRS_FEF, %g0; \ + be,a,pn %icc, 999f; \ + wr %g0, FPRS_FEF, %fprs; \ + 999: + +#ifdef MEMCPY_DEBUG +#define VISEntryHalf FPU_ENTER; \ + clr %g1; clr %g2; clr %g3; clr %g5; subcc %g0, %g0, %g0; +#define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs +#else +#define VISEntryHalf FPU_ENTER +#define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs +#endif + +#define GLOBAL_SPARE %g5 +#endif + +#ifndef STORE_ASI +#ifndef SIMULATE_NIAGARA_ON_NON_NIAGARA +#define STORE_ASI ASI_BLK_INIT_QUAD_LDD_P +#else +#define STORE_ASI 0x80 /* ASI_P */ +#endif +#endif + +#ifndef EX_LD +#define EX_LD(x) x +#endif + +#ifndef EX_ST +#define EX_ST(x) x +#endif + +#ifndef EX_RETVAL +#define EX_RETVAL(x) x +#endif + +#ifndef LOAD +#define LOAD(type,addr,dest) type [addr], dest +#endif + +#ifndef STORE +#ifndef MEMCPY_DEBUG +#define STORE(type,src,addr) type src, [addr] +#else +#define STORE(type,src,addr) type##a src, [addr] %asi +#endif +#endif + +#ifndef STORE_INIT +#define STORE_INIT(src,addr) stxa src, [addr] STORE_ASI +#endif + +#ifndef FUNC_NAME +#define FUNC_NAME NG4memcpy +#endif +#ifndef PREAMBLE +#define PREAMBLE +#endif + +#ifndef XCC +#define XCC xcc +#endif + + .register %g2,#scratch + .register %g3,#scratch + + .text + .align 64 + + .globl FUNC_NAME + .type FUNC_NAME,#function +FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */ +#ifdef MEMCPY_DEBUG + wr %g0, 0x80, %asi +#endif + srlx %o2, 31, %g2 + cmp %g2, 0 + tne %XCC, 5 + PREAMBLE + mov %o0, %o3 + brz,pn %o2, .Lexit + cmp %o2, 3 + ble,pn %icc, .Ltiny + cmp %o2, 19 + ble,pn %icc, .Lsmall + or %o0, %o1, %g2 + cmp %o2, 128 + bl,pn %icc, .Lmedium + nop + +.Llarge:/* len >= 0x80 */ + /* First get dest 8 byte aligned. */ + sub %g0, %o0, %g1 + and %g1, 0x7, %g1 + brz,pt %g1, 51f + sub %o2, %g1, %o2 + +1: EX_LD(LOAD(ldub, %o1 + 0x00, %g2)) + add %o1, 1, %o1 + subcc %g1, 1, %g1 + add %o0, 1, %o0 + bne,pt %icc, 1b + EX_ST(STORE(stb, %g2, %o0 - 0x01)) + +51: LOAD(prefetch, %o1 + 0x040, #n_reads_strong) + LOAD(prefetch, %o1 + 0x080, #n_reads_strong) + LOAD(prefetch, %o1 + 0x0c0, #n_reads_strong) + LOAD(prefetch, %o1 + 0x100, #n_reads_strong) + LOAD(prefetch, %o1 + 0x140, #n_reads_strong) + LOAD(prefetch, %o1 + 0x180, #n_reads_strong) + LOAD(prefetch, %o1 + 0x1c0, #n_reads_strong) + LOAD(prefetch, %o1 + 0x200, #n_reads_strong) + + /* Check if we can use the straight fully aligned + * loop, or we require the alignaddr/faligndata variant. + */ + andcc %o1, 0x7, %o5 + bne,pn %icc, .Llarge_src_unaligned + sub %g0, %o0, %g1 + + /* Legitimize the use of initializing stores by getting dest + * to be 64-byte aligned. + */ + and %g1, 0x3f, %g1 + brz,pt %g1, .Llarge_aligned + sub %o2, %g1, %o2 + +1: EX_LD(LOAD(ldx, %o1 + 0x00, %g2)) + add %o1, 8, %o1 + subcc %g1, 8, %g1 + add %o0, 8, %o0 + bne,pt %icc, 1b + EX_ST(STORE(stx, %g2, %o0 - 0x08)) + +.Llarge_aligned: + /* len >= 0x80 && src 8-byte aligned && dest 8-byte aligned */ + andn %o2, 0x3f, %o4 + sub %o2, %o4, %o2 + +1: EX_LD(LOAD(ldx, %o1 + 0x00, %g1)) + add %o1, 0x40, %o1 + EX_LD(LOAD(ldx, %o1 - 0x38, %g2)) + subcc %o4, 0x40, %o4 + EX_LD(LOAD(ldx, %o1 - 0x30, %g3)) + EX_LD(LOAD(ldx, %o1 - 0x28, GLOBAL_SPARE)) + EX_LD(LOAD(ldx, %o1 - 0x20, %o5)) + EX_ST(STORE_INIT(%g1, %o0)) + add %o0, 0x08, %o0 + EX_ST(STORE_INIT(%g2, %o0)) + add %o0, 0x08, %o0 + EX_LD(LOAD(ldx, %o1 - 0x18, %g2)) + EX_ST(STORE_INIT(%g3, %o0)) + add %o0, 0x08, %o0 + EX_LD(LOAD(ldx, %o1 - 0x10, %g3)) + EX_ST(STORE_INIT(GLOBAL_SPARE, %o0)) + add %o0, 0x08, %o0 + EX_LD(LOAD(ldx, %o1 - 0x08, GLOBAL_SPARE)) + EX_ST(STORE_INIT(%o5, %o0)) + add %o0, 0x08, %o0 + EX_ST(STORE_INIT(%g2, %o0)) + add %o0, 0x08, %o0 + EX_ST(STORE_INIT(%g3, %o0)) + add %o0, 0x08, %o0 + EX_ST(STORE_INIT(GLOBAL_SPARE, %o0)) + add %o0, 0x08, %o0 + bne,pt %icc, 1b + LOAD(prefetch, %o1 + 0x200, #n_reads_strong) + + membar #StoreLoad | #StoreStore + + brz,pn %o2, .Lexit + cmp %o2, 19 + ble,pn %icc, .Lsmall_unaligned + nop + ba,a,pt %icc, .Lmedium_noprefetch + +.Lexit: retl + mov EX_RETVAL(%o3), %o0 + +.Llarge_src_unaligned: + andn %o2, 0x3f, %o4 + sub %o2, %o4, %o2 + VISEntryHalf + alignaddr %o1, %g0, %g1 + add %o1, %o4, %o1 + EX_LD(LOAD(ldd, %g1 + 0x00, %f0)) +1: EX_LD(LOAD(ldd, %g1 + 0x08, %f2)) + subcc %o4, 0x40, %o4 + EX_LD(LOAD(ldd, %g1 + 0x10, %f4)) + EX_LD(LOAD(ldd, %g1 + 0x18, %f6)) + EX_LD(LOAD(ldd, %g1 + 0x20, %f8)) + EX_LD(LOAD(ldd, %g1 + 0x28, %f10)) + EX_LD(LOAD(ldd, %g1 + 0x30, %f12)) + EX_LD(LOAD(ldd, %g1 + 0x38, %f14)) + faligndata %f0, %f2, %f16 + EX_LD(LOAD(ldd, %g1 + 0x40, %f0)) + faligndata %f2, %f4, %f18 + add %g1, 0x40, %g1 + faligndata %f4, %f6, %f20 + faligndata %f6, %f8, %f22 + faligndata %f8, %f10, %f24 + faligndata %f10, %f12, %f26 + faligndata %f12, %f14, %f28 + faligndata %f14, %f0, %f30 + EX_ST(STORE(std, %f16, %o0 + 0x00)) + EX_ST(STORE(std, %f18, %o0 + 0x08)) + EX_ST(STORE(std, %f20, %o0 + 0x10)) + EX_ST(STORE(std, %f22, %o0 + 0x18)) + EX_ST(STORE(std, %f24, %o0 + 0x20)) + EX_ST(STORE(std, %f26, %o0 + 0x28)) + EX_ST(STORE(std, %f28, %o0 + 0x30)) + EX_ST(STORE(std, %f30, %o0 + 0x38)) + add %o0, 0x40, %o0 + bne,pt %icc, 1b + LOAD(prefetch, %g1 + 0x200, #n_reads_strong) + VISExitHalf + + brz,pn %o2, .Lexit + cmp %o2, 19 + ble,pn %icc, .Lsmall_unaligned + nop + ba,a,pt %icc, .Lmedium_unaligned + +.Lmedium: + LOAD(prefetch, %o1 + 0x40, #n_reads_strong) + andcc %g2, 0x7, %g0 + bne,pn %icc, .Lmedium_unaligned + nop +.Lmedium_noprefetch: + andncc %o2, 0x20 - 1, %o5 + be,pn %icc, 2f + sub %o2, %o5, %o2 +1: EX_LD(LOAD(ldx, %o1 + 0x00, %g1)) + EX_LD(LOAD(ldx, %o1 + 0x08, %g2)) + EX_LD(LOAD(ldx, %o1 + 0x10, GLOBAL_SPARE)) + EX_LD(LOAD(ldx, %o1 + 0x18, %o4)) + add %o1, 0x20, %o1 + subcc %o5, 0x20, %o5 + EX_ST(STORE(stx, %g1, %o0 + 0x00)) + EX_ST(STORE(stx, %g2, %o0 + 0x08)) + EX_ST(STORE(stx, GLOBAL_SPARE, %o0 + 0x10)) + EX_ST(STORE(stx, %o4, %o0 + 0x18)) + bne,pt %icc, 1b + add %o0, 0x20, %o0 +2: andcc %o2, 0x18, %o5 + be,pt %icc, 3f + sub %o2, %o5, %o2 +1: EX_LD(LOAD(ldx, %o1 + 0x00, %g1)) + add %o1, 0x08, %o1 + add %o0, 0x08, %o0 + subcc %o5, 0x08, %o5 + bne,pt %icc, 1b + EX_ST(STORE(stx, %g1, %o0 - 0x08)) +3: brz,pt %o2, .Lexit + cmp %o2, 0x04 + bl,pn %icc, .Ltiny + nop + EX_LD(LOAD(lduw, %o1 + 0x00, %g1)) + add %o1, 0x04, %o1 + add %o0, 0x04, %o0 + subcc %o2, 0x04, %o2 + bne,pn %icc, .Ltiny + EX_ST(STORE(stw, %g1, %o0 - 0x04)) + ba,a,pt %icc, .Lexit +.Lmedium_unaligned: + /* First get dest 8 byte aligned. */ + sub %g0, %o0, %g1 + and %g1, 0x7, %g1 + brz,pt %g1, 2f + sub %o2, %g1, %o2 + +1: EX_LD(LOAD(ldub, %o1 + 0x00, %g2)) + add %o1, 1, %o1 + subcc %g1, 1, %g1 + add %o0, 1, %o0 + bne,pt %icc, 1b + EX_ST(STORE(stb, %g2, %o0 - 0x01)) +2: + and %o1, 0x7, %g1 + brz,pn %g1, .Lmedium_noprefetch + sll %g1, 3, %g1 + mov 64, %g2 + sub %g2, %g1, %g2 + andn %o1, 0x7, %o1 + EX_LD(LOAD(ldx, %o1 + 0x00, %o4)) + sllx %o4, %g1, %o4 + andn %o2, 0x08 - 1, %o5 + sub %o2, %o5, %o2 +1: EX_LD(LOAD(ldx, %o1 + 0x08, %g3)) + add %o1, 0x08, %o1 + subcc %o5, 0x08, %o5 + srlx %g3, %g2, GLOBAL_SPARE + or GLOBAL_SPARE, %o4, GLOBAL_SPARE + EX_ST(STORE(stx, GLOBAL_SPARE, %o0 + 0x00)) + add %o0, 0x08, %o0 + bne,pt %icc, 1b + sllx %g3, %g1, %o4 + srl %g1, 3, %g1 + add %o1, %g1, %o1 + brz,pn %o2, .Lexit + nop + ba,pt %icc, .Lsmall_unaligned + +.Ltiny: + EX_LD(LOAD(ldub, %o1 + 0x00, %g1)) + subcc %o2, 1, %o2 + be,pn %icc, .Lexit + EX_ST(STORE(stb, %g1, %o0 + 0x00)) + EX_LD(LOAD(ldub, %o1 + 0x01, %g1)) + subcc %o2, 1, %o2 + be,pn %icc, .Lexit + EX_ST(STORE(stb, %g1, %o0 + 0x01)) + EX_LD(LOAD(ldub, %o1 + 0x02, %g1)) + ba,pt %icc, .Lexit + EX_ST(STORE(stb, %g1, %o0 + 0x02)) + +.Lsmall: + andcc %g2, 0x3, %g0 + bne,pn %icc, .Lsmall_unaligned + andn %o2, 0x4 - 1, %o5 + sub %o2, %o5, %o2 +1: + EX_LD(LOAD(lduw, %o1 + 0x00, %g1)) + add %o1, 0x04, %o1 + subcc %o5, 0x04, %o5 + add %o0, 0x04, %o0 + bne,pt %icc, 1b + EX_ST(STORE(stw, %g1, %o0 - 0x04)) + brz,pt %o2, .Lexit + nop + ba,a,pt %icc, .Ltiny + +.Lsmall_unaligned: +1: EX_LD(LOAD(ldub, %o1 + 0x00, %g1)) + add %o1, 1, %o1 + add %o0, 1, %o0 + subcc %o2, 1, %o2 + bne,pt %icc, 1b + EX_ST(STORE(stb, %g1, %o0 - 0x01)) + ba,a,pt %icc, .Lexit + .size FUNC_NAME, .-FUNC_NAME diff --git a/arch/sparc/lib/NG4memset.S b/arch/sparc/lib/NG4memset.S new file mode 100644 index 000000000000..41da4bdd95cb --- /dev/null +++ b/arch/sparc/lib/NG4memset.S @@ -0,0 +1,105 @@ +/* NG4memset.S: Niagara-4 optimized memset/bzero. + * + * Copyright (C) 2012 David S. Miller (davem@davemloft.net) + */ + +#include <asm/asi.h> + + .register %g2, #scratch + .register %g3, #scratch + + .text + .align 32 + .globl NG4memset +NG4memset: + andcc %o1, 0xff, %o4 + be,pt %icc, 1f + mov %o2, %o1 + sllx %o4, 8, %g1 + or %g1, %o4, %o2 + sllx %o2, 16, %g1 + or %g1, %o2, %o2 + sllx %o2, 32, %g1 + ba,pt %icc, 1f + or %g1, %o2, %o4 + .size NG4memset,.-NG4memset + + .align 32 + .globl NG4bzero +NG4bzero: + clr %o4 +1: cmp %o1, 16 + ble %icc, .Ltiny + mov %o0, %o3 + sub %g0, %o0, %g1 + and %g1, 0x7, %g1 + brz,pt %g1, .Laligned8 + sub %o1, %g1, %o1 +1: stb %o4, [%o0 + 0x00] + subcc %g1, 1, %g1 + bne,pt %icc, 1b + add %o0, 1, %o0 +.Laligned8: + cmp %o1, 64 + (64 - 8) + ble .Lmedium + sub %g0, %o0, %g1 + andcc %g1, (64 - 1), %g1 + brz,pn %g1, .Laligned64 + sub %o1, %g1, %o1 +1: stx %o4, [%o0 + 0x00] + subcc %g1, 8, %g1 + bne,pt %icc, 1b + add %o0, 0x8, %o0 +.Laligned64: + andn %o1, 64 - 1, %g1 + sub %o1, %g1, %o1 + brnz,pn %o4, .Lnon_bzero_loop + mov 0x20, %g2 +1: stxa %o4, [%o0 + %g0] ASI_BLK_INIT_QUAD_LDD_P + subcc %g1, 0x40, %g1 + stxa %o4, [%o0 + %g2] ASI_BLK_INIT_QUAD_LDD_P + bne,pt %icc, 1b + add %o0, 0x40, %o0 +.Lpostloop: + cmp %o1, 8 + bl,pn %icc, .Ltiny + membar #StoreStore|#StoreLoad +.Lmedium: + andn %o1, 0x7, %g1 + sub %o1, %g1, %o1 +1: stx %o4, [%o0 + 0x00] + subcc %g1, 0x8, %g1 + bne,pt %icc, 1b + add %o0, 0x08, %o0 + andcc %o1, 0x4, %g1 + be,pt %icc, .Ltiny + sub %o1, %g1, %o1 + stw %o4, [%o0 + 0x00] + add %o0, 0x4, %o0 +.Ltiny: + cmp %o1, 0 + be,pn %icc, .Lexit +1: subcc %o1, 1, %o1 + stb %o4, [%o0 + 0x00] + bne,pt %icc, 1b + add %o0, 1, %o0 +.Lexit: + retl + mov %o3, %o0 +.Lnon_bzero_loop: + mov 0x08, %g3 + mov 0x28, %o5 +1: stxa %o4, [%o0 + %g0] ASI_BLK_INIT_QUAD_LDD_P + subcc %g1, 0x40, %g1 + stxa %o4, [%o0 + %g2] ASI_BLK_INIT_QUAD_LDD_P + stxa %o4, [%o0 + %g3] ASI_BLK_INIT_QUAD_LDD_P + stxa %o4, [%o0 + %o5] ASI_BLK_INIT_QUAD_LDD_P + add %o0, 0x10, %o0 + stxa %o4, [%o0 + %g0] ASI_BLK_INIT_QUAD_LDD_P + stxa %o4, [%o0 + %g2] ASI_BLK_INIT_QUAD_LDD_P + stxa %o4, [%o0 + %g3] ASI_BLK_INIT_QUAD_LDD_P + stxa %o4, [%o0 + %o5] ASI_BLK_INIT_QUAD_LDD_P + bne,pt %icc, 1b + add %o0, 0x30, %o0 + ba,a,pt %icc, .Lpostloop + .size NG4bzero,.-NG4bzero diff --git a/arch/sparc/lib/NG4patch.S b/arch/sparc/lib/NG4patch.S new file mode 100644 index 000000000000..a114cbcf2a48 --- /dev/null +++ b/arch/sparc/lib/NG4patch.S @@ -0,0 +1,54 @@ +/* NG4patch.S: Patch Ultra-I routines with Niagara-4 variant. + * + * Copyright (C) 2012 David S. Miller <davem@davemloft.net> + */ + +#define BRANCH_ALWAYS 0x10680000 +#define NOP 0x01000000 +#define NG_DO_PATCH(OLD, NEW) \ + sethi %hi(NEW), %g1; \ + or %g1, %lo(NEW), %g1; \ + sethi %hi(OLD), %g2; \ + or %g2, %lo(OLD), %g2; \ + sub %g1, %g2, %g1; \ + sethi %hi(BRANCH_ALWAYS), %g3; \ + sll %g1, 11, %g1; \ + srl %g1, 11 + 2, %g1; \ + or %g3, %lo(BRANCH_ALWAYS), %g3; \ + or %g3, %g1, %g3; \ + stw %g3, [%g2]; \ + sethi %hi(NOP), %g3; \ + or %g3, %lo(NOP), %g3; \ + stw %g3, [%g2 + 0x4]; \ + flush %g2; + + .globl niagara4_patch_copyops + .type niagara4_patch_copyops,#function +niagara4_patch_copyops: + NG_DO_PATCH(memcpy, NG4memcpy) + NG_DO_PATCH(___copy_from_user, NG4copy_from_user) + NG_DO_PATCH(___copy_to_user, NG4copy_to_user) + retl + nop + .size niagara4_patch_copyops,.-niagara4_patch_copyops + + .globl niagara4_patch_bzero + .type niagara4_patch_bzero,#function +niagara4_patch_bzero: + NG_DO_PATCH(memset, NG4memset) + NG_DO_PATCH(__bzero, NG4bzero) + NG_DO_PATCH(__clear_user, NGclear_user) + NG_DO_PATCH(tsb_init, NGtsb_init) + retl + nop + .size niagara4_patch_bzero,.-niagara4_patch_bzero + + .globl niagara4_patch_pageops + .type niagara4_patch_pageops,#function +niagara4_patch_pageops: + NG_DO_PATCH(copy_user_page, NG4copy_user_page) + NG_DO_PATCH(_clear_page, NG4clear_page) + NG_DO_PATCH(clear_user_page, NG4clear_user_page) + retl + nop + .size niagara4_patch_pageops,.-niagara4_patch_pageops diff --git a/arch/sparc/lib/NGpage.S b/arch/sparc/lib/NGpage.S index b9e790b9c6b8..423d46e2258b 100644 --- a/arch/sparc/lib/NGpage.S +++ b/arch/sparc/lib/NGpage.S @@ -59,6 +59,8 @@ NGcopy_user_page: /* %o0=dest, %o1=src, %o2=vaddr */ restore .align 32 + .globl NGclear_page + .globl NGclear_user_page NGclear_page: /* %o0=dest */ NGclear_user_page: /* %o0=dest, %o1=vaddr */ rd %asi, %g3 diff --git a/arch/sparc/lib/ksyms.c b/arch/sparc/lib/ksyms.c index 3b31218cafc6..ee31b884c61b 100644 --- a/arch/sparc/lib/ksyms.c +++ b/arch/sparc/lib/ksyms.c @@ -134,6 +134,10 @@ EXPORT_SYMBOL(copy_user_page); void VISenter(void); EXPORT_SYMBOL(VISenter); +/* CRYPTO code needs this */ +void VISenterhalf(void); +EXPORT_SYMBOL(VISenterhalf); + extern void xor_vis_2(unsigned long, unsigned long *, unsigned long *); extern void xor_vis_3(unsigned long, unsigned long *, unsigned long *, unsigned long *); |