diff options
Diffstat (limited to 'arch/ia64/lib')
-rw-r--r-- | arch/ia64/lib/Makefile | 48 | ||||
-rw-r--r-- | arch/ia64/lib/checksum.c | 102 | ||||
-rw-r--r-- | arch/ia64/lib/clear_page.S | 79 | ||||
-rw-r--r-- | arch/ia64/lib/clear_user.S | 212 | ||||
-rw-r--r-- | arch/ia64/lib/copy_page.S | 101 | ||||
-rw-r--r-- | arch/ia64/lib/copy_page_mck.S | 188 | ||||
-rw-r--r-- | arch/ia64/lib/copy_user.S | 613 | ||||
-rw-r--r-- | arch/ia64/lib/csum_partial_copy.c | 98 | ||||
-rw-r--r-- | arch/ia64/lib/do_csum.S | 324 | ||||
-rw-r--r-- | arch/ia64/lib/flush.S | 119 | ||||
-rw-r--r-- | arch/ia64/lib/idiv32.S | 86 | ||||
-rw-r--r-- | arch/ia64/lib/idiv64.S | 83 | ||||
-rw-r--r-- | arch/ia64/lib/io.c | 51 | ||||
-rw-r--r-- | arch/ia64/lib/ip_fast_csum.S | 148 | ||||
-rw-r--r-- | arch/ia64/lib/memcpy.S | 304 | ||||
-rw-r--r-- | arch/ia64/lib/memcpy_mck.S | 659 | ||||
-rw-r--r-- | arch/ia64/lib/memset.S | 365 | ||||
-rw-r--r-- | arch/ia64/lib/strlen.S | 195 | ||||
-rw-r--r-- | arch/ia64/lib/strncpy_from_user.S | 47 | ||||
-rw-r--r-- | arch/ia64/lib/strnlen_user.S | 48 | ||||
-rw-r--r-- | arch/ia64/lib/xor.S | 181 |
21 files changed, 0 insertions, 4051 deletions
diff --git a/arch/ia64/lib/Makefile b/arch/ia64/lib/Makefile deleted file mode 100644 index 081fcba01dc0..000000000000 --- a/arch/ia64/lib/Makefile +++ /dev/null @@ -1,48 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0 -# -# Makefile for ia64-specific library routines.. -# - -lib-y := io.o __divsi3.o __udivsi3.o __modsi3.o __umodsi3.o \ - __divdi3.o __udivdi3.o __moddi3.o __umoddi3.o \ - checksum.o clear_page.o csum_partial_copy.o \ - clear_user.o strncpy_from_user.o strnlen_user.o \ - flush.o ip_fast_csum.o do_csum.o \ - memset.o strlen.o xor.o - -lib-$(CONFIG_ITANIUM) += copy_page.o copy_user.o memcpy.o -lib-$(CONFIG_MCKINLEY) += copy_page_mck.o memcpy_mck.o - -AFLAGS___divdi3.o = -AFLAGS___udivdi3.o = -DUNSIGNED -AFLAGS___moddi3.o = -DMODULO -AFLAGS___umoddi3.o = -DUNSIGNED -DMODULO - -AFLAGS___divsi3.o = -AFLAGS___udivsi3.o = -DUNSIGNED -AFLAGS___modsi3.o = -DMODULO -AFLAGS___umodsi3.o = -DUNSIGNED -DMODULO - -$(obj)/__divdi3.o: $(src)/idiv64.S FORCE - $(call if_changed_rule,as_o_S) - -$(obj)/__udivdi3.o: $(src)/idiv64.S FORCE - $(call if_changed_rule,as_o_S) - -$(obj)/__moddi3.o: $(src)/idiv64.S FORCE - $(call if_changed_rule,as_o_S) - -$(obj)/__umoddi3.o: $(src)/idiv64.S FORCE - $(call if_changed_rule,as_o_S) - -$(obj)/__divsi3.o: $(src)/idiv32.S FORCE - $(call if_changed_rule,as_o_S) - -$(obj)/__udivsi3.o: $(src)/idiv32.S FORCE - $(call if_changed_rule,as_o_S) - -$(obj)/__modsi3.o: $(src)/idiv32.S FORCE - $(call if_changed_rule,as_o_S) - -$(obj)/__umodsi3.o: $(src)/idiv32.S FORCE - $(call if_changed_rule,as_o_S) diff --git a/arch/ia64/lib/checksum.c b/arch/ia64/lib/checksum.c deleted file mode 100644 index d26517fe3500..000000000000 --- a/arch/ia64/lib/checksum.c +++ /dev/null @@ -1,102 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Network checksum routines - * - * Copyright (C) 1999, 2003 Hewlett-Packard Co - * Stephane Eranian <eranian@hpl.hp.com> - * - * Most of the code coming from arch/alpha/lib/checksum.c - * - * This file contains network checksum routines that are better done - * in an architecture-specific manner due to speed.. - */ - -#include <linux/module.h> -#include <linux/string.h> - -#include <asm/byteorder.h> - -static inline unsigned short -from64to16 (unsigned long x) -{ - /* add up 32-bit words for 33 bits */ - x = (x & 0xffffffff) + (x >> 32); - /* add up 16-bit and 17-bit words for 17+c bits */ - x = (x & 0xffff) + (x >> 16); - /* add up 16-bit and 2-bit for 16+c bit */ - x = (x & 0xffff) + (x >> 16); - /* add up carry.. */ - x = (x & 0xffff) + (x >> 16); - return x; -} - -/* - * computes the checksum of the TCP/UDP pseudo-header - * returns a 16-bit checksum, already complemented. - */ -__sum16 -csum_tcpudp_magic(__be32 saddr, __be32 daddr, __u32 len, - __u8 proto, __wsum sum) -{ - return (__force __sum16)~from64to16( - (__force u64)saddr + (__force u64)daddr + - (__force u64)sum + ((len + proto) << 8)); -} - -EXPORT_SYMBOL(csum_tcpudp_magic); - -__wsum -csum_tcpudp_nofold(__be32 saddr, __be32 daddr, __u32 len, - __u8 proto, __wsum sum) -{ - unsigned long result; - - result = (__force u64)saddr + (__force u64)daddr + - (__force u64)sum + ((len + proto) << 8); - - /* Fold down to 32-bits so we don't lose in the typedef-less network stack. */ - /* 64 to 33 */ - result = (result & 0xffffffff) + (result >> 32); - /* 33 to 32 */ - result = (result & 0xffffffff) + (result >> 32); - return (__force __wsum)result; -} -EXPORT_SYMBOL(csum_tcpudp_nofold); - -extern unsigned long do_csum (const unsigned char *, long); - -/* - * computes the checksum of a memory block at buff, length len, - * and adds in "sum" (32-bit) - * - * returns a 32-bit number suitable for feeding into itself - * or csum_tcpudp_magic - * - * this function must be called with even lengths, except - * for the last fragment, which may be odd - * - * it's best to have buff aligned on a 32-bit boundary - */ -__wsum csum_partial(const void *buff, int len, __wsum sum) -{ - u64 result = do_csum(buff, len); - - /* add in old sum, and carry.. */ - result += (__force u32)sum; - /* 32+c bits -> 32 bits */ - result = (result & 0xffffffff) + (result >> 32); - return (__force __wsum)result; -} - -EXPORT_SYMBOL(csum_partial); - -/* - * this routine is used for miscellaneous IP-like checksums, mainly - * in icmp.c - */ -__sum16 ip_compute_csum (const void *buff, int len) -{ - return (__force __sum16)~do_csum(buff,len); -} - -EXPORT_SYMBOL(ip_compute_csum); diff --git a/arch/ia64/lib/clear_page.S b/arch/ia64/lib/clear_page.S deleted file mode 100644 index ba0dd2538fa5..000000000000 --- a/arch/ia64/lib/clear_page.S +++ /dev/null @@ -1,79 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Copyright (C) 1999-2002 Hewlett-Packard Co - * Stephane Eranian <eranian@hpl.hp.com> - * David Mosberger-Tang <davidm@hpl.hp.com> - * Copyright (C) 2002 Ken Chen <kenneth.w.chen@intel.com> - * - * 1/06/01 davidm Tuned for Itanium. - * 2/12/02 kchen Tuned for both Itanium and McKinley - * 3/08/02 davidm Some more tweaking - */ - -#include <linux/export.h> -#include <asm/asmmacro.h> -#include <asm/page.h> - -#ifdef CONFIG_ITANIUM -# define L3_LINE_SIZE 64 // Itanium L3 line size -# define PREFETCH_LINES 9 // magic number -#else -# define L3_LINE_SIZE 128 // McKinley L3 line size -# define PREFETCH_LINES 12 // magic number -#endif - -#define saved_lc r2 -#define dst_fetch r3 -#define dst1 r8 -#define dst2 r9 -#define dst3 r10 -#define dst4 r11 - -#define dst_last r31 - -GLOBAL_ENTRY(clear_page) - .prologue - .regstk 1,0,0,0 - mov r16 = PAGE_SIZE/L3_LINE_SIZE-1 // main loop count, -1=repeat/until - .save ar.lc, saved_lc - mov saved_lc = ar.lc - - .body - mov ar.lc = (PREFETCH_LINES - 1) - mov dst_fetch = in0 - adds dst1 = 16, in0 - adds dst2 = 32, in0 - ;; -.fetch: stf.spill.nta [dst_fetch] = f0, L3_LINE_SIZE - adds dst3 = 48, in0 // executing this multiple times is harmless - br.cloop.sptk.few .fetch - ;; - addl dst_last = (PAGE_SIZE - PREFETCH_LINES*L3_LINE_SIZE), dst_fetch - mov ar.lc = r16 // one L3 line per iteration - adds dst4 = 64, in0 - ;; -#ifdef CONFIG_ITANIUM - // Optimized for Itanium -1: stf.spill.nta [dst1] = f0, 64 - stf.spill.nta [dst2] = f0, 64 - cmp.lt p8,p0=dst_fetch, dst_last - ;; -#else - // Optimized for McKinley -1: stf.spill.nta [dst1] = f0, 64 - stf.spill.nta [dst2] = f0, 64 - stf.spill.nta [dst3] = f0, 64 - stf.spill.nta [dst4] = f0, 128 - cmp.lt p8,p0=dst_fetch, dst_last - ;; - stf.spill.nta [dst1] = f0, 64 - stf.spill.nta [dst2] = f0, 64 -#endif - stf.spill.nta [dst3] = f0, 64 -(p8) stf.spill.nta [dst_fetch] = f0, L3_LINE_SIZE - br.cloop.sptk.few 1b - ;; - mov ar.lc = saved_lc // restore lc - br.ret.sptk.many rp -END(clear_page) -EXPORT_SYMBOL(clear_page) diff --git a/arch/ia64/lib/clear_user.S b/arch/ia64/lib/clear_user.S deleted file mode 100644 index 1d9e45ccf8e5..000000000000 --- a/arch/ia64/lib/clear_user.S +++ /dev/null @@ -1,212 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * This routine clears to zero a linear memory buffer in user space. - * - * Inputs: - * in0: address of buffer - * in1: length of buffer in bytes - * Outputs: - * r8: number of bytes that didn't get cleared due to a fault - * - * Copyright (C) 1998, 1999, 2001 Hewlett-Packard Co - * Stephane Eranian <eranian@hpl.hp.com> - */ - -#include <linux/export.h> -#include <asm/asmmacro.h> - -// -// arguments -// -#define buf r32 -#define len r33 - -// -// local registers -// -#define cnt r16 -#define buf2 r17 -#define saved_lc r18 -#define saved_pfs r19 -#define tmp r20 -#define len2 r21 -#define len3 r22 - -// -// Theory of operations: -// - we check whether or not the buffer is small, i.e., less than 17 -// in which case we do the byte by byte loop. -// -// - Otherwise we go progressively from 1 byte store to 8byte store in -// the head part, the body is a 16byte store loop and we finish we the -// tail for the last 15 bytes. -// The good point about this breakdown is that the long buffer handling -// contains only 2 branches. -// -// The reason for not using shifting & masking for both the head and the -// tail is to stay semantically correct. This routine is not supposed -// to write bytes outside of the buffer. While most of the time this would -// be ok, we can't tolerate a mistake. A classical example is the case -// of multithreaded code were to the extra bytes touched is actually owned -// by another thread which runs concurrently to ours. Another, less likely, -// example is with device drivers where reading an I/O mapped location may -// have side effects (same thing for writing). -// - -GLOBAL_ENTRY(__do_clear_user) - .prologue - .save ar.pfs, saved_pfs - alloc saved_pfs=ar.pfs,2,0,0,0 - cmp.eq p6,p0=r0,len // check for zero length - .save ar.lc, saved_lc - mov saved_lc=ar.lc // preserve ar.lc (slow) - .body - ;; // avoid WAW on CFM - adds tmp=-1,len // br.ctop is repeat/until - mov ret0=len // return value is length at this point -(p6) br.ret.spnt.many rp - ;; - cmp.lt p6,p0=16,len // if len > 16 then long memset - mov ar.lc=tmp // initialize lc for small count -(p6) br.cond.dptk .long_do_clear - ;; // WAR on ar.lc - // - // worst case 16 iterations, avg 8 iterations - // - // We could have played with the predicates to use the extra - // M slot for 2 stores/iteration but the cost the initialization - // the various counters compared to how long the loop is supposed - // to last on average does not make this solution viable. - // -1: - EX( .Lexit1, st1 [buf]=r0,1 ) - adds len=-1,len // countdown length using len - br.cloop.dptk 1b - ;; // avoid RAW on ar.lc - // - // .Lexit4: comes from byte by byte loop - // len contains bytes left -.Lexit1: - mov ret0=len // faster than using ar.lc - mov ar.lc=saved_lc - br.ret.sptk.many rp // end of short clear_user - - - // - // At this point we know we have more than 16 bytes to copy - // so we focus on alignment (no branches required) - // - // The use of len/len2 for countdown of the number of bytes left - // instead of ret0 is due to the fact that the exception code - // changes the values of r8. - // -.long_do_clear: - tbit.nz p6,p0=buf,0 // odd alignment (for long_do_clear) - ;; - EX( .Lexit3, (p6) st1 [buf]=r0,1 ) // 1-byte aligned -(p6) adds len=-1,len;; // sync because buf is modified - tbit.nz p6,p0=buf,1 - ;; - EX( .Lexit3, (p6) st2 [buf]=r0,2 ) // 2-byte aligned -(p6) adds len=-2,len;; - tbit.nz p6,p0=buf,2 - ;; - EX( .Lexit3, (p6) st4 [buf]=r0,4 ) // 4-byte aligned -(p6) adds len=-4,len;; - tbit.nz p6,p0=buf,3 - ;; - EX( .Lexit3, (p6) st8 [buf]=r0,8 ) // 8-byte aligned -(p6) adds len=-8,len;; - shr.u cnt=len,4 // number of 128-bit (2x64bit) words - ;; - cmp.eq p6,p0=r0,cnt - adds tmp=-1,cnt -(p6) br.cond.dpnt .dotail // we have less than 16 bytes left - ;; - adds buf2=8,buf // setup second base pointer - mov ar.lc=tmp - ;; - - // - // 16bytes/iteration core loop - // - // The second store can never generate a fault because - // we come into the loop only when we are 16-byte aligned. - // This means that if we cross a page then it will always be - // in the first store and never in the second. - // - // - // We need to keep track of the remaining length. A possible (optimistic) - // way would be to use ar.lc and derive how many byte were left by - // doing : left= 16*ar.lc + 16. this would avoid the addition at - // every iteration. - // However we need to keep the synchronization point. A template - // M;;MB does not exist and thus we can keep the addition at no - // extra cycle cost (use a nop slot anyway). It also simplifies the - // (unlikely) error recovery code - // - -2: EX(.Lexit3, st8 [buf]=r0,16 ) - ;; // needed to get len correct when error - st8 [buf2]=r0,16 - adds len=-16,len - br.cloop.dptk 2b - ;; - mov ar.lc=saved_lc - // - // tail correction based on len only - // - // We alternate the use of len3,len2 to allow parallelism and correct - // error handling. We also reuse p6/p7 to return correct value. - // The addition of len2/len3 does not cost anything more compared to - // the regular memset as we had empty slots. - // -.dotail: - mov len2=len // for parallelization of error handling - mov len3=len - tbit.nz p6,p0=len,3 - ;; - EX( .Lexit2, (p6) st8 [buf]=r0,8 ) // at least 8 bytes -(p6) adds len3=-8,len2 - tbit.nz p7,p6=len,2 - ;; - EX( .Lexit2, (p7) st4 [buf]=r0,4 ) // at least 4 bytes -(p7) adds len2=-4,len3 - tbit.nz p6,p7=len,1 - ;; - EX( .Lexit2, (p6) st2 [buf]=r0,2 ) // at least 2 bytes -(p6) adds len3=-2,len2 - tbit.nz p7,p6=len,0 - ;; - EX( .Lexit2, (p7) st1 [buf]=r0 ) // only 1 byte left - mov ret0=r0 // success - br.ret.sptk.many rp // end of most likely path - - // - // Outlined error handling code - // - - // - // .Lexit3: comes from core loop, need restore pr/lc - // len contains bytes left - // - // - // .Lexit2: - // if p6 -> coming from st8 or st2 : len2 contains what's left - // if p7 -> coming from st4 or st1 : len3 contains what's left - // We must restore lc/pr even though might not have been used. -.Lexit2: - .pred.rel "mutex", p6, p7 -(p6) mov len=len2 -(p7) mov len=len3 - ;; - // - // .Lexit4: comes from head, need not restore pr/lc - // len contains bytes left - // -.Lexit3: - mov ret0=len - mov ar.lc=saved_lc - br.ret.sptk.many rp -END(__do_clear_user) -EXPORT_SYMBOL(__do_clear_user) diff --git a/arch/ia64/lib/copy_page.S b/arch/ia64/lib/copy_page.S deleted file mode 100644 index c0a0e6b2af00..000000000000 --- a/arch/ia64/lib/copy_page.S +++ /dev/null @@ -1,101 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * - * Optimized version of the standard copy_page() function - * - * Inputs: - * in0: address of target page - * in1: address of source page - * Output: - * no return value - * - * Copyright (C) 1999, 2001 Hewlett-Packard Co - * Stephane Eranian <eranian@hpl.hp.com> - * David Mosberger <davidm@hpl.hp.com> - * - * 4/06/01 davidm Tuned to make it perform well both for cached and uncached copies. - */ -#include <linux/export.h> -#include <asm/asmmacro.h> -#include <asm/page.h> - -#define PIPE_DEPTH 3 -#define EPI p[PIPE_DEPTH-1] - -#define lcount r16 -#define saved_pr r17 -#define saved_lc r18 -#define saved_pfs r19 -#define src1 r20 -#define src2 r21 -#define tgt1 r22 -#define tgt2 r23 -#define srcf r24 -#define tgtf r25 -#define tgt_last r26 - -#define Nrot ((8*PIPE_DEPTH+7)&~7) - -GLOBAL_ENTRY(copy_page) - .prologue - .save ar.pfs, saved_pfs - alloc saved_pfs=ar.pfs,3,Nrot-3,0,Nrot - - .rotr t1[PIPE_DEPTH], t2[PIPE_DEPTH], t3[PIPE_DEPTH], t4[PIPE_DEPTH], \ - t5[PIPE_DEPTH], t6[PIPE_DEPTH], t7[PIPE_DEPTH], t8[PIPE_DEPTH] - .rotp p[PIPE_DEPTH] - - .save ar.lc, saved_lc - mov saved_lc=ar.lc - mov ar.ec=PIPE_DEPTH - - mov lcount=PAGE_SIZE/64-1 - .save pr, saved_pr - mov saved_pr=pr - mov pr.rot=1<<16 - - .body - - mov src1=in1 - adds src2=8,in1 - mov tgt_last = PAGE_SIZE - ;; - adds tgt2=8,in0 - add srcf=512,in1 - mov ar.lc=lcount - mov tgt1=in0 - add tgtf=512,in0 - add tgt_last = tgt_last, in0 - ;; -1: -(p[0]) ld8 t1[0]=[src1],16 -(EPI) st8 [tgt1]=t1[PIPE_DEPTH-1],16 -(p[0]) ld8 t2[0]=[src2],16 -(EPI) st8 [tgt2]=t2[PIPE_DEPTH-1],16 - cmp.ltu p6,p0 = tgtf, tgt_last - ;; -(p[0]) ld8 t3[0]=[src1],16 -(EPI) st8 [tgt1]=t3[PIPE_DEPTH-1],16 -(p[0]) ld8 t4[0]=[src2],16 -(EPI) st8 [tgt2]=t4[PIPE_DEPTH-1],16 - ;; -(p[0]) ld8 t5[0]=[src1],16 -(EPI) st8 [tgt1]=t5[PIPE_DEPTH-1],16 -(p[0]) ld8 t6[0]=[src2],16 -(EPI) st8 [tgt2]=t6[PIPE_DEPTH-1],16 - ;; -(p[0]) ld8 t7[0]=[src1],16 -(EPI) st8 [tgt1]=t7[PIPE_DEPTH-1],16 -(p[0]) ld8 t8[0]=[src2],16 -(EPI) st8 [tgt2]=t8[PIPE_DEPTH-1],16 - -(p6) lfetch [srcf], 64 -(p6) lfetch [tgtf], 64 - br.ctop.sptk.few 1b - ;; - mov pr=saved_pr,0xffffffffffff0000 // restore predicates - mov ar.pfs=saved_pfs - mov ar.lc=saved_lc - br.ret.sptk.many rp -END(copy_page) -EXPORT_SYMBOL(copy_page) diff --git a/arch/ia64/lib/copy_page_mck.S b/arch/ia64/lib/copy_page_mck.S deleted file mode 100644 index 5e8bb4b4b535..000000000000 --- a/arch/ia64/lib/copy_page_mck.S +++ /dev/null @@ -1,188 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * McKinley-optimized version of copy_page(). - * - * Copyright (C) 2002 Hewlett-Packard Co - * David Mosberger <davidm@hpl.hp.com> - * - * Inputs: - * in0: address of target page - * in1: address of source page - * Output: - * no return value - * - * General idea: - * - use regular loads and stores to prefetch data to avoid consuming M-slot just for - * lfetches => good for in-cache performance - * - avoid l2 bank-conflicts by not storing into the same 16-byte bank within a single - * cycle - * - * Principle of operation: - * First, note that L1 has a line-size of 64 bytes and L2 a line-size of 128 bytes. - * To avoid secondary misses in L2, we prefetch both source and destination with a line-size - * of 128 bytes. When both of these lines are in the L2 and the first half of the - * source line is in L1, we start copying the remaining words. The second half of the - * source line is prefetched in an earlier iteration, so that by the time we start - * accessing it, it's also present in the L1. - * - * We use a software-pipelined loop to control the overall operation. The pipeline - * has 2*PREFETCH_DIST+K stages. The first PREFETCH_DIST stages are used for prefetching - * source cache-lines. The second PREFETCH_DIST stages are used for prefetching destination - * cache-lines, the last K stages are used to copy the cache-line words not copied by - * the prefetches. The four relevant points in the pipelined are called A, B, C, D: - * p[A] is TRUE if a source-line should be prefetched, p[B] is TRUE if a destination-line - * should be prefetched, p[C] is TRUE if the second half of an L2 line should be brought - * into L1D and p[D] is TRUE if a cacheline needs to be copied. - * - * This all sounds very complicated, but thanks to the modulo-scheduled loop support, - * the resulting code is very regular and quite easy to follow (once you get the idea). - * - * As a secondary optimization, the first 2*PREFETCH_DIST iterations are implemented - * as the separate .prefetch_loop. Logically, this loop performs exactly like the - * main-loop (.line_copy), but has all known-to-be-predicated-off instructions removed, - * so that each loop iteration is faster (again, good for cached case). - * - * When reading the code, it helps to keep the following picture in mind: - * - * word 0 word 1 - * +------+------+--- - * | v[x] | t1 | ^ - * | t2 | t3 | | - * | t4 | t5 | | - * | t6 | t7 | | 128 bytes - * | n[y] | t9 | | (L2 cache line) - * | t10 | t11 | | - * | t12 | t13 | | - * | t14 | t15 | v - * +------+------+--- - * - * Here, v[x] is copied by the (memory) prefetch. n[y] is loaded at p[C] - * to fetch the second-half of the L2 cache line into L1, and the tX words are copied in - * an order that avoids bank conflicts. - */ -#include <linux/export.h> -#include <asm/asmmacro.h> -#include <asm/page.h> - -#define PREFETCH_DIST 8 // McKinley sustains 16 outstanding L2 misses (8 ld, 8 st) - -#define src0 r2 -#define src1 r3 -#define dst0 r9 -#define dst1 r10 -#define src_pre_mem r11 -#define dst_pre_mem r14 -#define src_pre_l2 r15 -#define dst_pre_l2 r16 -#define t1 r17 -#define t2 r18 -#define t3 r19 -#define t4 r20 -#define t5 t1 // alias! -#define t6 t2 // alias! -#define t7 t3 // alias! -#define t9 t5 // alias! -#define t10 t4 // alias! -#define t11 t7 // alias! -#define t12 t6 // alias! -#define t14 t10 // alias! -#define t13 r21 -#define t15 r22 - -#define saved_lc r23 -#define saved_pr r24 - -#define A 0 -#define B (PREFETCH_DIST) -#define C (B + PREFETCH_DIST) -#define D (C + 3) -#define N (D + 1) -#define Nrot ((N + 7) & ~7) - -GLOBAL_ENTRY(copy_page) - .prologue - alloc r8 = ar.pfs, 2, Nrot-2, 0, Nrot - - .rotr v[2*PREFETCH_DIST], n[D-C+1] - .rotp p[N] - - .save ar.lc, saved_lc - mov saved_lc = ar.lc - .save pr, saved_pr - mov saved_pr = pr - .body - - mov src_pre_mem = in1 - mov pr.rot = 0x10000 - mov ar.ec = 1 // special unrolled loop - - mov dst_pre_mem = in0 - mov ar.lc = 2*PREFETCH_DIST - 1 - - add src_pre_l2 = 8*8, in1 - add dst_pre_l2 = 8*8, in0 - add src0 = 8, in1 // first t1 src - add src1 = 3*8, in1 // first t3 src - add dst0 = 8, in0 // first t1 dst - add dst1 = 3*8, in0 // first t3 dst - mov t1 = (PAGE_SIZE/128) - (2*PREFETCH_DIST) - 1 - nop.m 0 - nop.i 0 - ;; - // same as .line_copy loop, but with all predicated-off instructions removed: -.prefetch_loop: -(p[A]) ld8 v[A] = [src_pre_mem], 128 // M0 -(p[B]) st8 [dst_pre_mem] = v[B], 128 // M2 - br.ctop.sptk .prefetch_loop - ;; - cmp.eq p16, p0 = r0, r0 // reset p16 to 1 (br.ctop cleared it to zero) - mov ar.lc = t1 // with 64KB pages, t1 is too big to fit in 8 bits! - mov ar.ec = N // # of stages in pipeline - ;; -.line_copy: -(p[D]) ld8 t2 = [src0], 3*8 // M0 -(p[D]) ld8 t4 = [src1], 3*8 // M1 -(p[B]) st8 [dst_pre_mem] = v[B], 128 // M2 prefetch dst from memory -(p[D]) st8 [dst_pre_l2] = n[D-C], 128 // M3 prefetch dst from L2 - ;; -(p[A]) ld8 v[A] = [src_pre_mem], 128 // M0 prefetch src from memory -(p[C]) ld8 n[0] = [src_pre_l2], 128 // M1 prefetch src from L2 -(p[D]) st8 [dst0] = t1, 8 // M2 -(p[D]) st8 [dst1] = t3, 8 // M3 - ;; -(p[D]) ld8 t5 = [src0], 8 -(p[D]) ld8 t7 = [src1], 3*8 -(p[D]) st8 [dst0] = t2, 3*8 -(p[D]) st8 [dst1] = t4, 3*8 - ;; -(p[D]) ld8 t6 = [src0], 3*8 -(p[D]) ld8 t10 = [src1], 8 -(p[D]) st8 [dst0] = t5, 8 -(p[D]) st8 [dst1] = t7, 3*8 - ;; -(p[D]) ld8 t9 = [src0], 3*8 -(p[D]) ld8 t11 = [src1], 3*8 -(p[D]) st8 [dst0] = t6, 3*8 -(p[D]) st8 [dst1] = t10, 8 - ;; -(p[D]) ld8 t12 = [src0], 8 -(p[D]) ld8 t14 = [src1], 8 -(p[D]) st8 [dst0] = t9, 3*8 -(p[D]) st8 [dst1] = t11, 3*8 - ;; -(p[D]) ld8 t13 = [src0], 4*8 -(p[D]) ld8 t15 = [src1], 4*8 -(p[D]) st8 [dst0] = t12, 8 -(p[D]) st8 [dst1] = t14, 8 - ;; -(p[D-1])ld8 t1 = [src0], 8 -(p[D-1])ld8 t3 = [src1], 8 -(p[D]) st8 [dst0] = t13, 4*8 -(p[D]) st8 [dst1] = t15, 4*8 - br.ctop.sptk .line_copy - ;; - mov ar.lc = saved_lc - mov pr = saved_pr, -1 - br.ret.sptk.many rp -END(copy_page) -EXPORT_SYMBOL(copy_page) diff --git a/arch/ia64/lib/copy_user.S b/arch/ia64/lib/copy_user.S deleted file mode 100644 index 8daab72cfe77..000000000000 --- a/arch/ia64/lib/copy_user.S +++ /dev/null @@ -1,613 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * - * Optimized version of the copy_user() routine. - * It is used to copy date across the kernel/user boundary. - * - * The source and destination are always on opposite side of - * the boundary. When reading from user space we must catch - * faults on loads. When writing to user space we must catch - * errors on stores. Note that because of the nature of the copy - * we don't need to worry about overlapping regions. - * - * - * Inputs: - * in0 address of source buffer - * in1 address of destination buffer - * in2 number of bytes to copy - * - * Outputs: - * ret0 0 in case of success. The number of bytes NOT copied in - * case of error. - * - * Copyright (C) 2000-2001 Hewlett-Packard Co - * Stephane Eranian <eranian@hpl.hp.com> - * - * Fixme: - * - handle the case where we have more than 16 bytes and the alignment - * are different. - * - more benchmarking - * - fix extraneous stop bit introduced by the EX() macro. - */ - -#include <linux/export.h> -#include <asm/asmmacro.h> - -// -// Tuneable parameters -// -#define COPY_BREAK 16 // we do byte copy below (must be >=16) -#define PIPE_DEPTH 21 // pipe depth - -#define EPI p[PIPE_DEPTH-1] - -// -// arguments -// -#define dst in0 -#define src in1 -#define len in2 - -// -// local registers -// -#define t1 r2 // rshift in bytes -#define t2 r3 // lshift in bytes -#define rshift r14 // right shift in bits -#define lshift r15 // left shift in bits -#define word1 r16 -#define word2 r17 -#define cnt r18 -#define len2 r19 -#define saved_lc r20 -#define saved_pr r21 -#define tmp r22 -#define val r23 -#define src1 r24 -#define dst1 r25 -#define src2 r26 -#define dst2 r27 -#define len1 r28 -#define enddst r29 -#define endsrc r30 -#define saved_pfs r31 - -GLOBAL_ENTRY(__copy_user) - .prologue - .save ar.pfs, saved_pfs - alloc saved_pfs=ar.pfs,3,((2*PIPE_DEPTH+7)&~7),0,((2*PIPE_DEPTH+7)&~7) - - .rotr val1[PIPE_DEPTH],val2[PIPE_DEPTH] - .rotp p[PIPE_DEPTH] - - adds len2=-1,len // br.ctop is repeat/until - mov ret0=r0 - - ;; // RAW of cfm when len=0 - cmp.eq p8,p0=r0,len // check for zero length - .save ar.lc, saved_lc - mov saved_lc=ar.lc // preserve ar.lc (slow) -(p8) br.ret.spnt.many rp // empty mempcy() - ;; - add enddst=dst,len // first byte after end of source - add endsrc=src,len // first byte after end of destination - .save pr, saved_pr - mov saved_pr=pr // preserve predicates - - .body - - mov dst1=dst // copy because of rotation - mov ar.ec=PIPE_DEPTH - mov pr.rot=1<<16 // p16=true all others are false - - mov src1=src // copy because of rotation - mov ar.lc=len2 // initialize lc for small count - cmp.lt p10,p7=COPY_BREAK,len // if len > COPY_BREAK then long copy - - xor tmp=src,dst // same alignment test prepare -(p10) br.cond.dptk .long_copy_user - ;; // RAW pr.rot/p16 ? - // - // Now we do the byte by byte loop with software pipeline - // - // p7 is necessarily false by now -1: - EX(.failure_in_pipe1,(p16) ld1 val1[0]=[src1],1) - EX(.failure_out,(EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1) - br.ctop.dptk.few 1b - ;; - mov ar.lc=saved_lc - mov pr=saved_pr,0xffffffffffff0000 - mov ar.pfs=saved_pfs // restore ar.ec - br.ret.sptk.many rp // end of short memcpy - - // - // Not 8-byte aligned - // -.diff_align_copy_user: - // At this point we know we have more than 16 bytes to copy - // and also that src and dest do _not_ have the same alignment. - and src2=0x7,src1 // src offset - and dst2=0x7,dst1 // dst offset - ;; - // The basic idea is that we copy byte-by-byte at the head so - // that we can reach 8-byte alignment for both src1 and dst1. - // Then copy the body using software pipelined 8-byte copy, - // shifting the two back-to-back words right and left, then copy - // the tail by copying byte-by-byte. - // - // Fault handling. If the byte-by-byte at the head fails on the - // load, then restart and finish the pipleline by copying zeros - // to the dst1. Then copy zeros for the rest of dst1. - // If 8-byte software pipeline fails on the load, do the same as - // failure_in3 does. If the byte-by-byte at the tail fails, it is - // handled simply by failure_in_pipe1. - // - // The case p14 represents the source has more bytes in the - // the first word (by the shifted part), whereas the p15 needs to - // copy some bytes from the 2nd word of the source that has the - // tail of the 1st of the destination. - // - - // - // Optimization. If dst1 is 8-byte aligned (quite common), we don't need - // to copy the head to dst1, to start 8-byte copy software pipeline. - // We know src1 is not 8-byte aligned in this case. - // - cmp.eq p14,p15=r0,dst2 -(p15) br.cond.spnt 1f - ;; - sub t1=8,src2 - mov t2=src2 - ;; - shl rshift=t2,3 - sub len1=len,t1 // set len1 - ;; - sub lshift=64,rshift - ;; - br.cond.spnt .word_copy_user - ;; -1: - cmp.leu p14,p15=src2,dst2 - sub t1=dst2,src2 - ;; - .pred.rel "mutex", p14, p15 -(p14) sub word1=8,src2 // (8 - src offset) -(p15) sub t1=r0,t1 // absolute value -(p15) sub word1=8,dst2 // (8 - dst offset) - ;; - // For the case p14, we don't need to copy the shifted part to - // the 1st word of destination. - sub t2=8,t1 -(p14) sub word1=word1,t1 - ;; - sub len1=len,word1 // resulting len -(p15) shl rshift=t1,3 // in bits -(p14) shl rshift=t2,3 - ;; -(p14) sub len1=len1,t1 - adds cnt=-1,word1 - ;; - sub lshift=64,rshift - mov ar.ec=PIPE_DEPTH - mov pr.rot=1<<16 // p16=true all others are false - mov ar.lc=cnt - ;; -2: - EX(.failure_in_pipe2,(p16) ld1 val1[0]=[src1],1) - EX(.failure_out,(EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1) - br.ctop.dptk.few 2b - ;; - clrrrb - ;; -.word_copy_user: - cmp.gtu p9,p0=16,len1 -(p9) br.cond.spnt 4f // if (16 > len1) skip 8-byte copy - ;; - shr.u cnt=len1,3 // number of 64-bit words - ;; - adds cnt=-1,cnt - ;; - .pred.rel "mutex", p14, p15 -(p14) sub src1=src1,t2 -(p15) sub src1=src1,t1 - // - // Now both src1 and dst1 point to an 8-byte aligned address. And - // we have more than 8 bytes to copy. - // - mov ar.lc=cnt - mov ar.ec=PIPE_DEPTH - mov pr.rot=1<<16 // p16=true all others are false - ;; -3: - // - // The pipleline consists of 3 stages: - // 1 (p16): Load a word from src1 - // 2 (EPI_1): Shift right pair, saving to tmp - // 3 (EPI): Store tmp to dst1 - // - // To make it simple, use at least 2 (p16) loops to set up val1[n] - // because we need 2 back-to-back val1[] to get tmp. - // Note that this implies EPI_2 must be p18 or greater. - // - -#define EPI_1 p[PIPE_DEPTH-2] -#define SWITCH(pred, shift) cmp.eq pred,p0=shift,rshift -#define CASE(pred, shift) \ - (pred) br.cond.spnt .copy_user_bit##shift -#define BODY(rshift) \ -.copy_user_bit##rshift: \ -1: \ - EX(.failure_out,(EPI) st8 [dst1]=tmp,8); \ -(EPI_1) shrp tmp=val1[PIPE_DEPTH-2],val1[PIPE_DEPTH-1],rshift; \ - EX(3f,(p16) ld8 val1[1]=[src1],8); \ -(p16) mov val1[0]=r0; \ - br.ctop.dptk 1b; \ - ;; \ - br.cond.sptk.many .diff_align_do_tail; \ -2: \ -(EPI) st8 [dst1]=tmp,8; \ -(EPI_1) shrp tmp=val1[PIPE_DEPTH-2],val1[PIPE_DEPTH-1],rshift; \ -3: \ -(p16) mov val1[1]=r0; \ -(p16) mov val1[0]=r0; \ - br.ctop.dptk 2b; \ - ;; \ - br.cond.sptk.many .failure_in2 - - // - // Since the instruction 'shrp' requires a fixed 128-bit value - // specifying the bits to shift, we need to provide 7 cases - // below. - // - SWITCH(p6, 8) - SWITCH(p7, 16) - SWITCH(p8, 24) - SWITCH(p9, 32) - SWITCH(p10, 40) - SWITCH(p11, 48) - SWITCH(p12, 56) - ;; - CASE(p6, 8) - CASE(p7, 16) - CASE(p8, 24) - CASE(p9, 32) - CASE(p10, 40) - CASE(p11, 48) - CASE(p12, 56) - ;; - BODY(8) - BODY(16) - BODY(24) - BODY(32) - BODY(40) - BODY(48) - BODY(56) - ;; -.diff_align_do_tail: - .pred.rel "mutex", p14, p15 -(p14) sub src1=src1,t1 -(p14) adds dst1=-8,dst1 -(p15) sub dst1=dst1,t1 - ;; -4: - // Tail correction. - // - // The problem with this piplelined loop is that the last word is not - // loaded and thus parf of the last word written is not correct. - // To fix that, we simply copy the tail byte by byte. - - sub len1=endsrc,src1,1 - clrrrb - ;; - mov ar.ec=PIPE_DEPTH - mov pr.rot=1<<16 // p16=true all others are false - mov ar.lc=len1 - ;; -5: - EX(.failure_in_pipe1,(p16) ld1 val1[0]=[src1],1) - EX(.failure_out,(EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1) - br.ctop.dptk.few 5b - ;; - mov ar.lc=saved_lc - mov pr=saved_pr,0xffffffffffff0000 - mov ar.pfs=saved_pfs - br.ret.sptk.many rp - - // - // Beginning of long mempcy (i.e. > 16 bytes) - // -.long_copy_user: - tbit.nz p6,p7=src1,0 // odd alignment - and tmp=7,tmp - ;; - cmp.eq p10,p8=r0,tmp - mov len1=len // copy because of rotation -(p8) br.cond.dpnt .diff_align_copy_user - ;; - // At this point we know we have more than 16 bytes to copy - // and also that both src and dest have the same alignment - // which may not be the one we want. So for now we must move - // forward slowly until we reach 16byte alignment: no need to - // worry about reaching the end of buffer. - // - EX(.failure_in1,(p6) ld1 val1[0]=[src1],1) // 1-byte aligned -(p6) adds len1=-1,len1;; - tbit.nz p7,p0=src1,1 - ;; - EX(.failure_in1,(p7) ld2 val1[1]=[src1],2) // 2-byte aligned -(p7) adds len1=-2,len1;; - tbit.nz p8,p0=src1,2 - ;; - // - // Stop bit not required after ld4 because if we fail on ld4 - // we have never executed the ld1, therefore st1 is not executed. - // - EX(.failure_in1,(p8) ld4 val2[0]=[src1],4) // 4-byte aligned - ;; - EX(.failure_out,(p6) st1 [dst1]=val1[0],1) - tbit.nz p9,p0=src1,3 - ;; - // - // Stop bit not required after ld8 because if we fail on ld8 - // we have never executed the ld2, therefore st2 is not executed. - // - EX(.failure_in1,(p9) ld8 val2[1]=[src1],8) // 8-byte aligned - EX(.failure_out,(p7) st2 [dst1]=val1[1],2) -(p8) adds len1=-4,len1 - ;; - EX(.failure_out, (p8) st4 [dst1]=val2[0],4) -(p9) adds len1=-8,len1;; - shr.u cnt=len1,4 // number of 128-bit (2x64bit) words - ;; - EX(.failure_out, (p9) st8 [dst1]=val2[1],8) - tbit.nz p6,p0=len1,3 - cmp.eq p7,p0=r0,cnt - adds tmp=-1,cnt // br.ctop is repeat/until -(p7) br.cond.dpnt .dotail // we have less than 16 bytes left - ;; - adds src2=8,src1 - adds dst2=8,dst1 - mov ar.lc=tmp - ;; - // - // 16bytes/iteration - // -2: - EX(.failure_in3,(p16) ld8 val1[0]=[src1],16) -(p16) ld8 val2[0]=[src2],16 - - EX(.failure_out, (EPI) st8 [dst1]=val1[PIPE_DEPTH-1],16) -(EPI) st8 [dst2]=val2[PIPE_DEPTH-1],16 - br.ctop.dptk 2b - ;; // RAW on src1 when fall through from loop - // - // Tail correction based on len only - // - // No matter where we come from (loop or test) the src1 pointer - // is 16 byte aligned AND we have less than 16 bytes to copy. - // -.dotail: - EX(.failure_in1,(p6) ld8 val1[0]=[src1],8) // at least 8 bytes - tbit.nz p7,p0=len1,2 - ;; - EX(.failure_in1,(p7) ld4 val1[1]=[src1],4) // at least 4 bytes - tbit.nz p8,p0=len1,1 - ;; - EX(.failure_in1,(p8) ld2 val2[0]=[src1],2) // at least 2 bytes - tbit.nz p9,p0=len1,0 - ;; - EX(.failure_out, (p6) st8 [dst1]=val1[0],8) - ;; - EX(.failure_in1,(p9) ld1 val2[1]=[src1]) // only 1 byte left - mov ar.lc=saved_lc - ;; - EX(.failure_out,(p7) st4 [dst1]=val1[1],4) - mov pr=saved_pr,0xffffffffffff0000 - ;; - EX(.failure_out, (p8) st2 [dst1]=val2[0],2) - mov ar.pfs=saved_pfs - ;; - EX(.failure_out, (p9) st1 [dst1]=val2[1]) - br.ret.sptk.many rp - - - // - // Here we handle the case where the byte by byte copy fails - // on the load. - // Several factors make the zeroing of the rest of the buffer kind of - // tricky: - // - the pipeline: loads/stores are not in sync (pipeline) - // - // In the same loop iteration, the dst1 pointer does not directly - // reflect where the faulty load was. - // - // - pipeline effect - // When you get a fault on load, you may have valid data from - // previous loads not yet store in transit. Such data must be - // store normally before moving onto zeroing the rest. - // - // - single/multi dispersal independence. - // - // solution: - // - we don't disrupt the pipeline, i.e. data in transit in - // the software pipeline will be eventually move to memory. - // We simply replace the load with a simple mov and keep the - // pipeline going. We can't really do this inline because - // p16 is always reset to 1 when lc > 0. - // -.failure_in_pipe1: - sub ret0=endsrc,src1 // number of bytes to zero, i.e. not copied -1: -(p16) mov val1[0]=r0 -(EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1 - br.ctop.dptk 1b - ;; - mov pr=saved_pr,0xffffffffffff0000 - mov ar.lc=saved_lc - mov ar.pfs=saved_pfs - br.ret.sptk.many rp - - // - // This is the case where the byte by byte copy fails on the load - // when we copy the head. We need to finish the pipeline and copy - // zeros for the rest of the destination. Since this happens - // at the top we still need to fill the body and tail. -.failure_in_pipe2: - sub ret0=endsrc,src1 // number of bytes to zero, i.e. not copied -2: -(p16) mov val1[0]=r0 -(EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1 - br.ctop.dptk 2b - ;; - sub len=enddst,dst1,1 // precompute len - br.cond.dptk.many .failure_in1bis - ;; - - // - // Here we handle the head & tail part when we check for alignment. - // The following code handles only the load failures. The - // main diffculty comes from the fact that loads/stores are - // scheduled. So when you fail on a load, the stores corresponding - // to previous successful loads must be executed. - // - // However some simplifications are possible given the way - // things work. - // - // 1) HEAD - // Theory of operation: - // - // Page A | Page B - // ---------|----- - // 1|8 x - // 1 2|8 x - // 4|8 x - // 1 4|8 x - // 2 4|8 x - // 1 2 4|8 x - // |1 - // |2 x - // |4 x - // - // page_size >= 4k (2^12). (x means 4, 2, 1) - // Here we suppose Page A exists and Page B does not. - // - // As we move towards eight byte alignment we may encounter faults. - // The numbers on each page show the size of the load (current alignment). - // - // Key point: - // - if you fail on 1, 2, 4 then you have never executed any smaller - // size loads, e.g. failing ld4 means no ld1 nor ld2 executed - // before. - // - // This allows us to simplify the cleanup code, because basically you - // only have to worry about "pending" stores in the case of a failing - // ld8(). Given the way the code is written today, this means only - // worry about st2, st4. There we can use the information encapsulated - // into the predicates. - // - // Other key point: - // - if you fail on the ld8 in the head, it means you went straight - // to it, i.e. 8byte alignment within an unexisting page. - // Again this comes from the fact that if you crossed just for the ld8 then - // you are 8byte aligned but also 16byte align, therefore you would - // either go for the 16byte copy loop OR the ld8 in the tail part. - // The combination ld1, ld2, ld4, ld8 where you fail on ld8 is impossible - // because it would mean you had 15bytes to copy in which case you - // would have defaulted to the byte by byte copy. - // - // - // 2) TAIL - // Here we now we have less than 16 bytes AND we are either 8 or 16 byte - // aligned. - // - // Key point: - // This means that we either: - // - are right on a page boundary - // OR - // - are at more than 16 bytes from a page boundary with - // at most 15 bytes to copy: no chance of crossing. - // - // This allows us to assume that if we fail on a load we haven't possibly - // executed any of the previous (tail) ones, so we don't need to do - // any stores. For instance, if we fail on ld2, this means we had - // 2 or 3 bytes left to copy and we did not execute the ld8 nor ld4. - // - // This means that we are in a situation similar the a fault in the - // head part. That's nice! - // -.failure_in1: - sub ret0=endsrc,src1 // number of bytes to zero, i.e. not copied - sub len=endsrc,src1,1 - // - // we know that ret0 can never be zero at this point - // because we failed why trying to do a load, i.e. there is still - // some work to do. - // The failure_in1bis and length problem is taken care of at the - // calling side. - // - ;; -.failure_in1bis: // from (.failure_in3) - mov ar.lc=len // Continue with a stupid byte store. - ;; -5: - st1 [dst1]=r0,1 - br.cloop.dptk 5b - ;; - mov pr=saved_pr,0xffffffffffff0000 - mov ar.lc=saved_lc - mov ar.pfs=saved_pfs - br.ret.sptk.many rp - - // - // Here we simply restart the loop but instead - // of doing loads we fill the pipeline with zeroes - // We can't simply store r0 because we may have valid - // data in transit in the pipeline. - // ar.lc and ar.ec are setup correctly at this point - // - // we MUST use src1/endsrc here and not dst1/enddst because - // of the pipeline effect. - // -.failure_in3: - sub ret0=endsrc,src1 // number of bytes to zero, i.e. not copied - ;; -2: -(p16) mov val1[0]=r0 -(p16) mov val2[0]=r0 -(EPI) st8 [dst1]=val1[PIPE_DEPTH-1],16 -(EPI) st8 [dst2]=val2[PIPE_DEPTH-1],16 - br.ctop.dptk 2b - ;; - cmp.ne p6,p0=dst1,enddst // Do we need to finish the tail ? - sub len=enddst,dst1,1 // precompute len -(p6) br.cond.dptk .failure_in1bis - ;; - mov pr=saved_pr,0xffffffffffff0000 - mov ar.lc=saved_lc - mov ar.pfs=saved_pfs - br.ret.sptk.many rp - -.failure_in2: - sub ret0=endsrc,src1 - cmp.ne p6,p0=dst1,enddst // Do we need to finish the tail ? - sub len=enddst,dst1,1 // precompute len -(p6) br.cond.dptk .failure_in1bis - ;; - mov pr=saved_pr,0xffffffffffff0000 - mov ar.lc=saved_lc - mov ar.pfs=saved_pfs - br.ret.sptk.many rp - - // - // handling of failures on stores: that's the easy part - // -.failure_out: - sub ret0=enddst,dst1 - mov pr=saved_pr,0xffffffffffff0000 - mov ar.lc=saved_lc - - mov ar.pfs=saved_pfs - br.ret.sptk.many rp -END(__copy_user) -EXPORT_SYMBOL(__copy_user) diff --git a/arch/ia64/lib/csum_partial_copy.c b/arch/ia64/lib/csum_partial_copy.c deleted file mode 100644 index 917e3138b277..000000000000 --- a/arch/ia64/lib/csum_partial_copy.c +++ /dev/null @@ -1,98 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Network Checksum & Copy routine - * - * Copyright (C) 1999, 2003-2004 Hewlett-Packard Co - * Stephane Eranian <eranian@hpl.hp.com> - * - * Most of the code has been imported from Linux/Alpha - */ - -#include <linux/module.h> -#include <linux/types.h> -#include <linux/string.h> - -#include <net/checksum.h> - -/* - * XXX Fixme: those 2 inlines are meant for debugging and will go away - */ -static inline unsigned -short from64to16(unsigned long x) -{ - /* add up 32-bit words for 33 bits */ - x = (x & 0xffffffff) + (x >> 32); - /* add up 16-bit and 17-bit words for 17+c bits */ - x = (x & 0xffff) + (x >> 16); - /* add up 16-bit and 2-bit for 16+c bit */ - x = (x & 0xffff) + (x >> 16); - /* add up carry.. */ - x = (x & 0xffff) + (x >> 16); - return x; -} - -static inline -unsigned long do_csum_c(const unsigned char * buff, int len, unsigned int psum) -{ - int odd, count; - unsigned long result = (unsigned long)psum; - - if (len <= 0) - goto out; - odd = 1 & (unsigned long) buff; - if (odd) { - result = *buff << 8; - len--; - buff++; - } - count = len >> 1; /* nr of 16-bit words.. */ - if (count) { - if (2 & (unsigned long) buff) { - result += *(unsigned short *) buff; - count--; - len -= 2; - buff += 2; - } - count >>= 1; /* nr of 32-bit words.. */ - if (count) { - if (4 & (unsigned long) buff) { - result += *(unsigned int *) buff; - count--; - len -= 4; - buff += 4; - } - count >>= 1; /* nr of 64-bit words.. */ - if (count) { - unsigned long carry = 0; - do { - unsigned long w = *(unsigned long *) buff; - count--; - buff += 8; - result += carry; - result += w; - carry = (w > result); - } while (count); - result += carry; - result = (result & 0xffffffff) + (result >> 32); - } - if (len & 4) { - result += *(unsigned int *) buff; - buff += 4; - } - } - if (len & 2) { - result += *(unsigned short *) buff; - buff += 2; - } - } - if (len & 1) - result += *buff; - - result = from64to16(result); - - if (odd) - result = ((result >> 8) & 0xff) | ((result & 0xff) << 8); - -out: - return result; -} diff --git a/arch/ia64/lib/do_csum.S b/arch/ia64/lib/do_csum.S deleted file mode 100644 index 6004dad2597c..000000000000 --- a/arch/ia64/lib/do_csum.S +++ /dev/null @@ -1,324 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * - * Optmized version of the standard do_csum() function - * - * Return: a 64bit quantity containing the 16bit Internet checksum - * - * Inputs: - * in0: address of buffer to checksum (char *) - * in1: length of the buffer (int) - * - * Copyright (C) 1999, 2001-2002 Hewlett-Packard Co - * Stephane Eranian <eranian@hpl.hp.com> - * - * 02/04/22 Ken Chen <kenneth.w.chen@intel.com> - * Data locality study on the checksum buffer. - * More optimization cleanup - remove excessive stop bits. - * 02/04/08 David Mosberger <davidm@hpl.hp.com> - * More cleanup and tuning. - * 01/04/18 Jun Nakajima <jun.nakajima@intel.com> - * Clean up and optimize and the software pipeline, loading two - * back-to-back 8-byte words per loop. Clean up the initialization - * for the loop. Support the cases where load latency = 1 or 2. - * Set CONFIG_IA64_LOAD_LATENCY to 1 or 2 (default). - */ - -#include <asm/asmmacro.h> - -// -// Theory of operations: -// The goal is to go as quickly as possible to the point where -// we can checksum 16 bytes/loop. Before reaching that point we must -// take care of incorrect alignment of first byte. -// -// The code hereafter also takes care of the "tail" part of the buffer -// before entering the core loop, if any. The checksum is a sum so it -// allows us to commute operations. So we do the "head" and "tail" -// first to finish at full speed in the body. Once we get the head and -// tail values, we feed them into the pipeline, very handy initialization. -// -// Of course we deal with the special case where the whole buffer fits -// into one 8 byte word. In this case we have only one entry in the pipeline. -// -// We use a (LOAD_LATENCY+2)-stage pipeline in the loop to account for -// possible load latency and also to accommodate for head and tail. -// -// The end of the function deals with folding the checksum from 64bits -// down to 16bits taking care of the carry. -// -// This version avoids synchronization in the core loop by also using a -// pipeline for the accumulation of the checksum in resultx[] (x=1,2). -// -// wordx[] (x=1,2) -// |---| -// | | 0 : new value loaded in pipeline -// |---| -// | | - : in transit data -// |---| -// | | LOAD_LATENCY : current value to add to checksum -// |---| -// | | LOAD_LATENCY+1 : previous value added to checksum -// |---| (previous iteration) -// -// resultx[] (x=1,2) -// |---| -// | | 0 : initial value -// |---| -// | | LOAD_LATENCY-1 : new checksum -// |---| -// | | LOAD_LATENCY : previous value of checksum -// |---| -// | | LOAD_LATENCY+1 : final checksum when out of the loop -// |---| -// -// -// See RFC1071 "Computing the Internet Checksum" for various techniques for -// calculating the Internet checksum. -// -// NOT YET DONE: -// - Maybe another algorithm which would take care of the folding at the -// end in a different manner -// - Work with people more knowledgeable than me on the network stack -// to figure out if we could not split the function depending on the -// type of packet or alignment we get. Like the ip_fast_csum() routine -// where we know we have at least 20bytes worth of data to checksum. -// - Do a better job of handling small packets. -// - Note on prefetching: it was found that under various load, i.e. ftp read/write, -// nfs read/write, the L1 cache hit rate is at 60% and L2 cache hit rate is at 99.8% -// on the data that buffer points to (partly because the checksum is often preceded by -// a copy_from_user()). This finding indiate that lfetch will not be beneficial since -// the data is already in the cache. -// - -#define saved_pfs r11 -#define hmask r16 -#define tmask r17 -#define first1 r18 -#define firstval r19 -#define firstoff r20 -#define last r21 -#define lastval r22 -#define lastoff r23 -#define saved_lc r24 -#define saved_pr r25 -#define tmp1 r26 -#define tmp2 r27 -#define tmp3 r28 -#define carry1 r29 -#define carry2 r30 -#define first2 r31 - -#define buf in0 -#define len in1 - -#define LOAD_LATENCY 2 // XXX fix me - -#if (LOAD_LATENCY != 1) && (LOAD_LATENCY != 2) -# error "Only 1 or 2 is supported/tested for LOAD_LATENCY." -#endif - -#define PIPE_DEPTH (LOAD_LATENCY+2) -#define ELD p[LOAD_LATENCY] // end of load -#define ELD_1 p[LOAD_LATENCY+1] // and next stage - -// unsigned long do_csum(unsigned char *buf,long len) - -GLOBAL_ENTRY(do_csum) - .prologue - .save ar.pfs, saved_pfs - alloc saved_pfs=ar.pfs,2,16,0,16 - .rotr word1[4], word2[4],result1[LOAD_LATENCY+2],result2[LOAD_LATENCY+2] - .rotp p[PIPE_DEPTH], pC1[2], pC2[2] - mov ret0=r0 // in case we have zero length - cmp.lt p0,p6=r0,len // check for zero length or negative (32bit len) - ;; - add tmp1=buf,len // last byte's address - .save pr, saved_pr - mov saved_pr=pr // preserve predicates (rotation) -(p6) br.ret.spnt.many rp // return if zero or negative length - - mov hmask=-1 // initialize head mask - tbit.nz p15,p0=buf,0 // is buf an odd address? - and first1=-8,buf // 8-byte align down address of first1 element - - and firstoff=7,buf // how many bytes off for first1 element - mov tmask=-1 // initialize tail mask - - ;; - adds tmp2=-1,tmp1 // last-1 - and lastoff=7,tmp1 // how many bytes off for last element - ;; - sub tmp1=8,lastoff // complement to lastoff - and last=-8,tmp2 // address of word containing last byte - ;; - sub tmp3=last,first1 // tmp3=distance from first1 to last - .save ar.lc, saved_lc - mov saved_lc=ar.lc // save lc - cmp.eq p8,p9=last,first1 // everything fits in one word ? - - ld8 firstval=[first1],8 // load, ahead of time, "first1" word - and tmp1=7, tmp1 // make sure that if tmp1==8 -> tmp1=0 - shl tmp2=firstoff,3 // number of bits - ;; -(p9) ld8 lastval=[last] // load, ahead of time, "last" word, if needed - shl tmp1=tmp1,3 // number of bits -(p9) adds tmp3=-8,tmp3 // effectively loaded - ;; -(p8) mov lastval=r0 // we don't need lastval if first1==last - shl hmask=hmask,tmp2 // build head mask, mask off [0,first1off[ - shr.u tmask=tmask,tmp1 // build tail mask, mask off ]8,lastoff] - ;; - .body -#define count tmp3 - -(p8) and hmask=hmask,tmask // apply tail mask to head mask if 1 word only -(p9) and word2[0]=lastval,tmask // mask last it as appropriate - shr.u count=count,3 // how many 8-byte? - ;; - // If count is odd, finish this 8-byte word so that we can - // load two back-to-back 8-byte words per loop thereafter. - and word1[0]=firstval,hmask // and mask it as appropriate - tbit.nz p10,p11=count,0 // if (count is odd) - ;; -(p8) mov result1[0]=word1[0] -(p9) add result1[0]=word1[0],word2[0] - ;; - cmp.ltu p6,p0=result1[0],word1[0] // check the carry - cmp.eq.or.andcm p8,p0=0,count // exit if zero 8-byte - ;; -(p6) adds result1[0]=1,result1[0] -(p8) br.cond.dptk .do_csum_exit // if (within an 8-byte word) -(p11) br.cond.dptk .do_csum16 // if (count is even) - - // Here count is odd. - ld8 word1[1]=[first1],8 // load an 8-byte word - cmp.eq p9,p10=1,count // if (count == 1) - adds count=-1,count // loaded an 8-byte word - ;; - add result1[0]=result1[0],word1[1] - ;; - cmp.ltu p6,p0=result1[0],word1[1] - ;; -(p6) adds result1[0]=1,result1[0] -(p9) br.cond.sptk .do_csum_exit // if (count == 1) exit - // Fall through to calculate the checksum, feeding result1[0] as - // the initial value in result1[0]. - // - // Calculate the checksum loading two 8-byte words per loop. - // -.do_csum16: - add first2=8,first1 - shr.u count=count,1 // we do 16 bytes per loop - ;; - adds count=-1,count - mov carry1=r0 - mov carry2=r0 - brp.loop.imp 1f,2f - ;; - mov ar.ec=PIPE_DEPTH - mov ar.lc=count // set lc - mov pr.rot=1<<16 - // result1[0] must be initialized in advance. - mov result2[0]=r0 - ;; - .align 32 -1: -(ELD_1) cmp.ltu pC1[0],p0=result1[LOAD_LATENCY],word1[LOAD_LATENCY+1] -(pC1[1])adds carry1=1,carry1 -(ELD_1) cmp.ltu pC2[0],p0=result2[LOAD_LATENCY],word2[LOAD_LATENCY+1] -(pC2[1])adds carry2=1,carry2 -(ELD) add result1[LOAD_LATENCY-1]=result1[LOAD_LATENCY],word1[LOAD_LATENCY] -(ELD) add result2[LOAD_LATENCY-1]=result2[LOAD_LATENCY],word2[LOAD_LATENCY] -2: -(p[0]) ld8 word1[0]=[first1],16 -(p[0]) ld8 word2[0]=[first2],16 - br.ctop.sptk 1b - ;; - // Since len is a 32-bit value, carry cannot be larger than a 64-bit value. -(pC1[1])adds carry1=1,carry1 // since we miss the last one -(pC2[1])adds carry2=1,carry2 - ;; - add result1[LOAD_LATENCY+1]=result1[LOAD_LATENCY+1],carry1 - add result2[LOAD_LATENCY+1]=result2[LOAD_LATENCY+1],carry2 - ;; - cmp.ltu p6,p0=result1[LOAD_LATENCY+1],carry1 - cmp.ltu p7,p0=result2[LOAD_LATENCY+1],carry2 - ;; -(p6) adds result1[LOAD_LATENCY+1]=1,result1[LOAD_LATENCY+1] -(p7) adds result2[LOAD_LATENCY+1]=1,result2[LOAD_LATENCY+1] - ;; - add result1[0]=result1[LOAD_LATENCY+1],result2[LOAD_LATENCY+1] - ;; - cmp.ltu p6,p0=result1[0],result2[LOAD_LATENCY+1] - ;; -(p6) adds result1[0]=1,result1[0] - ;; -.do_csum_exit: - // - // now fold 64 into 16 bits taking care of carry - // that's not very good because it has lots of sequentiality - // - mov tmp3=0xffff - zxt4 tmp1=result1[0] - shr.u tmp2=result1[0],32 - ;; - add result1[0]=tmp1,tmp2 - ;; - and tmp1=result1[0],tmp3 - shr.u tmp2=result1[0],16 - ;; - add result1[0]=tmp1,tmp2 - ;; - and tmp1=result1[0],tmp3 - shr.u tmp2=result1[0],16 - ;; - add result1[0]=tmp1,tmp2 - ;; - and tmp1=result1[0],tmp3 - shr.u tmp2=result1[0],16 - ;; - add ret0=tmp1,tmp2 - mov pr=saved_pr,0xffffffffffff0000 - ;; - // if buf was odd then swap bytes - mov ar.pfs=saved_pfs // restore ar.ec -(p15) mux1 ret0=ret0,@rev // reverse word - ;; - mov ar.lc=saved_lc -(p15) shr.u ret0=ret0,64-16 // + shift back to position = swap bytes - br.ret.sptk.many rp - -// I (Jun Nakajima) wrote an equivalent code (see below), but it was -// not much better than the original. So keep the original there so that -// someone else can challenge. -// -// shr.u word1[0]=result1[0],32 -// zxt4 result1[0]=result1[0] -// ;; -// add result1[0]=result1[0],word1[0] -// ;; -// zxt2 result2[0]=result1[0] -// extr.u word1[0]=result1[0],16,16 -// shr.u carry1=result1[0],32 -// ;; -// add result2[0]=result2[0],word1[0] -// ;; -// add result2[0]=result2[0],carry1 -// ;; -// extr.u ret0=result2[0],16,16 -// ;; -// add ret0=ret0,result2[0] -// ;; -// zxt2 ret0=ret0 -// mov ar.pfs=saved_pfs // restore ar.ec -// mov pr=saved_pr,0xffffffffffff0000 -// ;; -// // if buf was odd then swap bytes -// mov ar.lc=saved_lc -//(p15) mux1 ret0=ret0,@rev // reverse word -// ;; -//(p15) shr.u ret0=ret0,64-16 // + shift back to position = swap bytes -// br.ret.sptk.many rp - -END(do_csum) diff --git a/arch/ia64/lib/flush.S b/arch/ia64/lib/flush.S deleted file mode 100644 index f8e795fe45cb..000000000000 --- a/arch/ia64/lib/flush.S +++ /dev/null @@ -1,119 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Cache flushing routines. - * - * Copyright (C) 1999-2001, 2005 Hewlett-Packard Co - * David Mosberger-Tang <davidm@hpl.hp.com> - * - * 05/28/05 Zoltan Menyhart Dynamic stride size - */ - -#include <linux/export.h> -#include <asm/asmmacro.h> - - /* - * flush_icache_range(start,end) - * - * Make i-cache(s) coherent with d-caches. - * - * Must deal with range from start to end-1 but nothing else (need to - * be careful not to touch addresses that may be unmapped). - * - * Note: "in0" and "in1" are preserved for debugging purposes. - */ - .section .kprobes.text,"ax" -GLOBAL_ENTRY(flush_icache_range) - - .prologue - alloc r2=ar.pfs,2,0,0,0 - movl r3=ia64_i_cache_stride_shift - mov r21=1 - ;; - ld8 r20=[r3] // r20: stride shift - sub r22=in1,r0,1 // last byte address - ;; - shr.u r23=in0,r20 // start / (stride size) - shr.u r22=r22,r20 // (last byte address) / (stride size) - shl r21=r21,r20 // r21: stride size of the i-cache(s) - ;; - sub r8=r22,r23 // number of strides - 1 - shl r24=r23,r20 // r24: addresses for "fc.i" = - // "start" rounded down to stride boundary - .save ar.lc,r3 - mov r3=ar.lc // save ar.lc - ;; - - .body - mov ar.lc=r8 - ;; - /* - * 32 byte aligned loop, even number of (actually 2) bundles - */ -.Loop: fc.i r24 // issuable on M0 only - add r24=r21,r24 // we flush "stride size" bytes per iteration - nop.i 0 - br.cloop.sptk.few .Loop - ;; - sync.i - ;; - srlz.i - ;; - mov ar.lc=r3 // restore ar.lc - br.ret.sptk.many rp -END(flush_icache_range) -EXPORT_SYMBOL_GPL(flush_icache_range) - - /* - * clflush_cache_range(start,size) - * - * Flush cache lines from start to start+size-1. - * - * Must deal with range from start to start+size-1 but nothing else - * (need to be careful not to touch addresses that may be - * unmapped). - * - * Note: "in0" and "in1" are preserved for debugging purposes. - */ - .section .kprobes.text,"ax" -GLOBAL_ENTRY(clflush_cache_range) - - .prologue - alloc r2=ar.pfs,2,0,0,0 - movl r3=ia64_cache_stride_shift - mov r21=1 - add r22=in1,in0 - ;; - ld8 r20=[r3] // r20: stride shift - sub r22=r22,r0,1 // last byte address - ;; - shr.u r23=in0,r20 // start / (stride size) - shr.u r22=r22,r20 // (last byte address) / (stride size) - shl r21=r21,r20 // r21: stride size of the i-cache(s) - ;; - sub r8=r22,r23 // number of strides - 1 - shl r24=r23,r20 // r24: addresses for "fc" = - // "start" rounded down to stride - // boundary - .save ar.lc,r3 - mov r3=ar.lc // save ar.lc - ;; - - .body - mov ar.lc=r8 - ;; - /* - * 32 byte aligned loop, even number of (actually 2) bundles - */ -.Loop_fc: - fc r24 // issuable on M0 only - add r24=r21,r24 // we flush "stride size" bytes per iteration - nop.i 0 - br.cloop.sptk.few .Loop_fc - ;; - sync.i - ;; - srlz.i - ;; - mov ar.lc=r3 // restore ar.lc - br.ret.sptk.many rp -END(clflush_cache_range) diff --git a/arch/ia64/lib/idiv32.S b/arch/ia64/lib/idiv32.S deleted file mode 100644 index 83586fbc51ff..000000000000 --- a/arch/ia64/lib/idiv32.S +++ /dev/null @@ -1,86 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Copyright (C) 2000 Hewlett-Packard Co - * Copyright (C) 2000 David Mosberger-Tang <davidm@hpl.hp.com> - * - * 32-bit integer division. - * - * This code is based on the application note entitled "Divide, Square Root - * and Remainder Algorithms for the IA-64 Architecture". This document - * is available as Intel document number 248725-002 or via the web at - * http://developer.intel.com/software/opensource/numerics/ - * - * For more details on the theory behind these algorithms, see "IA-64 - * and Elementary Functions" by Peter Markstein; HP Professional Books - * (http://www.goodreads.com/book/show/2019887.Ia_64_and_Elementary_Functions) - */ - -#include <linux/export.h> -#include <asm/asmmacro.h> - -#ifdef MODULO -# define OP mod -#else -# define OP div -#endif - -#ifdef UNSIGNED -# define SGN u -# define EXTEND zxt4 -# define INT_TO_FP(a,b) fcvt.xuf.s1 a=b -# define FP_TO_INT(a,b) fcvt.fxu.trunc.s1 a=b -#else -# define SGN -# define EXTEND sxt4 -# define INT_TO_FP(a,b) fcvt.xf a=b -# define FP_TO_INT(a,b) fcvt.fx.trunc.s1 a=b -#endif - -#define PASTE1(a,b) a##b -#define PASTE(a,b) PASTE1(a,b) -#define NAME PASTE(PASTE(__,SGN),PASTE(OP,si3)) - -GLOBAL_ENTRY(NAME) - .regstk 2,0,0,0 - // Transfer inputs to FP registers. - mov r2 = 0xffdd // r2 = -34 + 65535 (fp reg format bias) - EXTEND in0 = in0 // in0 = a - EXTEND in1 = in1 // in1 = b - ;; - setf.sig f8 = in0 - setf.sig f9 = in1 -#ifdef MODULO - sub in1 = r0, in1 // in1 = -b -#endif - ;; - // Convert the inputs to FP, to avoid FP software-assist faults. - INT_TO_FP(f8, f8) - INT_TO_FP(f9, f9) - ;; - setf.exp f7 = r2 // f7 = 2^-34 - frcpa.s1 f6, p6 = f8, f9 // y0 = frcpa(b) - ;; -(p6) fmpy.s1 f8 = f8, f6 // q0 = a*y0 -(p6) fnma.s1 f6 = f9, f6, f1 // e0 = -b*y0 + 1 - ;; -#ifdef MODULO - setf.sig f9 = in1 // f9 = -b -#endif -(p6) fma.s1 f8 = f6, f8, f8 // q1 = e0*q0 + q0 -(p6) fma.s1 f6 = f6, f6, f7 // e1 = e0*e0 + 2^-34 - ;; -#ifdef MODULO - setf.sig f7 = in0 -#endif -(p6) fma.s1 f6 = f6, f8, f8 // q2 = e1*q1 + q1 - ;; - FP_TO_INT(f6, f6) // q = trunc(q2) - ;; -#ifdef MODULO - xma.l f6 = f6, f9, f7 // r = q*(-b) + a - ;; -#endif - getf.sig r8 = f6 // transfer result to result register - br.ret.sptk.many rp -END(NAME) -EXPORT_SYMBOL(NAME) diff --git a/arch/ia64/lib/idiv64.S b/arch/ia64/lib/idiv64.S deleted file mode 100644 index 5c9113691f72..000000000000 --- a/arch/ia64/lib/idiv64.S +++ /dev/null @@ -1,83 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Copyright (C) 1999-2000 Hewlett-Packard Co - * Copyright (C) 1999-2000 David Mosberger-Tang <davidm@hpl.hp.com> - * - * 64-bit integer division. - * - * This code is based on the application note entitled "Divide, Square Root - * and Remainder Algorithms for the IA-64 Architecture". This document - * is available as Intel document number 248725-002 or via the web at - * http://developer.intel.com/software/opensource/numerics/ - * - * For more details on the theory behind these algorithms, see "IA-64 - * and Elementary Functions" by Peter Markstein; HP Professional Books - * (http://www.goodreads.com/book/show/2019887.Ia_64_and_Elementary_Functions) - */ - -#include <linux/export.h> -#include <asm/asmmacro.h> - -#ifdef MODULO -# define OP mod -#else -# define OP div -#endif - -#ifdef UNSIGNED -# define SGN u -# define INT_TO_FP(a,b) fcvt.xuf.s1 a=b -# define FP_TO_INT(a,b) fcvt.fxu.trunc.s1 a=b -#else -# define SGN -# define INT_TO_FP(a,b) fcvt.xf a=b -# define FP_TO_INT(a,b) fcvt.fx.trunc.s1 a=b -#endif - -#define PASTE1(a,b) a##b -#define PASTE(a,b) PASTE1(a,b) -#define NAME PASTE(PASTE(__,SGN),PASTE(OP,di3)) - -GLOBAL_ENTRY(NAME) - .regstk 2,0,0,0 - // Transfer inputs to FP registers. - setf.sig f8 = in0 - setf.sig f9 = in1 - ;; - // Convert the inputs to FP, to avoid FP software-assist faults. - INT_TO_FP(f8, f8) - INT_TO_FP(f9, f9) - ;; - frcpa.s1 f11, p6 = f8, f9 // y0 = frcpa(b) - ;; -(p6) fmpy.s1 f7 = f8, f11 // q0 = a*y0 -(p6) fnma.s1 f6 = f9, f11, f1 // e0 = -b*y0 + 1 - ;; -(p6) fma.s1 f10 = f7, f6, f7 // q1 = q0*e0 + q0 -(p6) fmpy.s1 f7 = f6, f6 // e1 = e0*e0 - ;; -#ifdef MODULO - sub in1 = r0, in1 // in1 = -b -#endif -(p6) fma.s1 f10 = f10, f7, f10 // q2 = q1*e1 + q1 -(p6) fma.s1 f6 = f11, f6, f11 // y1 = y0*e0 + y0 - ;; -(p6) fma.s1 f6 = f6, f7, f6 // y2 = y1*e1 + y1 -(p6) fnma.s1 f7 = f9, f10, f8 // r = -b*q2 + a - ;; -#ifdef MODULO - setf.sig f8 = in0 // f8 = a - setf.sig f9 = in1 // f9 = -b -#endif -(p6) fma.s1 f11 = f7, f6, f10 // q3 = r*y2 + q2 - ;; - FP_TO_INT(f11, f11) // q = trunc(q3) - ;; -#ifdef MODULO - xma.l f11 = f11, f9, f8 // r = q*(-b) + a - ;; -#endif - getf.sig r8 = f11 // transfer result to result register - br.ret.sptk.many rp -END(NAME) -EXPORT_SYMBOL(NAME) diff --git a/arch/ia64/lib/io.c b/arch/ia64/lib/io.c deleted file mode 100644 index c3e02462ed16..000000000000 --- a/arch/ia64/lib/io.c +++ /dev/null @@ -1,51 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include <linux/module.h> -#include <linux/types.h> - -#include <asm/io.h> - -/* - * Copy data from IO memory space to "real" memory space. - * This needs to be optimized. - */ -void memcpy_fromio(void *to, const volatile void __iomem *from, long count) -{ - char *dst = to; - - while (count) { - count--; - *dst++ = readb(from++); - } -} -EXPORT_SYMBOL(memcpy_fromio); - -/* - * Copy data from "real" memory space to IO memory space. - * This needs to be optimized. - */ -void memcpy_toio(volatile void __iomem *to, const void *from, long count) -{ - const char *src = from; - - while (count) { - count--; - writeb(*src++, to++); - } -} -EXPORT_SYMBOL(memcpy_toio); - -/* - * "memset" on IO memory space. - * This needs to be optimized. - */ -void memset_io(volatile void __iomem *dst, int c, long count) -{ - unsigned char ch = (char)(c & 0xff); - - while (count) { - count--; - writeb(ch, dst); - dst++; - } -} -EXPORT_SYMBOL(memset_io); diff --git a/arch/ia64/lib/ip_fast_csum.S b/arch/ia64/lib/ip_fast_csum.S deleted file mode 100644 index fcc0b812ce2e..000000000000 --- a/arch/ia64/lib/ip_fast_csum.S +++ /dev/null @@ -1,148 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Optmized version of the ip_fast_csum() function - * Used for calculating IP header checksum - * - * Return: 16bit checksum, complemented - * - * Inputs: - * in0: address of buffer to checksum (char *) - * in1: length of the buffer (int) - * - * Copyright (C) 2002, 2006 Intel Corp. - * Copyright (C) 2002, 2006 Ken Chen <kenneth.w.chen@intel.com> - */ - -#include <linux/export.h> -#include <asm/asmmacro.h> - -/* - * Since we know that most likely this function is called with buf aligned - * on 4-byte boundary and 20 bytes in length, we can execution rather quickly - * versus calling generic version of do_csum, which has lots of overhead in - * handling various alignments and sizes. However, due to lack of constrains - * put on the function input argument, cases with alignment not on 4-byte or - * size not equal to 20 bytes will be handled by the generic do_csum function. - */ - -#define in0 r32 -#define in1 r33 -#define in2 r34 -#define in3 r35 -#define in4 r36 -#define ret0 r8 - -GLOBAL_ENTRY(ip_fast_csum) - .prologue - .body - cmp.ne p6,p7=5,in1 // size other than 20 byte? - and r14=3,in0 // is it aligned on 4-byte? - add r15=4,in0 // second source pointer - ;; - cmp.ne.or.andcm p6,p7=r14,r0 - ;; -(p7) ld4 r20=[in0],8 -(p7) ld4 r21=[r15],8 -(p6) br.spnt .generic - ;; - ld4 r22=[in0],8 - ld4 r23=[r15],8 - ;; - ld4 r24=[in0] - add r20=r20,r21 - add r22=r22,r23 - ;; - add r20=r20,r22 - ;; - add r20=r20,r24 - ;; - shr.u ret0=r20,16 // now need to add the carry - zxt2 r20=r20 - ;; - add r20=ret0,r20 - ;; - shr.u ret0=r20,16 // add carry again - zxt2 r20=r20 - ;; - add r20=ret0,r20 - ;; - shr.u ret0=r20,16 - zxt2 r20=r20 - ;; - add r20=ret0,r20 - mov r9=0xffff - ;; - andcm ret0=r9,r20 - .restore sp // reset frame state - br.ret.sptk.many b0 - ;; - -.generic: - .prologue - .save ar.pfs, r35 - alloc r35=ar.pfs,2,2,2,0 - .save rp, r34 - mov r34=b0 - .body - dep.z out1=in1,2,30 - mov out0=in0 - ;; - br.call.sptk.many b0=do_csum - ;; - andcm ret0=-1,ret0 - mov ar.pfs=r35 - mov b0=r34 - br.ret.sptk.many b0 -END(ip_fast_csum) -EXPORT_SYMBOL(ip_fast_csum) - -GLOBAL_ENTRY(csum_ipv6_magic) - ld4 r20=[in0],4 - ld4 r21=[in1],4 - zxt4 in2=in2 - ;; - ld4 r22=[in0],4 - ld4 r23=[in1],4 - dep r15=in3,in2,32,16 - ;; - ld4 r24=[in0],4 - ld4 r25=[in1],4 - mux1 r15=r15,@rev - add r16=r20,r21 - add r17=r22,r23 - zxt4 in4=in4 - ;; - ld4 r26=[in0],4 - ld4 r27=[in1],4 - shr.u r15=r15,16 - add r18=r24,r25 - add r8=r16,r17 - ;; - add r19=r26,r27 - add r8=r8,r18 - ;; - add r8=r8,r19 - add r15=r15,in4 - ;; - add r8=r8,r15 - ;; - shr.u r10=r8,32 // now fold sum into short - zxt4 r11=r8 - ;; - add r8=r10,r11 - ;; - shr.u r10=r8,16 // yeah, keep it rolling - zxt2 r11=r8 - ;; - add r8=r10,r11 - ;; - shr.u r10=r8,16 // three times lucky - zxt2 r11=r8 - ;; - add r8=r10,r11 - mov r9=0xffff - ;; - andcm r8=r9,r8 - br.ret.sptk.many b0 -END(csum_ipv6_magic) -EXPORT_SYMBOL(csum_ipv6_magic) diff --git a/arch/ia64/lib/memcpy.S b/arch/ia64/lib/memcpy.S deleted file mode 100644 index 35c9069a8345..000000000000 --- a/arch/ia64/lib/memcpy.S +++ /dev/null @@ -1,304 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * - * Optimized version of the standard memcpy() function - * - * Inputs: - * in0: destination address - * in1: source address - * in2: number of bytes to copy - * Output: - * no return value - * - * Copyright (C) 2000-2001 Hewlett-Packard Co - * Stephane Eranian <eranian@hpl.hp.com> - * David Mosberger-Tang <davidm@hpl.hp.com> - */ -#include <linux/export.h> -#include <asm/asmmacro.h> - -GLOBAL_ENTRY(memcpy) - -# define MEM_LAT 21 /* latency to memory */ - -# define dst r2 -# define src r3 -# define retval r8 -# define saved_pfs r9 -# define saved_lc r10 -# define saved_pr r11 -# define cnt r16 -# define src2 r17 -# define t0 r18 -# define t1 r19 -# define t2 r20 -# define t3 r21 -# define t4 r22 -# define src_end r23 - -# define N (MEM_LAT + 4) -# define Nrot ((N + 7) & ~7) - - /* - * First, check if everything (src, dst, len) is a multiple of eight. If - * so, we handle everything with no taken branches (other than the loop - * itself) and a small icache footprint. Otherwise, we jump off to - * the more general copy routine handling arbitrary - * sizes/alignment etc. - */ - .prologue - .save ar.pfs, saved_pfs - alloc saved_pfs=ar.pfs,3,Nrot,0,Nrot - .save ar.lc, saved_lc - mov saved_lc=ar.lc - or t0=in0,in1 - ;; - - or t0=t0,in2 - .save pr, saved_pr - mov saved_pr=pr - - .body - - cmp.eq p6,p0=in2,r0 // zero length? - mov retval=in0 // return dst -(p6) br.ret.spnt.many rp // zero length, return immediately - ;; - - mov dst=in0 // copy because of rotation - shr.u cnt=in2,3 // number of 8-byte words to copy - mov pr.rot=1<<16 - ;; - - adds cnt=-1,cnt // br.ctop is repeat/until - cmp.gtu p7,p0=16,in2 // copying less than 16 bytes? - mov ar.ec=N - ;; - - and t0=0x7,t0 - mov ar.lc=cnt - ;; - cmp.ne p6,p0=t0,r0 - - mov src=in1 // copy because of rotation -(p7) br.cond.spnt.few .memcpy_short -(p6) br.cond.spnt.few .memcpy_long - ;; - nop.m 0 - ;; - nop.m 0 - nop.i 0 - ;; - nop.m 0 - ;; - .rotr val[N] - .rotp p[N] - .align 32 -1: { .mib -(p[0]) ld8 val[0]=[src],8 - nop.i 0 - brp.loop.imp 1b, 2f -} -2: { .mfb -(p[N-1])st8 [dst]=val[N-1],8 - nop.f 0 - br.ctop.dptk.few 1b -} - ;; - mov ar.lc=saved_lc - mov pr=saved_pr,-1 - mov ar.pfs=saved_pfs - br.ret.sptk.many rp - - /* - * Small (<16 bytes) unaligned copying is done via a simple byte-at-the-time - * copy loop. This performs relatively poorly on Itanium, but it doesn't - * get used very often (gcc inlines small copies) and due to atomicity - * issues, we want to avoid read-modify-write of entire words. - */ - .align 32 -.memcpy_short: - adds cnt=-1,in2 // br.ctop is repeat/until - mov ar.ec=MEM_LAT - brp.loop.imp 1f, 2f - ;; - mov ar.lc=cnt - ;; - nop.m 0 - ;; - nop.m 0 - nop.i 0 - ;; - nop.m 0 - ;; - nop.m 0 - ;; - /* - * It is faster to put a stop bit in the loop here because it makes - * the pipeline shorter (and latency is what matters on short copies). - */ - .align 32 -1: { .mib -(p[0]) ld1 val[0]=[src],1 - nop.i 0 - brp.loop.imp 1b, 2f -} ;; -2: { .mfb -(p[MEM_LAT-1])st1 [dst]=val[MEM_LAT-1],1 - nop.f 0 - br.ctop.dptk.few 1b -} ;; - mov ar.lc=saved_lc - mov pr=saved_pr,-1 - mov ar.pfs=saved_pfs - br.ret.sptk.many rp - - /* - * Large (>= 16 bytes) copying is done in a fancy way. Latency isn't - * an overriding concern here, but throughput is. We first do - * sub-word copying until the destination is aligned, then we check - * if the source is also aligned. If so, we do a simple load/store-loop - * until there are less than 8 bytes left over and then we do the tail, - * by storing the last few bytes using sub-word copying. If the source - * is not aligned, we branch off to the non-congruent loop. - * - * stage: op: - * 0 ld - * : - * MEM_LAT+3 shrp - * MEM_LAT+4 st - * - * On Itanium, the pipeline itself runs without stalls. However, br.ctop - * seems to introduce an unavoidable bubble in the pipeline so the overall - * latency is 2 cycles/iteration. This gives us a _copy_ throughput - * of 4 byte/cycle. Still not bad. - */ -# undef N -# undef Nrot -# define N (MEM_LAT + 5) /* number of stages */ -# define Nrot ((N+1 + 2 + 7) & ~7) /* number of rotating regs */ - -#define LOG_LOOP_SIZE 6 - -.memcpy_long: - alloc t3=ar.pfs,3,Nrot,0,Nrot // resize register frame - and t0=-8,src // t0 = src & ~7 - and t2=7,src // t2 = src & 7 - ;; - ld8 t0=[t0] // t0 = 1st source word - adds src2=7,src // src2 = (src + 7) - sub t4=r0,dst // t4 = -dst - ;; - and src2=-8,src2 // src2 = (src + 7) & ~7 - shl t2=t2,3 // t2 = 8*(src & 7) - shl t4=t4,3 // t4 = 8*(dst & 7) - ;; - ld8 t1=[src2] // t1 = 1st source word if src is 8-byte aligned, 2nd otherwise - sub t3=64,t2 // t3 = 64-8*(src & 7) - shr.u t0=t0,t2 - ;; - add src_end=src,in2 - shl t1=t1,t3 - mov pr=t4,0x38 // (p5,p4,p3)=(dst & 7) - ;; - or t0=t0,t1 - mov cnt=r0 - adds src_end=-1,src_end - ;; -(p3) st1 [dst]=t0,1 -(p3) shr.u t0=t0,8 -(p3) adds cnt=1,cnt - ;; -(p4) st2 [dst]=t0,2 -(p4) shr.u t0=t0,16 -(p4) adds cnt=2,cnt - ;; -(p5) st4 [dst]=t0,4 -(p5) adds cnt=4,cnt - and src_end=-8,src_end // src_end = last word of source buffer - ;; - - // At this point, dst is aligned to 8 bytes and there at least 16-7=9 bytes left to copy: - -1:{ add src=cnt,src // make src point to remainder of source buffer - sub cnt=in2,cnt // cnt = number of bytes left to copy - mov t4=ip - } ;; - and src2=-8,src // align source pointer - adds t4=.memcpy_loops-1b,t4 - mov ar.ec=N - - and t0=7,src // t0 = src & 7 - shr.u t2=cnt,3 // t2 = number of 8-byte words left to copy - shl cnt=cnt,3 // move bits 0-2 to 3-5 - ;; - - .rotr val[N+1], w[2] - .rotp p[N] - - cmp.ne p6,p0=t0,r0 // is src aligned, too? - shl t0=t0,LOG_LOOP_SIZE // t0 = 8*(src & 7) - adds t2=-1,t2 // br.ctop is repeat/until - ;; - add t4=t0,t4 - mov pr=cnt,0x38 // set (p5,p4,p3) to # of bytes last-word bytes to copy - mov ar.lc=t2 - ;; - nop.m 0 - ;; - nop.m 0 - nop.i 0 - ;; - nop.m 0 - ;; -(p6) ld8 val[1]=[src2],8 // prime the pump... - mov b6=t4 - br.sptk.few b6 - ;; - -.memcpy_tail: - // At this point, (p5,p4,p3) are set to the number of bytes left to copy (which is - // less than 8) and t0 contains the last few bytes of the src buffer: -(p5) st4 [dst]=t0,4 -(p5) shr.u t0=t0,32 - mov ar.lc=saved_lc - ;; -(p4) st2 [dst]=t0,2 -(p4) shr.u t0=t0,16 - mov ar.pfs=saved_pfs - ;; -(p3) st1 [dst]=t0 - mov pr=saved_pr,-1 - br.ret.sptk.many rp - -/////////////////////////////////////////////////////// - .align 64 - -#define COPY(shift,index) \ - 1: { .mib \ - (p[0]) ld8 val[0]=[src2],8; \ - (p[MEM_LAT+3]) shrp w[0]=val[MEM_LAT+3],val[MEM_LAT+4-index],shift; \ - brp.loop.imp 1b, 2f \ - }; \ - 2: { .mfb \ - (p[MEM_LAT+4]) st8 [dst]=w[1],8; \ - nop.f 0; \ - br.ctop.dptk.few 1b; \ - }; \ - ;; \ - ld8 val[N-1]=[src_end]; /* load last word (may be same as val[N]) */ \ - ;; \ - shrp t0=val[N-1],val[N-index],shift; \ - br .memcpy_tail -.memcpy_loops: - COPY(0, 1) /* no point special casing this---it doesn't go any faster without shrp */ - COPY(8, 0) - COPY(16, 0) - COPY(24, 0) - COPY(32, 0) - COPY(40, 0) - COPY(48, 0) - COPY(56, 0) - -END(memcpy) -EXPORT_SYMBOL(memcpy) diff --git a/arch/ia64/lib/memcpy_mck.S b/arch/ia64/lib/memcpy_mck.S deleted file mode 100644 index c0d4362217ae..000000000000 --- a/arch/ia64/lib/memcpy_mck.S +++ /dev/null @@ -1,659 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Itanium 2-optimized version of memcpy and copy_user function - * - * Inputs: - * in0: destination address - * in1: source address - * in2: number of bytes to copy - * Output: - * for memcpy: return dest - * for copy_user: return 0 if success, - * or number of byte NOT copied if error occurred. - * - * Copyright (C) 2002 Intel Corp. - * Copyright (C) 2002 Ken Chen <kenneth.w.chen@intel.com> - */ -#include <linux/export.h> -#include <asm/asmmacro.h> -#include <asm/page.h> - -#define EK(y...) EX(y) - -/* McKinley specific optimization */ - -#define retval r8 -#define saved_pfs r31 -#define saved_lc r10 -#define saved_pr r11 -#define saved_in0 r14 -#define saved_in1 r15 -#define saved_in2 r16 - -#define src0 r2 -#define src1 r3 -#define dst0 r17 -#define dst1 r18 -#define cnt r9 - -/* r19-r30 are temp for each code section */ -#define PREFETCH_DIST 8 -#define src_pre_mem r19 -#define dst_pre_mem r20 -#define src_pre_l2 r21 -#define dst_pre_l2 r22 -#define t1 r23 -#define t2 r24 -#define t3 r25 -#define t4 r26 -#define t5 t1 // alias! -#define t6 t2 // alias! -#define t7 t3 // alias! -#define n8 r27 -#define t9 t5 // alias! -#define t10 t4 // alias! -#define t11 t7 // alias! -#define t12 t6 // alias! -#define t14 t10 // alias! -#define t13 r28 -#define t15 r29 -#define tmp r30 - -/* defines for long_copy block */ -#define A 0 -#define B (PREFETCH_DIST) -#define C (B + PREFETCH_DIST) -#define D (C + 1) -#define N (D + 1) -#define Nrot ((N + 7) & ~7) - -/* alias */ -#define in0 r32 -#define in1 r33 -#define in2 r34 - -GLOBAL_ENTRY(memcpy) - and r28=0x7,in0 - and r29=0x7,in1 - mov f6=f0 - mov retval=in0 - br.cond.sptk .common_code - ;; -END(memcpy) -EXPORT_SYMBOL(memcpy) -GLOBAL_ENTRY(__copy_user) - .prologue -// check dest alignment - and r28=0x7,in0 - and r29=0x7,in1 - mov f6=f1 - mov saved_in0=in0 // save dest pointer - mov saved_in1=in1 // save src pointer - mov retval=r0 // initialize return value - ;; -.common_code: - cmp.gt p15,p0=8,in2 // check for small size - cmp.ne p13,p0=0,r28 // check dest alignment - cmp.ne p14,p0=0,r29 // check src alignment - add src0=0,in1 - sub r30=8,r28 // for .align_dest - mov saved_in2=in2 // save len - ;; - add dst0=0,in0 - add dst1=1,in0 // dest odd index - cmp.le p6,p0 = 1,r30 // for .align_dest -(p15) br.cond.dpnt .memcpy_short -(p13) br.cond.dpnt .align_dest -(p14) br.cond.dpnt .unaligned_src - ;; - -// both dest and src are aligned on 8-byte boundary -.aligned_src: - .save ar.pfs, saved_pfs - alloc saved_pfs=ar.pfs,3,Nrot-3,0,Nrot - .save pr, saved_pr - mov saved_pr=pr - - shr.u cnt=in2,7 // this much cache line - ;; - cmp.lt p6,p0=2*PREFETCH_DIST,cnt - cmp.lt p7,p8=1,cnt - .save ar.lc, saved_lc - mov saved_lc=ar.lc - .body - add cnt=-1,cnt - add src_pre_mem=0,in1 // prefetch src pointer - add dst_pre_mem=0,in0 // prefetch dest pointer - ;; -(p7) mov ar.lc=cnt // prefetch count -(p8) mov ar.lc=r0 -(p6) br.cond.dpnt .long_copy - ;; - -.prefetch: - lfetch.fault [src_pre_mem], 128 - lfetch.fault.excl [dst_pre_mem], 128 - br.cloop.dptk.few .prefetch - ;; - -.medium_copy: - and tmp=31,in2 // copy length after iteration - shr.u r29=in2,5 // number of 32-byte iteration - add dst1=8,dst0 // 2nd dest pointer - ;; - add cnt=-1,r29 // ctop iteration adjustment - cmp.eq p10,p0=r29,r0 // do we really need to loop? - add src1=8,src0 // 2nd src pointer - cmp.le p6,p0=8,tmp - ;; - cmp.le p7,p0=16,tmp - mov ar.lc=cnt // loop setup - cmp.eq p16,p17 = r0,r0 - mov ar.ec=2 -(p10) br.dpnt.few .aligned_src_tail - ;; - TEXT_ALIGN(32) -1: -EX(.ex_handler, (p16) ld8 r34=[src0],16) -EK(.ex_handler, (p16) ld8 r38=[src1],16) -EX(.ex_handler, (p17) st8 [dst0]=r33,16) -EK(.ex_handler, (p17) st8 [dst1]=r37,16) - ;; -EX(.ex_handler, (p16) ld8 r32=[src0],16) -EK(.ex_handler, (p16) ld8 r36=[src1],16) -EX(.ex_handler, (p16) st8 [dst0]=r34,16) -EK(.ex_handler, (p16) st8 [dst1]=r38,16) - br.ctop.dptk.few 1b - ;; - -.aligned_src_tail: -EX(.ex_handler, (p6) ld8 t1=[src0]) - mov ar.lc=saved_lc - mov ar.pfs=saved_pfs -EX(.ex_hndlr_s, (p7) ld8 t2=[src1],8) - cmp.le p8,p0=24,tmp - and r21=-8,tmp - ;; -EX(.ex_hndlr_s, (p8) ld8 t3=[src1]) -EX(.ex_handler, (p6) st8 [dst0]=t1) // store byte 1 - and in2=7,tmp // remaining length -EX(.ex_hndlr_d, (p7) st8 [dst1]=t2,8) // store byte 2 - add src0=src0,r21 // setting up src pointer - add dst0=dst0,r21 // setting up dest pointer - ;; -EX(.ex_handler, (p8) st8 [dst1]=t3) // store byte 3 - mov pr=saved_pr,-1 - br.dptk.many .memcpy_short - ;; - -/* code taken from copy_page_mck */ -.long_copy: - .rotr v[2*PREFETCH_DIST] - .rotp p[N] - - mov src_pre_mem = src0 - mov pr.rot = 0x10000 - mov ar.ec = 1 // special unrolled loop - - mov dst_pre_mem = dst0 - - add src_pre_l2 = 8*8, src0 - add dst_pre_l2 = 8*8, dst0 - ;; - add src0 = 8, src_pre_mem // first t1 src - mov ar.lc = 2*PREFETCH_DIST - 1 - shr.u cnt=in2,7 // number of lines - add src1 = 3*8, src_pre_mem // first t3 src - add dst0 = 8, dst_pre_mem // first t1 dst - add dst1 = 3*8, dst_pre_mem // first t3 dst - ;; - and tmp=127,in2 // remaining bytes after this block - add cnt = -(2*PREFETCH_DIST) - 1, cnt - // same as .line_copy loop, but with all predicated-off instructions removed: -.prefetch_loop: -EX(.ex_hndlr_lcpy_1, (p[A]) ld8 v[A] = [src_pre_mem], 128) // M0 -EK(.ex_hndlr_lcpy_1, (p[B]) st8 [dst_pre_mem] = v[B], 128) // M2 - br.ctop.sptk .prefetch_loop - ;; - cmp.eq p16, p0 = r0, r0 // reset p16 to 1 - mov ar.lc = cnt - mov ar.ec = N // # of stages in pipeline - ;; -.line_copy: -EX(.ex_handler, (p[D]) ld8 t2 = [src0], 3*8) // M0 -EK(.ex_handler, (p[D]) ld8 t4 = [src1], 3*8) // M1 -EX(.ex_handler_lcpy, (p[B]) st8 [dst_pre_mem] = v[B], 128) // M2 prefetch dst from memory -EK(.ex_handler_lcpy, (p[D]) st8 [dst_pre_l2] = n8, 128) // M3 prefetch dst from L2 - ;; -EX(.ex_handler_lcpy, (p[A]) ld8 v[A] = [src_pre_mem], 128) // M0 prefetch src from memory -EK(.ex_handler_lcpy, (p[C]) ld8 n8 = [src_pre_l2], 128) // M1 prefetch src from L2 -EX(.ex_handler, (p[D]) st8 [dst0] = t1, 8) // M2 -EK(.ex_handler, (p[D]) st8 [dst1] = t3, 8) // M3 - ;; -EX(.ex_handler, (p[D]) ld8 t5 = [src0], 8) -EK(.ex_handler, (p[D]) ld8 t7 = [src1], 3*8) -EX(.ex_handler, (p[D]) st8 [dst0] = t2, 3*8) -EK(.ex_handler, (p[D]) st8 [dst1] = t4, 3*8) - ;; -EX(.ex_handler, (p[D]) ld8 t6 = [src0], 3*8) -EK(.ex_handler, (p[D]) ld8 t10 = [src1], 8) -EX(.ex_handler, (p[D]) st8 [dst0] = t5, 8) -EK(.ex_handler, (p[D]) st8 [dst1] = t7, 3*8) - ;; -EX(.ex_handler, (p[D]) ld8 t9 = [src0], 3*8) -EK(.ex_handler, (p[D]) ld8 t11 = [src1], 3*8) -EX(.ex_handler, (p[D]) st8 [dst0] = t6, 3*8) -EK(.ex_handler, (p[D]) st8 [dst1] = t10, 8) - ;; -EX(.ex_handler, (p[D]) ld8 t12 = [src0], 8) -EK(.ex_handler, (p[D]) ld8 t14 = [src1], 8) -EX(.ex_handler, (p[D]) st8 [dst0] = t9, 3*8) -EK(.ex_handler, (p[D]) st8 [dst1] = t11, 3*8) - ;; -EX(.ex_handler, (p[D]) ld8 t13 = [src0], 4*8) -EK(.ex_handler, (p[D]) ld8 t15 = [src1], 4*8) -EX(.ex_handler, (p[D]) st8 [dst0] = t12, 8) -EK(.ex_handler, (p[D]) st8 [dst1] = t14, 8) - ;; -EX(.ex_handler, (p[C]) ld8 t1 = [src0], 8) -EK(.ex_handler, (p[C]) ld8 t3 = [src1], 8) -EX(.ex_handler, (p[D]) st8 [dst0] = t13, 4*8) -EK(.ex_handler, (p[D]) st8 [dst1] = t15, 4*8) - br.ctop.sptk .line_copy - ;; - - add dst0=-8,dst0 - add src0=-8,src0 - mov in2=tmp - .restore sp - br.sptk.many .medium_copy - ;; - -#define BLOCK_SIZE 128*32 -#define blocksize r23 -#define curlen r24 - -// dest is on 8-byte boundary, src is not. We need to do -// ld8-ld8, shrp, then st8. Max 8 byte copy per cycle. -.unaligned_src: - .prologue - .save ar.pfs, saved_pfs - alloc saved_pfs=ar.pfs,3,5,0,8 - .save ar.lc, saved_lc - mov saved_lc=ar.lc - .save pr, saved_pr - mov saved_pr=pr - .body -.4k_block: - mov saved_in0=dst0 // need to save all input arguments - mov saved_in2=in2 - mov blocksize=BLOCK_SIZE - ;; - cmp.lt p6,p7=blocksize,in2 - mov saved_in1=src0 - ;; -(p6) mov in2=blocksize - ;; - shr.u r21=in2,7 // this much cache line - shr.u r22=in2,4 // number of 16-byte iteration - and curlen=15,in2 // copy length after iteration - and r30=7,src0 // source alignment - ;; - cmp.lt p7,p8=1,r21 - add cnt=-1,r21 - ;; - - add src_pre_mem=0,src0 // prefetch src pointer - add dst_pre_mem=0,dst0 // prefetch dest pointer - and src0=-8,src0 // 1st src pointer -(p7) mov ar.lc = cnt -(p8) mov ar.lc = r0 - ;; - TEXT_ALIGN(32) -1: lfetch.fault [src_pre_mem], 128 - lfetch.fault.excl [dst_pre_mem], 128 - br.cloop.dptk.few 1b - ;; - - shladd dst1=r22,3,dst0 // 2nd dest pointer - shladd src1=r22,3,src0 // 2nd src pointer - cmp.eq p8,p9=r22,r0 // do we really need to loop? - cmp.le p6,p7=8,curlen; // have at least 8 byte remaining? - add cnt=-1,r22 // ctop iteration adjustment - ;; -EX(.ex_handler, (p9) ld8 r33=[src0],8) // loop primer -EK(.ex_handler, (p9) ld8 r37=[src1],8) -(p8) br.dpnt.few .noloop - ;; - -// The jump address is calculated based on src alignment. The COPYU -// macro below need to confine its size to power of two, so an entry -// can be caulated using shl instead of an expensive multiply. The -// size is then hard coded by the following #define to match the -// actual size. This make it somewhat tedious when COPYU macro gets -// changed and this need to be adjusted to match. -#define LOOP_SIZE 6 -1: - mov r29=ip // jmp_table thread - mov ar.lc=cnt - ;; - add r29=.jump_table - 1b - (.jmp1-.jump_table), r29 - shl r28=r30, LOOP_SIZE // jmp_table thread - mov ar.ec=2 // loop setup - ;; - add r29=r29,r28 // jmp_table thread - cmp.eq p16,p17=r0,r0 - ;; - mov b6=r29 // jmp_table thread - ;; - br.cond.sptk.few b6 - -// for 8-15 byte case -// We will skip the loop, but need to replicate the side effect -// that the loop produces. -.noloop: -EX(.ex_handler, (p6) ld8 r37=[src1],8) - add src0=8,src0 -(p6) shl r25=r30,3 - ;; -EX(.ex_handler, (p6) ld8 r27=[src1]) -(p6) shr.u r28=r37,r25 -(p6) sub r26=64,r25 - ;; -(p6) shl r27=r27,r26 - ;; -(p6) or r21=r28,r27 - -.unaligned_src_tail: -/* check if we have more than blocksize to copy, if so go back */ - cmp.gt p8,p0=saved_in2,blocksize - ;; -(p8) add dst0=saved_in0,blocksize -(p8) add src0=saved_in1,blocksize -(p8) sub in2=saved_in2,blocksize -(p8) br.dpnt .4k_block - ;; - -/* we have up to 15 byte to copy in the tail. - * part of work is already done in the jump table code - * we are at the following state. - * src side: - * - * xxxxxx xx <----- r21 has xxxxxxxx already - * -------- -------- -------- - * 0 8 16 - * ^ - * | - * src1 - * - * dst - * -------- -------- -------- - * ^ - * | - * dst1 - */ -EX(.ex_handler, (p6) st8 [dst1]=r21,8) // more than 8 byte to copy -(p6) add curlen=-8,curlen // update length - mov ar.pfs=saved_pfs - ;; - mov ar.lc=saved_lc - mov pr=saved_pr,-1 - mov in2=curlen // remaining length - mov dst0=dst1 // dest pointer - add src0=src1,r30 // forward by src alignment - ;; - -// 7 byte or smaller. -.memcpy_short: - cmp.le p8,p9 = 1,in2 - cmp.le p10,p11 = 2,in2 - cmp.le p12,p13 = 3,in2 - cmp.le p14,p15 = 4,in2 - add src1=1,src0 // second src pointer - add dst1=1,dst0 // second dest pointer - ;; - -EX(.ex_handler_short, (p8) ld1 t1=[src0],2) -EK(.ex_handler_short, (p10) ld1 t2=[src1],2) -(p9) br.ret.dpnt rp // 0 byte copy - ;; - -EX(.ex_handler_short, (p8) st1 [dst0]=t1,2) -EK(.ex_handler_short, (p10) st1 [dst1]=t2,2) -(p11) br.ret.dpnt rp // 1 byte copy - -EX(.ex_handler_short, (p12) ld1 t3=[src0],2) -EK(.ex_handler_short, (p14) ld1 t4=[src1],2) -(p13) br.ret.dpnt rp // 2 byte copy - ;; - - cmp.le p6,p7 = 5,in2 - cmp.le p8,p9 = 6,in2 - cmp.le p10,p11 = 7,in2 - -EX(.ex_handler_short, (p12) st1 [dst0]=t3,2) -EK(.ex_handler_short, (p14) st1 [dst1]=t4,2) -(p15) br.ret.dpnt rp // 3 byte copy - ;; - -EX(.ex_handler_short, (p6) ld1 t5=[src0],2) -EK(.ex_handler_short, (p8) ld1 t6=[src1],2) -(p7) br.ret.dpnt rp // 4 byte copy - ;; - -EX(.ex_handler_short, (p6) st1 [dst0]=t5,2) -EK(.ex_handler_short, (p8) st1 [dst1]=t6,2) -(p9) br.ret.dptk rp // 5 byte copy - -EX(.ex_handler_short, (p10) ld1 t7=[src0],2) -(p11) br.ret.dptk rp // 6 byte copy - ;; - -EX(.ex_handler_short, (p10) st1 [dst0]=t7,2) - br.ret.dptk rp // done all cases - - -/* Align dest to nearest 8-byte boundary. We know we have at - * least 7 bytes to copy, enough to crawl to 8-byte boundary. - * Actual number of byte to crawl depend on the dest alignment. - * 7 byte or less is taken care at .memcpy_short - - * src0 - source even index - * src1 - source odd index - * dst0 - dest even index - * dst1 - dest odd index - * r30 - distance to 8-byte boundary - */ - -.align_dest: - add src1=1,in1 // source odd index - cmp.le p7,p0 = 2,r30 // for .align_dest - cmp.le p8,p0 = 3,r30 // for .align_dest -EX(.ex_handler_short, (p6) ld1 t1=[src0],2) - cmp.le p9,p0 = 4,r30 // for .align_dest - cmp.le p10,p0 = 5,r30 - ;; -EX(.ex_handler_short, (p7) ld1 t2=[src1],2) -EK(.ex_handler_short, (p8) ld1 t3=[src0],2) - cmp.le p11,p0 = 6,r30 -EX(.ex_handler_short, (p6) st1 [dst0] = t1,2) - cmp.le p12,p0 = 7,r30 - ;; -EX(.ex_handler_short, (p9) ld1 t4=[src1],2) -EK(.ex_handler_short, (p10) ld1 t5=[src0],2) -EX(.ex_handler_short, (p7) st1 [dst1] = t2,2) -EK(.ex_handler_short, (p8) st1 [dst0] = t3,2) - ;; -EX(.ex_handler_short, (p11) ld1 t6=[src1],2) -EK(.ex_handler_short, (p12) ld1 t7=[src0],2) - cmp.eq p6,p7=r28,r29 -EX(.ex_handler_short, (p9) st1 [dst1] = t4,2) -EK(.ex_handler_short, (p10) st1 [dst0] = t5,2) - sub in2=in2,r30 - ;; -EX(.ex_handler_short, (p11) st1 [dst1] = t6,2) -EK(.ex_handler_short, (p12) st1 [dst0] = t7) - add dst0=in0,r30 // setup arguments - add src0=in1,r30 -(p6) br.cond.dptk .aligned_src -(p7) br.cond.dpnt .unaligned_src - ;; - -/* main loop body in jump table format */ -#define COPYU(shift) \ -1: \ -EX(.ex_handler, (p16) ld8 r32=[src0],8); /* 1 */ \ -EK(.ex_handler, (p16) ld8 r36=[src1],8); \ - (p17) shrp r35=r33,r34,shift;; /* 1 */ \ -EX(.ex_handler, (p6) ld8 r22=[src1]); /* common, prime for tail section */ \ - nop.m 0; \ - (p16) shrp r38=r36,r37,shift; \ -EX(.ex_handler, (p17) st8 [dst0]=r35,8); /* 1 */ \ -EK(.ex_handler, (p17) st8 [dst1]=r39,8); \ - br.ctop.dptk.few 1b;; \ - (p7) add src1=-8,src1; /* back out for <8 byte case */ \ - shrp r21=r22,r38,shift; /* speculative work */ \ - br.sptk.few .unaligned_src_tail /* branch out of jump table */ \ - ;; - TEXT_ALIGN(32) -.jump_table: - COPYU(8) // unaligned cases -.jmp1: - COPYU(16) - COPYU(24) - COPYU(32) - COPYU(40) - COPYU(48) - COPYU(56) - -#undef A -#undef B -#undef C -#undef D - -/* - * Due to lack of local tag support in gcc 2.x assembler, it is not clear which - * instruction failed in the bundle. The exception algorithm is that we - * first figure out the faulting address, then detect if there is any - * progress made on the copy, if so, redo the copy from last known copied - * location up to the faulting address (exclusive). In the copy_from_user - * case, remaining byte in kernel buffer will be zeroed. - * - * Take copy_from_user as an example, in the code there are multiple loads - * in a bundle and those multiple loads could span over two pages, the - * faulting address is calculated as page_round_down(max(src0, src1)). - * This is based on knowledge that if we can access one byte in a page, we - * can access any byte in that page. - * - * predicate used in the exception handler: - * p6-p7: direction - * p10-p11: src faulting addr calculation - * p12-p13: dst faulting addr calculation - */ - -#define A r19 -#define B r20 -#define C r21 -#define D r22 -#define F r28 - -#define saved_retval loc0 -#define saved_rtlink loc1 -#define saved_pfs_stack loc2 - -.ex_hndlr_s: - add src0=8,src0 - br.sptk .ex_handler - ;; -.ex_hndlr_d: - add dst0=8,dst0 - br.sptk .ex_handler - ;; -.ex_hndlr_lcpy_1: - mov src1=src_pre_mem - mov dst1=dst_pre_mem - cmp.gtu p10,p11=src_pre_mem,saved_in1 - cmp.gtu p12,p13=dst_pre_mem,saved_in0 - ;; -(p10) add src0=8,saved_in1 -(p11) mov src0=saved_in1 -(p12) add dst0=8,saved_in0 -(p13) mov dst0=saved_in0 - br.sptk .ex_handler -.ex_handler_lcpy: - // in line_copy block, the preload addresses should always ahead - // of the other two src/dst pointers. Furthermore, src1/dst1 should - // always ahead of src0/dst0. - mov src1=src_pre_mem - mov dst1=dst_pre_mem -.ex_handler: - mov pr=saved_pr,-1 // first restore pr, lc, and pfs - mov ar.lc=saved_lc - mov ar.pfs=saved_pfs - ;; -.ex_handler_short: // fault occurred in these sections didn't change pr, lc, pfs - cmp.ltu p6,p7=saved_in0, saved_in1 // get the copy direction - cmp.ltu p10,p11=src0,src1 - cmp.ltu p12,p13=dst0,dst1 - fcmp.eq p8,p0=f6,f0 // is it memcpy? - mov tmp = dst0 - ;; -(p11) mov src1 = src0 // pick the larger of the two -(p13) mov dst0 = dst1 // make dst0 the smaller one -(p13) mov dst1 = tmp // and dst1 the larger one - ;; -(p6) dep F = r0,dst1,0,PAGE_SHIFT // usr dst round down to page boundary -(p7) dep F = r0,src1,0,PAGE_SHIFT // usr src round down to page boundary - ;; -(p6) cmp.le p14,p0=dst0,saved_in0 // no progress has been made on store -(p7) cmp.le p14,p0=src0,saved_in1 // no progress has been made on load - mov retval=saved_in2 -(p8) ld1 tmp=[src1] // force an oops for memcpy call -(p8) st1 [dst1]=r0 // force an oops for memcpy call -(p14) br.ret.sptk.many rp - -/* - * The remaining byte to copy is calculated as: - * - * A = (faulting_addr - orig_src) -> len to faulting ld address - * or - * (faulting_addr - orig_dst) -> len to faulting st address - * B = (cur_dst - orig_dst) -> len copied so far - * C = A - B -> len need to be copied - * D = orig_len - A -> len need to be left along - */ -(p6) sub A = F, saved_in0 -(p7) sub A = F, saved_in1 - clrrrb - ;; - alloc saved_pfs_stack=ar.pfs,3,3,3,0 - cmp.lt p8,p0=A,r0 - sub B = dst0, saved_in0 // how many byte copied so far - ;; -(p8) mov A = 0; // A shouldn't be negative, cap it - ;; - sub C = A, B - sub D = saved_in2, A - ;; - cmp.gt p8,p0=C,r0 // more than 1 byte? - mov r8=0 - mov saved_retval = D - mov saved_rtlink = b0 - - add out0=saved_in0, B - add out1=saved_in1, B - mov out2=C -(p8) br.call.sptk.few b0=__copy_user // recursive call - ;; - - add saved_retval=saved_retval,r8 // above might return non-zero value - ;; - - mov retval=saved_retval - mov ar.pfs=saved_pfs_stack - mov b0=saved_rtlink - br.ret.sptk.many rp - -/* end of McKinley specific optimization */ -END(__copy_user) -EXPORT_SYMBOL(__copy_user) diff --git a/arch/ia64/lib/memset.S b/arch/ia64/lib/memset.S deleted file mode 100644 index 552c5c7e4d06..000000000000 --- a/arch/ia64/lib/memset.S +++ /dev/null @@ -1,365 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* Optimized version of the standard memset() function. - - Copyright (c) 2002 Hewlett-Packard Co/CERN - Sverre Jarp <Sverre.Jarp@cern.ch> - - Return: dest - - Inputs: - in0: dest - in1: value - in2: count - - The algorithm is fairly straightforward: set byte by byte until we - we get to a 16B-aligned address, then loop on 128 B chunks using an - early store as prefetching, then loop on 32B chucks, then clear remaining - words, finally clear remaining bytes. - Since a stf.spill f0 can store 16B in one go, we use this instruction - to get peak speed when value = 0. */ - -#include <linux/export.h> -#include <asm/asmmacro.h> -#undef ret - -#define dest in0 -#define value in1 -#define cnt in2 - -#define tmp r31 -#define save_lc r30 -#define ptr0 r29 -#define ptr1 r28 -#define ptr2 r27 -#define ptr3 r26 -#define ptr9 r24 -#define loopcnt r23 -#define linecnt r22 -#define bytecnt r21 - -#define fvalue f6 - -// This routine uses only scratch predicate registers (p6 - p15) -#define p_scr p6 // default register for same-cycle branches -#define p_nz p7 -#define p_zr p8 -#define p_unalgn p9 -#define p_y p11 -#define p_n p12 -#define p_yy p13 -#define p_nn p14 - -#define MIN1 15 -#define MIN1P1HALF 8 -#define LINE_SIZE 128 -#define LSIZE_SH 7 // shift amount -#define PREF_AHEAD 8 - -GLOBAL_ENTRY(memset) -{ .mmi - .prologue - alloc tmp = ar.pfs, 3, 0, 0, 0 - lfetch.nt1 [dest] // - .save ar.lc, save_lc - mov.i save_lc = ar.lc - .body -} { .mmi - mov ret0 = dest // return value - cmp.ne p_nz, p_zr = value, r0 // use stf.spill if value is zero - cmp.eq p_scr, p0 = cnt, r0 -;; } -{ .mmi - and ptr2 = -(MIN1+1), dest // aligned address - and tmp = MIN1, dest // prepare to check for correct alignment - tbit.nz p_y, p_n = dest, 0 // Do we have an odd address? (M_B_U) -} { .mib - mov ptr1 = dest - mux1 value = value, @brcst // create 8 identical bytes in word -(p_scr) br.ret.dpnt.many rp // return immediately if count = 0 -;; } -{ .mib - cmp.ne p_unalgn, p0 = tmp, r0 // -} { .mib - sub bytecnt = (MIN1+1), tmp // NB: # of bytes to move is 1 higher than loopcnt - cmp.gt p_scr, p0 = 16, cnt // is it a minimalistic task? -(p_scr) br.cond.dptk.many .move_bytes_unaligned // go move just a few (M_B_U) -;; } -{ .mmi -(p_unalgn) add ptr1 = (MIN1+1), ptr2 // after alignment -(p_unalgn) add ptr2 = MIN1P1HALF, ptr2 // after alignment -(p_unalgn) tbit.nz.unc p_y, p_n = bytecnt, 3 // should we do a st8 ? -;; } -{ .mib -(p_y) add cnt = -8, cnt // -(p_unalgn) tbit.nz.unc p_yy, p_nn = bytecnt, 2 // should we do a st4 ? -} { .mib -(p_y) st8 [ptr2] = value,-4 // -(p_n) add ptr2 = 4, ptr2 // -;; } -{ .mib -(p_yy) add cnt = -4, cnt // -(p_unalgn) tbit.nz.unc p_y, p_n = bytecnt, 1 // should we do a st2 ? -} { .mib -(p_yy) st4 [ptr2] = value,-2 // -(p_nn) add ptr2 = 2, ptr2 // -;; } -{ .mmi - mov tmp = LINE_SIZE+1 // for compare -(p_y) add cnt = -2, cnt // -(p_unalgn) tbit.nz.unc p_yy, p_nn = bytecnt, 0 // should we do a st1 ? -} { .mmi - setf.sig fvalue=value // transfer value to FLP side -(p_y) st2 [ptr2] = value,-1 // -(p_n) add ptr2 = 1, ptr2 // -;; } - -{ .mmi -(p_yy) st1 [ptr2] = value // - cmp.gt p_scr, p0 = tmp, cnt // is it a minimalistic task? -} { .mbb -(p_yy) add cnt = -1, cnt // -(p_scr) br.cond.dpnt.many .fraction_of_line // go move just a few -;; } - -{ .mib - nop.m 0 - shr.u linecnt = cnt, LSIZE_SH -(p_zr) br.cond.dptk.many .l1b // Jump to use stf.spill -;; } - - TEXT_ALIGN(32) // --------------------- // L1A: store ahead into cache lines; fill later -{ .mmi - and tmp = -(LINE_SIZE), cnt // compute end of range - mov ptr9 = ptr1 // used for prefetching - and cnt = (LINE_SIZE-1), cnt // remainder -} { .mmi - mov loopcnt = PREF_AHEAD-1 // default prefetch loop - cmp.gt p_scr, p0 = PREF_AHEAD, linecnt // check against actual value -;; } -{ .mmi -(p_scr) add loopcnt = -1, linecnt // - add ptr2 = 8, ptr1 // start of stores (beyond prefetch stores) - add ptr1 = tmp, ptr1 // first address beyond total range -;; } -{ .mmi - add tmp = -1, linecnt // next loop count - mov.i ar.lc = loopcnt // -;; } -.pref_l1a: -{ .mib - stf8 [ptr9] = fvalue, 128 // Do stores one cache line apart - nop.i 0 - br.cloop.dptk.few .pref_l1a -;; } -{ .mmi - add ptr0 = 16, ptr2 // Two stores in parallel - mov.i ar.lc = tmp // -;; } -.l1ax: - { .mmi - stf8 [ptr2] = fvalue, 8 - stf8 [ptr0] = fvalue, 8 - ;; } - { .mmi - stf8 [ptr2] = fvalue, 24 - stf8 [ptr0] = fvalue, 24 - ;; } - { .mmi - stf8 [ptr2] = fvalue, 8 - stf8 [ptr0] = fvalue, 8 - ;; } - { .mmi - stf8 [ptr2] = fvalue, 24 - stf8 [ptr0] = fvalue, 24 - ;; } - { .mmi - stf8 [ptr2] = fvalue, 8 - stf8 [ptr0] = fvalue, 8 - ;; } - { .mmi - stf8 [ptr2] = fvalue, 24 - stf8 [ptr0] = fvalue, 24 - ;; } - { .mmi - stf8 [ptr2] = fvalue, 8 - stf8 [ptr0] = fvalue, 32 - cmp.lt p_scr, p0 = ptr9, ptr1 // do we need more prefetching? - ;; } -{ .mmb - stf8 [ptr2] = fvalue, 24 -(p_scr) stf8 [ptr9] = fvalue, 128 - br.cloop.dptk.few .l1ax -;; } -{ .mbb - cmp.le p_scr, p0 = 8, cnt // just a few bytes left ? -(p_scr) br.cond.dpnt.many .fraction_of_line // Branch no. 2 - br.cond.dpnt.many .move_bytes_from_alignment // Branch no. 3 -;; } - - TEXT_ALIGN(32) -.l1b: // ------------------------------------ // L1B: store ahead into cache lines; fill later -{ .mmi - and tmp = -(LINE_SIZE), cnt // compute end of range - mov ptr9 = ptr1 // used for prefetching - and cnt = (LINE_SIZE-1), cnt // remainder -} { .mmi - mov loopcnt = PREF_AHEAD-1 // default prefetch loop - cmp.gt p_scr, p0 = PREF_AHEAD, linecnt // check against actual value -;; } -{ .mmi -(p_scr) add loopcnt = -1, linecnt - add ptr2 = 16, ptr1 // start of stores (beyond prefetch stores) - add ptr1 = tmp, ptr1 // first address beyond total range -;; } -{ .mmi - add tmp = -1, linecnt // next loop count - mov.i ar.lc = loopcnt -;; } -.pref_l1b: -{ .mib - stf.spill [ptr9] = f0, 128 // Do stores one cache line apart - nop.i 0 - br.cloop.dptk.few .pref_l1b -;; } -{ .mmi - add ptr0 = 16, ptr2 // Two stores in parallel - mov.i ar.lc = tmp -;; } -.l1bx: - { .mmi - stf.spill [ptr2] = f0, 32 - stf.spill [ptr0] = f0, 32 - ;; } - { .mmi - stf.spill [ptr2] = f0, 32 - stf.spill [ptr0] = f0, 32 - ;; } - { .mmi - stf.spill [ptr2] = f0, 32 - stf.spill [ptr0] = f0, 64 - cmp.lt p_scr, p0 = ptr9, ptr1 // do we need more prefetching? - ;; } -{ .mmb - stf.spill [ptr2] = f0, 32 -(p_scr) stf.spill [ptr9] = f0, 128 - br.cloop.dptk.few .l1bx -;; } -{ .mib - cmp.gt p_scr, p0 = 8, cnt // just a few bytes left ? -(p_scr) br.cond.dpnt.many .move_bytes_from_alignment // -;; } - -.fraction_of_line: -{ .mib - add ptr2 = 16, ptr1 - shr.u loopcnt = cnt, 5 // loopcnt = cnt / 32 -;; } -{ .mib - cmp.eq p_scr, p0 = loopcnt, r0 - add loopcnt = -1, loopcnt -(p_scr) br.cond.dpnt.many .store_words -;; } -{ .mib - and cnt = 0x1f, cnt // compute the remaining cnt - mov.i ar.lc = loopcnt -;; } - TEXT_ALIGN(32) -.l2: // ------------------------------------ // L2A: store 32B in 2 cycles -{ .mmb - stf8 [ptr1] = fvalue, 8 - stf8 [ptr2] = fvalue, 8 -;; } { .mmb - stf8 [ptr1] = fvalue, 24 - stf8 [ptr2] = fvalue, 24 - br.cloop.dptk.many .l2 -;; } -.store_words: -{ .mib - cmp.gt p_scr, p0 = 8, cnt // just a few bytes left ? -(p_scr) br.cond.dpnt.many .move_bytes_from_alignment // Branch -;; } - -{ .mmi - stf8 [ptr1] = fvalue, 8 // store - cmp.le p_y, p_n = 16, cnt - add cnt = -8, cnt // subtract -;; } -{ .mmi -(p_y) stf8 [ptr1] = fvalue, 8 // store -(p_y) cmp.le.unc p_yy, p_nn = 16, cnt -(p_y) add cnt = -8, cnt // subtract -;; } -{ .mmi // store -(p_yy) stf8 [ptr1] = fvalue, 8 -(p_yy) add cnt = -8, cnt // subtract -;; } - -.move_bytes_from_alignment: -{ .mib - cmp.eq p_scr, p0 = cnt, r0 - tbit.nz.unc p_y, p0 = cnt, 2 // should we terminate with a st4 ? -(p_scr) br.cond.dpnt.few .restore_and_exit -;; } -{ .mib -(p_y) st4 [ptr1] = value,4 - tbit.nz.unc p_yy, p0 = cnt, 1 // should we terminate with a st2 ? -;; } -{ .mib -(p_yy) st2 [ptr1] = value,2 - tbit.nz.unc p_y, p0 = cnt, 0 // should we terminate with a st1 ? -;; } - -{ .mib -(p_y) st1 [ptr1] = value -;; } -.restore_and_exit: -{ .mib - nop.m 0 - mov.i ar.lc = save_lc - br.ret.sptk.many rp -;; } - -.move_bytes_unaligned: -{ .mmi - .pred.rel "mutex",p_y, p_n - .pred.rel "mutex",p_yy, p_nn -(p_n) cmp.le p_yy, p_nn = 4, cnt -(p_y) cmp.le p_yy, p_nn = 5, cnt -(p_n) add ptr2 = 2, ptr1 -} { .mmi -(p_y) add ptr2 = 3, ptr1 -(p_y) st1 [ptr1] = value, 1 // fill 1 (odd-aligned) byte [15, 14 (or less) left] -(p_y) add cnt = -1, cnt -;; } -{ .mmi -(p_yy) cmp.le.unc p_y, p0 = 8, cnt - add ptr3 = ptr1, cnt // prepare last store - mov.i ar.lc = save_lc -} { .mmi -(p_yy) st2 [ptr1] = value, 4 // fill 2 (aligned) bytes -(p_yy) st2 [ptr2] = value, 4 // fill 2 (aligned) bytes [11, 10 (o less) left] -(p_yy) add cnt = -4, cnt -;; } -{ .mmi -(p_y) cmp.le.unc p_yy, p0 = 8, cnt - add ptr3 = -1, ptr3 // last store - tbit.nz p_scr, p0 = cnt, 1 // will there be a st2 at the end ? -} { .mmi -(p_y) st2 [ptr1] = value, 4 // fill 2 (aligned) bytes -(p_y) st2 [ptr2] = value, 4 // fill 2 (aligned) bytes [7, 6 (or less) left] -(p_y) add cnt = -4, cnt -;; } -{ .mmi -(p_yy) st2 [ptr1] = value, 4 // fill 2 (aligned) bytes -(p_yy) st2 [ptr2] = value, 4 // fill 2 (aligned) bytes [3, 2 (or less) left] - tbit.nz p_y, p0 = cnt, 0 // will there be a st1 at the end ? -} { .mmi -(p_yy) add cnt = -4, cnt -;; } -{ .mmb -(p_scr) st2 [ptr1] = value // fill 2 (aligned) bytes -(p_y) st1 [ptr3] = value // fill last byte (using ptr3) - br.ret.sptk.many rp -} -END(memset) -EXPORT_SYMBOL(memset) diff --git a/arch/ia64/lib/strlen.S b/arch/ia64/lib/strlen.S deleted file mode 100644 index 1f4a46c15127..000000000000 --- a/arch/ia64/lib/strlen.S +++ /dev/null @@ -1,195 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * - * Optimized version of the standard strlen() function - * - * - * Inputs: - * in0 address of string - * - * Outputs: - * ret0 the number of characters in the string (0 if empty string) - * does not count the \0 - * - * Copyright (C) 1999, 2001 Hewlett-Packard Co - * Stephane Eranian <eranian@hpl.hp.com> - * - * 09/24/99 S.Eranian add speculation recovery code - */ - -#include <linux/export.h> -#include <asm/asmmacro.h> - -// -// -// This is an enhanced version of the basic strlen. it includes a combination -// of compute zero index (czx), parallel comparisons, speculative loads and -// loop unroll using rotating registers. -// -// General Ideas about the algorithm: -// The goal is to look at the string in chunks of 8 bytes. -// so we need to do a few extra checks at the beginning because the -// string may not be 8-byte aligned. In this case we load the 8byte -// quantity which includes the start of the string and mask the unused -// bytes with 0xff to avoid confusing czx. -// We use speculative loads and software pipelining to hide memory -// latency and do read ahead safely. This way we defer any exception. -// -// Because we don't want the kernel to be relying on particular -// settings of the DCR register, we provide recovery code in case -// speculation fails. The recovery code is going to "redo" the work using -// only normal loads. If we still get a fault then we generate a -// kernel panic. Otherwise we return the strlen as usual. -// -// The fact that speculation may fail can be caused, for instance, by -// the DCR.dm bit being set. In this case TLB misses are deferred, i.e., -// a NaT bit will be set if the translation is not present. The normal -// load, on the other hand, will cause the translation to be inserted -// if the mapping exists. -// -// It should be noted that we execute recovery code only when we need -// to use the data that has been speculatively loaded: we don't execute -// recovery code on pure read ahead data. -// -// Remarks: -// - the cmp r0,r0 is used as a fast way to initialize a predicate -// register to 1. This is required to make sure that we get the parallel -// compare correct. -// -// - we don't use the epilogue counter to exit the loop but we need to set -// it to zero beforehand. -// -// - after the loop we must test for Nat values because neither the -// czx nor cmp instruction raise a NaT consumption fault. We must be -// careful not to look too far for a Nat for which we don't care. -// For instance we don't need to look at a NaT in val2 if the zero byte -// was in val1. -// -// - Clearly performance tuning is required. -// -// -// -#define saved_pfs r11 -#define tmp r10 -#define base r16 -#define orig r17 -#define saved_pr r18 -#define src r19 -#define mask r20 -#define val r21 -#define val1 r22 -#define val2 r23 - -GLOBAL_ENTRY(strlen) - .prologue - .save ar.pfs, saved_pfs - alloc saved_pfs=ar.pfs,11,0,0,8 // rotating must be multiple of 8 - - .rotr v[2], w[2] // declares our 4 aliases - - extr.u tmp=in0,0,3 // tmp=least significant 3 bits - mov orig=in0 // keep trackof initial byte address - dep src=0,in0,0,3 // src=8byte-aligned in0 address - .save pr, saved_pr - mov saved_pr=pr // preserve predicates (rotation) - ;; - - .body - - ld8 v[1]=[src],8 // must not speculate: can fail here - shl tmp=tmp,3 // multiply by 8bits/byte - mov mask=-1 // our mask - ;; - ld8.s w[1]=[src],8 // speculatively load next - cmp.eq p6,p0=r0,r0 // sets p6 to true for cmp.and - sub tmp=64,tmp // how many bits to shift our mask on the right - ;; - shr.u mask=mask,tmp // zero enough bits to hold v[1] valuable part - mov ar.ec=r0 // clear epilogue counter (saved in ar.pfs) - ;; - add base=-16,src // keep track of aligned base - or v[1]=v[1],mask // now we have a safe initial byte pattern - ;; -1: - ld8.s v[0]=[src],8 // speculatively load next - czx1.r val1=v[1] // search 0 byte from right - czx1.r val2=w[1] // search 0 byte from right following 8bytes - ;; - ld8.s w[0]=[src],8 // speculatively load next to next - cmp.eq.and p6,p0=8,val1 // p6 = p6 and val1==8 - cmp.eq.and p6,p0=8,val2 // p6 = p6 and mask==8 -(p6) br.wtop.dptk 1b // loop until p6 == 0 - ;; - // - // We must return try the recovery code iff - // val1_is_nat || (val1==8 && val2_is_nat) - // - // XXX Fixme - // - there must be a better way of doing the test - // - cmp.eq p8,p9=8,val1 // p6 = val1 had zero (disambiguate) - tnat.nz p6,p7=val1 // test NaT on val1 -(p6) br.cond.spnt .recover // jump to recovery if val1 is NaT - ;; - // - // if we come here p7 is true, i.e., initialized for // cmp - // - cmp.eq.and p7,p0=8,val1// val1==8? - tnat.nz.and p7,p0=val2 // test NaT if val2 -(p7) br.cond.spnt .recover // jump to recovery if val2 is NaT - ;; -(p8) mov val1=val2 // the other test got us out of the loop -(p8) adds src=-16,src // correct position when 3 ahead -(p9) adds src=-24,src // correct position when 4 ahead - ;; - sub ret0=src,orig // distance from base - sub tmp=8,val1 // which byte in word - mov pr=saved_pr,0xffffffffffff0000 - ;; - sub ret0=ret0,tmp // adjust - mov ar.pfs=saved_pfs // because of ar.ec, restore no matter what - br.ret.sptk.many rp // end of normal execution - - // - // Outlined recovery code when speculation failed - // - // This time we don't use speculation and rely on the normal exception - // mechanism. that's why the loop is not as good as the previous one - // because read ahead is not possible - // - // IMPORTANT: - // Please note that in the case of strlen() as opposed to strlen_user() - // we don't use the exception mechanism, as this function is not - // supposed to fail. If that happens it means we have a bug and the - // code will cause of kernel fault. - // - // XXX Fixme - // - today we restart from the beginning of the string instead - // of trying to continue where we left off. - // -.recover: - ld8 val=[base],8 // will fail if unrecoverable fault - ;; - or val=val,mask // remask first bytes - cmp.eq p0,p6=r0,r0 // nullify first ld8 in loop - ;; - // - // ar.ec is still zero here - // -2: -(p6) ld8 val=[base],8 // will fail if unrecoverable fault - ;; - czx1.r val1=val // search 0 byte from right - ;; - cmp.eq p6,p0=8,val1 // val1==8 ? -(p6) br.wtop.dptk 2b // loop until p6 == 0 - ;; // (avoid WAW on p63) - sub ret0=base,orig // distance from base - sub tmp=8,val1 - mov pr=saved_pr,0xffffffffffff0000 - ;; - sub ret0=ret0,tmp // length=now - back -1 - mov ar.pfs=saved_pfs // because of ar.ec, restore no matter what - br.ret.sptk.many rp // end of successful recovery code -END(strlen) -EXPORT_SYMBOL(strlen) diff --git a/arch/ia64/lib/strncpy_from_user.S b/arch/ia64/lib/strncpy_from_user.S deleted file mode 100644 index a287169bd953..000000000000 --- a/arch/ia64/lib/strncpy_from_user.S +++ /dev/null @@ -1,47 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Just like strncpy() except that if a fault occurs during copying, - * -EFAULT is returned. - * - * Inputs: - * in0: address of destination buffer - * in1: address of string to be copied - * in2: length of buffer in bytes - * Outputs: - * r8: -EFAULT in case of fault or number of bytes copied if no fault - * - * Copyright (C) 1998-2001 Hewlett-Packard Co - * Copyright (C) 1998-2001 David Mosberger-Tang <davidm@hpl.hp.com> - * - * 00/03/06 D. Mosberger Fixed to return proper return value (bug found by - * by Andreas Schwab <schwab@suse.de>). - */ - -#include <linux/export.h> -#include <asm/asmmacro.h> - -GLOBAL_ENTRY(__strncpy_from_user) - alloc r2=ar.pfs,3,0,0,0 - mov r8=0 - mov r9=in1 - ;; - add r10=in1,in2 - cmp.eq p6,p0=r0,in2 -(p6) br.ret.spnt.many rp - - // XXX braindead copy loop---this needs to be optimized -.Loop1: - EX(.Lexit, ld1 r8=[in1],1) - ;; - EX(.Lexit, st1 [in0]=r8,1) - cmp.ne p6,p7=r8,r0 - ;; -(p6) cmp.ne.unc p8,p0=in1,r10 -(p8) br.cond.dpnt.few .Loop1 - ;; -(p6) mov r8=in2 // buffer filled up---return buffer length -(p7) sub r8=in1,r9,1 // return string length (excluding NUL character) -[.Lexit:] - br.ret.sptk.many rp -END(__strncpy_from_user) -EXPORT_SYMBOL(__strncpy_from_user) diff --git a/arch/ia64/lib/strnlen_user.S b/arch/ia64/lib/strnlen_user.S deleted file mode 100644 index a7eb56e840a9..000000000000 --- a/arch/ia64/lib/strnlen_user.S +++ /dev/null @@ -1,48 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Returns 0 if exception before NUL or reaching the supplied limit (N), - * a value greater than N if the string is longer than the limit, else - * strlen. - * - * Inputs: - * in0: address of buffer - * in1: string length limit N - * Outputs: - * r8: 0 in case of fault, strlen(buffer)+1 otherwise - * - * Copyright (C) 1999, 2001 David Mosberger-Tang <davidm@hpl.hp.com> - */ - -#include <linux/export.h> -#include <asm/asmmacro.h> - -GLOBAL_ENTRY(__strnlen_user) - .prologue - alloc r2=ar.pfs,2,0,0,0 - .save ar.lc, r16 - mov r16=ar.lc // preserve ar.lc - - .body - - add r3=-1,in1 - ;; - mov ar.lc=r3 - mov r9=0 - ;; - // XXX braindead strlen loop---this needs to be optimized -.Loop1: - EXCLR(.Lexit, ld1 r8=[in0],1) - add r9=1,r9 - ;; - cmp.eq p6,p0=r8,r0 -(p6) br.cond.dpnt .Lexit - br.cloop.dptk.few .Loop1 - - add r9=1,in1 // NUL not found---return N+1 - ;; -.Lexit: - mov r8=r9 - mov ar.lc=r16 // restore ar.lc - br.ret.sptk.many rp -END(__strnlen_user) -EXPORT_SYMBOL(__strnlen_user) diff --git a/arch/ia64/lib/xor.S b/arch/ia64/lib/xor.S deleted file mode 100644 index 6e2a69662c06..000000000000 --- a/arch/ia64/lib/xor.S +++ /dev/null @@ -1,181 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * arch/ia64/lib/xor.S - * - * Optimized RAID-5 checksumming functions for IA-64. - */ - -#include <linux/export.h> -#include <asm/asmmacro.h> - -GLOBAL_ENTRY(xor_ia64_2) - .prologue - .fframe 0 - .save ar.pfs, r31 - alloc r31 = ar.pfs, 3, 0, 13, 16 - .save ar.lc, r30 - mov r30 = ar.lc - .save pr, r29 - mov r29 = pr - ;; - .body - mov r8 = in1 - mov ar.ec = 6 + 2 - shr in0 = in0, 3 - ;; - adds in0 = -1, in0 - mov r16 = in1 - mov r17 = in2 - ;; - mov ar.lc = in0 - mov pr.rot = 1 << 16 - ;; - .rotr s1[6+1], s2[6+1], d[2] - .rotp p[6+2] -0: -(p[0]) ld8.nta s1[0] = [r16], 8 -(p[0]) ld8.nta s2[0] = [r17], 8 -(p[6]) xor d[0] = s1[6], s2[6] -(p[6+1])st8.nta [r8] = d[1], 8 - nop.f 0 - br.ctop.dptk.few 0b - ;; - mov ar.lc = r30 - mov pr = r29, -1 - br.ret.sptk.few rp -END(xor_ia64_2) -EXPORT_SYMBOL(xor_ia64_2) - -GLOBAL_ENTRY(xor_ia64_3) - .prologue - .fframe 0 - .save ar.pfs, r31 - alloc r31 = ar.pfs, 4, 0, 20, 24 - .save ar.lc, r30 - mov r30 = ar.lc - .save pr, r29 - mov r29 = pr - ;; - .body - mov r8 = in1 - mov ar.ec = 6 + 2 - shr in0 = in0, 3 - ;; - adds in0 = -1, in0 - mov r16 = in1 - mov r17 = in2 - ;; - mov r18 = in3 - mov ar.lc = in0 - mov pr.rot = 1 << 16 - ;; - .rotr s1[6+1], s2[6+1], s3[6+1], d[2] - .rotp p[6+2] -0: -(p[0]) ld8.nta s1[0] = [r16], 8 -(p[0]) ld8.nta s2[0] = [r17], 8 -(p[6]) xor d[0] = s1[6], s2[6] - ;; -(p[0]) ld8.nta s3[0] = [r18], 8 -(p[6+1])st8.nta [r8] = d[1], 8 -(p[6]) xor d[0] = d[0], s3[6] - br.ctop.dptk.few 0b - ;; - mov ar.lc = r30 - mov pr = r29, -1 - br.ret.sptk.few rp -END(xor_ia64_3) -EXPORT_SYMBOL(xor_ia64_3) - -GLOBAL_ENTRY(xor_ia64_4) - .prologue - .fframe 0 - .save ar.pfs, r31 - alloc r31 = ar.pfs, 5, 0, 27, 32 - .save ar.lc, r30 - mov r30 = ar.lc - .save pr, r29 - mov r29 = pr - ;; - .body - mov r8 = in1 - mov ar.ec = 6 + 2 - shr in0 = in0, 3 - ;; - adds in0 = -1, in0 - mov r16 = in1 - mov r17 = in2 - ;; - mov r18 = in3 - mov ar.lc = in0 - mov pr.rot = 1 << 16 - mov r19 = in4 - ;; - .rotr s1[6+1], s2[6+1], s3[6+1], s4[6+1], d[2] - .rotp p[6+2] -0: -(p[0]) ld8.nta s1[0] = [r16], 8 -(p[0]) ld8.nta s2[0] = [r17], 8 -(p[6]) xor d[0] = s1[6], s2[6] -(p[0]) ld8.nta s3[0] = [r18], 8 -(p[0]) ld8.nta s4[0] = [r19], 8 -(p[6]) xor r20 = s3[6], s4[6] - ;; -(p[6+1])st8.nta [r8] = d[1], 8 -(p[6]) xor d[0] = d[0], r20 - br.ctop.dptk.few 0b - ;; - mov ar.lc = r30 - mov pr = r29, -1 - br.ret.sptk.few rp -END(xor_ia64_4) -EXPORT_SYMBOL(xor_ia64_4) - -GLOBAL_ENTRY(xor_ia64_5) - .prologue - .fframe 0 - .save ar.pfs, r31 - alloc r31 = ar.pfs, 6, 0, 34, 40 - .save ar.lc, r30 - mov r30 = ar.lc - .save pr, r29 - mov r29 = pr - ;; - .body - mov r8 = in1 - mov ar.ec = 6 + 2 - shr in0 = in0, 3 - ;; - adds in0 = -1, in0 - mov r16 = in1 - mov r17 = in2 - ;; - mov r18 = in3 - mov ar.lc = in0 - mov pr.rot = 1 << 16 - mov r19 = in4 - mov r20 = in5 - ;; - .rotr s1[6+1], s2[6+1], s3[6+1], s4[6+1], s5[6+1], d[2] - .rotp p[6+2] -0: -(p[0]) ld8.nta s1[0] = [r16], 8 -(p[0]) ld8.nta s2[0] = [r17], 8 -(p[6]) xor d[0] = s1[6], s2[6] -(p[0]) ld8.nta s3[0] = [r18], 8 -(p[0]) ld8.nta s4[0] = [r19], 8 -(p[6]) xor r21 = s3[6], s4[6] - ;; -(p[0]) ld8.nta s5[0] = [r20], 8 -(p[6+1])st8.nta [r8] = d[1], 8 -(p[6]) xor d[0] = d[0], r21 - ;; -(p[6]) xor d[0] = d[0], s5[6] - nop.f 0 - br.ctop.dptk.few 0b - ;; - mov ar.lc = r30 - mov pr = r29, -1 - br.ret.sptk.few rp -END(xor_ia64_5) -EXPORT_SYMBOL(xor_ia64_5) |