summaryrefslogtreecommitdiffstats
path: root/arch/ia64/lib
diff options
context:
space:
mode:
Diffstat (limited to 'arch/ia64/lib')
-rw-r--r--arch/ia64/lib/Makefile48
-rw-r--r--arch/ia64/lib/checksum.c102
-rw-r--r--arch/ia64/lib/clear_page.S79
-rw-r--r--arch/ia64/lib/clear_user.S212
-rw-r--r--arch/ia64/lib/copy_page.S101
-rw-r--r--arch/ia64/lib/copy_page_mck.S188
-rw-r--r--arch/ia64/lib/copy_user.S613
-rw-r--r--arch/ia64/lib/csum_partial_copy.c98
-rw-r--r--arch/ia64/lib/do_csum.S324
-rw-r--r--arch/ia64/lib/flush.S119
-rw-r--r--arch/ia64/lib/idiv32.S86
-rw-r--r--arch/ia64/lib/idiv64.S83
-rw-r--r--arch/ia64/lib/io.c51
-rw-r--r--arch/ia64/lib/ip_fast_csum.S148
-rw-r--r--arch/ia64/lib/memcpy.S304
-rw-r--r--arch/ia64/lib/memcpy_mck.S659
-rw-r--r--arch/ia64/lib/memset.S365
-rw-r--r--arch/ia64/lib/strlen.S195
-rw-r--r--arch/ia64/lib/strncpy_from_user.S47
-rw-r--r--arch/ia64/lib/strnlen_user.S48
-rw-r--r--arch/ia64/lib/xor.S181
21 files changed, 0 insertions, 4051 deletions
diff --git a/arch/ia64/lib/Makefile b/arch/ia64/lib/Makefile
deleted file mode 100644
index 081fcba01dc0..000000000000
--- a/arch/ia64/lib/Makefile
+++ /dev/null
@@ -1,48 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0
-#
-# Makefile for ia64-specific library routines..
-#
-
-lib-y := io.o __divsi3.o __udivsi3.o __modsi3.o __umodsi3.o \
- __divdi3.o __udivdi3.o __moddi3.o __umoddi3.o \
- checksum.o clear_page.o csum_partial_copy.o \
- clear_user.o strncpy_from_user.o strnlen_user.o \
- flush.o ip_fast_csum.o do_csum.o \
- memset.o strlen.o xor.o
-
-lib-$(CONFIG_ITANIUM) += copy_page.o copy_user.o memcpy.o
-lib-$(CONFIG_MCKINLEY) += copy_page_mck.o memcpy_mck.o
-
-AFLAGS___divdi3.o =
-AFLAGS___udivdi3.o = -DUNSIGNED
-AFLAGS___moddi3.o = -DMODULO
-AFLAGS___umoddi3.o = -DUNSIGNED -DMODULO
-
-AFLAGS___divsi3.o =
-AFLAGS___udivsi3.o = -DUNSIGNED
-AFLAGS___modsi3.o = -DMODULO
-AFLAGS___umodsi3.o = -DUNSIGNED -DMODULO
-
-$(obj)/__divdi3.o: $(src)/idiv64.S FORCE
- $(call if_changed_rule,as_o_S)
-
-$(obj)/__udivdi3.o: $(src)/idiv64.S FORCE
- $(call if_changed_rule,as_o_S)
-
-$(obj)/__moddi3.o: $(src)/idiv64.S FORCE
- $(call if_changed_rule,as_o_S)
-
-$(obj)/__umoddi3.o: $(src)/idiv64.S FORCE
- $(call if_changed_rule,as_o_S)
-
-$(obj)/__divsi3.o: $(src)/idiv32.S FORCE
- $(call if_changed_rule,as_o_S)
-
-$(obj)/__udivsi3.o: $(src)/idiv32.S FORCE
- $(call if_changed_rule,as_o_S)
-
-$(obj)/__modsi3.o: $(src)/idiv32.S FORCE
- $(call if_changed_rule,as_o_S)
-
-$(obj)/__umodsi3.o: $(src)/idiv32.S FORCE
- $(call if_changed_rule,as_o_S)
diff --git a/arch/ia64/lib/checksum.c b/arch/ia64/lib/checksum.c
deleted file mode 100644
index d26517fe3500..000000000000
--- a/arch/ia64/lib/checksum.c
+++ /dev/null
@@ -1,102 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Network checksum routines
- *
- * Copyright (C) 1999, 2003 Hewlett-Packard Co
- * Stephane Eranian <eranian@hpl.hp.com>
- *
- * Most of the code coming from arch/alpha/lib/checksum.c
- *
- * This file contains network checksum routines that are better done
- * in an architecture-specific manner due to speed..
- */
-
-#include <linux/module.h>
-#include <linux/string.h>
-
-#include <asm/byteorder.h>
-
-static inline unsigned short
-from64to16 (unsigned long x)
-{
- /* add up 32-bit words for 33 bits */
- x = (x & 0xffffffff) + (x >> 32);
- /* add up 16-bit and 17-bit words for 17+c bits */
- x = (x & 0xffff) + (x >> 16);
- /* add up 16-bit and 2-bit for 16+c bit */
- x = (x & 0xffff) + (x >> 16);
- /* add up carry.. */
- x = (x & 0xffff) + (x >> 16);
- return x;
-}
-
-/*
- * computes the checksum of the TCP/UDP pseudo-header
- * returns a 16-bit checksum, already complemented.
- */
-__sum16
-csum_tcpudp_magic(__be32 saddr, __be32 daddr, __u32 len,
- __u8 proto, __wsum sum)
-{
- return (__force __sum16)~from64to16(
- (__force u64)saddr + (__force u64)daddr +
- (__force u64)sum + ((len + proto) << 8));
-}
-
-EXPORT_SYMBOL(csum_tcpudp_magic);
-
-__wsum
-csum_tcpudp_nofold(__be32 saddr, __be32 daddr, __u32 len,
- __u8 proto, __wsum sum)
-{
- unsigned long result;
-
- result = (__force u64)saddr + (__force u64)daddr +
- (__force u64)sum + ((len + proto) << 8);
-
- /* Fold down to 32-bits so we don't lose in the typedef-less network stack. */
- /* 64 to 33 */
- result = (result & 0xffffffff) + (result >> 32);
- /* 33 to 32 */
- result = (result & 0xffffffff) + (result >> 32);
- return (__force __wsum)result;
-}
-EXPORT_SYMBOL(csum_tcpudp_nofold);
-
-extern unsigned long do_csum (const unsigned char *, long);
-
-/*
- * computes the checksum of a memory block at buff, length len,
- * and adds in "sum" (32-bit)
- *
- * returns a 32-bit number suitable for feeding into itself
- * or csum_tcpudp_magic
- *
- * this function must be called with even lengths, except
- * for the last fragment, which may be odd
- *
- * it's best to have buff aligned on a 32-bit boundary
- */
-__wsum csum_partial(const void *buff, int len, __wsum sum)
-{
- u64 result = do_csum(buff, len);
-
- /* add in old sum, and carry.. */
- result += (__force u32)sum;
- /* 32+c bits -> 32 bits */
- result = (result & 0xffffffff) + (result >> 32);
- return (__force __wsum)result;
-}
-
-EXPORT_SYMBOL(csum_partial);
-
-/*
- * this routine is used for miscellaneous IP-like checksums, mainly
- * in icmp.c
- */
-__sum16 ip_compute_csum (const void *buff, int len)
-{
- return (__force __sum16)~do_csum(buff,len);
-}
-
-EXPORT_SYMBOL(ip_compute_csum);
diff --git a/arch/ia64/lib/clear_page.S b/arch/ia64/lib/clear_page.S
deleted file mode 100644
index ba0dd2538fa5..000000000000
--- a/arch/ia64/lib/clear_page.S
+++ /dev/null
@@ -1,79 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Copyright (C) 1999-2002 Hewlett-Packard Co
- * Stephane Eranian <eranian@hpl.hp.com>
- * David Mosberger-Tang <davidm@hpl.hp.com>
- * Copyright (C) 2002 Ken Chen <kenneth.w.chen@intel.com>
- *
- * 1/06/01 davidm Tuned for Itanium.
- * 2/12/02 kchen Tuned for both Itanium and McKinley
- * 3/08/02 davidm Some more tweaking
- */
-
-#include <linux/export.h>
-#include <asm/asmmacro.h>
-#include <asm/page.h>
-
-#ifdef CONFIG_ITANIUM
-# define L3_LINE_SIZE 64 // Itanium L3 line size
-# define PREFETCH_LINES 9 // magic number
-#else
-# define L3_LINE_SIZE 128 // McKinley L3 line size
-# define PREFETCH_LINES 12 // magic number
-#endif
-
-#define saved_lc r2
-#define dst_fetch r3
-#define dst1 r8
-#define dst2 r9
-#define dst3 r10
-#define dst4 r11
-
-#define dst_last r31
-
-GLOBAL_ENTRY(clear_page)
- .prologue
- .regstk 1,0,0,0
- mov r16 = PAGE_SIZE/L3_LINE_SIZE-1 // main loop count, -1=repeat/until
- .save ar.lc, saved_lc
- mov saved_lc = ar.lc
-
- .body
- mov ar.lc = (PREFETCH_LINES - 1)
- mov dst_fetch = in0
- adds dst1 = 16, in0
- adds dst2 = 32, in0
- ;;
-.fetch: stf.spill.nta [dst_fetch] = f0, L3_LINE_SIZE
- adds dst3 = 48, in0 // executing this multiple times is harmless
- br.cloop.sptk.few .fetch
- ;;
- addl dst_last = (PAGE_SIZE - PREFETCH_LINES*L3_LINE_SIZE), dst_fetch
- mov ar.lc = r16 // one L3 line per iteration
- adds dst4 = 64, in0
- ;;
-#ifdef CONFIG_ITANIUM
- // Optimized for Itanium
-1: stf.spill.nta [dst1] = f0, 64
- stf.spill.nta [dst2] = f0, 64
- cmp.lt p8,p0=dst_fetch, dst_last
- ;;
-#else
- // Optimized for McKinley
-1: stf.spill.nta [dst1] = f0, 64
- stf.spill.nta [dst2] = f0, 64
- stf.spill.nta [dst3] = f0, 64
- stf.spill.nta [dst4] = f0, 128
- cmp.lt p8,p0=dst_fetch, dst_last
- ;;
- stf.spill.nta [dst1] = f0, 64
- stf.spill.nta [dst2] = f0, 64
-#endif
- stf.spill.nta [dst3] = f0, 64
-(p8) stf.spill.nta [dst_fetch] = f0, L3_LINE_SIZE
- br.cloop.sptk.few 1b
- ;;
- mov ar.lc = saved_lc // restore lc
- br.ret.sptk.many rp
-END(clear_page)
-EXPORT_SYMBOL(clear_page)
diff --git a/arch/ia64/lib/clear_user.S b/arch/ia64/lib/clear_user.S
deleted file mode 100644
index 1d9e45ccf8e5..000000000000
--- a/arch/ia64/lib/clear_user.S
+++ /dev/null
@@ -1,212 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * This routine clears to zero a linear memory buffer in user space.
- *
- * Inputs:
- * in0: address of buffer
- * in1: length of buffer in bytes
- * Outputs:
- * r8: number of bytes that didn't get cleared due to a fault
- *
- * Copyright (C) 1998, 1999, 2001 Hewlett-Packard Co
- * Stephane Eranian <eranian@hpl.hp.com>
- */
-
-#include <linux/export.h>
-#include <asm/asmmacro.h>
-
-//
-// arguments
-//
-#define buf r32
-#define len r33
-
-//
-// local registers
-//
-#define cnt r16
-#define buf2 r17
-#define saved_lc r18
-#define saved_pfs r19
-#define tmp r20
-#define len2 r21
-#define len3 r22
-
-//
-// Theory of operations:
-// - we check whether or not the buffer is small, i.e., less than 17
-// in which case we do the byte by byte loop.
-//
-// - Otherwise we go progressively from 1 byte store to 8byte store in
-// the head part, the body is a 16byte store loop and we finish we the
-// tail for the last 15 bytes.
-// The good point about this breakdown is that the long buffer handling
-// contains only 2 branches.
-//
-// The reason for not using shifting & masking for both the head and the
-// tail is to stay semantically correct. This routine is not supposed
-// to write bytes outside of the buffer. While most of the time this would
-// be ok, we can't tolerate a mistake. A classical example is the case
-// of multithreaded code were to the extra bytes touched is actually owned
-// by another thread which runs concurrently to ours. Another, less likely,
-// example is with device drivers where reading an I/O mapped location may
-// have side effects (same thing for writing).
-//
-
-GLOBAL_ENTRY(__do_clear_user)
- .prologue
- .save ar.pfs, saved_pfs
- alloc saved_pfs=ar.pfs,2,0,0,0
- cmp.eq p6,p0=r0,len // check for zero length
- .save ar.lc, saved_lc
- mov saved_lc=ar.lc // preserve ar.lc (slow)
- .body
- ;; // avoid WAW on CFM
- adds tmp=-1,len // br.ctop is repeat/until
- mov ret0=len // return value is length at this point
-(p6) br.ret.spnt.many rp
- ;;
- cmp.lt p6,p0=16,len // if len > 16 then long memset
- mov ar.lc=tmp // initialize lc for small count
-(p6) br.cond.dptk .long_do_clear
- ;; // WAR on ar.lc
- //
- // worst case 16 iterations, avg 8 iterations
- //
- // We could have played with the predicates to use the extra
- // M slot for 2 stores/iteration but the cost the initialization
- // the various counters compared to how long the loop is supposed
- // to last on average does not make this solution viable.
- //
-1:
- EX( .Lexit1, st1 [buf]=r0,1 )
- adds len=-1,len // countdown length using len
- br.cloop.dptk 1b
- ;; // avoid RAW on ar.lc
- //
- // .Lexit4: comes from byte by byte loop
- // len contains bytes left
-.Lexit1:
- mov ret0=len // faster than using ar.lc
- mov ar.lc=saved_lc
- br.ret.sptk.many rp // end of short clear_user
-
-
- //
- // At this point we know we have more than 16 bytes to copy
- // so we focus on alignment (no branches required)
- //
- // The use of len/len2 for countdown of the number of bytes left
- // instead of ret0 is due to the fact that the exception code
- // changes the values of r8.
- //
-.long_do_clear:
- tbit.nz p6,p0=buf,0 // odd alignment (for long_do_clear)
- ;;
- EX( .Lexit3, (p6) st1 [buf]=r0,1 ) // 1-byte aligned
-(p6) adds len=-1,len;; // sync because buf is modified
- tbit.nz p6,p0=buf,1
- ;;
- EX( .Lexit3, (p6) st2 [buf]=r0,2 ) // 2-byte aligned
-(p6) adds len=-2,len;;
- tbit.nz p6,p0=buf,2
- ;;
- EX( .Lexit3, (p6) st4 [buf]=r0,4 ) // 4-byte aligned
-(p6) adds len=-4,len;;
- tbit.nz p6,p0=buf,3
- ;;
- EX( .Lexit3, (p6) st8 [buf]=r0,8 ) // 8-byte aligned
-(p6) adds len=-8,len;;
- shr.u cnt=len,4 // number of 128-bit (2x64bit) words
- ;;
- cmp.eq p6,p0=r0,cnt
- adds tmp=-1,cnt
-(p6) br.cond.dpnt .dotail // we have less than 16 bytes left
- ;;
- adds buf2=8,buf // setup second base pointer
- mov ar.lc=tmp
- ;;
-
- //
- // 16bytes/iteration core loop
- //
- // The second store can never generate a fault because
- // we come into the loop only when we are 16-byte aligned.
- // This means that if we cross a page then it will always be
- // in the first store and never in the second.
- //
- //
- // We need to keep track of the remaining length. A possible (optimistic)
- // way would be to use ar.lc and derive how many byte were left by
- // doing : left= 16*ar.lc + 16. this would avoid the addition at
- // every iteration.
- // However we need to keep the synchronization point. A template
- // M;;MB does not exist and thus we can keep the addition at no
- // extra cycle cost (use a nop slot anyway). It also simplifies the
- // (unlikely) error recovery code
- //
-
-2: EX(.Lexit3, st8 [buf]=r0,16 )
- ;; // needed to get len correct when error
- st8 [buf2]=r0,16
- adds len=-16,len
- br.cloop.dptk 2b
- ;;
- mov ar.lc=saved_lc
- //
- // tail correction based on len only
- //
- // We alternate the use of len3,len2 to allow parallelism and correct
- // error handling. We also reuse p6/p7 to return correct value.
- // The addition of len2/len3 does not cost anything more compared to
- // the regular memset as we had empty slots.
- //
-.dotail:
- mov len2=len // for parallelization of error handling
- mov len3=len
- tbit.nz p6,p0=len,3
- ;;
- EX( .Lexit2, (p6) st8 [buf]=r0,8 ) // at least 8 bytes
-(p6) adds len3=-8,len2
- tbit.nz p7,p6=len,2
- ;;
- EX( .Lexit2, (p7) st4 [buf]=r0,4 ) // at least 4 bytes
-(p7) adds len2=-4,len3
- tbit.nz p6,p7=len,1
- ;;
- EX( .Lexit2, (p6) st2 [buf]=r0,2 ) // at least 2 bytes
-(p6) adds len3=-2,len2
- tbit.nz p7,p6=len,0
- ;;
- EX( .Lexit2, (p7) st1 [buf]=r0 ) // only 1 byte left
- mov ret0=r0 // success
- br.ret.sptk.many rp // end of most likely path
-
- //
- // Outlined error handling code
- //
-
- //
- // .Lexit3: comes from core loop, need restore pr/lc
- // len contains bytes left
- //
- //
- // .Lexit2:
- // if p6 -> coming from st8 or st2 : len2 contains what's left
- // if p7 -> coming from st4 or st1 : len3 contains what's left
- // We must restore lc/pr even though might not have been used.
-.Lexit2:
- .pred.rel "mutex", p6, p7
-(p6) mov len=len2
-(p7) mov len=len3
- ;;
- //
- // .Lexit4: comes from head, need not restore pr/lc
- // len contains bytes left
- //
-.Lexit3:
- mov ret0=len
- mov ar.lc=saved_lc
- br.ret.sptk.many rp
-END(__do_clear_user)
-EXPORT_SYMBOL(__do_clear_user)
diff --git a/arch/ia64/lib/copy_page.S b/arch/ia64/lib/copy_page.S
deleted file mode 100644
index c0a0e6b2af00..000000000000
--- a/arch/ia64/lib/copy_page.S
+++ /dev/null
@@ -1,101 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- *
- * Optimized version of the standard copy_page() function
- *
- * Inputs:
- * in0: address of target page
- * in1: address of source page
- * Output:
- * no return value
- *
- * Copyright (C) 1999, 2001 Hewlett-Packard Co
- * Stephane Eranian <eranian@hpl.hp.com>
- * David Mosberger <davidm@hpl.hp.com>
- *
- * 4/06/01 davidm Tuned to make it perform well both for cached and uncached copies.
- */
-#include <linux/export.h>
-#include <asm/asmmacro.h>
-#include <asm/page.h>
-
-#define PIPE_DEPTH 3
-#define EPI p[PIPE_DEPTH-1]
-
-#define lcount r16
-#define saved_pr r17
-#define saved_lc r18
-#define saved_pfs r19
-#define src1 r20
-#define src2 r21
-#define tgt1 r22
-#define tgt2 r23
-#define srcf r24
-#define tgtf r25
-#define tgt_last r26
-
-#define Nrot ((8*PIPE_DEPTH+7)&~7)
-
-GLOBAL_ENTRY(copy_page)
- .prologue
- .save ar.pfs, saved_pfs
- alloc saved_pfs=ar.pfs,3,Nrot-3,0,Nrot
-
- .rotr t1[PIPE_DEPTH], t2[PIPE_DEPTH], t3[PIPE_DEPTH], t4[PIPE_DEPTH], \
- t5[PIPE_DEPTH], t6[PIPE_DEPTH], t7[PIPE_DEPTH], t8[PIPE_DEPTH]
- .rotp p[PIPE_DEPTH]
-
- .save ar.lc, saved_lc
- mov saved_lc=ar.lc
- mov ar.ec=PIPE_DEPTH
-
- mov lcount=PAGE_SIZE/64-1
- .save pr, saved_pr
- mov saved_pr=pr
- mov pr.rot=1<<16
-
- .body
-
- mov src1=in1
- adds src2=8,in1
- mov tgt_last = PAGE_SIZE
- ;;
- adds tgt2=8,in0
- add srcf=512,in1
- mov ar.lc=lcount
- mov tgt1=in0
- add tgtf=512,in0
- add tgt_last = tgt_last, in0
- ;;
-1:
-(p[0]) ld8 t1[0]=[src1],16
-(EPI) st8 [tgt1]=t1[PIPE_DEPTH-1],16
-(p[0]) ld8 t2[0]=[src2],16
-(EPI) st8 [tgt2]=t2[PIPE_DEPTH-1],16
- cmp.ltu p6,p0 = tgtf, tgt_last
- ;;
-(p[0]) ld8 t3[0]=[src1],16
-(EPI) st8 [tgt1]=t3[PIPE_DEPTH-1],16
-(p[0]) ld8 t4[0]=[src2],16
-(EPI) st8 [tgt2]=t4[PIPE_DEPTH-1],16
- ;;
-(p[0]) ld8 t5[0]=[src1],16
-(EPI) st8 [tgt1]=t5[PIPE_DEPTH-1],16
-(p[0]) ld8 t6[0]=[src2],16
-(EPI) st8 [tgt2]=t6[PIPE_DEPTH-1],16
- ;;
-(p[0]) ld8 t7[0]=[src1],16
-(EPI) st8 [tgt1]=t7[PIPE_DEPTH-1],16
-(p[0]) ld8 t8[0]=[src2],16
-(EPI) st8 [tgt2]=t8[PIPE_DEPTH-1],16
-
-(p6) lfetch [srcf], 64
-(p6) lfetch [tgtf], 64
- br.ctop.sptk.few 1b
- ;;
- mov pr=saved_pr,0xffffffffffff0000 // restore predicates
- mov ar.pfs=saved_pfs
- mov ar.lc=saved_lc
- br.ret.sptk.many rp
-END(copy_page)
-EXPORT_SYMBOL(copy_page)
diff --git a/arch/ia64/lib/copy_page_mck.S b/arch/ia64/lib/copy_page_mck.S
deleted file mode 100644
index 5e8bb4b4b535..000000000000
--- a/arch/ia64/lib/copy_page_mck.S
+++ /dev/null
@@ -1,188 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * McKinley-optimized version of copy_page().
- *
- * Copyright (C) 2002 Hewlett-Packard Co
- * David Mosberger <davidm@hpl.hp.com>
- *
- * Inputs:
- * in0: address of target page
- * in1: address of source page
- * Output:
- * no return value
- *
- * General idea:
- * - use regular loads and stores to prefetch data to avoid consuming M-slot just for
- * lfetches => good for in-cache performance
- * - avoid l2 bank-conflicts by not storing into the same 16-byte bank within a single
- * cycle
- *
- * Principle of operation:
- * First, note that L1 has a line-size of 64 bytes and L2 a line-size of 128 bytes.
- * To avoid secondary misses in L2, we prefetch both source and destination with a line-size
- * of 128 bytes. When both of these lines are in the L2 and the first half of the
- * source line is in L1, we start copying the remaining words. The second half of the
- * source line is prefetched in an earlier iteration, so that by the time we start
- * accessing it, it's also present in the L1.
- *
- * We use a software-pipelined loop to control the overall operation. The pipeline
- * has 2*PREFETCH_DIST+K stages. The first PREFETCH_DIST stages are used for prefetching
- * source cache-lines. The second PREFETCH_DIST stages are used for prefetching destination
- * cache-lines, the last K stages are used to copy the cache-line words not copied by
- * the prefetches. The four relevant points in the pipelined are called A, B, C, D:
- * p[A] is TRUE if a source-line should be prefetched, p[B] is TRUE if a destination-line
- * should be prefetched, p[C] is TRUE if the second half of an L2 line should be brought
- * into L1D and p[D] is TRUE if a cacheline needs to be copied.
- *
- * This all sounds very complicated, but thanks to the modulo-scheduled loop support,
- * the resulting code is very regular and quite easy to follow (once you get the idea).
- *
- * As a secondary optimization, the first 2*PREFETCH_DIST iterations are implemented
- * as the separate .prefetch_loop. Logically, this loop performs exactly like the
- * main-loop (.line_copy), but has all known-to-be-predicated-off instructions removed,
- * so that each loop iteration is faster (again, good for cached case).
- *
- * When reading the code, it helps to keep the following picture in mind:
- *
- * word 0 word 1
- * +------+------+---
- * | v[x] | t1 | ^
- * | t2 | t3 | |
- * | t4 | t5 | |
- * | t6 | t7 | | 128 bytes
- * | n[y] | t9 | | (L2 cache line)
- * | t10 | t11 | |
- * | t12 | t13 | |
- * | t14 | t15 | v
- * +------+------+---
- *
- * Here, v[x] is copied by the (memory) prefetch. n[y] is loaded at p[C]
- * to fetch the second-half of the L2 cache line into L1, and the tX words are copied in
- * an order that avoids bank conflicts.
- */
-#include <linux/export.h>
-#include <asm/asmmacro.h>
-#include <asm/page.h>
-
-#define PREFETCH_DIST 8 // McKinley sustains 16 outstanding L2 misses (8 ld, 8 st)
-
-#define src0 r2
-#define src1 r3
-#define dst0 r9
-#define dst1 r10
-#define src_pre_mem r11
-#define dst_pre_mem r14
-#define src_pre_l2 r15
-#define dst_pre_l2 r16
-#define t1 r17
-#define t2 r18
-#define t3 r19
-#define t4 r20
-#define t5 t1 // alias!
-#define t6 t2 // alias!
-#define t7 t3 // alias!
-#define t9 t5 // alias!
-#define t10 t4 // alias!
-#define t11 t7 // alias!
-#define t12 t6 // alias!
-#define t14 t10 // alias!
-#define t13 r21
-#define t15 r22
-
-#define saved_lc r23
-#define saved_pr r24
-
-#define A 0
-#define B (PREFETCH_DIST)
-#define C (B + PREFETCH_DIST)
-#define D (C + 3)
-#define N (D + 1)
-#define Nrot ((N + 7) & ~7)
-
-GLOBAL_ENTRY(copy_page)
- .prologue
- alloc r8 = ar.pfs, 2, Nrot-2, 0, Nrot
-
- .rotr v[2*PREFETCH_DIST], n[D-C+1]
- .rotp p[N]
-
- .save ar.lc, saved_lc
- mov saved_lc = ar.lc
- .save pr, saved_pr
- mov saved_pr = pr
- .body
-
- mov src_pre_mem = in1
- mov pr.rot = 0x10000
- mov ar.ec = 1 // special unrolled loop
-
- mov dst_pre_mem = in0
- mov ar.lc = 2*PREFETCH_DIST - 1
-
- add src_pre_l2 = 8*8, in1
- add dst_pre_l2 = 8*8, in0
- add src0 = 8, in1 // first t1 src
- add src1 = 3*8, in1 // first t3 src
- add dst0 = 8, in0 // first t1 dst
- add dst1 = 3*8, in0 // first t3 dst
- mov t1 = (PAGE_SIZE/128) - (2*PREFETCH_DIST) - 1
- nop.m 0
- nop.i 0
- ;;
- // same as .line_copy loop, but with all predicated-off instructions removed:
-.prefetch_loop:
-(p[A]) ld8 v[A] = [src_pre_mem], 128 // M0
-(p[B]) st8 [dst_pre_mem] = v[B], 128 // M2
- br.ctop.sptk .prefetch_loop
- ;;
- cmp.eq p16, p0 = r0, r0 // reset p16 to 1 (br.ctop cleared it to zero)
- mov ar.lc = t1 // with 64KB pages, t1 is too big to fit in 8 bits!
- mov ar.ec = N // # of stages in pipeline
- ;;
-.line_copy:
-(p[D]) ld8 t2 = [src0], 3*8 // M0
-(p[D]) ld8 t4 = [src1], 3*8 // M1
-(p[B]) st8 [dst_pre_mem] = v[B], 128 // M2 prefetch dst from memory
-(p[D]) st8 [dst_pre_l2] = n[D-C], 128 // M3 prefetch dst from L2
- ;;
-(p[A]) ld8 v[A] = [src_pre_mem], 128 // M0 prefetch src from memory
-(p[C]) ld8 n[0] = [src_pre_l2], 128 // M1 prefetch src from L2
-(p[D]) st8 [dst0] = t1, 8 // M2
-(p[D]) st8 [dst1] = t3, 8 // M3
- ;;
-(p[D]) ld8 t5 = [src0], 8
-(p[D]) ld8 t7 = [src1], 3*8
-(p[D]) st8 [dst0] = t2, 3*8
-(p[D]) st8 [dst1] = t4, 3*8
- ;;
-(p[D]) ld8 t6 = [src0], 3*8
-(p[D]) ld8 t10 = [src1], 8
-(p[D]) st8 [dst0] = t5, 8
-(p[D]) st8 [dst1] = t7, 3*8
- ;;
-(p[D]) ld8 t9 = [src0], 3*8
-(p[D]) ld8 t11 = [src1], 3*8
-(p[D]) st8 [dst0] = t6, 3*8
-(p[D]) st8 [dst1] = t10, 8
- ;;
-(p[D]) ld8 t12 = [src0], 8
-(p[D]) ld8 t14 = [src1], 8
-(p[D]) st8 [dst0] = t9, 3*8
-(p[D]) st8 [dst1] = t11, 3*8
- ;;
-(p[D]) ld8 t13 = [src0], 4*8
-(p[D]) ld8 t15 = [src1], 4*8
-(p[D]) st8 [dst0] = t12, 8
-(p[D]) st8 [dst1] = t14, 8
- ;;
-(p[D-1])ld8 t1 = [src0], 8
-(p[D-1])ld8 t3 = [src1], 8
-(p[D]) st8 [dst0] = t13, 4*8
-(p[D]) st8 [dst1] = t15, 4*8
- br.ctop.sptk .line_copy
- ;;
- mov ar.lc = saved_lc
- mov pr = saved_pr, -1
- br.ret.sptk.many rp
-END(copy_page)
-EXPORT_SYMBOL(copy_page)
diff --git a/arch/ia64/lib/copy_user.S b/arch/ia64/lib/copy_user.S
deleted file mode 100644
index 8daab72cfe77..000000000000
--- a/arch/ia64/lib/copy_user.S
+++ /dev/null
@@ -1,613 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- *
- * Optimized version of the copy_user() routine.
- * It is used to copy date across the kernel/user boundary.
- *
- * The source and destination are always on opposite side of
- * the boundary. When reading from user space we must catch
- * faults on loads. When writing to user space we must catch
- * errors on stores. Note that because of the nature of the copy
- * we don't need to worry about overlapping regions.
- *
- *
- * Inputs:
- * in0 address of source buffer
- * in1 address of destination buffer
- * in2 number of bytes to copy
- *
- * Outputs:
- * ret0 0 in case of success. The number of bytes NOT copied in
- * case of error.
- *
- * Copyright (C) 2000-2001 Hewlett-Packard Co
- * Stephane Eranian <eranian@hpl.hp.com>
- *
- * Fixme:
- * - handle the case where we have more than 16 bytes and the alignment
- * are different.
- * - more benchmarking
- * - fix extraneous stop bit introduced by the EX() macro.
- */
-
-#include <linux/export.h>
-#include <asm/asmmacro.h>
-
-//
-// Tuneable parameters
-//
-#define COPY_BREAK 16 // we do byte copy below (must be >=16)
-#define PIPE_DEPTH 21 // pipe depth
-
-#define EPI p[PIPE_DEPTH-1]
-
-//
-// arguments
-//
-#define dst in0
-#define src in1
-#define len in2
-
-//
-// local registers
-//
-#define t1 r2 // rshift in bytes
-#define t2 r3 // lshift in bytes
-#define rshift r14 // right shift in bits
-#define lshift r15 // left shift in bits
-#define word1 r16
-#define word2 r17
-#define cnt r18
-#define len2 r19
-#define saved_lc r20
-#define saved_pr r21
-#define tmp r22
-#define val r23
-#define src1 r24
-#define dst1 r25
-#define src2 r26
-#define dst2 r27
-#define len1 r28
-#define enddst r29
-#define endsrc r30
-#define saved_pfs r31
-
-GLOBAL_ENTRY(__copy_user)
- .prologue
- .save ar.pfs, saved_pfs
- alloc saved_pfs=ar.pfs,3,((2*PIPE_DEPTH+7)&~7),0,((2*PIPE_DEPTH+7)&~7)
-
- .rotr val1[PIPE_DEPTH],val2[PIPE_DEPTH]
- .rotp p[PIPE_DEPTH]
-
- adds len2=-1,len // br.ctop is repeat/until
- mov ret0=r0
-
- ;; // RAW of cfm when len=0
- cmp.eq p8,p0=r0,len // check for zero length
- .save ar.lc, saved_lc
- mov saved_lc=ar.lc // preserve ar.lc (slow)
-(p8) br.ret.spnt.many rp // empty mempcy()
- ;;
- add enddst=dst,len // first byte after end of source
- add endsrc=src,len // first byte after end of destination
- .save pr, saved_pr
- mov saved_pr=pr // preserve predicates
-
- .body
-
- mov dst1=dst // copy because of rotation
- mov ar.ec=PIPE_DEPTH
- mov pr.rot=1<<16 // p16=true all others are false
-
- mov src1=src // copy because of rotation
- mov ar.lc=len2 // initialize lc for small count
- cmp.lt p10,p7=COPY_BREAK,len // if len > COPY_BREAK then long copy
-
- xor tmp=src,dst // same alignment test prepare
-(p10) br.cond.dptk .long_copy_user
- ;; // RAW pr.rot/p16 ?
- //
- // Now we do the byte by byte loop with software pipeline
- //
- // p7 is necessarily false by now
-1:
- EX(.failure_in_pipe1,(p16) ld1 val1[0]=[src1],1)
- EX(.failure_out,(EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1)
- br.ctop.dptk.few 1b
- ;;
- mov ar.lc=saved_lc
- mov pr=saved_pr,0xffffffffffff0000
- mov ar.pfs=saved_pfs // restore ar.ec
- br.ret.sptk.many rp // end of short memcpy
-
- //
- // Not 8-byte aligned
- //
-.diff_align_copy_user:
- // At this point we know we have more than 16 bytes to copy
- // and also that src and dest do _not_ have the same alignment.
- and src2=0x7,src1 // src offset
- and dst2=0x7,dst1 // dst offset
- ;;
- // The basic idea is that we copy byte-by-byte at the head so
- // that we can reach 8-byte alignment for both src1 and dst1.
- // Then copy the body using software pipelined 8-byte copy,
- // shifting the two back-to-back words right and left, then copy
- // the tail by copying byte-by-byte.
- //
- // Fault handling. If the byte-by-byte at the head fails on the
- // load, then restart and finish the pipleline by copying zeros
- // to the dst1. Then copy zeros for the rest of dst1.
- // If 8-byte software pipeline fails on the load, do the same as
- // failure_in3 does. If the byte-by-byte at the tail fails, it is
- // handled simply by failure_in_pipe1.
- //
- // The case p14 represents the source has more bytes in the
- // the first word (by the shifted part), whereas the p15 needs to
- // copy some bytes from the 2nd word of the source that has the
- // tail of the 1st of the destination.
- //
-
- //
- // Optimization. If dst1 is 8-byte aligned (quite common), we don't need
- // to copy the head to dst1, to start 8-byte copy software pipeline.
- // We know src1 is not 8-byte aligned in this case.
- //
- cmp.eq p14,p15=r0,dst2
-(p15) br.cond.spnt 1f
- ;;
- sub t1=8,src2
- mov t2=src2
- ;;
- shl rshift=t2,3
- sub len1=len,t1 // set len1
- ;;
- sub lshift=64,rshift
- ;;
- br.cond.spnt .word_copy_user
- ;;
-1:
- cmp.leu p14,p15=src2,dst2
- sub t1=dst2,src2
- ;;
- .pred.rel "mutex", p14, p15
-(p14) sub word1=8,src2 // (8 - src offset)
-(p15) sub t1=r0,t1 // absolute value
-(p15) sub word1=8,dst2 // (8 - dst offset)
- ;;
- // For the case p14, we don't need to copy the shifted part to
- // the 1st word of destination.
- sub t2=8,t1
-(p14) sub word1=word1,t1
- ;;
- sub len1=len,word1 // resulting len
-(p15) shl rshift=t1,3 // in bits
-(p14) shl rshift=t2,3
- ;;
-(p14) sub len1=len1,t1
- adds cnt=-1,word1
- ;;
- sub lshift=64,rshift
- mov ar.ec=PIPE_DEPTH
- mov pr.rot=1<<16 // p16=true all others are false
- mov ar.lc=cnt
- ;;
-2:
- EX(.failure_in_pipe2,(p16) ld1 val1[0]=[src1],1)
- EX(.failure_out,(EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1)
- br.ctop.dptk.few 2b
- ;;
- clrrrb
- ;;
-.word_copy_user:
- cmp.gtu p9,p0=16,len1
-(p9) br.cond.spnt 4f // if (16 > len1) skip 8-byte copy
- ;;
- shr.u cnt=len1,3 // number of 64-bit words
- ;;
- adds cnt=-1,cnt
- ;;
- .pred.rel "mutex", p14, p15
-(p14) sub src1=src1,t2
-(p15) sub src1=src1,t1
- //
- // Now both src1 and dst1 point to an 8-byte aligned address. And
- // we have more than 8 bytes to copy.
- //
- mov ar.lc=cnt
- mov ar.ec=PIPE_DEPTH
- mov pr.rot=1<<16 // p16=true all others are false
- ;;
-3:
- //
- // The pipleline consists of 3 stages:
- // 1 (p16): Load a word from src1
- // 2 (EPI_1): Shift right pair, saving to tmp
- // 3 (EPI): Store tmp to dst1
- //
- // To make it simple, use at least 2 (p16) loops to set up val1[n]
- // because we need 2 back-to-back val1[] to get tmp.
- // Note that this implies EPI_2 must be p18 or greater.
- //
-
-#define EPI_1 p[PIPE_DEPTH-2]
-#define SWITCH(pred, shift) cmp.eq pred,p0=shift,rshift
-#define CASE(pred, shift) \
- (pred) br.cond.spnt .copy_user_bit##shift
-#define BODY(rshift) \
-.copy_user_bit##rshift: \
-1: \
- EX(.failure_out,(EPI) st8 [dst1]=tmp,8); \
-(EPI_1) shrp tmp=val1[PIPE_DEPTH-2],val1[PIPE_DEPTH-1],rshift; \
- EX(3f,(p16) ld8 val1[1]=[src1],8); \
-(p16) mov val1[0]=r0; \
- br.ctop.dptk 1b; \
- ;; \
- br.cond.sptk.many .diff_align_do_tail; \
-2: \
-(EPI) st8 [dst1]=tmp,8; \
-(EPI_1) shrp tmp=val1[PIPE_DEPTH-2],val1[PIPE_DEPTH-1],rshift; \
-3: \
-(p16) mov val1[1]=r0; \
-(p16) mov val1[0]=r0; \
- br.ctop.dptk 2b; \
- ;; \
- br.cond.sptk.many .failure_in2
-
- //
- // Since the instruction 'shrp' requires a fixed 128-bit value
- // specifying the bits to shift, we need to provide 7 cases
- // below.
- //
- SWITCH(p6, 8)
- SWITCH(p7, 16)
- SWITCH(p8, 24)
- SWITCH(p9, 32)
- SWITCH(p10, 40)
- SWITCH(p11, 48)
- SWITCH(p12, 56)
- ;;
- CASE(p6, 8)
- CASE(p7, 16)
- CASE(p8, 24)
- CASE(p9, 32)
- CASE(p10, 40)
- CASE(p11, 48)
- CASE(p12, 56)
- ;;
- BODY(8)
- BODY(16)
- BODY(24)
- BODY(32)
- BODY(40)
- BODY(48)
- BODY(56)
- ;;
-.diff_align_do_tail:
- .pred.rel "mutex", p14, p15
-(p14) sub src1=src1,t1
-(p14) adds dst1=-8,dst1
-(p15) sub dst1=dst1,t1
- ;;
-4:
- // Tail correction.
- //
- // The problem with this piplelined loop is that the last word is not
- // loaded and thus parf of the last word written is not correct.
- // To fix that, we simply copy the tail byte by byte.
-
- sub len1=endsrc,src1,1
- clrrrb
- ;;
- mov ar.ec=PIPE_DEPTH
- mov pr.rot=1<<16 // p16=true all others are false
- mov ar.lc=len1
- ;;
-5:
- EX(.failure_in_pipe1,(p16) ld1 val1[0]=[src1],1)
- EX(.failure_out,(EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1)
- br.ctop.dptk.few 5b
- ;;
- mov ar.lc=saved_lc
- mov pr=saved_pr,0xffffffffffff0000
- mov ar.pfs=saved_pfs
- br.ret.sptk.many rp
-
- //
- // Beginning of long mempcy (i.e. > 16 bytes)
- //
-.long_copy_user:
- tbit.nz p6,p7=src1,0 // odd alignment
- and tmp=7,tmp
- ;;
- cmp.eq p10,p8=r0,tmp
- mov len1=len // copy because of rotation
-(p8) br.cond.dpnt .diff_align_copy_user
- ;;
- // At this point we know we have more than 16 bytes to copy
- // and also that both src and dest have the same alignment
- // which may not be the one we want. So for now we must move
- // forward slowly until we reach 16byte alignment: no need to
- // worry about reaching the end of buffer.
- //
- EX(.failure_in1,(p6) ld1 val1[0]=[src1],1) // 1-byte aligned
-(p6) adds len1=-1,len1;;
- tbit.nz p7,p0=src1,1
- ;;
- EX(.failure_in1,(p7) ld2 val1[1]=[src1],2) // 2-byte aligned
-(p7) adds len1=-2,len1;;
- tbit.nz p8,p0=src1,2
- ;;
- //
- // Stop bit not required after ld4 because if we fail on ld4
- // we have never executed the ld1, therefore st1 is not executed.
- //
- EX(.failure_in1,(p8) ld4 val2[0]=[src1],4) // 4-byte aligned
- ;;
- EX(.failure_out,(p6) st1 [dst1]=val1[0],1)
- tbit.nz p9,p0=src1,3
- ;;
- //
- // Stop bit not required after ld8 because if we fail on ld8
- // we have never executed the ld2, therefore st2 is not executed.
- //
- EX(.failure_in1,(p9) ld8 val2[1]=[src1],8) // 8-byte aligned
- EX(.failure_out,(p7) st2 [dst1]=val1[1],2)
-(p8) adds len1=-4,len1
- ;;
- EX(.failure_out, (p8) st4 [dst1]=val2[0],4)
-(p9) adds len1=-8,len1;;
- shr.u cnt=len1,4 // number of 128-bit (2x64bit) words
- ;;
- EX(.failure_out, (p9) st8 [dst1]=val2[1],8)
- tbit.nz p6,p0=len1,3
- cmp.eq p7,p0=r0,cnt
- adds tmp=-1,cnt // br.ctop is repeat/until
-(p7) br.cond.dpnt .dotail // we have less than 16 bytes left
- ;;
- adds src2=8,src1
- adds dst2=8,dst1
- mov ar.lc=tmp
- ;;
- //
- // 16bytes/iteration
- //
-2:
- EX(.failure_in3,(p16) ld8 val1[0]=[src1],16)
-(p16) ld8 val2[0]=[src2],16
-
- EX(.failure_out, (EPI) st8 [dst1]=val1[PIPE_DEPTH-1],16)
-(EPI) st8 [dst2]=val2[PIPE_DEPTH-1],16
- br.ctop.dptk 2b
- ;; // RAW on src1 when fall through from loop
- //
- // Tail correction based on len only
- //
- // No matter where we come from (loop or test) the src1 pointer
- // is 16 byte aligned AND we have less than 16 bytes to copy.
- //
-.dotail:
- EX(.failure_in1,(p6) ld8 val1[0]=[src1],8) // at least 8 bytes
- tbit.nz p7,p0=len1,2
- ;;
- EX(.failure_in1,(p7) ld4 val1[1]=[src1],4) // at least 4 bytes
- tbit.nz p8,p0=len1,1
- ;;
- EX(.failure_in1,(p8) ld2 val2[0]=[src1],2) // at least 2 bytes
- tbit.nz p9,p0=len1,0
- ;;
- EX(.failure_out, (p6) st8 [dst1]=val1[0],8)
- ;;
- EX(.failure_in1,(p9) ld1 val2[1]=[src1]) // only 1 byte left
- mov ar.lc=saved_lc
- ;;
- EX(.failure_out,(p7) st4 [dst1]=val1[1],4)
- mov pr=saved_pr,0xffffffffffff0000
- ;;
- EX(.failure_out, (p8) st2 [dst1]=val2[0],2)
- mov ar.pfs=saved_pfs
- ;;
- EX(.failure_out, (p9) st1 [dst1]=val2[1])
- br.ret.sptk.many rp
-
-
- //
- // Here we handle the case where the byte by byte copy fails
- // on the load.
- // Several factors make the zeroing of the rest of the buffer kind of
- // tricky:
- // - the pipeline: loads/stores are not in sync (pipeline)
- //
- // In the same loop iteration, the dst1 pointer does not directly
- // reflect where the faulty load was.
- //
- // - pipeline effect
- // When you get a fault on load, you may have valid data from
- // previous loads not yet store in transit. Such data must be
- // store normally before moving onto zeroing the rest.
- //
- // - single/multi dispersal independence.
- //
- // solution:
- // - we don't disrupt the pipeline, i.e. data in transit in
- // the software pipeline will be eventually move to memory.
- // We simply replace the load with a simple mov and keep the
- // pipeline going. We can't really do this inline because
- // p16 is always reset to 1 when lc > 0.
- //
-.failure_in_pipe1:
- sub ret0=endsrc,src1 // number of bytes to zero, i.e. not copied
-1:
-(p16) mov val1[0]=r0
-(EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1
- br.ctop.dptk 1b
- ;;
- mov pr=saved_pr,0xffffffffffff0000
- mov ar.lc=saved_lc
- mov ar.pfs=saved_pfs
- br.ret.sptk.many rp
-
- //
- // This is the case where the byte by byte copy fails on the load
- // when we copy the head. We need to finish the pipeline and copy
- // zeros for the rest of the destination. Since this happens
- // at the top we still need to fill the body and tail.
-.failure_in_pipe2:
- sub ret0=endsrc,src1 // number of bytes to zero, i.e. not copied
-2:
-(p16) mov val1[0]=r0
-(EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1
- br.ctop.dptk 2b
- ;;
- sub len=enddst,dst1,1 // precompute len
- br.cond.dptk.many .failure_in1bis
- ;;
-
- //
- // Here we handle the head & tail part when we check for alignment.
- // The following code handles only the load failures. The
- // main diffculty comes from the fact that loads/stores are
- // scheduled. So when you fail on a load, the stores corresponding
- // to previous successful loads must be executed.
- //
- // However some simplifications are possible given the way
- // things work.
- //
- // 1) HEAD
- // Theory of operation:
- //
- // Page A | Page B
- // ---------|-----
- // 1|8 x
- // 1 2|8 x
- // 4|8 x
- // 1 4|8 x
- // 2 4|8 x
- // 1 2 4|8 x
- // |1
- // |2 x
- // |4 x
- //
- // page_size >= 4k (2^12). (x means 4, 2, 1)
- // Here we suppose Page A exists and Page B does not.
- //
- // As we move towards eight byte alignment we may encounter faults.
- // The numbers on each page show the size of the load (current alignment).
- //
- // Key point:
- // - if you fail on 1, 2, 4 then you have never executed any smaller
- // size loads, e.g. failing ld4 means no ld1 nor ld2 executed
- // before.
- //
- // This allows us to simplify the cleanup code, because basically you
- // only have to worry about "pending" stores in the case of a failing
- // ld8(). Given the way the code is written today, this means only
- // worry about st2, st4. There we can use the information encapsulated
- // into the predicates.
- //
- // Other key point:
- // - if you fail on the ld8 in the head, it means you went straight
- // to it, i.e. 8byte alignment within an unexisting page.
- // Again this comes from the fact that if you crossed just for the ld8 then
- // you are 8byte aligned but also 16byte align, therefore you would
- // either go for the 16byte copy loop OR the ld8 in the tail part.
- // The combination ld1, ld2, ld4, ld8 where you fail on ld8 is impossible
- // because it would mean you had 15bytes to copy in which case you
- // would have defaulted to the byte by byte copy.
- //
- //
- // 2) TAIL
- // Here we now we have less than 16 bytes AND we are either 8 or 16 byte
- // aligned.
- //
- // Key point:
- // This means that we either:
- // - are right on a page boundary
- // OR
- // - are at more than 16 bytes from a page boundary with
- // at most 15 bytes to copy: no chance of crossing.
- //
- // This allows us to assume that if we fail on a load we haven't possibly
- // executed any of the previous (tail) ones, so we don't need to do
- // any stores. For instance, if we fail on ld2, this means we had
- // 2 or 3 bytes left to copy and we did not execute the ld8 nor ld4.
- //
- // This means that we are in a situation similar the a fault in the
- // head part. That's nice!
- //
-.failure_in1:
- sub ret0=endsrc,src1 // number of bytes to zero, i.e. not copied
- sub len=endsrc,src1,1
- //
- // we know that ret0 can never be zero at this point
- // because we failed why trying to do a load, i.e. there is still
- // some work to do.
- // The failure_in1bis and length problem is taken care of at the
- // calling side.
- //
- ;;
-.failure_in1bis: // from (.failure_in3)
- mov ar.lc=len // Continue with a stupid byte store.
- ;;
-5:
- st1 [dst1]=r0,1
- br.cloop.dptk 5b
- ;;
- mov pr=saved_pr,0xffffffffffff0000
- mov ar.lc=saved_lc
- mov ar.pfs=saved_pfs
- br.ret.sptk.many rp
-
- //
- // Here we simply restart the loop but instead
- // of doing loads we fill the pipeline with zeroes
- // We can't simply store r0 because we may have valid
- // data in transit in the pipeline.
- // ar.lc and ar.ec are setup correctly at this point
- //
- // we MUST use src1/endsrc here and not dst1/enddst because
- // of the pipeline effect.
- //
-.failure_in3:
- sub ret0=endsrc,src1 // number of bytes to zero, i.e. not copied
- ;;
-2:
-(p16) mov val1[0]=r0
-(p16) mov val2[0]=r0
-(EPI) st8 [dst1]=val1[PIPE_DEPTH-1],16
-(EPI) st8 [dst2]=val2[PIPE_DEPTH-1],16
- br.ctop.dptk 2b
- ;;
- cmp.ne p6,p0=dst1,enddst // Do we need to finish the tail ?
- sub len=enddst,dst1,1 // precompute len
-(p6) br.cond.dptk .failure_in1bis
- ;;
- mov pr=saved_pr,0xffffffffffff0000
- mov ar.lc=saved_lc
- mov ar.pfs=saved_pfs
- br.ret.sptk.many rp
-
-.failure_in2:
- sub ret0=endsrc,src1
- cmp.ne p6,p0=dst1,enddst // Do we need to finish the tail ?
- sub len=enddst,dst1,1 // precompute len
-(p6) br.cond.dptk .failure_in1bis
- ;;
- mov pr=saved_pr,0xffffffffffff0000
- mov ar.lc=saved_lc
- mov ar.pfs=saved_pfs
- br.ret.sptk.many rp
-
- //
- // handling of failures on stores: that's the easy part
- //
-.failure_out:
- sub ret0=enddst,dst1
- mov pr=saved_pr,0xffffffffffff0000
- mov ar.lc=saved_lc
-
- mov ar.pfs=saved_pfs
- br.ret.sptk.many rp
-END(__copy_user)
-EXPORT_SYMBOL(__copy_user)
diff --git a/arch/ia64/lib/csum_partial_copy.c b/arch/ia64/lib/csum_partial_copy.c
deleted file mode 100644
index 917e3138b277..000000000000
--- a/arch/ia64/lib/csum_partial_copy.c
+++ /dev/null
@@ -1,98 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Network Checksum & Copy routine
- *
- * Copyright (C) 1999, 2003-2004 Hewlett-Packard Co
- * Stephane Eranian <eranian@hpl.hp.com>
- *
- * Most of the code has been imported from Linux/Alpha
- */
-
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/string.h>
-
-#include <net/checksum.h>
-
-/*
- * XXX Fixme: those 2 inlines are meant for debugging and will go away
- */
-static inline unsigned
-short from64to16(unsigned long x)
-{
- /* add up 32-bit words for 33 bits */
- x = (x & 0xffffffff) + (x >> 32);
- /* add up 16-bit and 17-bit words for 17+c bits */
- x = (x & 0xffff) + (x >> 16);
- /* add up 16-bit and 2-bit for 16+c bit */
- x = (x & 0xffff) + (x >> 16);
- /* add up carry.. */
- x = (x & 0xffff) + (x >> 16);
- return x;
-}
-
-static inline
-unsigned long do_csum_c(const unsigned char * buff, int len, unsigned int psum)
-{
- int odd, count;
- unsigned long result = (unsigned long)psum;
-
- if (len <= 0)
- goto out;
- odd = 1 & (unsigned long) buff;
- if (odd) {
- result = *buff << 8;
- len--;
- buff++;
- }
- count = len >> 1; /* nr of 16-bit words.. */
- if (count) {
- if (2 & (unsigned long) buff) {
- result += *(unsigned short *) buff;
- count--;
- len -= 2;
- buff += 2;
- }
- count >>= 1; /* nr of 32-bit words.. */
- if (count) {
- if (4 & (unsigned long) buff) {
- result += *(unsigned int *) buff;
- count--;
- len -= 4;
- buff += 4;
- }
- count >>= 1; /* nr of 64-bit words.. */
- if (count) {
- unsigned long carry = 0;
- do {
- unsigned long w = *(unsigned long *) buff;
- count--;
- buff += 8;
- result += carry;
- result += w;
- carry = (w > result);
- } while (count);
- result += carry;
- result = (result & 0xffffffff) + (result >> 32);
- }
- if (len & 4) {
- result += *(unsigned int *) buff;
- buff += 4;
- }
- }
- if (len & 2) {
- result += *(unsigned short *) buff;
- buff += 2;
- }
- }
- if (len & 1)
- result += *buff;
-
- result = from64to16(result);
-
- if (odd)
- result = ((result >> 8) & 0xff) | ((result & 0xff) << 8);
-
-out:
- return result;
-}
diff --git a/arch/ia64/lib/do_csum.S b/arch/ia64/lib/do_csum.S
deleted file mode 100644
index 6004dad2597c..000000000000
--- a/arch/ia64/lib/do_csum.S
+++ /dev/null
@@ -1,324 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- *
- * Optmized version of the standard do_csum() function
- *
- * Return: a 64bit quantity containing the 16bit Internet checksum
- *
- * Inputs:
- * in0: address of buffer to checksum (char *)
- * in1: length of the buffer (int)
- *
- * Copyright (C) 1999, 2001-2002 Hewlett-Packard Co
- * Stephane Eranian <eranian@hpl.hp.com>
- *
- * 02/04/22 Ken Chen <kenneth.w.chen@intel.com>
- * Data locality study on the checksum buffer.
- * More optimization cleanup - remove excessive stop bits.
- * 02/04/08 David Mosberger <davidm@hpl.hp.com>
- * More cleanup and tuning.
- * 01/04/18 Jun Nakajima <jun.nakajima@intel.com>
- * Clean up and optimize and the software pipeline, loading two
- * back-to-back 8-byte words per loop. Clean up the initialization
- * for the loop. Support the cases where load latency = 1 or 2.
- * Set CONFIG_IA64_LOAD_LATENCY to 1 or 2 (default).
- */
-
-#include <asm/asmmacro.h>
-
-//
-// Theory of operations:
-// The goal is to go as quickly as possible to the point where
-// we can checksum 16 bytes/loop. Before reaching that point we must
-// take care of incorrect alignment of first byte.
-//
-// The code hereafter also takes care of the "tail" part of the buffer
-// before entering the core loop, if any. The checksum is a sum so it
-// allows us to commute operations. So we do the "head" and "tail"
-// first to finish at full speed in the body. Once we get the head and
-// tail values, we feed them into the pipeline, very handy initialization.
-//
-// Of course we deal with the special case where the whole buffer fits
-// into one 8 byte word. In this case we have only one entry in the pipeline.
-//
-// We use a (LOAD_LATENCY+2)-stage pipeline in the loop to account for
-// possible load latency and also to accommodate for head and tail.
-//
-// The end of the function deals with folding the checksum from 64bits
-// down to 16bits taking care of the carry.
-//
-// This version avoids synchronization in the core loop by also using a
-// pipeline for the accumulation of the checksum in resultx[] (x=1,2).
-//
-// wordx[] (x=1,2)
-// |---|
-// | | 0 : new value loaded in pipeline
-// |---|
-// | | - : in transit data
-// |---|
-// | | LOAD_LATENCY : current value to add to checksum
-// |---|
-// | | LOAD_LATENCY+1 : previous value added to checksum
-// |---| (previous iteration)
-//
-// resultx[] (x=1,2)
-// |---|
-// | | 0 : initial value
-// |---|
-// | | LOAD_LATENCY-1 : new checksum
-// |---|
-// | | LOAD_LATENCY : previous value of checksum
-// |---|
-// | | LOAD_LATENCY+1 : final checksum when out of the loop
-// |---|
-//
-//
-// See RFC1071 "Computing the Internet Checksum" for various techniques for
-// calculating the Internet checksum.
-//
-// NOT YET DONE:
-// - Maybe another algorithm which would take care of the folding at the
-// end in a different manner
-// - Work with people more knowledgeable than me on the network stack
-// to figure out if we could not split the function depending on the
-// type of packet or alignment we get. Like the ip_fast_csum() routine
-// where we know we have at least 20bytes worth of data to checksum.
-// - Do a better job of handling small packets.
-// - Note on prefetching: it was found that under various load, i.e. ftp read/write,
-// nfs read/write, the L1 cache hit rate is at 60% and L2 cache hit rate is at 99.8%
-// on the data that buffer points to (partly because the checksum is often preceded by
-// a copy_from_user()). This finding indiate that lfetch will not be beneficial since
-// the data is already in the cache.
-//
-
-#define saved_pfs r11
-#define hmask r16
-#define tmask r17
-#define first1 r18
-#define firstval r19
-#define firstoff r20
-#define last r21
-#define lastval r22
-#define lastoff r23
-#define saved_lc r24
-#define saved_pr r25
-#define tmp1 r26
-#define tmp2 r27
-#define tmp3 r28
-#define carry1 r29
-#define carry2 r30
-#define first2 r31
-
-#define buf in0
-#define len in1
-
-#define LOAD_LATENCY 2 // XXX fix me
-
-#if (LOAD_LATENCY != 1) && (LOAD_LATENCY != 2)
-# error "Only 1 or 2 is supported/tested for LOAD_LATENCY."
-#endif
-
-#define PIPE_DEPTH (LOAD_LATENCY+2)
-#define ELD p[LOAD_LATENCY] // end of load
-#define ELD_1 p[LOAD_LATENCY+1] // and next stage
-
-// unsigned long do_csum(unsigned char *buf,long len)
-
-GLOBAL_ENTRY(do_csum)
- .prologue
- .save ar.pfs, saved_pfs
- alloc saved_pfs=ar.pfs,2,16,0,16
- .rotr word1[4], word2[4],result1[LOAD_LATENCY+2],result2[LOAD_LATENCY+2]
- .rotp p[PIPE_DEPTH], pC1[2], pC2[2]
- mov ret0=r0 // in case we have zero length
- cmp.lt p0,p6=r0,len // check for zero length or negative (32bit len)
- ;;
- add tmp1=buf,len // last byte's address
- .save pr, saved_pr
- mov saved_pr=pr // preserve predicates (rotation)
-(p6) br.ret.spnt.many rp // return if zero or negative length
-
- mov hmask=-1 // initialize head mask
- tbit.nz p15,p0=buf,0 // is buf an odd address?
- and first1=-8,buf // 8-byte align down address of first1 element
-
- and firstoff=7,buf // how many bytes off for first1 element
- mov tmask=-1 // initialize tail mask
-
- ;;
- adds tmp2=-1,tmp1 // last-1
- and lastoff=7,tmp1 // how many bytes off for last element
- ;;
- sub tmp1=8,lastoff // complement to lastoff
- and last=-8,tmp2 // address of word containing last byte
- ;;
- sub tmp3=last,first1 // tmp3=distance from first1 to last
- .save ar.lc, saved_lc
- mov saved_lc=ar.lc // save lc
- cmp.eq p8,p9=last,first1 // everything fits in one word ?
-
- ld8 firstval=[first1],8 // load, ahead of time, "first1" word
- and tmp1=7, tmp1 // make sure that if tmp1==8 -> tmp1=0
- shl tmp2=firstoff,3 // number of bits
- ;;
-(p9) ld8 lastval=[last] // load, ahead of time, "last" word, if needed
- shl tmp1=tmp1,3 // number of bits
-(p9) adds tmp3=-8,tmp3 // effectively loaded
- ;;
-(p8) mov lastval=r0 // we don't need lastval if first1==last
- shl hmask=hmask,tmp2 // build head mask, mask off [0,first1off[
- shr.u tmask=tmask,tmp1 // build tail mask, mask off ]8,lastoff]
- ;;
- .body
-#define count tmp3
-
-(p8) and hmask=hmask,tmask // apply tail mask to head mask if 1 word only
-(p9) and word2[0]=lastval,tmask // mask last it as appropriate
- shr.u count=count,3 // how many 8-byte?
- ;;
- // If count is odd, finish this 8-byte word so that we can
- // load two back-to-back 8-byte words per loop thereafter.
- and word1[0]=firstval,hmask // and mask it as appropriate
- tbit.nz p10,p11=count,0 // if (count is odd)
- ;;
-(p8) mov result1[0]=word1[0]
-(p9) add result1[0]=word1[0],word2[0]
- ;;
- cmp.ltu p6,p0=result1[0],word1[0] // check the carry
- cmp.eq.or.andcm p8,p0=0,count // exit if zero 8-byte
- ;;
-(p6) adds result1[0]=1,result1[0]
-(p8) br.cond.dptk .do_csum_exit // if (within an 8-byte word)
-(p11) br.cond.dptk .do_csum16 // if (count is even)
-
- // Here count is odd.
- ld8 word1[1]=[first1],8 // load an 8-byte word
- cmp.eq p9,p10=1,count // if (count == 1)
- adds count=-1,count // loaded an 8-byte word
- ;;
- add result1[0]=result1[0],word1[1]
- ;;
- cmp.ltu p6,p0=result1[0],word1[1]
- ;;
-(p6) adds result1[0]=1,result1[0]
-(p9) br.cond.sptk .do_csum_exit // if (count == 1) exit
- // Fall through to calculate the checksum, feeding result1[0] as
- // the initial value in result1[0].
- //
- // Calculate the checksum loading two 8-byte words per loop.
- //
-.do_csum16:
- add first2=8,first1
- shr.u count=count,1 // we do 16 bytes per loop
- ;;
- adds count=-1,count
- mov carry1=r0
- mov carry2=r0
- brp.loop.imp 1f,2f
- ;;
- mov ar.ec=PIPE_DEPTH
- mov ar.lc=count // set lc
- mov pr.rot=1<<16
- // result1[0] must be initialized in advance.
- mov result2[0]=r0
- ;;
- .align 32
-1:
-(ELD_1) cmp.ltu pC1[0],p0=result1[LOAD_LATENCY],word1[LOAD_LATENCY+1]
-(pC1[1])adds carry1=1,carry1
-(ELD_1) cmp.ltu pC2[0],p0=result2[LOAD_LATENCY],word2[LOAD_LATENCY+1]
-(pC2[1])adds carry2=1,carry2
-(ELD) add result1[LOAD_LATENCY-1]=result1[LOAD_LATENCY],word1[LOAD_LATENCY]
-(ELD) add result2[LOAD_LATENCY-1]=result2[LOAD_LATENCY],word2[LOAD_LATENCY]
-2:
-(p[0]) ld8 word1[0]=[first1],16
-(p[0]) ld8 word2[0]=[first2],16
- br.ctop.sptk 1b
- ;;
- // Since len is a 32-bit value, carry cannot be larger than a 64-bit value.
-(pC1[1])adds carry1=1,carry1 // since we miss the last one
-(pC2[1])adds carry2=1,carry2
- ;;
- add result1[LOAD_LATENCY+1]=result1[LOAD_LATENCY+1],carry1
- add result2[LOAD_LATENCY+1]=result2[LOAD_LATENCY+1],carry2
- ;;
- cmp.ltu p6,p0=result1[LOAD_LATENCY+1],carry1
- cmp.ltu p7,p0=result2[LOAD_LATENCY+1],carry2
- ;;
-(p6) adds result1[LOAD_LATENCY+1]=1,result1[LOAD_LATENCY+1]
-(p7) adds result2[LOAD_LATENCY+1]=1,result2[LOAD_LATENCY+1]
- ;;
- add result1[0]=result1[LOAD_LATENCY+1],result2[LOAD_LATENCY+1]
- ;;
- cmp.ltu p6,p0=result1[0],result2[LOAD_LATENCY+1]
- ;;
-(p6) adds result1[0]=1,result1[0]
- ;;
-.do_csum_exit:
- //
- // now fold 64 into 16 bits taking care of carry
- // that's not very good because it has lots of sequentiality
- //
- mov tmp3=0xffff
- zxt4 tmp1=result1[0]
- shr.u tmp2=result1[0],32
- ;;
- add result1[0]=tmp1,tmp2
- ;;
- and tmp1=result1[0],tmp3
- shr.u tmp2=result1[0],16
- ;;
- add result1[0]=tmp1,tmp2
- ;;
- and tmp1=result1[0],tmp3
- shr.u tmp2=result1[0],16
- ;;
- add result1[0]=tmp1,tmp2
- ;;
- and tmp1=result1[0],tmp3
- shr.u tmp2=result1[0],16
- ;;
- add ret0=tmp1,tmp2
- mov pr=saved_pr,0xffffffffffff0000
- ;;
- // if buf was odd then swap bytes
- mov ar.pfs=saved_pfs // restore ar.ec
-(p15) mux1 ret0=ret0,@rev // reverse word
- ;;
- mov ar.lc=saved_lc
-(p15) shr.u ret0=ret0,64-16 // + shift back to position = swap bytes
- br.ret.sptk.many rp
-
-// I (Jun Nakajima) wrote an equivalent code (see below), but it was
-// not much better than the original. So keep the original there so that
-// someone else can challenge.
-//
-// shr.u word1[0]=result1[0],32
-// zxt4 result1[0]=result1[0]
-// ;;
-// add result1[0]=result1[0],word1[0]
-// ;;
-// zxt2 result2[0]=result1[0]
-// extr.u word1[0]=result1[0],16,16
-// shr.u carry1=result1[0],32
-// ;;
-// add result2[0]=result2[0],word1[0]
-// ;;
-// add result2[0]=result2[0],carry1
-// ;;
-// extr.u ret0=result2[0],16,16
-// ;;
-// add ret0=ret0,result2[0]
-// ;;
-// zxt2 ret0=ret0
-// mov ar.pfs=saved_pfs // restore ar.ec
-// mov pr=saved_pr,0xffffffffffff0000
-// ;;
-// // if buf was odd then swap bytes
-// mov ar.lc=saved_lc
-//(p15) mux1 ret0=ret0,@rev // reverse word
-// ;;
-//(p15) shr.u ret0=ret0,64-16 // + shift back to position = swap bytes
-// br.ret.sptk.many rp
-
-END(do_csum)
diff --git a/arch/ia64/lib/flush.S b/arch/ia64/lib/flush.S
deleted file mode 100644
index f8e795fe45cb..000000000000
--- a/arch/ia64/lib/flush.S
+++ /dev/null
@@ -1,119 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Cache flushing routines.
- *
- * Copyright (C) 1999-2001, 2005 Hewlett-Packard Co
- * David Mosberger-Tang <davidm@hpl.hp.com>
- *
- * 05/28/05 Zoltan Menyhart Dynamic stride size
- */
-
-#include <linux/export.h>
-#include <asm/asmmacro.h>
-
- /*
- * flush_icache_range(start,end)
- *
- * Make i-cache(s) coherent with d-caches.
- *
- * Must deal with range from start to end-1 but nothing else (need to
- * be careful not to touch addresses that may be unmapped).
- *
- * Note: "in0" and "in1" are preserved for debugging purposes.
- */
- .section .kprobes.text,"ax"
-GLOBAL_ENTRY(flush_icache_range)
-
- .prologue
- alloc r2=ar.pfs,2,0,0,0
- movl r3=ia64_i_cache_stride_shift
- mov r21=1
- ;;
- ld8 r20=[r3] // r20: stride shift
- sub r22=in1,r0,1 // last byte address
- ;;
- shr.u r23=in0,r20 // start / (stride size)
- shr.u r22=r22,r20 // (last byte address) / (stride size)
- shl r21=r21,r20 // r21: stride size of the i-cache(s)
- ;;
- sub r8=r22,r23 // number of strides - 1
- shl r24=r23,r20 // r24: addresses for "fc.i" =
- // "start" rounded down to stride boundary
- .save ar.lc,r3
- mov r3=ar.lc // save ar.lc
- ;;
-
- .body
- mov ar.lc=r8
- ;;
- /*
- * 32 byte aligned loop, even number of (actually 2) bundles
- */
-.Loop: fc.i r24 // issuable on M0 only
- add r24=r21,r24 // we flush "stride size" bytes per iteration
- nop.i 0
- br.cloop.sptk.few .Loop
- ;;
- sync.i
- ;;
- srlz.i
- ;;
- mov ar.lc=r3 // restore ar.lc
- br.ret.sptk.many rp
-END(flush_icache_range)
-EXPORT_SYMBOL_GPL(flush_icache_range)
-
- /*
- * clflush_cache_range(start,size)
- *
- * Flush cache lines from start to start+size-1.
- *
- * Must deal with range from start to start+size-1 but nothing else
- * (need to be careful not to touch addresses that may be
- * unmapped).
- *
- * Note: "in0" and "in1" are preserved for debugging purposes.
- */
- .section .kprobes.text,"ax"
-GLOBAL_ENTRY(clflush_cache_range)
-
- .prologue
- alloc r2=ar.pfs,2,0,0,0
- movl r3=ia64_cache_stride_shift
- mov r21=1
- add r22=in1,in0
- ;;
- ld8 r20=[r3] // r20: stride shift
- sub r22=r22,r0,1 // last byte address
- ;;
- shr.u r23=in0,r20 // start / (stride size)
- shr.u r22=r22,r20 // (last byte address) / (stride size)
- shl r21=r21,r20 // r21: stride size of the i-cache(s)
- ;;
- sub r8=r22,r23 // number of strides - 1
- shl r24=r23,r20 // r24: addresses for "fc" =
- // "start" rounded down to stride
- // boundary
- .save ar.lc,r3
- mov r3=ar.lc // save ar.lc
- ;;
-
- .body
- mov ar.lc=r8
- ;;
- /*
- * 32 byte aligned loop, even number of (actually 2) bundles
- */
-.Loop_fc:
- fc r24 // issuable on M0 only
- add r24=r21,r24 // we flush "stride size" bytes per iteration
- nop.i 0
- br.cloop.sptk.few .Loop_fc
- ;;
- sync.i
- ;;
- srlz.i
- ;;
- mov ar.lc=r3 // restore ar.lc
- br.ret.sptk.many rp
-END(clflush_cache_range)
diff --git a/arch/ia64/lib/idiv32.S b/arch/ia64/lib/idiv32.S
deleted file mode 100644
index 83586fbc51ff..000000000000
--- a/arch/ia64/lib/idiv32.S
+++ /dev/null
@@ -1,86 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Copyright (C) 2000 Hewlett-Packard Co
- * Copyright (C) 2000 David Mosberger-Tang <davidm@hpl.hp.com>
- *
- * 32-bit integer division.
- *
- * This code is based on the application note entitled "Divide, Square Root
- * and Remainder Algorithms for the IA-64 Architecture". This document
- * is available as Intel document number 248725-002 or via the web at
- * http://developer.intel.com/software/opensource/numerics/
- *
- * For more details on the theory behind these algorithms, see "IA-64
- * and Elementary Functions" by Peter Markstein; HP Professional Books
- * (http://www.goodreads.com/book/show/2019887.Ia_64_and_Elementary_Functions)
- */
-
-#include <linux/export.h>
-#include <asm/asmmacro.h>
-
-#ifdef MODULO
-# define OP mod
-#else
-# define OP div
-#endif
-
-#ifdef UNSIGNED
-# define SGN u
-# define EXTEND zxt4
-# define INT_TO_FP(a,b) fcvt.xuf.s1 a=b
-# define FP_TO_INT(a,b) fcvt.fxu.trunc.s1 a=b
-#else
-# define SGN
-# define EXTEND sxt4
-# define INT_TO_FP(a,b) fcvt.xf a=b
-# define FP_TO_INT(a,b) fcvt.fx.trunc.s1 a=b
-#endif
-
-#define PASTE1(a,b) a##b
-#define PASTE(a,b) PASTE1(a,b)
-#define NAME PASTE(PASTE(__,SGN),PASTE(OP,si3))
-
-GLOBAL_ENTRY(NAME)
- .regstk 2,0,0,0
- // Transfer inputs to FP registers.
- mov r2 = 0xffdd // r2 = -34 + 65535 (fp reg format bias)
- EXTEND in0 = in0 // in0 = a
- EXTEND in1 = in1 // in1 = b
- ;;
- setf.sig f8 = in0
- setf.sig f9 = in1
-#ifdef MODULO
- sub in1 = r0, in1 // in1 = -b
-#endif
- ;;
- // Convert the inputs to FP, to avoid FP software-assist faults.
- INT_TO_FP(f8, f8)
- INT_TO_FP(f9, f9)
- ;;
- setf.exp f7 = r2 // f7 = 2^-34
- frcpa.s1 f6, p6 = f8, f9 // y0 = frcpa(b)
- ;;
-(p6) fmpy.s1 f8 = f8, f6 // q0 = a*y0
-(p6) fnma.s1 f6 = f9, f6, f1 // e0 = -b*y0 + 1
- ;;
-#ifdef MODULO
- setf.sig f9 = in1 // f9 = -b
-#endif
-(p6) fma.s1 f8 = f6, f8, f8 // q1 = e0*q0 + q0
-(p6) fma.s1 f6 = f6, f6, f7 // e1 = e0*e0 + 2^-34
- ;;
-#ifdef MODULO
- setf.sig f7 = in0
-#endif
-(p6) fma.s1 f6 = f6, f8, f8 // q2 = e1*q1 + q1
- ;;
- FP_TO_INT(f6, f6) // q = trunc(q2)
- ;;
-#ifdef MODULO
- xma.l f6 = f6, f9, f7 // r = q*(-b) + a
- ;;
-#endif
- getf.sig r8 = f6 // transfer result to result register
- br.ret.sptk.many rp
-END(NAME)
-EXPORT_SYMBOL(NAME)
diff --git a/arch/ia64/lib/idiv64.S b/arch/ia64/lib/idiv64.S
deleted file mode 100644
index 5c9113691f72..000000000000
--- a/arch/ia64/lib/idiv64.S
+++ /dev/null
@@ -1,83 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Copyright (C) 1999-2000 Hewlett-Packard Co
- * Copyright (C) 1999-2000 David Mosberger-Tang <davidm@hpl.hp.com>
- *
- * 64-bit integer division.
- *
- * This code is based on the application note entitled "Divide, Square Root
- * and Remainder Algorithms for the IA-64 Architecture". This document
- * is available as Intel document number 248725-002 or via the web at
- * http://developer.intel.com/software/opensource/numerics/
- *
- * For more details on the theory behind these algorithms, see "IA-64
- * and Elementary Functions" by Peter Markstein; HP Professional Books
- * (http://www.goodreads.com/book/show/2019887.Ia_64_and_Elementary_Functions)
- */
-
-#include <linux/export.h>
-#include <asm/asmmacro.h>
-
-#ifdef MODULO
-# define OP mod
-#else
-# define OP div
-#endif
-
-#ifdef UNSIGNED
-# define SGN u
-# define INT_TO_FP(a,b) fcvt.xuf.s1 a=b
-# define FP_TO_INT(a,b) fcvt.fxu.trunc.s1 a=b
-#else
-# define SGN
-# define INT_TO_FP(a,b) fcvt.xf a=b
-# define FP_TO_INT(a,b) fcvt.fx.trunc.s1 a=b
-#endif
-
-#define PASTE1(a,b) a##b
-#define PASTE(a,b) PASTE1(a,b)
-#define NAME PASTE(PASTE(__,SGN),PASTE(OP,di3))
-
-GLOBAL_ENTRY(NAME)
- .regstk 2,0,0,0
- // Transfer inputs to FP registers.
- setf.sig f8 = in0
- setf.sig f9 = in1
- ;;
- // Convert the inputs to FP, to avoid FP software-assist faults.
- INT_TO_FP(f8, f8)
- INT_TO_FP(f9, f9)
- ;;
- frcpa.s1 f11, p6 = f8, f9 // y0 = frcpa(b)
- ;;
-(p6) fmpy.s1 f7 = f8, f11 // q0 = a*y0
-(p6) fnma.s1 f6 = f9, f11, f1 // e0 = -b*y0 + 1
- ;;
-(p6) fma.s1 f10 = f7, f6, f7 // q1 = q0*e0 + q0
-(p6) fmpy.s1 f7 = f6, f6 // e1 = e0*e0
- ;;
-#ifdef MODULO
- sub in1 = r0, in1 // in1 = -b
-#endif
-(p6) fma.s1 f10 = f10, f7, f10 // q2 = q1*e1 + q1
-(p6) fma.s1 f6 = f11, f6, f11 // y1 = y0*e0 + y0
- ;;
-(p6) fma.s1 f6 = f6, f7, f6 // y2 = y1*e1 + y1
-(p6) fnma.s1 f7 = f9, f10, f8 // r = -b*q2 + a
- ;;
-#ifdef MODULO
- setf.sig f8 = in0 // f8 = a
- setf.sig f9 = in1 // f9 = -b
-#endif
-(p6) fma.s1 f11 = f7, f6, f10 // q3 = r*y2 + q2
- ;;
- FP_TO_INT(f11, f11) // q = trunc(q3)
- ;;
-#ifdef MODULO
- xma.l f11 = f11, f9, f8 // r = q*(-b) + a
- ;;
-#endif
- getf.sig r8 = f11 // transfer result to result register
- br.ret.sptk.many rp
-END(NAME)
-EXPORT_SYMBOL(NAME)
diff --git a/arch/ia64/lib/io.c b/arch/ia64/lib/io.c
deleted file mode 100644
index c3e02462ed16..000000000000
--- a/arch/ia64/lib/io.c
+++ /dev/null
@@ -1,51 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <linux/module.h>
-#include <linux/types.h>
-
-#include <asm/io.h>
-
-/*
- * Copy data from IO memory space to "real" memory space.
- * This needs to be optimized.
- */
-void memcpy_fromio(void *to, const volatile void __iomem *from, long count)
-{
- char *dst = to;
-
- while (count) {
- count--;
- *dst++ = readb(from++);
- }
-}
-EXPORT_SYMBOL(memcpy_fromio);
-
-/*
- * Copy data from "real" memory space to IO memory space.
- * This needs to be optimized.
- */
-void memcpy_toio(volatile void __iomem *to, const void *from, long count)
-{
- const char *src = from;
-
- while (count) {
- count--;
- writeb(*src++, to++);
- }
-}
-EXPORT_SYMBOL(memcpy_toio);
-
-/*
- * "memset" on IO memory space.
- * This needs to be optimized.
- */
-void memset_io(volatile void __iomem *dst, int c, long count)
-{
- unsigned char ch = (char)(c & 0xff);
-
- while (count) {
- count--;
- writeb(ch, dst);
- dst++;
- }
-}
-EXPORT_SYMBOL(memset_io);
diff --git a/arch/ia64/lib/ip_fast_csum.S b/arch/ia64/lib/ip_fast_csum.S
deleted file mode 100644
index fcc0b812ce2e..000000000000
--- a/arch/ia64/lib/ip_fast_csum.S
+++ /dev/null
@@ -1,148 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Optmized version of the ip_fast_csum() function
- * Used for calculating IP header checksum
- *
- * Return: 16bit checksum, complemented
- *
- * Inputs:
- * in0: address of buffer to checksum (char *)
- * in1: length of the buffer (int)
- *
- * Copyright (C) 2002, 2006 Intel Corp.
- * Copyright (C) 2002, 2006 Ken Chen <kenneth.w.chen@intel.com>
- */
-
-#include <linux/export.h>
-#include <asm/asmmacro.h>
-
-/*
- * Since we know that most likely this function is called with buf aligned
- * on 4-byte boundary and 20 bytes in length, we can execution rather quickly
- * versus calling generic version of do_csum, which has lots of overhead in
- * handling various alignments and sizes. However, due to lack of constrains
- * put on the function input argument, cases with alignment not on 4-byte or
- * size not equal to 20 bytes will be handled by the generic do_csum function.
- */
-
-#define in0 r32
-#define in1 r33
-#define in2 r34
-#define in3 r35
-#define in4 r36
-#define ret0 r8
-
-GLOBAL_ENTRY(ip_fast_csum)
- .prologue
- .body
- cmp.ne p6,p7=5,in1 // size other than 20 byte?
- and r14=3,in0 // is it aligned on 4-byte?
- add r15=4,in0 // second source pointer
- ;;
- cmp.ne.or.andcm p6,p7=r14,r0
- ;;
-(p7) ld4 r20=[in0],8
-(p7) ld4 r21=[r15],8
-(p6) br.spnt .generic
- ;;
- ld4 r22=[in0],8
- ld4 r23=[r15],8
- ;;
- ld4 r24=[in0]
- add r20=r20,r21
- add r22=r22,r23
- ;;
- add r20=r20,r22
- ;;
- add r20=r20,r24
- ;;
- shr.u ret0=r20,16 // now need to add the carry
- zxt2 r20=r20
- ;;
- add r20=ret0,r20
- ;;
- shr.u ret0=r20,16 // add carry again
- zxt2 r20=r20
- ;;
- add r20=ret0,r20
- ;;
- shr.u ret0=r20,16
- zxt2 r20=r20
- ;;
- add r20=ret0,r20
- mov r9=0xffff
- ;;
- andcm ret0=r9,r20
- .restore sp // reset frame state
- br.ret.sptk.many b0
- ;;
-
-.generic:
- .prologue
- .save ar.pfs, r35
- alloc r35=ar.pfs,2,2,2,0
- .save rp, r34
- mov r34=b0
- .body
- dep.z out1=in1,2,30
- mov out0=in0
- ;;
- br.call.sptk.many b0=do_csum
- ;;
- andcm ret0=-1,ret0
- mov ar.pfs=r35
- mov b0=r34
- br.ret.sptk.many b0
-END(ip_fast_csum)
-EXPORT_SYMBOL(ip_fast_csum)
-
-GLOBAL_ENTRY(csum_ipv6_magic)
- ld4 r20=[in0],4
- ld4 r21=[in1],4
- zxt4 in2=in2
- ;;
- ld4 r22=[in0],4
- ld4 r23=[in1],4
- dep r15=in3,in2,32,16
- ;;
- ld4 r24=[in0],4
- ld4 r25=[in1],4
- mux1 r15=r15,@rev
- add r16=r20,r21
- add r17=r22,r23
- zxt4 in4=in4
- ;;
- ld4 r26=[in0],4
- ld4 r27=[in1],4
- shr.u r15=r15,16
- add r18=r24,r25
- add r8=r16,r17
- ;;
- add r19=r26,r27
- add r8=r8,r18
- ;;
- add r8=r8,r19
- add r15=r15,in4
- ;;
- add r8=r8,r15
- ;;
- shr.u r10=r8,32 // now fold sum into short
- zxt4 r11=r8
- ;;
- add r8=r10,r11
- ;;
- shr.u r10=r8,16 // yeah, keep it rolling
- zxt2 r11=r8
- ;;
- add r8=r10,r11
- ;;
- shr.u r10=r8,16 // three times lucky
- zxt2 r11=r8
- ;;
- add r8=r10,r11
- mov r9=0xffff
- ;;
- andcm r8=r9,r8
- br.ret.sptk.many b0
-END(csum_ipv6_magic)
-EXPORT_SYMBOL(csum_ipv6_magic)
diff --git a/arch/ia64/lib/memcpy.S b/arch/ia64/lib/memcpy.S
deleted file mode 100644
index 35c9069a8345..000000000000
--- a/arch/ia64/lib/memcpy.S
+++ /dev/null
@@ -1,304 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- *
- * Optimized version of the standard memcpy() function
- *
- * Inputs:
- * in0: destination address
- * in1: source address
- * in2: number of bytes to copy
- * Output:
- * no return value
- *
- * Copyright (C) 2000-2001 Hewlett-Packard Co
- * Stephane Eranian <eranian@hpl.hp.com>
- * David Mosberger-Tang <davidm@hpl.hp.com>
- */
-#include <linux/export.h>
-#include <asm/asmmacro.h>
-
-GLOBAL_ENTRY(memcpy)
-
-# define MEM_LAT 21 /* latency to memory */
-
-# define dst r2
-# define src r3
-# define retval r8
-# define saved_pfs r9
-# define saved_lc r10
-# define saved_pr r11
-# define cnt r16
-# define src2 r17
-# define t0 r18
-# define t1 r19
-# define t2 r20
-# define t3 r21
-# define t4 r22
-# define src_end r23
-
-# define N (MEM_LAT + 4)
-# define Nrot ((N + 7) & ~7)
-
- /*
- * First, check if everything (src, dst, len) is a multiple of eight. If
- * so, we handle everything with no taken branches (other than the loop
- * itself) and a small icache footprint. Otherwise, we jump off to
- * the more general copy routine handling arbitrary
- * sizes/alignment etc.
- */
- .prologue
- .save ar.pfs, saved_pfs
- alloc saved_pfs=ar.pfs,3,Nrot,0,Nrot
- .save ar.lc, saved_lc
- mov saved_lc=ar.lc
- or t0=in0,in1
- ;;
-
- or t0=t0,in2
- .save pr, saved_pr
- mov saved_pr=pr
-
- .body
-
- cmp.eq p6,p0=in2,r0 // zero length?
- mov retval=in0 // return dst
-(p6) br.ret.spnt.many rp // zero length, return immediately
- ;;
-
- mov dst=in0 // copy because of rotation
- shr.u cnt=in2,3 // number of 8-byte words to copy
- mov pr.rot=1<<16
- ;;
-
- adds cnt=-1,cnt // br.ctop is repeat/until
- cmp.gtu p7,p0=16,in2 // copying less than 16 bytes?
- mov ar.ec=N
- ;;
-
- and t0=0x7,t0
- mov ar.lc=cnt
- ;;
- cmp.ne p6,p0=t0,r0
-
- mov src=in1 // copy because of rotation
-(p7) br.cond.spnt.few .memcpy_short
-(p6) br.cond.spnt.few .memcpy_long
- ;;
- nop.m 0
- ;;
- nop.m 0
- nop.i 0
- ;;
- nop.m 0
- ;;
- .rotr val[N]
- .rotp p[N]
- .align 32
-1: { .mib
-(p[0]) ld8 val[0]=[src],8
- nop.i 0
- brp.loop.imp 1b, 2f
-}
-2: { .mfb
-(p[N-1])st8 [dst]=val[N-1],8
- nop.f 0
- br.ctop.dptk.few 1b
-}
- ;;
- mov ar.lc=saved_lc
- mov pr=saved_pr,-1
- mov ar.pfs=saved_pfs
- br.ret.sptk.many rp
-
- /*
- * Small (<16 bytes) unaligned copying is done via a simple byte-at-the-time
- * copy loop. This performs relatively poorly on Itanium, but it doesn't
- * get used very often (gcc inlines small copies) and due to atomicity
- * issues, we want to avoid read-modify-write of entire words.
- */
- .align 32
-.memcpy_short:
- adds cnt=-1,in2 // br.ctop is repeat/until
- mov ar.ec=MEM_LAT
- brp.loop.imp 1f, 2f
- ;;
- mov ar.lc=cnt
- ;;
- nop.m 0
- ;;
- nop.m 0
- nop.i 0
- ;;
- nop.m 0
- ;;
- nop.m 0
- ;;
- /*
- * It is faster to put a stop bit in the loop here because it makes
- * the pipeline shorter (and latency is what matters on short copies).
- */
- .align 32
-1: { .mib
-(p[0]) ld1 val[0]=[src],1
- nop.i 0
- brp.loop.imp 1b, 2f
-} ;;
-2: { .mfb
-(p[MEM_LAT-1])st1 [dst]=val[MEM_LAT-1],1
- nop.f 0
- br.ctop.dptk.few 1b
-} ;;
- mov ar.lc=saved_lc
- mov pr=saved_pr,-1
- mov ar.pfs=saved_pfs
- br.ret.sptk.many rp
-
- /*
- * Large (>= 16 bytes) copying is done in a fancy way. Latency isn't
- * an overriding concern here, but throughput is. We first do
- * sub-word copying until the destination is aligned, then we check
- * if the source is also aligned. If so, we do a simple load/store-loop
- * until there are less than 8 bytes left over and then we do the tail,
- * by storing the last few bytes using sub-word copying. If the source
- * is not aligned, we branch off to the non-congruent loop.
- *
- * stage: op:
- * 0 ld
- * :
- * MEM_LAT+3 shrp
- * MEM_LAT+4 st
- *
- * On Itanium, the pipeline itself runs without stalls. However, br.ctop
- * seems to introduce an unavoidable bubble in the pipeline so the overall
- * latency is 2 cycles/iteration. This gives us a _copy_ throughput
- * of 4 byte/cycle. Still not bad.
- */
-# undef N
-# undef Nrot
-# define N (MEM_LAT + 5) /* number of stages */
-# define Nrot ((N+1 + 2 + 7) & ~7) /* number of rotating regs */
-
-#define LOG_LOOP_SIZE 6
-
-.memcpy_long:
- alloc t3=ar.pfs,3,Nrot,0,Nrot // resize register frame
- and t0=-8,src // t0 = src & ~7
- and t2=7,src // t2 = src & 7
- ;;
- ld8 t0=[t0] // t0 = 1st source word
- adds src2=7,src // src2 = (src + 7)
- sub t4=r0,dst // t4 = -dst
- ;;
- and src2=-8,src2 // src2 = (src + 7) & ~7
- shl t2=t2,3 // t2 = 8*(src & 7)
- shl t4=t4,3 // t4 = 8*(dst & 7)
- ;;
- ld8 t1=[src2] // t1 = 1st source word if src is 8-byte aligned, 2nd otherwise
- sub t3=64,t2 // t3 = 64-8*(src & 7)
- shr.u t0=t0,t2
- ;;
- add src_end=src,in2
- shl t1=t1,t3
- mov pr=t4,0x38 // (p5,p4,p3)=(dst & 7)
- ;;
- or t0=t0,t1
- mov cnt=r0
- adds src_end=-1,src_end
- ;;
-(p3) st1 [dst]=t0,1
-(p3) shr.u t0=t0,8
-(p3) adds cnt=1,cnt
- ;;
-(p4) st2 [dst]=t0,2
-(p4) shr.u t0=t0,16
-(p4) adds cnt=2,cnt
- ;;
-(p5) st4 [dst]=t0,4
-(p5) adds cnt=4,cnt
- and src_end=-8,src_end // src_end = last word of source buffer
- ;;
-
- // At this point, dst is aligned to 8 bytes and there at least 16-7=9 bytes left to copy:
-
-1:{ add src=cnt,src // make src point to remainder of source buffer
- sub cnt=in2,cnt // cnt = number of bytes left to copy
- mov t4=ip
- } ;;
- and src2=-8,src // align source pointer
- adds t4=.memcpy_loops-1b,t4
- mov ar.ec=N
-
- and t0=7,src // t0 = src & 7
- shr.u t2=cnt,3 // t2 = number of 8-byte words left to copy
- shl cnt=cnt,3 // move bits 0-2 to 3-5
- ;;
-
- .rotr val[N+1], w[2]
- .rotp p[N]
-
- cmp.ne p6,p0=t0,r0 // is src aligned, too?
- shl t0=t0,LOG_LOOP_SIZE // t0 = 8*(src & 7)
- adds t2=-1,t2 // br.ctop is repeat/until
- ;;
- add t4=t0,t4
- mov pr=cnt,0x38 // set (p5,p4,p3) to # of bytes last-word bytes to copy
- mov ar.lc=t2
- ;;
- nop.m 0
- ;;
- nop.m 0
- nop.i 0
- ;;
- nop.m 0
- ;;
-(p6) ld8 val[1]=[src2],8 // prime the pump...
- mov b6=t4
- br.sptk.few b6
- ;;
-
-.memcpy_tail:
- // At this point, (p5,p4,p3) are set to the number of bytes left to copy (which is
- // less than 8) and t0 contains the last few bytes of the src buffer:
-(p5) st4 [dst]=t0,4
-(p5) shr.u t0=t0,32
- mov ar.lc=saved_lc
- ;;
-(p4) st2 [dst]=t0,2
-(p4) shr.u t0=t0,16
- mov ar.pfs=saved_pfs
- ;;
-(p3) st1 [dst]=t0
- mov pr=saved_pr,-1
- br.ret.sptk.many rp
-
-///////////////////////////////////////////////////////
- .align 64
-
-#define COPY(shift,index) \
- 1: { .mib \
- (p[0]) ld8 val[0]=[src2],8; \
- (p[MEM_LAT+3]) shrp w[0]=val[MEM_LAT+3],val[MEM_LAT+4-index],shift; \
- brp.loop.imp 1b, 2f \
- }; \
- 2: { .mfb \
- (p[MEM_LAT+4]) st8 [dst]=w[1],8; \
- nop.f 0; \
- br.ctop.dptk.few 1b; \
- }; \
- ;; \
- ld8 val[N-1]=[src_end]; /* load last word (may be same as val[N]) */ \
- ;; \
- shrp t0=val[N-1],val[N-index],shift; \
- br .memcpy_tail
-.memcpy_loops:
- COPY(0, 1) /* no point special casing this---it doesn't go any faster without shrp */
- COPY(8, 0)
- COPY(16, 0)
- COPY(24, 0)
- COPY(32, 0)
- COPY(40, 0)
- COPY(48, 0)
- COPY(56, 0)
-
-END(memcpy)
-EXPORT_SYMBOL(memcpy)
diff --git a/arch/ia64/lib/memcpy_mck.S b/arch/ia64/lib/memcpy_mck.S
deleted file mode 100644
index c0d4362217ae..000000000000
--- a/arch/ia64/lib/memcpy_mck.S
+++ /dev/null
@@ -1,659 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Itanium 2-optimized version of memcpy and copy_user function
- *
- * Inputs:
- * in0: destination address
- * in1: source address
- * in2: number of bytes to copy
- * Output:
- * for memcpy: return dest
- * for copy_user: return 0 if success,
- * or number of byte NOT copied if error occurred.
- *
- * Copyright (C) 2002 Intel Corp.
- * Copyright (C) 2002 Ken Chen <kenneth.w.chen@intel.com>
- */
-#include <linux/export.h>
-#include <asm/asmmacro.h>
-#include <asm/page.h>
-
-#define EK(y...) EX(y)
-
-/* McKinley specific optimization */
-
-#define retval r8
-#define saved_pfs r31
-#define saved_lc r10
-#define saved_pr r11
-#define saved_in0 r14
-#define saved_in1 r15
-#define saved_in2 r16
-
-#define src0 r2
-#define src1 r3
-#define dst0 r17
-#define dst1 r18
-#define cnt r9
-
-/* r19-r30 are temp for each code section */
-#define PREFETCH_DIST 8
-#define src_pre_mem r19
-#define dst_pre_mem r20
-#define src_pre_l2 r21
-#define dst_pre_l2 r22
-#define t1 r23
-#define t2 r24
-#define t3 r25
-#define t4 r26
-#define t5 t1 // alias!
-#define t6 t2 // alias!
-#define t7 t3 // alias!
-#define n8 r27
-#define t9 t5 // alias!
-#define t10 t4 // alias!
-#define t11 t7 // alias!
-#define t12 t6 // alias!
-#define t14 t10 // alias!
-#define t13 r28
-#define t15 r29
-#define tmp r30
-
-/* defines for long_copy block */
-#define A 0
-#define B (PREFETCH_DIST)
-#define C (B + PREFETCH_DIST)
-#define D (C + 1)
-#define N (D + 1)
-#define Nrot ((N + 7) & ~7)
-
-/* alias */
-#define in0 r32
-#define in1 r33
-#define in2 r34
-
-GLOBAL_ENTRY(memcpy)
- and r28=0x7,in0
- and r29=0x7,in1
- mov f6=f0
- mov retval=in0
- br.cond.sptk .common_code
- ;;
-END(memcpy)
-EXPORT_SYMBOL(memcpy)
-GLOBAL_ENTRY(__copy_user)
- .prologue
-// check dest alignment
- and r28=0x7,in0
- and r29=0x7,in1
- mov f6=f1
- mov saved_in0=in0 // save dest pointer
- mov saved_in1=in1 // save src pointer
- mov retval=r0 // initialize return value
- ;;
-.common_code:
- cmp.gt p15,p0=8,in2 // check for small size
- cmp.ne p13,p0=0,r28 // check dest alignment
- cmp.ne p14,p0=0,r29 // check src alignment
- add src0=0,in1
- sub r30=8,r28 // for .align_dest
- mov saved_in2=in2 // save len
- ;;
- add dst0=0,in0
- add dst1=1,in0 // dest odd index
- cmp.le p6,p0 = 1,r30 // for .align_dest
-(p15) br.cond.dpnt .memcpy_short
-(p13) br.cond.dpnt .align_dest
-(p14) br.cond.dpnt .unaligned_src
- ;;
-
-// both dest and src are aligned on 8-byte boundary
-.aligned_src:
- .save ar.pfs, saved_pfs
- alloc saved_pfs=ar.pfs,3,Nrot-3,0,Nrot
- .save pr, saved_pr
- mov saved_pr=pr
-
- shr.u cnt=in2,7 // this much cache line
- ;;
- cmp.lt p6,p0=2*PREFETCH_DIST,cnt
- cmp.lt p7,p8=1,cnt
- .save ar.lc, saved_lc
- mov saved_lc=ar.lc
- .body
- add cnt=-1,cnt
- add src_pre_mem=0,in1 // prefetch src pointer
- add dst_pre_mem=0,in0 // prefetch dest pointer
- ;;
-(p7) mov ar.lc=cnt // prefetch count
-(p8) mov ar.lc=r0
-(p6) br.cond.dpnt .long_copy
- ;;
-
-.prefetch:
- lfetch.fault [src_pre_mem], 128
- lfetch.fault.excl [dst_pre_mem], 128
- br.cloop.dptk.few .prefetch
- ;;
-
-.medium_copy:
- and tmp=31,in2 // copy length after iteration
- shr.u r29=in2,5 // number of 32-byte iteration
- add dst1=8,dst0 // 2nd dest pointer
- ;;
- add cnt=-1,r29 // ctop iteration adjustment
- cmp.eq p10,p0=r29,r0 // do we really need to loop?
- add src1=8,src0 // 2nd src pointer
- cmp.le p6,p0=8,tmp
- ;;
- cmp.le p7,p0=16,tmp
- mov ar.lc=cnt // loop setup
- cmp.eq p16,p17 = r0,r0
- mov ar.ec=2
-(p10) br.dpnt.few .aligned_src_tail
- ;;
- TEXT_ALIGN(32)
-1:
-EX(.ex_handler, (p16) ld8 r34=[src0],16)
-EK(.ex_handler, (p16) ld8 r38=[src1],16)
-EX(.ex_handler, (p17) st8 [dst0]=r33,16)
-EK(.ex_handler, (p17) st8 [dst1]=r37,16)
- ;;
-EX(.ex_handler, (p16) ld8 r32=[src0],16)
-EK(.ex_handler, (p16) ld8 r36=[src1],16)
-EX(.ex_handler, (p16) st8 [dst0]=r34,16)
-EK(.ex_handler, (p16) st8 [dst1]=r38,16)
- br.ctop.dptk.few 1b
- ;;
-
-.aligned_src_tail:
-EX(.ex_handler, (p6) ld8 t1=[src0])
- mov ar.lc=saved_lc
- mov ar.pfs=saved_pfs
-EX(.ex_hndlr_s, (p7) ld8 t2=[src1],8)
- cmp.le p8,p0=24,tmp
- and r21=-8,tmp
- ;;
-EX(.ex_hndlr_s, (p8) ld8 t3=[src1])
-EX(.ex_handler, (p6) st8 [dst0]=t1) // store byte 1
- and in2=7,tmp // remaining length
-EX(.ex_hndlr_d, (p7) st8 [dst1]=t2,8) // store byte 2
- add src0=src0,r21 // setting up src pointer
- add dst0=dst0,r21 // setting up dest pointer
- ;;
-EX(.ex_handler, (p8) st8 [dst1]=t3) // store byte 3
- mov pr=saved_pr,-1
- br.dptk.many .memcpy_short
- ;;
-
-/* code taken from copy_page_mck */
-.long_copy:
- .rotr v[2*PREFETCH_DIST]
- .rotp p[N]
-
- mov src_pre_mem = src0
- mov pr.rot = 0x10000
- mov ar.ec = 1 // special unrolled loop
-
- mov dst_pre_mem = dst0
-
- add src_pre_l2 = 8*8, src0
- add dst_pre_l2 = 8*8, dst0
- ;;
- add src0 = 8, src_pre_mem // first t1 src
- mov ar.lc = 2*PREFETCH_DIST - 1
- shr.u cnt=in2,7 // number of lines
- add src1 = 3*8, src_pre_mem // first t3 src
- add dst0 = 8, dst_pre_mem // first t1 dst
- add dst1 = 3*8, dst_pre_mem // first t3 dst
- ;;
- and tmp=127,in2 // remaining bytes after this block
- add cnt = -(2*PREFETCH_DIST) - 1, cnt
- // same as .line_copy loop, but with all predicated-off instructions removed:
-.prefetch_loop:
-EX(.ex_hndlr_lcpy_1, (p[A]) ld8 v[A] = [src_pre_mem], 128) // M0
-EK(.ex_hndlr_lcpy_1, (p[B]) st8 [dst_pre_mem] = v[B], 128) // M2
- br.ctop.sptk .prefetch_loop
- ;;
- cmp.eq p16, p0 = r0, r0 // reset p16 to 1
- mov ar.lc = cnt
- mov ar.ec = N // # of stages in pipeline
- ;;
-.line_copy:
-EX(.ex_handler, (p[D]) ld8 t2 = [src0], 3*8) // M0
-EK(.ex_handler, (p[D]) ld8 t4 = [src1], 3*8) // M1
-EX(.ex_handler_lcpy, (p[B]) st8 [dst_pre_mem] = v[B], 128) // M2 prefetch dst from memory
-EK(.ex_handler_lcpy, (p[D]) st8 [dst_pre_l2] = n8, 128) // M3 prefetch dst from L2
- ;;
-EX(.ex_handler_lcpy, (p[A]) ld8 v[A] = [src_pre_mem], 128) // M0 prefetch src from memory
-EK(.ex_handler_lcpy, (p[C]) ld8 n8 = [src_pre_l2], 128) // M1 prefetch src from L2
-EX(.ex_handler, (p[D]) st8 [dst0] = t1, 8) // M2
-EK(.ex_handler, (p[D]) st8 [dst1] = t3, 8) // M3
- ;;
-EX(.ex_handler, (p[D]) ld8 t5 = [src0], 8)
-EK(.ex_handler, (p[D]) ld8 t7 = [src1], 3*8)
-EX(.ex_handler, (p[D]) st8 [dst0] = t2, 3*8)
-EK(.ex_handler, (p[D]) st8 [dst1] = t4, 3*8)
- ;;
-EX(.ex_handler, (p[D]) ld8 t6 = [src0], 3*8)
-EK(.ex_handler, (p[D]) ld8 t10 = [src1], 8)
-EX(.ex_handler, (p[D]) st8 [dst0] = t5, 8)
-EK(.ex_handler, (p[D]) st8 [dst1] = t7, 3*8)
- ;;
-EX(.ex_handler, (p[D]) ld8 t9 = [src0], 3*8)
-EK(.ex_handler, (p[D]) ld8 t11 = [src1], 3*8)
-EX(.ex_handler, (p[D]) st8 [dst0] = t6, 3*8)
-EK(.ex_handler, (p[D]) st8 [dst1] = t10, 8)
- ;;
-EX(.ex_handler, (p[D]) ld8 t12 = [src0], 8)
-EK(.ex_handler, (p[D]) ld8 t14 = [src1], 8)
-EX(.ex_handler, (p[D]) st8 [dst0] = t9, 3*8)
-EK(.ex_handler, (p[D]) st8 [dst1] = t11, 3*8)
- ;;
-EX(.ex_handler, (p[D]) ld8 t13 = [src0], 4*8)
-EK(.ex_handler, (p[D]) ld8 t15 = [src1], 4*8)
-EX(.ex_handler, (p[D]) st8 [dst0] = t12, 8)
-EK(.ex_handler, (p[D]) st8 [dst1] = t14, 8)
- ;;
-EX(.ex_handler, (p[C]) ld8 t1 = [src0], 8)
-EK(.ex_handler, (p[C]) ld8 t3 = [src1], 8)
-EX(.ex_handler, (p[D]) st8 [dst0] = t13, 4*8)
-EK(.ex_handler, (p[D]) st8 [dst1] = t15, 4*8)
- br.ctop.sptk .line_copy
- ;;
-
- add dst0=-8,dst0
- add src0=-8,src0
- mov in2=tmp
- .restore sp
- br.sptk.many .medium_copy
- ;;
-
-#define BLOCK_SIZE 128*32
-#define blocksize r23
-#define curlen r24
-
-// dest is on 8-byte boundary, src is not. We need to do
-// ld8-ld8, shrp, then st8. Max 8 byte copy per cycle.
-.unaligned_src:
- .prologue
- .save ar.pfs, saved_pfs
- alloc saved_pfs=ar.pfs,3,5,0,8
- .save ar.lc, saved_lc
- mov saved_lc=ar.lc
- .save pr, saved_pr
- mov saved_pr=pr
- .body
-.4k_block:
- mov saved_in0=dst0 // need to save all input arguments
- mov saved_in2=in2
- mov blocksize=BLOCK_SIZE
- ;;
- cmp.lt p6,p7=blocksize,in2
- mov saved_in1=src0
- ;;
-(p6) mov in2=blocksize
- ;;
- shr.u r21=in2,7 // this much cache line
- shr.u r22=in2,4 // number of 16-byte iteration
- and curlen=15,in2 // copy length after iteration
- and r30=7,src0 // source alignment
- ;;
- cmp.lt p7,p8=1,r21
- add cnt=-1,r21
- ;;
-
- add src_pre_mem=0,src0 // prefetch src pointer
- add dst_pre_mem=0,dst0 // prefetch dest pointer
- and src0=-8,src0 // 1st src pointer
-(p7) mov ar.lc = cnt
-(p8) mov ar.lc = r0
- ;;
- TEXT_ALIGN(32)
-1: lfetch.fault [src_pre_mem], 128
- lfetch.fault.excl [dst_pre_mem], 128
- br.cloop.dptk.few 1b
- ;;
-
- shladd dst1=r22,3,dst0 // 2nd dest pointer
- shladd src1=r22,3,src0 // 2nd src pointer
- cmp.eq p8,p9=r22,r0 // do we really need to loop?
- cmp.le p6,p7=8,curlen; // have at least 8 byte remaining?
- add cnt=-1,r22 // ctop iteration adjustment
- ;;
-EX(.ex_handler, (p9) ld8 r33=[src0],8) // loop primer
-EK(.ex_handler, (p9) ld8 r37=[src1],8)
-(p8) br.dpnt.few .noloop
- ;;
-
-// The jump address is calculated based on src alignment. The COPYU
-// macro below need to confine its size to power of two, so an entry
-// can be caulated using shl instead of an expensive multiply. The
-// size is then hard coded by the following #define to match the
-// actual size. This make it somewhat tedious when COPYU macro gets
-// changed and this need to be adjusted to match.
-#define LOOP_SIZE 6
-1:
- mov r29=ip // jmp_table thread
- mov ar.lc=cnt
- ;;
- add r29=.jump_table - 1b - (.jmp1-.jump_table), r29
- shl r28=r30, LOOP_SIZE // jmp_table thread
- mov ar.ec=2 // loop setup
- ;;
- add r29=r29,r28 // jmp_table thread
- cmp.eq p16,p17=r0,r0
- ;;
- mov b6=r29 // jmp_table thread
- ;;
- br.cond.sptk.few b6
-
-// for 8-15 byte case
-// We will skip the loop, but need to replicate the side effect
-// that the loop produces.
-.noloop:
-EX(.ex_handler, (p6) ld8 r37=[src1],8)
- add src0=8,src0
-(p6) shl r25=r30,3
- ;;
-EX(.ex_handler, (p6) ld8 r27=[src1])
-(p6) shr.u r28=r37,r25
-(p6) sub r26=64,r25
- ;;
-(p6) shl r27=r27,r26
- ;;
-(p6) or r21=r28,r27
-
-.unaligned_src_tail:
-/* check if we have more than blocksize to copy, if so go back */
- cmp.gt p8,p0=saved_in2,blocksize
- ;;
-(p8) add dst0=saved_in0,blocksize
-(p8) add src0=saved_in1,blocksize
-(p8) sub in2=saved_in2,blocksize
-(p8) br.dpnt .4k_block
- ;;
-
-/* we have up to 15 byte to copy in the tail.
- * part of work is already done in the jump table code
- * we are at the following state.
- * src side:
- *
- * xxxxxx xx <----- r21 has xxxxxxxx already
- * -------- -------- --------
- * 0 8 16
- * ^
- * |
- * src1
- *
- * dst
- * -------- -------- --------
- * ^
- * |
- * dst1
- */
-EX(.ex_handler, (p6) st8 [dst1]=r21,8) // more than 8 byte to copy
-(p6) add curlen=-8,curlen // update length
- mov ar.pfs=saved_pfs
- ;;
- mov ar.lc=saved_lc
- mov pr=saved_pr,-1
- mov in2=curlen // remaining length
- mov dst0=dst1 // dest pointer
- add src0=src1,r30 // forward by src alignment
- ;;
-
-// 7 byte or smaller.
-.memcpy_short:
- cmp.le p8,p9 = 1,in2
- cmp.le p10,p11 = 2,in2
- cmp.le p12,p13 = 3,in2
- cmp.le p14,p15 = 4,in2
- add src1=1,src0 // second src pointer
- add dst1=1,dst0 // second dest pointer
- ;;
-
-EX(.ex_handler_short, (p8) ld1 t1=[src0],2)
-EK(.ex_handler_short, (p10) ld1 t2=[src1],2)
-(p9) br.ret.dpnt rp // 0 byte copy
- ;;
-
-EX(.ex_handler_short, (p8) st1 [dst0]=t1,2)
-EK(.ex_handler_short, (p10) st1 [dst1]=t2,2)
-(p11) br.ret.dpnt rp // 1 byte copy
-
-EX(.ex_handler_short, (p12) ld1 t3=[src0],2)
-EK(.ex_handler_short, (p14) ld1 t4=[src1],2)
-(p13) br.ret.dpnt rp // 2 byte copy
- ;;
-
- cmp.le p6,p7 = 5,in2
- cmp.le p8,p9 = 6,in2
- cmp.le p10,p11 = 7,in2
-
-EX(.ex_handler_short, (p12) st1 [dst0]=t3,2)
-EK(.ex_handler_short, (p14) st1 [dst1]=t4,2)
-(p15) br.ret.dpnt rp // 3 byte copy
- ;;
-
-EX(.ex_handler_short, (p6) ld1 t5=[src0],2)
-EK(.ex_handler_short, (p8) ld1 t6=[src1],2)
-(p7) br.ret.dpnt rp // 4 byte copy
- ;;
-
-EX(.ex_handler_short, (p6) st1 [dst0]=t5,2)
-EK(.ex_handler_short, (p8) st1 [dst1]=t6,2)
-(p9) br.ret.dptk rp // 5 byte copy
-
-EX(.ex_handler_short, (p10) ld1 t7=[src0],2)
-(p11) br.ret.dptk rp // 6 byte copy
- ;;
-
-EX(.ex_handler_short, (p10) st1 [dst0]=t7,2)
- br.ret.dptk rp // done all cases
-
-
-/* Align dest to nearest 8-byte boundary. We know we have at
- * least 7 bytes to copy, enough to crawl to 8-byte boundary.
- * Actual number of byte to crawl depend on the dest alignment.
- * 7 byte or less is taken care at .memcpy_short
-
- * src0 - source even index
- * src1 - source odd index
- * dst0 - dest even index
- * dst1 - dest odd index
- * r30 - distance to 8-byte boundary
- */
-
-.align_dest:
- add src1=1,in1 // source odd index
- cmp.le p7,p0 = 2,r30 // for .align_dest
- cmp.le p8,p0 = 3,r30 // for .align_dest
-EX(.ex_handler_short, (p6) ld1 t1=[src0],2)
- cmp.le p9,p0 = 4,r30 // for .align_dest
- cmp.le p10,p0 = 5,r30
- ;;
-EX(.ex_handler_short, (p7) ld1 t2=[src1],2)
-EK(.ex_handler_short, (p8) ld1 t3=[src0],2)
- cmp.le p11,p0 = 6,r30
-EX(.ex_handler_short, (p6) st1 [dst0] = t1,2)
- cmp.le p12,p0 = 7,r30
- ;;
-EX(.ex_handler_short, (p9) ld1 t4=[src1],2)
-EK(.ex_handler_short, (p10) ld1 t5=[src0],2)
-EX(.ex_handler_short, (p7) st1 [dst1] = t2,2)
-EK(.ex_handler_short, (p8) st1 [dst0] = t3,2)
- ;;
-EX(.ex_handler_short, (p11) ld1 t6=[src1],2)
-EK(.ex_handler_short, (p12) ld1 t7=[src0],2)
- cmp.eq p6,p7=r28,r29
-EX(.ex_handler_short, (p9) st1 [dst1] = t4,2)
-EK(.ex_handler_short, (p10) st1 [dst0] = t5,2)
- sub in2=in2,r30
- ;;
-EX(.ex_handler_short, (p11) st1 [dst1] = t6,2)
-EK(.ex_handler_short, (p12) st1 [dst0] = t7)
- add dst0=in0,r30 // setup arguments
- add src0=in1,r30
-(p6) br.cond.dptk .aligned_src
-(p7) br.cond.dpnt .unaligned_src
- ;;
-
-/* main loop body in jump table format */
-#define COPYU(shift) \
-1: \
-EX(.ex_handler, (p16) ld8 r32=[src0],8); /* 1 */ \
-EK(.ex_handler, (p16) ld8 r36=[src1],8); \
- (p17) shrp r35=r33,r34,shift;; /* 1 */ \
-EX(.ex_handler, (p6) ld8 r22=[src1]); /* common, prime for tail section */ \
- nop.m 0; \
- (p16) shrp r38=r36,r37,shift; \
-EX(.ex_handler, (p17) st8 [dst0]=r35,8); /* 1 */ \
-EK(.ex_handler, (p17) st8 [dst1]=r39,8); \
- br.ctop.dptk.few 1b;; \
- (p7) add src1=-8,src1; /* back out for <8 byte case */ \
- shrp r21=r22,r38,shift; /* speculative work */ \
- br.sptk.few .unaligned_src_tail /* branch out of jump table */ \
- ;;
- TEXT_ALIGN(32)
-.jump_table:
- COPYU(8) // unaligned cases
-.jmp1:
- COPYU(16)
- COPYU(24)
- COPYU(32)
- COPYU(40)
- COPYU(48)
- COPYU(56)
-
-#undef A
-#undef B
-#undef C
-#undef D
-
-/*
- * Due to lack of local tag support in gcc 2.x assembler, it is not clear which
- * instruction failed in the bundle. The exception algorithm is that we
- * first figure out the faulting address, then detect if there is any
- * progress made on the copy, if so, redo the copy from last known copied
- * location up to the faulting address (exclusive). In the copy_from_user
- * case, remaining byte in kernel buffer will be zeroed.
- *
- * Take copy_from_user as an example, in the code there are multiple loads
- * in a bundle and those multiple loads could span over two pages, the
- * faulting address is calculated as page_round_down(max(src0, src1)).
- * This is based on knowledge that if we can access one byte in a page, we
- * can access any byte in that page.
- *
- * predicate used in the exception handler:
- * p6-p7: direction
- * p10-p11: src faulting addr calculation
- * p12-p13: dst faulting addr calculation
- */
-
-#define A r19
-#define B r20
-#define C r21
-#define D r22
-#define F r28
-
-#define saved_retval loc0
-#define saved_rtlink loc1
-#define saved_pfs_stack loc2
-
-.ex_hndlr_s:
- add src0=8,src0
- br.sptk .ex_handler
- ;;
-.ex_hndlr_d:
- add dst0=8,dst0
- br.sptk .ex_handler
- ;;
-.ex_hndlr_lcpy_1:
- mov src1=src_pre_mem
- mov dst1=dst_pre_mem
- cmp.gtu p10,p11=src_pre_mem,saved_in1
- cmp.gtu p12,p13=dst_pre_mem,saved_in0
- ;;
-(p10) add src0=8,saved_in1
-(p11) mov src0=saved_in1
-(p12) add dst0=8,saved_in0
-(p13) mov dst0=saved_in0
- br.sptk .ex_handler
-.ex_handler_lcpy:
- // in line_copy block, the preload addresses should always ahead
- // of the other two src/dst pointers. Furthermore, src1/dst1 should
- // always ahead of src0/dst0.
- mov src1=src_pre_mem
- mov dst1=dst_pre_mem
-.ex_handler:
- mov pr=saved_pr,-1 // first restore pr, lc, and pfs
- mov ar.lc=saved_lc
- mov ar.pfs=saved_pfs
- ;;
-.ex_handler_short: // fault occurred in these sections didn't change pr, lc, pfs
- cmp.ltu p6,p7=saved_in0, saved_in1 // get the copy direction
- cmp.ltu p10,p11=src0,src1
- cmp.ltu p12,p13=dst0,dst1
- fcmp.eq p8,p0=f6,f0 // is it memcpy?
- mov tmp = dst0
- ;;
-(p11) mov src1 = src0 // pick the larger of the two
-(p13) mov dst0 = dst1 // make dst0 the smaller one
-(p13) mov dst1 = tmp // and dst1 the larger one
- ;;
-(p6) dep F = r0,dst1,0,PAGE_SHIFT // usr dst round down to page boundary
-(p7) dep F = r0,src1,0,PAGE_SHIFT // usr src round down to page boundary
- ;;
-(p6) cmp.le p14,p0=dst0,saved_in0 // no progress has been made on store
-(p7) cmp.le p14,p0=src0,saved_in1 // no progress has been made on load
- mov retval=saved_in2
-(p8) ld1 tmp=[src1] // force an oops for memcpy call
-(p8) st1 [dst1]=r0 // force an oops for memcpy call
-(p14) br.ret.sptk.many rp
-
-/*
- * The remaining byte to copy is calculated as:
- *
- * A = (faulting_addr - orig_src) -> len to faulting ld address
- * or
- * (faulting_addr - orig_dst) -> len to faulting st address
- * B = (cur_dst - orig_dst) -> len copied so far
- * C = A - B -> len need to be copied
- * D = orig_len - A -> len need to be left along
- */
-(p6) sub A = F, saved_in0
-(p7) sub A = F, saved_in1
- clrrrb
- ;;
- alloc saved_pfs_stack=ar.pfs,3,3,3,0
- cmp.lt p8,p0=A,r0
- sub B = dst0, saved_in0 // how many byte copied so far
- ;;
-(p8) mov A = 0; // A shouldn't be negative, cap it
- ;;
- sub C = A, B
- sub D = saved_in2, A
- ;;
- cmp.gt p8,p0=C,r0 // more than 1 byte?
- mov r8=0
- mov saved_retval = D
- mov saved_rtlink = b0
-
- add out0=saved_in0, B
- add out1=saved_in1, B
- mov out2=C
-(p8) br.call.sptk.few b0=__copy_user // recursive call
- ;;
-
- add saved_retval=saved_retval,r8 // above might return non-zero value
- ;;
-
- mov retval=saved_retval
- mov ar.pfs=saved_pfs_stack
- mov b0=saved_rtlink
- br.ret.sptk.many rp
-
-/* end of McKinley specific optimization */
-END(__copy_user)
-EXPORT_SYMBOL(__copy_user)
diff --git a/arch/ia64/lib/memset.S b/arch/ia64/lib/memset.S
deleted file mode 100644
index 552c5c7e4d06..000000000000
--- a/arch/ia64/lib/memset.S
+++ /dev/null
@@ -1,365 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/* Optimized version of the standard memset() function.
-
- Copyright (c) 2002 Hewlett-Packard Co/CERN
- Sverre Jarp <Sverre.Jarp@cern.ch>
-
- Return: dest
-
- Inputs:
- in0: dest
- in1: value
- in2: count
-
- The algorithm is fairly straightforward: set byte by byte until we
- we get to a 16B-aligned address, then loop on 128 B chunks using an
- early store as prefetching, then loop on 32B chucks, then clear remaining
- words, finally clear remaining bytes.
- Since a stf.spill f0 can store 16B in one go, we use this instruction
- to get peak speed when value = 0. */
-
-#include <linux/export.h>
-#include <asm/asmmacro.h>
-#undef ret
-
-#define dest in0
-#define value in1
-#define cnt in2
-
-#define tmp r31
-#define save_lc r30
-#define ptr0 r29
-#define ptr1 r28
-#define ptr2 r27
-#define ptr3 r26
-#define ptr9 r24
-#define loopcnt r23
-#define linecnt r22
-#define bytecnt r21
-
-#define fvalue f6
-
-// This routine uses only scratch predicate registers (p6 - p15)
-#define p_scr p6 // default register for same-cycle branches
-#define p_nz p7
-#define p_zr p8
-#define p_unalgn p9
-#define p_y p11
-#define p_n p12
-#define p_yy p13
-#define p_nn p14
-
-#define MIN1 15
-#define MIN1P1HALF 8
-#define LINE_SIZE 128
-#define LSIZE_SH 7 // shift amount
-#define PREF_AHEAD 8
-
-GLOBAL_ENTRY(memset)
-{ .mmi
- .prologue
- alloc tmp = ar.pfs, 3, 0, 0, 0
- lfetch.nt1 [dest] //
- .save ar.lc, save_lc
- mov.i save_lc = ar.lc
- .body
-} { .mmi
- mov ret0 = dest // return value
- cmp.ne p_nz, p_zr = value, r0 // use stf.spill if value is zero
- cmp.eq p_scr, p0 = cnt, r0
-;; }
-{ .mmi
- and ptr2 = -(MIN1+1), dest // aligned address
- and tmp = MIN1, dest // prepare to check for correct alignment
- tbit.nz p_y, p_n = dest, 0 // Do we have an odd address? (M_B_U)
-} { .mib
- mov ptr1 = dest
- mux1 value = value, @brcst // create 8 identical bytes in word
-(p_scr) br.ret.dpnt.many rp // return immediately if count = 0
-;; }
-{ .mib
- cmp.ne p_unalgn, p0 = tmp, r0 //
-} { .mib
- sub bytecnt = (MIN1+1), tmp // NB: # of bytes to move is 1 higher than loopcnt
- cmp.gt p_scr, p0 = 16, cnt // is it a minimalistic task?
-(p_scr) br.cond.dptk.many .move_bytes_unaligned // go move just a few (M_B_U)
-;; }
-{ .mmi
-(p_unalgn) add ptr1 = (MIN1+1), ptr2 // after alignment
-(p_unalgn) add ptr2 = MIN1P1HALF, ptr2 // after alignment
-(p_unalgn) tbit.nz.unc p_y, p_n = bytecnt, 3 // should we do a st8 ?
-;; }
-{ .mib
-(p_y) add cnt = -8, cnt //
-(p_unalgn) tbit.nz.unc p_yy, p_nn = bytecnt, 2 // should we do a st4 ?
-} { .mib
-(p_y) st8 [ptr2] = value,-4 //
-(p_n) add ptr2 = 4, ptr2 //
-;; }
-{ .mib
-(p_yy) add cnt = -4, cnt //
-(p_unalgn) tbit.nz.unc p_y, p_n = bytecnt, 1 // should we do a st2 ?
-} { .mib
-(p_yy) st4 [ptr2] = value,-2 //
-(p_nn) add ptr2 = 2, ptr2 //
-;; }
-{ .mmi
- mov tmp = LINE_SIZE+1 // for compare
-(p_y) add cnt = -2, cnt //
-(p_unalgn) tbit.nz.unc p_yy, p_nn = bytecnt, 0 // should we do a st1 ?
-} { .mmi
- setf.sig fvalue=value // transfer value to FLP side
-(p_y) st2 [ptr2] = value,-1 //
-(p_n) add ptr2 = 1, ptr2 //
-;; }
-
-{ .mmi
-(p_yy) st1 [ptr2] = value //
- cmp.gt p_scr, p0 = tmp, cnt // is it a minimalistic task?
-} { .mbb
-(p_yy) add cnt = -1, cnt //
-(p_scr) br.cond.dpnt.many .fraction_of_line // go move just a few
-;; }
-
-{ .mib
- nop.m 0
- shr.u linecnt = cnt, LSIZE_SH
-(p_zr) br.cond.dptk.many .l1b // Jump to use stf.spill
-;; }
-
- TEXT_ALIGN(32) // --------------------- // L1A: store ahead into cache lines; fill later
-{ .mmi
- and tmp = -(LINE_SIZE), cnt // compute end of range
- mov ptr9 = ptr1 // used for prefetching
- and cnt = (LINE_SIZE-1), cnt // remainder
-} { .mmi
- mov loopcnt = PREF_AHEAD-1 // default prefetch loop
- cmp.gt p_scr, p0 = PREF_AHEAD, linecnt // check against actual value
-;; }
-{ .mmi
-(p_scr) add loopcnt = -1, linecnt //
- add ptr2 = 8, ptr1 // start of stores (beyond prefetch stores)
- add ptr1 = tmp, ptr1 // first address beyond total range
-;; }
-{ .mmi
- add tmp = -1, linecnt // next loop count
- mov.i ar.lc = loopcnt //
-;; }
-.pref_l1a:
-{ .mib
- stf8 [ptr9] = fvalue, 128 // Do stores one cache line apart
- nop.i 0
- br.cloop.dptk.few .pref_l1a
-;; }
-{ .mmi
- add ptr0 = 16, ptr2 // Two stores in parallel
- mov.i ar.lc = tmp //
-;; }
-.l1ax:
- { .mmi
- stf8 [ptr2] = fvalue, 8
- stf8 [ptr0] = fvalue, 8
- ;; }
- { .mmi
- stf8 [ptr2] = fvalue, 24
- stf8 [ptr0] = fvalue, 24
- ;; }
- { .mmi
- stf8 [ptr2] = fvalue, 8
- stf8 [ptr0] = fvalue, 8
- ;; }
- { .mmi
- stf8 [ptr2] = fvalue, 24
- stf8 [ptr0] = fvalue, 24
- ;; }
- { .mmi
- stf8 [ptr2] = fvalue, 8
- stf8 [ptr0] = fvalue, 8
- ;; }
- { .mmi
- stf8 [ptr2] = fvalue, 24
- stf8 [ptr0] = fvalue, 24
- ;; }
- { .mmi
- stf8 [ptr2] = fvalue, 8
- stf8 [ptr0] = fvalue, 32
- cmp.lt p_scr, p0 = ptr9, ptr1 // do we need more prefetching?
- ;; }
-{ .mmb
- stf8 [ptr2] = fvalue, 24
-(p_scr) stf8 [ptr9] = fvalue, 128
- br.cloop.dptk.few .l1ax
-;; }
-{ .mbb
- cmp.le p_scr, p0 = 8, cnt // just a few bytes left ?
-(p_scr) br.cond.dpnt.many .fraction_of_line // Branch no. 2
- br.cond.dpnt.many .move_bytes_from_alignment // Branch no. 3
-;; }
-
- TEXT_ALIGN(32)
-.l1b: // ------------------------------------ // L1B: store ahead into cache lines; fill later
-{ .mmi
- and tmp = -(LINE_SIZE), cnt // compute end of range
- mov ptr9 = ptr1 // used for prefetching
- and cnt = (LINE_SIZE-1), cnt // remainder
-} { .mmi
- mov loopcnt = PREF_AHEAD-1 // default prefetch loop
- cmp.gt p_scr, p0 = PREF_AHEAD, linecnt // check against actual value
-;; }
-{ .mmi
-(p_scr) add loopcnt = -1, linecnt
- add ptr2 = 16, ptr1 // start of stores (beyond prefetch stores)
- add ptr1 = tmp, ptr1 // first address beyond total range
-;; }
-{ .mmi
- add tmp = -1, linecnt // next loop count
- mov.i ar.lc = loopcnt
-;; }
-.pref_l1b:
-{ .mib
- stf.spill [ptr9] = f0, 128 // Do stores one cache line apart
- nop.i 0
- br.cloop.dptk.few .pref_l1b
-;; }
-{ .mmi
- add ptr0 = 16, ptr2 // Two stores in parallel
- mov.i ar.lc = tmp
-;; }
-.l1bx:
- { .mmi
- stf.spill [ptr2] = f0, 32
- stf.spill [ptr0] = f0, 32
- ;; }
- { .mmi
- stf.spill [ptr2] = f0, 32
- stf.spill [ptr0] = f0, 32
- ;; }
- { .mmi
- stf.spill [ptr2] = f0, 32
- stf.spill [ptr0] = f0, 64
- cmp.lt p_scr, p0 = ptr9, ptr1 // do we need more prefetching?
- ;; }
-{ .mmb
- stf.spill [ptr2] = f0, 32
-(p_scr) stf.spill [ptr9] = f0, 128
- br.cloop.dptk.few .l1bx
-;; }
-{ .mib
- cmp.gt p_scr, p0 = 8, cnt // just a few bytes left ?
-(p_scr) br.cond.dpnt.many .move_bytes_from_alignment //
-;; }
-
-.fraction_of_line:
-{ .mib
- add ptr2 = 16, ptr1
- shr.u loopcnt = cnt, 5 // loopcnt = cnt / 32
-;; }
-{ .mib
- cmp.eq p_scr, p0 = loopcnt, r0
- add loopcnt = -1, loopcnt
-(p_scr) br.cond.dpnt.many .store_words
-;; }
-{ .mib
- and cnt = 0x1f, cnt // compute the remaining cnt
- mov.i ar.lc = loopcnt
-;; }
- TEXT_ALIGN(32)
-.l2: // ------------------------------------ // L2A: store 32B in 2 cycles
-{ .mmb
- stf8 [ptr1] = fvalue, 8
- stf8 [ptr2] = fvalue, 8
-;; } { .mmb
- stf8 [ptr1] = fvalue, 24
- stf8 [ptr2] = fvalue, 24
- br.cloop.dptk.many .l2
-;; }
-.store_words:
-{ .mib
- cmp.gt p_scr, p0 = 8, cnt // just a few bytes left ?
-(p_scr) br.cond.dpnt.many .move_bytes_from_alignment // Branch
-;; }
-
-{ .mmi
- stf8 [ptr1] = fvalue, 8 // store
- cmp.le p_y, p_n = 16, cnt
- add cnt = -8, cnt // subtract
-;; }
-{ .mmi
-(p_y) stf8 [ptr1] = fvalue, 8 // store
-(p_y) cmp.le.unc p_yy, p_nn = 16, cnt
-(p_y) add cnt = -8, cnt // subtract
-;; }
-{ .mmi // store
-(p_yy) stf8 [ptr1] = fvalue, 8
-(p_yy) add cnt = -8, cnt // subtract
-;; }
-
-.move_bytes_from_alignment:
-{ .mib
- cmp.eq p_scr, p0 = cnt, r0
- tbit.nz.unc p_y, p0 = cnt, 2 // should we terminate with a st4 ?
-(p_scr) br.cond.dpnt.few .restore_and_exit
-;; }
-{ .mib
-(p_y) st4 [ptr1] = value,4
- tbit.nz.unc p_yy, p0 = cnt, 1 // should we terminate with a st2 ?
-;; }
-{ .mib
-(p_yy) st2 [ptr1] = value,2
- tbit.nz.unc p_y, p0 = cnt, 0 // should we terminate with a st1 ?
-;; }
-
-{ .mib
-(p_y) st1 [ptr1] = value
-;; }
-.restore_and_exit:
-{ .mib
- nop.m 0
- mov.i ar.lc = save_lc
- br.ret.sptk.many rp
-;; }
-
-.move_bytes_unaligned:
-{ .mmi
- .pred.rel "mutex",p_y, p_n
- .pred.rel "mutex",p_yy, p_nn
-(p_n) cmp.le p_yy, p_nn = 4, cnt
-(p_y) cmp.le p_yy, p_nn = 5, cnt
-(p_n) add ptr2 = 2, ptr1
-} { .mmi
-(p_y) add ptr2 = 3, ptr1
-(p_y) st1 [ptr1] = value, 1 // fill 1 (odd-aligned) byte [15, 14 (or less) left]
-(p_y) add cnt = -1, cnt
-;; }
-{ .mmi
-(p_yy) cmp.le.unc p_y, p0 = 8, cnt
- add ptr3 = ptr1, cnt // prepare last store
- mov.i ar.lc = save_lc
-} { .mmi
-(p_yy) st2 [ptr1] = value, 4 // fill 2 (aligned) bytes
-(p_yy) st2 [ptr2] = value, 4 // fill 2 (aligned) bytes [11, 10 (o less) left]
-(p_yy) add cnt = -4, cnt
-;; }
-{ .mmi
-(p_y) cmp.le.unc p_yy, p0 = 8, cnt
- add ptr3 = -1, ptr3 // last store
- tbit.nz p_scr, p0 = cnt, 1 // will there be a st2 at the end ?
-} { .mmi
-(p_y) st2 [ptr1] = value, 4 // fill 2 (aligned) bytes
-(p_y) st2 [ptr2] = value, 4 // fill 2 (aligned) bytes [7, 6 (or less) left]
-(p_y) add cnt = -4, cnt
-;; }
-{ .mmi
-(p_yy) st2 [ptr1] = value, 4 // fill 2 (aligned) bytes
-(p_yy) st2 [ptr2] = value, 4 // fill 2 (aligned) bytes [3, 2 (or less) left]
- tbit.nz p_y, p0 = cnt, 0 // will there be a st1 at the end ?
-} { .mmi
-(p_yy) add cnt = -4, cnt
-;; }
-{ .mmb
-(p_scr) st2 [ptr1] = value // fill 2 (aligned) bytes
-(p_y) st1 [ptr3] = value // fill last byte (using ptr3)
- br.ret.sptk.many rp
-}
-END(memset)
-EXPORT_SYMBOL(memset)
diff --git a/arch/ia64/lib/strlen.S b/arch/ia64/lib/strlen.S
deleted file mode 100644
index 1f4a46c15127..000000000000
--- a/arch/ia64/lib/strlen.S
+++ /dev/null
@@ -1,195 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- *
- * Optimized version of the standard strlen() function
- *
- *
- * Inputs:
- * in0 address of string
- *
- * Outputs:
- * ret0 the number of characters in the string (0 if empty string)
- * does not count the \0
- *
- * Copyright (C) 1999, 2001 Hewlett-Packard Co
- * Stephane Eranian <eranian@hpl.hp.com>
- *
- * 09/24/99 S.Eranian add speculation recovery code
- */
-
-#include <linux/export.h>
-#include <asm/asmmacro.h>
-
-//
-//
-// This is an enhanced version of the basic strlen. it includes a combination
-// of compute zero index (czx), parallel comparisons, speculative loads and
-// loop unroll using rotating registers.
-//
-// General Ideas about the algorithm:
-// The goal is to look at the string in chunks of 8 bytes.
-// so we need to do a few extra checks at the beginning because the
-// string may not be 8-byte aligned. In this case we load the 8byte
-// quantity which includes the start of the string and mask the unused
-// bytes with 0xff to avoid confusing czx.
-// We use speculative loads and software pipelining to hide memory
-// latency and do read ahead safely. This way we defer any exception.
-//
-// Because we don't want the kernel to be relying on particular
-// settings of the DCR register, we provide recovery code in case
-// speculation fails. The recovery code is going to "redo" the work using
-// only normal loads. If we still get a fault then we generate a
-// kernel panic. Otherwise we return the strlen as usual.
-//
-// The fact that speculation may fail can be caused, for instance, by
-// the DCR.dm bit being set. In this case TLB misses are deferred, i.e.,
-// a NaT bit will be set if the translation is not present. The normal
-// load, on the other hand, will cause the translation to be inserted
-// if the mapping exists.
-//
-// It should be noted that we execute recovery code only when we need
-// to use the data that has been speculatively loaded: we don't execute
-// recovery code on pure read ahead data.
-//
-// Remarks:
-// - the cmp r0,r0 is used as a fast way to initialize a predicate
-// register to 1. This is required to make sure that we get the parallel
-// compare correct.
-//
-// - we don't use the epilogue counter to exit the loop but we need to set
-// it to zero beforehand.
-//
-// - after the loop we must test for Nat values because neither the
-// czx nor cmp instruction raise a NaT consumption fault. We must be
-// careful not to look too far for a Nat for which we don't care.
-// For instance we don't need to look at a NaT in val2 if the zero byte
-// was in val1.
-//
-// - Clearly performance tuning is required.
-//
-//
-//
-#define saved_pfs r11
-#define tmp r10
-#define base r16
-#define orig r17
-#define saved_pr r18
-#define src r19
-#define mask r20
-#define val r21
-#define val1 r22
-#define val2 r23
-
-GLOBAL_ENTRY(strlen)
- .prologue
- .save ar.pfs, saved_pfs
- alloc saved_pfs=ar.pfs,11,0,0,8 // rotating must be multiple of 8
-
- .rotr v[2], w[2] // declares our 4 aliases
-
- extr.u tmp=in0,0,3 // tmp=least significant 3 bits
- mov orig=in0 // keep trackof initial byte address
- dep src=0,in0,0,3 // src=8byte-aligned in0 address
- .save pr, saved_pr
- mov saved_pr=pr // preserve predicates (rotation)
- ;;
-
- .body
-
- ld8 v[1]=[src],8 // must not speculate: can fail here
- shl tmp=tmp,3 // multiply by 8bits/byte
- mov mask=-1 // our mask
- ;;
- ld8.s w[1]=[src],8 // speculatively load next
- cmp.eq p6,p0=r0,r0 // sets p6 to true for cmp.and
- sub tmp=64,tmp // how many bits to shift our mask on the right
- ;;
- shr.u mask=mask,tmp // zero enough bits to hold v[1] valuable part
- mov ar.ec=r0 // clear epilogue counter (saved in ar.pfs)
- ;;
- add base=-16,src // keep track of aligned base
- or v[1]=v[1],mask // now we have a safe initial byte pattern
- ;;
-1:
- ld8.s v[0]=[src],8 // speculatively load next
- czx1.r val1=v[1] // search 0 byte from right
- czx1.r val2=w[1] // search 0 byte from right following 8bytes
- ;;
- ld8.s w[0]=[src],8 // speculatively load next to next
- cmp.eq.and p6,p0=8,val1 // p6 = p6 and val1==8
- cmp.eq.and p6,p0=8,val2 // p6 = p6 and mask==8
-(p6) br.wtop.dptk 1b // loop until p6 == 0
- ;;
- //
- // We must return try the recovery code iff
- // val1_is_nat || (val1==8 && val2_is_nat)
- //
- // XXX Fixme
- // - there must be a better way of doing the test
- //
- cmp.eq p8,p9=8,val1 // p6 = val1 had zero (disambiguate)
- tnat.nz p6,p7=val1 // test NaT on val1
-(p6) br.cond.spnt .recover // jump to recovery if val1 is NaT
- ;;
- //
- // if we come here p7 is true, i.e., initialized for // cmp
- //
- cmp.eq.and p7,p0=8,val1// val1==8?
- tnat.nz.and p7,p0=val2 // test NaT if val2
-(p7) br.cond.spnt .recover // jump to recovery if val2 is NaT
- ;;
-(p8) mov val1=val2 // the other test got us out of the loop
-(p8) adds src=-16,src // correct position when 3 ahead
-(p9) adds src=-24,src // correct position when 4 ahead
- ;;
- sub ret0=src,orig // distance from base
- sub tmp=8,val1 // which byte in word
- mov pr=saved_pr,0xffffffffffff0000
- ;;
- sub ret0=ret0,tmp // adjust
- mov ar.pfs=saved_pfs // because of ar.ec, restore no matter what
- br.ret.sptk.many rp // end of normal execution
-
- //
- // Outlined recovery code when speculation failed
- //
- // This time we don't use speculation and rely on the normal exception
- // mechanism. that's why the loop is not as good as the previous one
- // because read ahead is not possible
- //
- // IMPORTANT:
- // Please note that in the case of strlen() as opposed to strlen_user()
- // we don't use the exception mechanism, as this function is not
- // supposed to fail. If that happens it means we have a bug and the
- // code will cause of kernel fault.
- //
- // XXX Fixme
- // - today we restart from the beginning of the string instead
- // of trying to continue where we left off.
- //
-.recover:
- ld8 val=[base],8 // will fail if unrecoverable fault
- ;;
- or val=val,mask // remask first bytes
- cmp.eq p0,p6=r0,r0 // nullify first ld8 in loop
- ;;
- //
- // ar.ec is still zero here
- //
-2:
-(p6) ld8 val=[base],8 // will fail if unrecoverable fault
- ;;
- czx1.r val1=val // search 0 byte from right
- ;;
- cmp.eq p6,p0=8,val1 // val1==8 ?
-(p6) br.wtop.dptk 2b // loop until p6 == 0
- ;; // (avoid WAW on p63)
- sub ret0=base,orig // distance from base
- sub tmp=8,val1
- mov pr=saved_pr,0xffffffffffff0000
- ;;
- sub ret0=ret0,tmp // length=now - back -1
- mov ar.pfs=saved_pfs // because of ar.ec, restore no matter what
- br.ret.sptk.many rp // end of successful recovery code
-END(strlen)
-EXPORT_SYMBOL(strlen)
diff --git a/arch/ia64/lib/strncpy_from_user.S b/arch/ia64/lib/strncpy_from_user.S
deleted file mode 100644
index a287169bd953..000000000000
--- a/arch/ia64/lib/strncpy_from_user.S
+++ /dev/null
@@ -1,47 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Just like strncpy() except that if a fault occurs during copying,
- * -EFAULT is returned.
- *
- * Inputs:
- * in0: address of destination buffer
- * in1: address of string to be copied
- * in2: length of buffer in bytes
- * Outputs:
- * r8: -EFAULT in case of fault or number of bytes copied if no fault
- *
- * Copyright (C) 1998-2001 Hewlett-Packard Co
- * Copyright (C) 1998-2001 David Mosberger-Tang <davidm@hpl.hp.com>
- *
- * 00/03/06 D. Mosberger Fixed to return proper return value (bug found by
- * by Andreas Schwab <schwab@suse.de>).
- */
-
-#include <linux/export.h>
-#include <asm/asmmacro.h>
-
-GLOBAL_ENTRY(__strncpy_from_user)
- alloc r2=ar.pfs,3,0,0,0
- mov r8=0
- mov r9=in1
- ;;
- add r10=in1,in2
- cmp.eq p6,p0=r0,in2
-(p6) br.ret.spnt.many rp
-
- // XXX braindead copy loop---this needs to be optimized
-.Loop1:
- EX(.Lexit, ld1 r8=[in1],1)
- ;;
- EX(.Lexit, st1 [in0]=r8,1)
- cmp.ne p6,p7=r8,r0
- ;;
-(p6) cmp.ne.unc p8,p0=in1,r10
-(p8) br.cond.dpnt.few .Loop1
- ;;
-(p6) mov r8=in2 // buffer filled up---return buffer length
-(p7) sub r8=in1,r9,1 // return string length (excluding NUL character)
-[.Lexit:]
- br.ret.sptk.many rp
-END(__strncpy_from_user)
-EXPORT_SYMBOL(__strncpy_from_user)
diff --git a/arch/ia64/lib/strnlen_user.S b/arch/ia64/lib/strnlen_user.S
deleted file mode 100644
index a7eb56e840a9..000000000000
--- a/arch/ia64/lib/strnlen_user.S
+++ /dev/null
@@ -1,48 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Returns 0 if exception before NUL or reaching the supplied limit (N),
- * a value greater than N if the string is longer than the limit, else
- * strlen.
- *
- * Inputs:
- * in0: address of buffer
- * in1: string length limit N
- * Outputs:
- * r8: 0 in case of fault, strlen(buffer)+1 otherwise
- *
- * Copyright (C) 1999, 2001 David Mosberger-Tang <davidm@hpl.hp.com>
- */
-
-#include <linux/export.h>
-#include <asm/asmmacro.h>
-
-GLOBAL_ENTRY(__strnlen_user)
- .prologue
- alloc r2=ar.pfs,2,0,0,0
- .save ar.lc, r16
- mov r16=ar.lc // preserve ar.lc
-
- .body
-
- add r3=-1,in1
- ;;
- mov ar.lc=r3
- mov r9=0
- ;;
- // XXX braindead strlen loop---this needs to be optimized
-.Loop1:
- EXCLR(.Lexit, ld1 r8=[in0],1)
- add r9=1,r9
- ;;
- cmp.eq p6,p0=r8,r0
-(p6) br.cond.dpnt .Lexit
- br.cloop.dptk.few .Loop1
-
- add r9=1,in1 // NUL not found---return N+1
- ;;
-.Lexit:
- mov r8=r9
- mov ar.lc=r16 // restore ar.lc
- br.ret.sptk.many rp
-END(__strnlen_user)
-EXPORT_SYMBOL(__strnlen_user)
diff --git a/arch/ia64/lib/xor.S b/arch/ia64/lib/xor.S
deleted file mode 100644
index 6e2a69662c06..000000000000
--- a/arch/ia64/lib/xor.S
+++ /dev/null
@@ -1,181 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * arch/ia64/lib/xor.S
- *
- * Optimized RAID-5 checksumming functions for IA-64.
- */
-
-#include <linux/export.h>
-#include <asm/asmmacro.h>
-
-GLOBAL_ENTRY(xor_ia64_2)
- .prologue
- .fframe 0
- .save ar.pfs, r31
- alloc r31 = ar.pfs, 3, 0, 13, 16
- .save ar.lc, r30
- mov r30 = ar.lc
- .save pr, r29
- mov r29 = pr
- ;;
- .body
- mov r8 = in1
- mov ar.ec = 6 + 2
- shr in0 = in0, 3
- ;;
- adds in0 = -1, in0
- mov r16 = in1
- mov r17 = in2
- ;;
- mov ar.lc = in0
- mov pr.rot = 1 << 16
- ;;
- .rotr s1[6+1], s2[6+1], d[2]
- .rotp p[6+2]
-0:
-(p[0]) ld8.nta s1[0] = [r16], 8
-(p[0]) ld8.nta s2[0] = [r17], 8
-(p[6]) xor d[0] = s1[6], s2[6]
-(p[6+1])st8.nta [r8] = d[1], 8
- nop.f 0
- br.ctop.dptk.few 0b
- ;;
- mov ar.lc = r30
- mov pr = r29, -1
- br.ret.sptk.few rp
-END(xor_ia64_2)
-EXPORT_SYMBOL(xor_ia64_2)
-
-GLOBAL_ENTRY(xor_ia64_3)
- .prologue
- .fframe 0
- .save ar.pfs, r31
- alloc r31 = ar.pfs, 4, 0, 20, 24
- .save ar.lc, r30
- mov r30 = ar.lc
- .save pr, r29
- mov r29 = pr
- ;;
- .body
- mov r8 = in1
- mov ar.ec = 6 + 2
- shr in0 = in0, 3
- ;;
- adds in0 = -1, in0
- mov r16 = in1
- mov r17 = in2
- ;;
- mov r18 = in3
- mov ar.lc = in0
- mov pr.rot = 1 << 16
- ;;
- .rotr s1[6+1], s2[6+1], s3[6+1], d[2]
- .rotp p[6+2]
-0:
-(p[0]) ld8.nta s1[0] = [r16], 8
-(p[0]) ld8.nta s2[0] = [r17], 8
-(p[6]) xor d[0] = s1[6], s2[6]
- ;;
-(p[0]) ld8.nta s3[0] = [r18], 8
-(p[6+1])st8.nta [r8] = d[1], 8
-(p[6]) xor d[0] = d[0], s3[6]
- br.ctop.dptk.few 0b
- ;;
- mov ar.lc = r30
- mov pr = r29, -1
- br.ret.sptk.few rp
-END(xor_ia64_3)
-EXPORT_SYMBOL(xor_ia64_3)
-
-GLOBAL_ENTRY(xor_ia64_4)
- .prologue
- .fframe 0
- .save ar.pfs, r31
- alloc r31 = ar.pfs, 5, 0, 27, 32
- .save ar.lc, r30
- mov r30 = ar.lc
- .save pr, r29
- mov r29 = pr
- ;;
- .body
- mov r8 = in1
- mov ar.ec = 6 + 2
- shr in0 = in0, 3
- ;;
- adds in0 = -1, in0
- mov r16 = in1
- mov r17 = in2
- ;;
- mov r18 = in3
- mov ar.lc = in0
- mov pr.rot = 1 << 16
- mov r19 = in4
- ;;
- .rotr s1[6+1], s2[6+1], s3[6+1], s4[6+1], d[2]
- .rotp p[6+2]
-0:
-(p[0]) ld8.nta s1[0] = [r16], 8
-(p[0]) ld8.nta s2[0] = [r17], 8
-(p[6]) xor d[0] = s1[6], s2[6]
-(p[0]) ld8.nta s3[0] = [r18], 8
-(p[0]) ld8.nta s4[0] = [r19], 8
-(p[6]) xor r20 = s3[6], s4[6]
- ;;
-(p[6+1])st8.nta [r8] = d[1], 8
-(p[6]) xor d[0] = d[0], r20
- br.ctop.dptk.few 0b
- ;;
- mov ar.lc = r30
- mov pr = r29, -1
- br.ret.sptk.few rp
-END(xor_ia64_4)
-EXPORT_SYMBOL(xor_ia64_4)
-
-GLOBAL_ENTRY(xor_ia64_5)
- .prologue
- .fframe 0
- .save ar.pfs, r31
- alloc r31 = ar.pfs, 6, 0, 34, 40
- .save ar.lc, r30
- mov r30 = ar.lc
- .save pr, r29
- mov r29 = pr
- ;;
- .body
- mov r8 = in1
- mov ar.ec = 6 + 2
- shr in0 = in0, 3
- ;;
- adds in0 = -1, in0
- mov r16 = in1
- mov r17 = in2
- ;;
- mov r18 = in3
- mov ar.lc = in0
- mov pr.rot = 1 << 16
- mov r19 = in4
- mov r20 = in5
- ;;
- .rotr s1[6+1], s2[6+1], s3[6+1], s4[6+1], s5[6+1], d[2]
- .rotp p[6+2]
-0:
-(p[0]) ld8.nta s1[0] = [r16], 8
-(p[0]) ld8.nta s2[0] = [r17], 8
-(p[6]) xor d[0] = s1[6], s2[6]
-(p[0]) ld8.nta s3[0] = [r18], 8
-(p[0]) ld8.nta s4[0] = [r19], 8
-(p[6]) xor r21 = s3[6], s4[6]
- ;;
-(p[0]) ld8.nta s5[0] = [r20], 8
-(p[6+1])st8.nta [r8] = d[1], 8
-(p[6]) xor d[0] = d[0], r21
- ;;
-(p[6]) xor d[0] = d[0], s5[6]
- nop.f 0
- br.ctop.dptk.few 0b
- ;;
- mov ar.lc = r30
- mov pr = r29, -1
- br.ret.sptk.few rp
-END(xor_ia64_5)
-EXPORT_SYMBOL(xor_ia64_5)