diff options
Diffstat (limited to 'arch/tile/lib')
29 files changed, 0 insertions, 3313 deletions
diff --git a/arch/tile/lib/Makefile b/arch/tile/lib/Makefile deleted file mode 100644 index 815a1fdeb2e4..000000000000 --- a/arch/tile/lib/Makefile +++ /dev/null @@ -1,19 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0 -# -# Makefile for TILE-specific library files.. -# - -lib-y = cacheflush.o checksum.o cpumask.o delay.o uaccess.o \ - memmove.o memcpy_$(BITS).o memchr_$(BITS).o memset_$(BITS).o \ - strchr_$(BITS).o strlen_$(BITS).o strnlen_$(BITS).o - -lib-$(CONFIG_TILEGX) += memcpy_user_64.o -lib-$(CONFIG_TILEPRO) += atomic_32.o atomic_asm_32.o -lib-$(CONFIG_SMP) += spinlock_$(BITS).o usercopy_$(BITS).o - -obj-$(CONFIG_MODULES) += exports.o - -# The finv_buffer_remote() and copy_{to,from}_user() routines can't -# have -pg added, since they both rely on being leaf functions. -CFLAGS_REMOVE_cacheflush.o = -pg -CFLAGS_REMOVE_memcpy_user_64.o = -pg diff --git a/arch/tile/lib/atomic_32.c b/arch/tile/lib/atomic_32.c deleted file mode 100644 index f8128800dbf5..000000000000 --- a/arch/tile/lib/atomic_32.c +++ /dev/null @@ -1,206 +0,0 @@ -/* - * Copyright 2010 Tilera Corporation. All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation, version 2. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or - * NON INFRINGEMENT. See the GNU General Public License for - * more details. - */ - -#include <linux/cache.h> -#include <linux/delay.h> -#include <linux/uaccess.h> -#include <linux/module.h> -#include <linux/mm.h> -#include <linux/atomic.h> -#include <arch/chip.h> - -/* This page is remapped on startup to be hash-for-home. */ -int atomic_locks[PAGE_SIZE / sizeof(int)] __page_aligned_bss; - -int *__atomic_hashed_lock(volatile void *v) -{ - /* NOTE: this code must match "sys_cmpxchg" in kernel/intvec_32.S */ - /* - * Use bits [3, 3 + ATOMIC_HASH_SHIFT) as the lock index. - * Using mm works here because atomic_locks is page aligned. - */ - unsigned long ptr = __insn_mm((unsigned long)v >> 1, - (unsigned long)atomic_locks, - 2, (ATOMIC_HASH_SHIFT + 2) - 1); - return (int *)ptr; -} - -#ifdef CONFIG_SMP -/* Return whether the passed pointer is a valid atomic lock pointer. */ -static int is_atomic_lock(int *p) -{ - return p >= &atomic_locks[0] && p < &atomic_locks[ATOMIC_HASH_SIZE]; -} - -void __atomic_fault_unlock(int *irqlock_word) -{ - BUG_ON(!is_atomic_lock(irqlock_word)); - BUG_ON(*irqlock_word != 1); - *irqlock_word = 0; -} - -#endif /* CONFIG_SMP */ - -static inline int *__atomic_setup(volatile void *v) -{ - /* Issue a load to the target to bring it into cache. */ - *(volatile int *)v; - return __atomic_hashed_lock(v); -} - -int _atomic_xchg(int *v, int n) -{ - return __atomic32_xchg(v, __atomic_setup(v), n).val; -} -EXPORT_SYMBOL(_atomic_xchg); - -int _atomic_xchg_add(int *v, int i) -{ - return __atomic32_xchg_add(v, __atomic_setup(v), i).val; -} -EXPORT_SYMBOL(_atomic_xchg_add); - -int _atomic_xchg_add_unless(int *v, int a, int u) -{ - /* - * Note: argument order is switched here since it is easier - * to use the first argument consistently as the "old value" - * in the assembly, as is done for _atomic_cmpxchg(). - */ - return __atomic32_xchg_add_unless(v, __atomic_setup(v), u, a).val; -} -EXPORT_SYMBOL(_atomic_xchg_add_unless); - -int _atomic_cmpxchg(int *v, int o, int n) -{ - return __atomic32_cmpxchg(v, __atomic_setup(v), o, n).val; -} -EXPORT_SYMBOL(_atomic_cmpxchg); - -unsigned long _atomic_fetch_or(volatile unsigned long *p, unsigned long mask) -{ - return __atomic32_fetch_or((int *)p, __atomic_setup(p), mask).val; -} -EXPORT_SYMBOL(_atomic_fetch_or); - -unsigned long _atomic_fetch_and(volatile unsigned long *p, unsigned long mask) -{ - return __atomic32_fetch_and((int *)p, __atomic_setup(p), mask).val; -} -EXPORT_SYMBOL(_atomic_fetch_and); - -unsigned long _atomic_fetch_andn(volatile unsigned long *p, unsigned long mask) -{ - return __atomic32_fetch_andn((int *)p, __atomic_setup(p), mask).val; -} -EXPORT_SYMBOL(_atomic_fetch_andn); - -unsigned long _atomic_fetch_xor(volatile unsigned long *p, unsigned long mask) -{ - return __atomic32_fetch_xor((int *)p, __atomic_setup(p), mask).val; -} -EXPORT_SYMBOL(_atomic_fetch_xor); - - -long long _atomic64_xchg(long long *v, long long n) -{ - return __atomic64_xchg(v, __atomic_setup(v), n); -} -EXPORT_SYMBOL(_atomic64_xchg); - -long long _atomic64_xchg_add(long long *v, long long i) -{ - return __atomic64_xchg_add(v, __atomic_setup(v), i); -} -EXPORT_SYMBOL(_atomic64_xchg_add); - -long long _atomic64_xchg_add_unless(long long *v, long long a, long long u) -{ - /* - * Note: argument order is switched here since it is easier - * to use the first argument consistently as the "old value" - * in the assembly, as is done for _atomic_cmpxchg(). - */ - return __atomic64_xchg_add_unless(v, __atomic_setup(v), u, a); -} -EXPORT_SYMBOL(_atomic64_xchg_add_unless); - -long long _atomic64_cmpxchg(long long *v, long long o, long long n) -{ - return __atomic64_cmpxchg(v, __atomic_setup(v), o, n); -} -EXPORT_SYMBOL(_atomic64_cmpxchg); - -long long _atomic64_fetch_and(long long *v, long long n) -{ - return __atomic64_fetch_and(v, __atomic_setup(v), n); -} -EXPORT_SYMBOL(_atomic64_fetch_and); - -long long _atomic64_fetch_or(long long *v, long long n) -{ - return __atomic64_fetch_or(v, __atomic_setup(v), n); -} -EXPORT_SYMBOL(_atomic64_fetch_or); - -long long _atomic64_fetch_xor(long long *v, long long n) -{ - return __atomic64_fetch_xor(v, __atomic_setup(v), n); -} -EXPORT_SYMBOL(_atomic64_fetch_xor); - -/* - * If any of the atomic or futex routines hit a bad address (not in - * the page tables at kernel PL) this routine is called. The futex - * routines are never used on kernel space, and the normal atomics and - * bitops are never used on user space. So a fault on kernel space - * must be fatal, but a fault on userspace is a futex fault and we - * need to return -EFAULT. Note that the context this routine is - * invoked in is the context of the "_atomic_xxx()" routines called - * by the functions in this file. - */ -struct __get_user __atomic_bad_address(int __user *addr) -{ - if (unlikely(!access_ok(VERIFY_WRITE, addr, sizeof(int)))) - panic("Bad address used for kernel atomic op: %p\n", addr); - return (struct __get_user) { .err = -EFAULT }; -} - - -void __init __init_atomic_per_cpu(void) -{ - /* Validate power-of-two and "bigger than cpus" assumption */ - BUILD_BUG_ON(ATOMIC_HASH_SIZE & (ATOMIC_HASH_SIZE-1)); - BUG_ON(ATOMIC_HASH_SIZE < nr_cpu_ids); - - /* - * On TILEPro we prefer to use a single hash-for-home - * page, since this means atomic operations are less - * likely to encounter a TLB fault and thus should - * in general perform faster. You may wish to disable - * this in situations where few hash-for-home tiles - * are configured. - */ - BUG_ON((unsigned long)atomic_locks % PAGE_SIZE != 0); - - /* The locks must all fit on one page. */ - BUILD_BUG_ON(ATOMIC_HASH_SIZE * sizeof(int) > PAGE_SIZE); - - /* - * We use the page offset of the atomic value's address as - * an index into atomic_locks, excluding the low 3 bits. - * That should not produce more indices than ATOMIC_HASH_SIZE. - */ - BUILD_BUG_ON((PAGE_SIZE >> 3) > ATOMIC_HASH_SIZE); -} diff --git a/arch/tile/lib/atomic_asm_32.S b/arch/tile/lib/atomic_asm_32.S deleted file mode 100644 index 94709ab41ed8..000000000000 --- a/arch/tile/lib/atomic_asm_32.S +++ /dev/null @@ -1,205 +0,0 @@ -/* - * Copyright 2010 Tilera Corporation. All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation, version 2. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or - * NON INFRINGEMENT. See the GNU General Public License for - * more details. - * - * Support routines for atomic operations. Each function takes: - * - * r0: address to manipulate - * r1: pointer to atomic lock guarding this operation (for ATOMIC_LOCK_REG) - * r2: new value to write, or for cmpxchg/add_unless, value to compare against - * r3: (cmpxchg/xchg_add_unless) new value to write or add; - * (atomic64 ops) high word of value to write - * r4/r5: (cmpxchg64/add_unless64) new value to write or add - * - * The 32-bit routines return a "struct __get_user" so that the futex code - * has an opportunity to return -EFAULT to the user if needed. - * The 64-bit routines just return a "long long" with the value, - * since they are only used from kernel space and don't expect to fault. - * Support for 16-bit ops is included in the framework but we don't provide any. - * - * Note that the caller is advised to issue a suitable L1 or L2 - * prefetch on the address being manipulated to avoid extra stalls. - * In addition, the hot path is on two icache lines, and we start with - * a jump to the second line to make sure they are both in cache so - * that we never stall waiting on icache fill while holding the lock. - * (This doesn't work out with most 64-bit ops, since they consume - * too many bundles, so may take an extra i-cache stall.) - * - * These routines set the INTERRUPT_CRITICAL_SECTION bit, just - * like sys_cmpxchg(), so that NMIs like PERF_COUNT will not interrupt - * the code, just page faults. - * - * If the load or store faults in a way that can be directly fixed in - * the do_page_fault_ics() handler (e.g. a vmalloc reference) we fix it - * directly, return to the instruction that faulted, and retry it. - * - * If the load or store faults in a way that potentially requires us - * to release the atomic lock, then retry (e.g. a migrating PTE), we - * reset the PC in do_page_fault_ics() to the "tns" instruction so - * that on return we will reacquire the lock and restart the op. We - * are somewhat overloading the exception_table_entry notion by doing - * this, since those entries are not normally used for migrating PTEs. - * - * If the main page fault handler discovers a bad address, it will see - * the PC pointing to the "tns" instruction (due to the earlier - * exception_table_entry processing in do_page_fault_ics), and - * re-reset the PC to the fault handler, atomic_bad_address(), which - * effectively takes over from the atomic op and can either return a - * bad "struct __get_user" (for user addresses) or can just panic (for - * bad kernel addresses). - * - * Note that if the value we would store is the same as what we - * loaded, we bypass the store. Other platforms with true atomics can - * make the guarantee that a non-atomic __clear_bit(), for example, - * can safely race with an atomic test_and_set_bit(); this example is - * from bit_spinlock.h in slub_lock() / slub_unlock(). We can't do - * that on Tile since the "atomic" op is really just a - * read/modify/write, and can race with the non-atomic - * read/modify/write. However, if we can short-circuit the write when - * it is not needed, in the atomic case, we avoid the race. - */ - -#include <linux/linkage.h> -#include <asm/atomic_32.h> -#include <asm/page.h> -#include <asm/processor.h> - - .section .text.atomic,"ax" -ENTRY(__start_atomic_asm_code) - - .macro atomic_op, name, bitwidth, body - .align 64 -STD_ENTRY_SECTION(__atomic\name, .text.atomic) - { - movei r24, 1 - j 4f /* branch to second cache line */ - } -1: { - .ifc \bitwidth,16 - lh r22, r0 - .else - lw r22, r0 - addi r28, r0, 4 - .endif - } - .ifc \bitwidth,64 - lw r23, r28 - .endif - \body /* set r24, and r25 if 64-bit */ - { - seq r26, r22, r24 - seq r27, r23, r25 - } - .ifc \bitwidth,64 - bbnst r27, 2f - .endif - bbs r26, 3f /* skip write-back if it's the same value */ -2: { - .ifc \bitwidth,16 - sh r0, r24 - .else - sw r0, r24 - .endif - } - .ifc \bitwidth,64 - sw r28, r25 - .endif - mf -3: { - move r0, r22 - .ifc \bitwidth,64 - move r1, r23 - .else - move r1, zero - .endif - sw ATOMIC_LOCK_REG_NAME, zero - } - mtspr INTERRUPT_CRITICAL_SECTION, zero - jrp lr -4: { - move ATOMIC_LOCK_REG_NAME, r1 - mtspr INTERRUPT_CRITICAL_SECTION, r24 - } -#ifndef CONFIG_SMP - j 1b /* no atomic locks */ -#else - { - tns r21, ATOMIC_LOCK_REG_NAME - moveli r23, 2048 /* maximum backoff time in cycles */ - } - { - bzt r21, 1b /* branch if lock acquired */ - moveli r25, 32 /* starting backoff time in cycles */ - } -5: mtspr INTERRUPT_CRITICAL_SECTION, zero - mfspr r26, CYCLE_LOW /* get start point for this backoff */ -6: mfspr r22, CYCLE_LOW /* test to see if we've backed off enough */ - sub r22, r22, r26 - slt r22, r22, r25 - bbst r22, 6b - { - mtspr INTERRUPT_CRITICAL_SECTION, r24 - shli r25, r25, 1 /* double the backoff; retry the tns */ - } - { - tns r21, ATOMIC_LOCK_REG_NAME - slt r26, r23, r25 /* is the proposed backoff too big? */ - } - { - bzt r21, 1b /* branch if lock acquired */ - mvnz r25, r26, r23 - } - j 5b -#endif - STD_ENDPROC(__atomic\name) - .ifc \bitwidth,32 - .pushsection __ex_table,"a" - .align 4 - .word 1b, __atomic\name - .word 2b, __atomic\name - .word __atomic\name, __atomic_bad_address - .popsection - .endif - .endm - - -/* - * Use __atomic32 prefix to avoid collisions with GCC builtin __atomic functions. - */ - -atomic_op 32_cmpxchg, 32, "seq r26, r22, r2; { bbns r26, 3f; move r24, r3 }" -atomic_op 32_xchg, 32, "move r24, r2" -atomic_op 32_xchg_add, 32, "add r24, r22, r2" -atomic_op 32_xchg_add_unless, 32, \ - "sne r26, r22, r2; { bbns r26, 3f; add r24, r22, r3 }" -atomic_op 32_fetch_or, 32, "or r24, r22, r2" -atomic_op 32_fetch_and, 32, "and r24, r22, r2" -atomic_op 32_fetch_andn, 32, "nor r2, r2, zero; and r24, r22, r2" -atomic_op 32_fetch_xor, 32, "xor r24, r22, r2" - -atomic_op 64_cmpxchg, 64, "{ seq r26, r22, r2; seq r27, r23, r3 }; \ - { bbns r26, 3f; move r24, r4 }; { bbns r27, 3f; move r25, r5 }" -atomic_op 64_xchg, 64, "{ move r24, r2; move r25, r3 }" -atomic_op 64_xchg_add, 64, "{ add r24, r22, r2; add r25, r23, r3 }; \ - slt_u r26, r24, r22; add r25, r25, r26" -atomic_op 64_xchg_add_unless, 64, \ - "{ sne r26, r22, r2; sne r27, r23, r3 }; \ - { bbns r26, 3f; add r24, r22, r4 }; \ - { bbns r27, 3f; add r25, r23, r5 }; \ - slt_u r26, r24, r22; add r25, r25, r26" -atomic_op 64_fetch_or, 64, "{ or r24, r22, r2; or r25, r23, r3 }" -atomic_op 64_fetch_and, 64, "{ and r24, r22, r2; and r25, r23, r3 }" -atomic_op 64_fetch_xor, 64, "{ xor r24, r22, r2; xor r25, r23, r3 }" - - jrp lr /* happy backtracer */ - -ENTRY(__end_atomic_asm_code) diff --git a/arch/tile/lib/cacheflush.c b/arch/tile/lib/cacheflush.c deleted file mode 100644 index c1ebc1065fc1..000000000000 --- a/arch/tile/lib/cacheflush.c +++ /dev/null @@ -1,167 +0,0 @@ -/* - * Copyright 2010 Tilera Corporation. All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation, version 2. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or - * NON INFRINGEMENT. See the GNU General Public License for - * more details. - */ - -#include <linux/export.h> -#include <asm/page.h> -#include <asm/cacheflush.h> -#include <arch/icache.h> -#include <arch/spr_def.h> - - -void __flush_icache_range(unsigned long start, unsigned long end) -{ - invalidate_icache((const void *)start, end - start, PAGE_SIZE); -} - - -/* Force a load instruction to issue. */ -static inline void force_load(char *p) -{ - *(volatile char *)p; -} - -/* - * Flush and invalidate a VA range that is homed remotely on a single - * core (if "!hfh") or homed via hash-for-home (if "hfh"), waiting - * until the memory controller holds the flushed values. - */ -void __attribute__((optimize("omit-frame-pointer"))) -finv_buffer_remote(void *buffer, size_t size, int hfh) -{ - char *p, *base; - size_t step_size, load_count; - - /* - * On TILEPro the striping granularity is a fixed 8KB; on - * TILE-Gx it is configurable, and we rely on the fact that - * the hypervisor always configures maximum striping, so that - * bits 9 and 10 of the PA are part of the stripe function, so - * every 512 bytes we hit a striping boundary. - * - */ -#ifdef __tilegx__ - const unsigned long STRIPE_WIDTH = 512; -#else - const unsigned long STRIPE_WIDTH = 8192; -#endif - -#ifdef __tilegx__ - /* - * On TILE-Gx, we must disable the dstream prefetcher before doing - * a cache flush; otherwise, we could end up with data in the cache - * that we don't want there. Note that normally we'd do an mf - * after the SPR write to disabling the prefetcher, but we do one - * below, before any further loads, so there's no need to do it - * here. - */ - uint_reg_t old_dstream_pf = __insn_mfspr(SPR_DSTREAM_PF); - __insn_mtspr(SPR_DSTREAM_PF, 0); -#endif - - /* - * Flush and invalidate the buffer out of the local L1/L2 - * and request the home cache to flush and invalidate as well. - */ - __finv_buffer(buffer, size); - - /* - * Wait for the home cache to acknowledge that it has processed - * all the flush-and-invalidate requests. This does not mean - * that the flushed data has reached the memory controller yet, - * but it does mean the home cache is processing the flushes. - */ - __insn_mf(); - - /* - * Issue a load to the last cache line, which can't complete - * until all the previously-issued flushes to the same memory - * controller have also completed. If we weren't striping - * memory, that one load would be sufficient, but since we may - * be, we also need to back up to the last load issued to - * another memory controller, which would be the point where - * we crossed a "striping" boundary (the granularity of striping - * across memory controllers). Keep backing up and doing this - * until we are before the beginning of the buffer, or have - * hit all the controllers. - * - * If we are flushing a hash-for-home buffer, it's even worse. - * Each line may be homed on a different tile, and each tile - * may have up to four lines that are on different - * controllers. So as we walk backwards, we have to touch - * enough cache lines to satisfy these constraints. In - * practice this ends up being close enough to "load from - * every cache line on a full memory stripe on each - * controller" that we simply do that, to simplify the logic. - * - * On TILE-Gx the hash-for-home function is much more complex, - * with the upshot being we can't readily guarantee we have - * hit both entries in the 128-entry AMT that were hit by any - * load in the entire range, so we just re-load them all. - * With larger buffers, we may want to consider using a hypervisor - * trap to issue loads directly to each hash-for-home tile for - * each controller (doing it from Linux would trash the TLB). - */ - if (hfh) { - step_size = L2_CACHE_BYTES; -#ifdef __tilegx__ - load_count = (size + L2_CACHE_BYTES - 1) / L2_CACHE_BYTES; -#else - load_count = (STRIPE_WIDTH / L2_CACHE_BYTES) * - (1 << CHIP_LOG_NUM_MSHIMS()); -#endif - } else { - step_size = STRIPE_WIDTH; - load_count = (1 << CHIP_LOG_NUM_MSHIMS()); - } - - /* Load the last byte of the buffer. */ - p = (char *)buffer + size - 1; - force_load(p); - - /* Bump down to the end of the previous stripe or cache line. */ - p -= step_size; - p = (char *)((unsigned long)p | (step_size - 1)); - - /* Figure out how far back we need to go. */ - base = p - (step_size * (load_count - 2)); - if ((unsigned long)base < (unsigned long)buffer) - base = buffer; - - /* Fire all the loads we need. */ - for (; p >= base; p -= step_size) - force_load(p); - - /* - * Repeat, but with finv's instead of loads, to get rid of the - * data we just loaded into our own cache and the old home L3. - * The finv's are guaranteed not to actually flush the data in - * the buffer back to their home, since we just read it, so the - * lines are clean in cache; we will only invalidate those lines. - */ - p = (char *)buffer + size - 1; - __insn_finv(p); - p -= step_size; - p = (char *)((unsigned long)p | (step_size - 1)); - for (; p >= base; p -= step_size) - __insn_finv(p); - - /* Wait for these finv's (and thus the first finvs) to be done. */ - __insn_mf(); - -#ifdef __tilegx__ - /* Reenable the prefetcher. */ - __insn_mtspr(SPR_DSTREAM_PF, old_dstream_pf); -#endif -} -EXPORT_SYMBOL_GPL(finv_buffer_remote); diff --git a/arch/tile/lib/checksum.c b/arch/tile/lib/checksum.c deleted file mode 100644 index c3ca3e64d9d9..000000000000 --- a/arch/tile/lib/checksum.c +++ /dev/null @@ -1,89 +0,0 @@ -/* - * Copyright 2010 Tilera Corporation. All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation, version 2. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or - * NON INFRINGEMENT. See the GNU General Public License for - * more details. - * Support code for the main lib/checksum.c. - */ - -#include <net/checksum.h> -#include <linux/module.h> - -__wsum do_csum(const unsigned char *buff, int len) -{ - int odd, count; - unsigned long result = 0; - - if (len <= 0) - goto out; - odd = 1 & (unsigned long) buff; - if (odd) { - result = (*buff << 8); - len--; - buff++; - } - count = len >> 1; /* nr of 16-bit words.. */ - if (count) { - if (2 & (unsigned long) buff) { - result += *(const unsigned short *)buff; - count--; - len -= 2; - buff += 2; - } - count >>= 1; /* nr of 32-bit words.. */ - if (count) { -#ifdef __tilegx__ - if (4 & (unsigned long) buff) { - unsigned int w = *(const unsigned int *)buff; - result = __insn_v2sadau(result, w, 0); - count--; - len -= 4; - buff += 4; - } - count >>= 1; /* nr of 64-bit words.. */ -#endif - - /* - * This algorithm could wrap around for very - * large buffers, but those should be impossible. - */ - BUG_ON(count >= 65530); - - while (count) { - unsigned long w = *(const unsigned long *)buff; - count--; - buff += sizeof(w); -#ifdef __tilegx__ - result = __insn_v2sadau(result, w, 0); -#else - result = __insn_sadah_u(result, w, 0); -#endif - } -#ifdef __tilegx__ - if (len & 4) { - unsigned int w = *(const unsigned int *)buff; - result = __insn_v2sadau(result, w, 0); - buff += 4; - } -#endif - } - if (len & 2) { - result += *(const unsigned short *) buff; - buff += 2; - } - } - if (len & 1) - result += *buff; - result = csum_long(result); - if (odd) - result = swab16(result); -out: - return result; -} diff --git a/arch/tile/lib/cpumask.c b/arch/tile/lib/cpumask.c deleted file mode 100644 index 75947edccb26..000000000000 --- a/arch/tile/lib/cpumask.c +++ /dev/null @@ -1,54 +0,0 @@ -/* - * Copyright 2010 Tilera Corporation. All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation, version 2. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or - * NON INFRINGEMENT. See the GNU General Public License for - * more details. - */ - -#include <linux/cpumask.h> -#include <linux/ctype.h> -#include <linux/errno.h> -#include <linux/smp.h> -#include <linux/export.h> - -/* - * Allow cropping out bits beyond the end of the array. - * Move to "lib" directory if more clients want to use this routine. - */ -int bitmap_parselist_crop(const char *bp, unsigned long *maskp, int nmaskbits) -{ - unsigned a, b; - - bitmap_zero(maskp, nmaskbits); - do { - if (!isdigit(*bp)) - return -EINVAL; - a = simple_strtoul(bp, (char **)&bp, 10); - b = a; - if (*bp == '-') { - bp++; - if (!isdigit(*bp)) - return -EINVAL; - b = simple_strtoul(bp, (char **)&bp, 10); - } - if (!(a <= b)) - return -EINVAL; - if (b >= nmaskbits) - b = nmaskbits-1; - while (a <= b) { - set_bit(a, maskp); - a++; - } - if (*bp == ',') - bp++; - } while (*bp != '\0' && *bp != '\n'); - return 0; -} -EXPORT_SYMBOL(bitmap_parselist_crop); diff --git a/arch/tile/lib/delay.c b/arch/tile/lib/delay.c deleted file mode 100644 index cdacdd11d360..000000000000 --- a/arch/tile/lib/delay.c +++ /dev/null @@ -1,45 +0,0 @@ -/* - * Copyright 2010 Tilera Corporation. All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation, version 2. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or - * NON INFRINGEMENT. See the GNU General Public License for - * more details. - */ - -#include <linux/module.h> -#include <linux/delay.h> -#include <linux/thread_info.h> -#include <asm/timex.h> - -void __udelay(unsigned long usecs) -{ - if (usecs > ULONG_MAX / 1000) { - WARN_ON_ONCE(usecs > ULONG_MAX / 1000); - usecs = ULONG_MAX / 1000; - } - __ndelay(usecs * 1000); -} -EXPORT_SYMBOL(__udelay); - -void __ndelay(unsigned long nsecs) -{ - cycles_t target = get_cycles(); - target += ns2cycles(nsecs); - while (get_cycles() < target) - cpu_relax(); -} -EXPORT_SYMBOL(__ndelay); - -void __delay(unsigned long cycles) -{ - cycles_t target = get_cycles() + cycles; - while (get_cycles() < target) - cpu_relax(); -} -EXPORT_SYMBOL(__delay); diff --git a/arch/tile/lib/exports.c b/arch/tile/lib/exports.c deleted file mode 100644 index ecce8e177e3f..000000000000 --- a/arch/tile/lib/exports.c +++ /dev/null @@ -1,94 +0,0 @@ -/* - * Copyright 2010 Tilera Corporation. All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation, version 2. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or - * NON INFRINGEMENT. See the GNU General Public License for - * more details. - * - * Exports from assembler code and from libtile-cc. - */ - -#include <linux/module.h> - -/* arch/tile/lib/usercopy.S */ -#include <linux/uaccess.h> -EXPORT_SYMBOL(clear_user_asm); -EXPORT_SYMBOL(flush_user_asm); -EXPORT_SYMBOL(finv_user_asm); - -/* arch/tile/kernel/entry.S */ -#include <linux/kernel.h> -#include <asm/processor.h> -EXPORT_SYMBOL(current_text_addr); - -/* arch/tile/kernel/head.S */ -EXPORT_SYMBOL(empty_zero_page); - -#ifdef CONFIG_FUNCTION_TRACER -/* arch/tile/kernel/mcount_64.S */ -#include <asm/ftrace.h> -EXPORT_SYMBOL(__mcount); -#endif /* CONFIG_FUNCTION_TRACER */ - -/* arch/tile/lib/, various memcpy files */ -EXPORT_SYMBOL(memcpy); -EXPORT_SYMBOL(raw_copy_to_user); -EXPORT_SYMBOL(raw_copy_from_user); -#ifdef __tilegx__ -EXPORT_SYMBOL(raw_copy_in_user); -#endif - -/* hypervisor glue */ -#include <hv/hypervisor.h> -EXPORT_SYMBOL(hv_dev_open); -EXPORT_SYMBOL(hv_dev_pread); -EXPORT_SYMBOL(hv_dev_pwrite); -EXPORT_SYMBOL(hv_dev_preada); -EXPORT_SYMBOL(hv_dev_pwritea); -EXPORT_SYMBOL(hv_dev_poll); -EXPORT_SYMBOL(hv_dev_poll_cancel); -EXPORT_SYMBOL(hv_dev_close); -EXPORT_SYMBOL(hv_sysconf); -EXPORT_SYMBOL(hv_confstr); -EXPORT_SYMBOL(hv_get_rtc); -EXPORT_SYMBOL(hv_set_rtc); - -/* libgcc.a */ -uint32_t __udivsi3(uint32_t dividend, uint32_t divisor); -EXPORT_SYMBOL(__udivsi3); -int32_t __divsi3(int32_t dividend, int32_t divisor); -EXPORT_SYMBOL(__divsi3); -uint64_t __udivdi3(uint64_t dividend, uint64_t divisor); -EXPORT_SYMBOL(__udivdi3); -int64_t __divdi3(int64_t dividend, int64_t divisor); -EXPORT_SYMBOL(__divdi3); -uint32_t __umodsi3(uint32_t dividend, uint32_t divisor); -EXPORT_SYMBOL(__umodsi3); -int32_t __modsi3(int32_t dividend, int32_t divisor); -EXPORT_SYMBOL(__modsi3); -uint64_t __umoddi3(uint64_t dividend, uint64_t divisor); -EXPORT_SYMBOL(__umoddi3); -int64_t __moddi3(int64_t dividend, int64_t divisor); -EXPORT_SYMBOL(__moddi3); -#ifdef __tilegx__ -typedef int TItype __attribute__((mode(TI))); -TItype __multi3(TItype a, TItype b); -EXPORT_SYMBOL(__multi3); /* required for gcc 7 and later */ -#else -int64_t __muldi3(int64_t, int64_t); -EXPORT_SYMBOL(__muldi3); -uint64_t __lshrdi3(uint64_t, unsigned int); -EXPORT_SYMBOL(__lshrdi3); -uint64_t __ashrdi3(uint64_t, unsigned int); -EXPORT_SYMBOL(__ashrdi3); -uint64_t __ashldi3(uint64_t, unsigned int); -EXPORT_SYMBOL(__ashldi3); -int __ffsdi2(uint64_t); -EXPORT_SYMBOL(__ffsdi2); -#endif diff --git a/arch/tile/lib/memchr_32.c b/arch/tile/lib/memchr_32.c deleted file mode 100644 index cc3d9badf030..000000000000 --- a/arch/tile/lib/memchr_32.c +++ /dev/null @@ -1,71 +0,0 @@ -/* - * Copyright 2010 Tilera Corporation. All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation, version 2. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or - * NON INFRINGEMENT. See the GNU General Public License for - * more details. - */ - -#include <linux/types.h> -#include <linux/string.h> -#include <linux/module.h> - -void *memchr(const void *s, int c, size_t n) -{ - const uint32_t *last_word_ptr; - const uint32_t *p; - const char *last_byte_ptr; - uintptr_t s_int; - uint32_t goal, before_mask, v, bits; - char *ret; - - if (__builtin_expect(n == 0, 0)) { - /* Don't dereference any memory if the array is empty. */ - return NULL; - } - - /* Get an aligned pointer. */ - s_int = (uintptr_t) s; - p = (const uint32_t *)(s_int & -4); - - /* Create four copies of the byte for which we are looking. */ - goal = 0x01010101 * (uint8_t) c; - - /* Read the first word, but munge it so that bytes before the array - * will not match goal. - * - * Note that this shift count expression works because we know - * shift counts are taken mod 32. - */ - before_mask = (1 << (s_int << 3)) - 1; - v = (*p | before_mask) ^ (goal & before_mask); - - /* Compute the address of the last byte. */ - last_byte_ptr = (const char *)s + n - 1; - - /* Compute the address of the word containing the last byte. */ - last_word_ptr = (const uint32_t *)((uintptr_t) last_byte_ptr & -4); - - while ((bits = __insn_seqb(v, goal)) == 0) { - if (__builtin_expect(p == last_word_ptr, 0)) { - /* We already read the last word in the array, - * so give up. - */ - return NULL; - } - v = *++p; - } - - /* We found a match, but it might be in a byte past the end - * of the array. - */ - ret = ((char *)p) + (__insn_ctz(bits) >> 3); - return (ret <= last_byte_ptr) ? ret : NULL; -} -EXPORT_SYMBOL(memchr); diff --git a/arch/tile/lib/memchr_64.c b/arch/tile/lib/memchr_64.c deleted file mode 100644 index f8196b3a950e..000000000000 --- a/arch/tile/lib/memchr_64.c +++ /dev/null @@ -1,69 +0,0 @@ -/* - * Copyright 2011 Tilera Corporation. All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation, version 2. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or - * NON INFRINGEMENT. See the GNU General Public License for - * more details. - */ - -#include <linux/types.h> -#include <linux/string.h> -#include <linux/module.h> -#include "string-endian.h" - -void *memchr(const void *s, int c, size_t n) -{ - const uint64_t *last_word_ptr; - const uint64_t *p; - const char *last_byte_ptr; - uintptr_t s_int; - uint64_t goal, before_mask, v, bits; - char *ret; - - if (__builtin_expect(n == 0, 0)) { - /* Don't dereference any memory if the array is empty. */ - return NULL; - } - - /* Get an aligned pointer. */ - s_int = (uintptr_t) s; - p = (const uint64_t *)(s_int & -8); - - /* Create eight copies of the byte for which we are looking. */ - goal = copy_byte(c); - - /* Read the first word, but munge it so that bytes before the array - * will not match goal. - */ - before_mask = MASK(s_int); - v = (*p | before_mask) ^ (goal & before_mask); - - /* Compute the address of the last byte. */ - last_byte_ptr = (const char *)s + n - 1; - - /* Compute the address of the word containing the last byte. */ - last_word_ptr = (const uint64_t *)((uintptr_t) last_byte_ptr & -8); - - while ((bits = __insn_v1cmpeq(v, goal)) == 0) { - if (__builtin_expect(p == last_word_ptr, 0)) { - /* We already read the last word in the array, - * so give up. - */ - return NULL; - } - v = *++p; - } - - /* We found a match, but it might be in a byte past the end - * of the array. - */ - ret = ((char *)p) + (CFZ(bits) >> 3); - return (ret <= last_byte_ptr) ? ret : NULL; -} -EXPORT_SYMBOL(memchr); diff --git a/arch/tile/lib/memcpy_32.S b/arch/tile/lib/memcpy_32.S deleted file mode 100644 index 270f1267cd18..000000000000 --- a/arch/tile/lib/memcpy_32.S +++ /dev/null @@ -1,544 +0,0 @@ -/* - * Copyright 2010 Tilera Corporation. All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation, version 2. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or - * NON INFRINGEMENT. See the GNU General Public License for - * more details. - */ - -#include <arch/chip.h> - - -/* - * This file shares the implementation of the userspace memcpy and - * the kernel's memcpy, copy_to_user and copy_from_user. - */ - -#include <linux/linkage.h> - -#define IS_MEMCPY 0 -#define IS_COPY_FROM_USER 1 -#define IS_COPY_TO_USER -1 - - .section .text.memcpy_common, "ax" - .align 64 - -/* Use this to preface each bundle that can cause an exception so - * the kernel can clean up properly. The special cleanup code should - * not use these, since it knows what it is doing. - */ -#define EX \ - .pushsection __ex_table, "a"; \ - .align 4; \ - .word 9f, memcpy_common_fixup; \ - .popsection; \ - 9 - - -/* raw_copy_from_user takes the kernel target address in r0, - * the user source in r1, and the bytes to copy in r2. - * It returns the number of uncopiable bytes (hopefully zero) in r0. - */ -ENTRY(raw_copy_from_user) -.type raw_copy_from_user, @function - FEEDBACK_ENTER_EXPLICIT(raw_copy_from_user, \ - .text.memcpy_common, \ - .Lend_memcpy_common - raw_copy_from_user) - { movei r29, IS_COPY_FROM_USER; j memcpy_common } - .size raw_copy_from_user, . - raw_copy_from_user - -/* raw_copy_to_user takes the user target address in r0, - * the kernel source in r1, and the bytes to copy in r2. - * It returns the number of uncopiable bytes (hopefully zero) in r0. - */ -ENTRY(raw_copy_to_user) -.type raw_copy_to_user, @function - FEEDBACK_REENTER(raw_copy_from_user) - { movei r29, IS_COPY_TO_USER; j memcpy_common } - .size raw_copy_to_user, . - raw_copy_to_user - -ENTRY(memcpy) -.type memcpy, @function - FEEDBACK_REENTER(raw_copy_from_user) - { movei r29, IS_MEMCPY } - .size memcpy, . - memcpy - /* Fall through */ - - .type memcpy_common, @function -memcpy_common: - /* On entry, r29 holds one of the IS_* macro values from above. */ - - - /* r0 is the dest, r1 is the source, r2 is the size. */ - - /* Save aside original dest so we can return it at the end. */ - { sw sp, lr; move r23, r0; or r4, r0, r1 } - - /* Check for an empty size. */ - { bz r2, .Ldone; andi r4, r4, 3 } - - /* Save aside original values in case of a fault. */ - { move r24, r1; move r25, r2 } - move r27, lr - - /* Check for an unaligned source or dest. */ - { bnz r4, .Lcopy_unaligned_maybe_many; addli r4, r2, -256 } - -.Lcheck_aligned_copy_size: - /* If we are copying < 256 bytes, branch to simple case. */ - { blzt r4, .Lcopy_8_check; slti_u r8, r2, 8 } - - /* Copying >= 256 bytes, so jump to complex prefetching loop. */ - { andi r6, r1, 63; j .Lcopy_many } - -/* - * - * Aligned 4 byte at a time copy loop - * - */ - -.Lcopy_8_loop: - /* Copy two words at a time to hide load latency. */ -EX: { lw r3, r1; addi r1, r1, 4; slti_u r8, r2, 16 } -EX: { lw r4, r1; addi r1, r1, 4 } -EX: { sw r0, r3; addi r0, r0, 4; addi r2, r2, -4 } -EX: { sw r0, r4; addi r0, r0, 4; addi r2, r2, -4 } -.Lcopy_8_check: - { bzt r8, .Lcopy_8_loop; slti_u r4, r2, 4 } - - /* Copy odd leftover word, if any. */ - { bnzt r4, .Lcheck_odd_stragglers } -EX: { lw r3, r1; addi r1, r1, 4 } -EX: { sw r0, r3; addi r0, r0, 4; addi r2, r2, -4 } - -.Lcheck_odd_stragglers: - { bnz r2, .Lcopy_unaligned_few } - -.Ldone: - /* For memcpy return original dest address, else zero. */ - { mz r0, r29, r23; jrp lr } - - -/* - * - * Prefetching multiple cache line copy handler (for large transfers). - * - */ - - /* Copy words until r1 is cache-line-aligned. */ -.Lalign_loop: -EX: { lw r3, r1; addi r1, r1, 4 } - { andi r6, r1, 63 } -EX: { sw r0, r3; addi r0, r0, 4; addi r2, r2, -4 } -.Lcopy_many: - { bnzt r6, .Lalign_loop; addi r9, r0, 63 } - - { addi r3, r1, 60; andi r9, r9, -64 } - - /* No need to prefetch dst, we'll just do the wh64 - * right before we copy a line. - */ -EX: { lw r5, r3; addi r3, r3, 64; movei r4, 1 } - /* Intentionally stall for a few cycles to leave L2 cache alone. */ - { bnzt zero, .; move r27, lr } -EX: { lw r6, r3; addi r3, r3, 64 } - /* Intentionally stall for a few cycles to leave L2 cache alone. */ - { bnzt zero, . } -EX: { lw r7, r3; addi r3, r3, 64 } - /* Intentionally stall for a few cycles to leave L2 cache alone. */ - { bz zero, .Lbig_loop2 } - - /* On entry to this loop: - * - r0 points to the start of dst line 0 - * - r1 points to start of src line 0 - * - r2 >= (256 - 60), only the first time the loop trips. - * - r3 contains r1 + 128 + 60 [pointer to end of source line 2] - * This is our prefetch address. When we get near the end - * rather than prefetching off the end this is changed to point - * to some "safe" recently loaded address. - * - r5 contains *(r1 + 60) [i.e. last word of source line 0] - * - r6 contains *(r1 + 64 + 60) [i.e. last word of source line 1] - * - r9 contains ((r0 + 63) & -64) - * [start of next dst cache line.] - */ - -.Lbig_loop: - { jal .Lcopy_line2; add r15, r1, r2 } - -.Lbig_loop2: - /* Copy line 0, first stalling until r5 is ready. */ -EX: { move r12, r5; lw r16, r1 } - { bz r4, .Lcopy_8_check; slti_u r8, r2, 8 } - /* Prefetch several lines ahead. */ -EX: { lw r5, r3; addi r3, r3, 64 } - { jal .Lcopy_line } - - /* Copy line 1, first stalling until r6 is ready. */ -EX: { move r12, r6; lw r16, r1 } - { bz r4, .Lcopy_8_check; slti_u r8, r2, 8 } - /* Prefetch several lines ahead. */ -EX: { lw r6, r3; addi r3, r3, 64 } - { jal .Lcopy_line } - - /* Copy line 2, first stalling until r7 is ready. */ -EX: { move r12, r7; lw r16, r1 } - { bz r4, .Lcopy_8_check; slti_u r8, r2, 8 } - /* Prefetch several lines ahead. */ -EX: { lw r7, r3; addi r3, r3, 64 } - /* Use up a caches-busy cycle by jumping back to the top of the - * loop. Might as well get it out of the way now. - */ - { j .Lbig_loop } - - - /* On entry: - * - r0 points to the destination line. - * - r1 points to the source line. - * - r3 is the next prefetch address. - * - r9 holds the last address used for wh64. - * - r12 = WORD_15 - * - r16 = WORD_0. - * - r17 == r1 + 16. - * - r27 holds saved lr to restore. - * - * On exit: - * - r0 is incremented by 64. - * - r1 is incremented by 64, unless that would point to a word - * beyond the end of the source array, in which case it is redirected - * to point to an arbitrary word already in the cache. - * - r2 is decremented by 64. - * - r3 is unchanged, unless it points to a word beyond the - * end of the source array, in which case it is redirected - * to point to an arbitrary word already in the cache. - * Redirecting is OK since if we are that close to the end - * of the array we will not come back to this subroutine - * and use the contents of the prefetched address. - * - r4 is nonzero iff r2 >= 64. - * - r9 is incremented by 64, unless it points beyond the - * end of the last full destination cache line, in which - * case it is redirected to a "safe address" that can be - * clobbered (sp - 64) - * - lr contains the value in r27. - */ - -/* r26 unused */ - -.Lcopy_line: - /* TODO: when r3 goes past the end, we would like to redirect it - * to prefetch the last partial cache line (if any) just once, for the - * benefit of the final cleanup loop. But we don't want to - * prefetch that line more than once, or subsequent prefetches - * will go into the RTF. But then .Lbig_loop should unconditionally - * branch to top of loop to execute final prefetch, and its - * nop should become a conditional branch. - */ - - /* We need two non-memory cycles here to cover the resources - * used by the loads initiated by the caller. - */ - { add r15, r1, r2 } -.Lcopy_line2: - { slt_u r13, r3, r15; addi r17, r1, 16 } - - /* NOTE: this will stall for one cycle as L1 is busy. */ - - /* Fill second L1D line. */ -EX: { lw r17, r17; addi r1, r1, 48; mvz r3, r13, r1 } /* r17 = WORD_4 */ - - /* Prepare destination line for writing. */ -EX: { wh64 r9; addi r9, r9, 64 } - /* Load seven words that are L1D hits to cover wh64 L2 usage. */ - - /* Load the three remaining words from the last L1D line, which - * we know has already filled the L1D. - */ -EX: { lw r4, r1; addi r1, r1, 4; addi r20, r1, 16 } /* r4 = WORD_12 */ -EX: { lw r8, r1; addi r1, r1, 4; slt_u r13, r20, r15 }/* r8 = WORD_13 */ -EX: { lw r11, r1; addi r1, r1, -52; mvz r20, r13, r1 } /* r11 = WORD_14 */ - - /* Load the three remaining words from the first L1D line, first - * stalling until it has filled by "looking at" r16. - */ -EX: { lw r13, r1; addi r1, r1, 4; move zero, r16 } /* r13 = WORD_1 */ -EX: { lw r14, r1; addi r1, r1, 4 } /* r14 = WORD_2 */ -EX: { lw r15, r1; addi r1, r1, 8; addi r10, r0, 60 } /* r15 = WORD_3 */ - - /* Load second word from the second L1D line, first - * stalling until it has filled by "looking at" r17. - */ -EX: { lw r19, r1; addi r1, r1, 4; move zero, r17 } /* r19 = WORD_5 */ - - /* Store last word to the destination line, potentially dirtying it - * for the first time, which keeps the L2 busy for two cycles. - */ -EX: { sw r10, r12 } /* store(WORD_15) */ - - /* Use two L1D hits to cover the sw L2 access above. */ -EX: { lw r10, r1; addi r1, r1, 4 } /* r10 = WORD_6 */ -EX: { lw r12, r1; addi r1, r1, 4 } /* r12 = WORD_7 */ - - /* Fill third L1D line. */ -EX: { lw r18, r1; addi r1, r1, 4 } /* r18 = WORD_8 */ - - /* Store first L1D line. */ -EX: { sw r0, r16; addi r0, r0, 4; add r16, r0, r2 } /* store(WORD_0) */ -EX: { sw r0, r13; addi r0, r0, 4; andi r16, r16, -64 } /* store(WORD_1) */ -EX: { sw r0, r14; addi r0, r0, 4; slt_u r16, r9, r16 } /* store(WORD_2) */ -EX: { sw r0, r15; addi r0, r0, 4; addi r13, sp, -64 } /* store(WORD_3) */ - /* Store second L1D line. */ -EX: { sw r0, r17; addi r0, r0, 4; mvz r9, r16, r13 }/* store(WORD_4) */ -EX: { sw r0, r19; addi r0, r0, 4 } /* store(WORD_5) */ -EX: { sw r0, r10; addi r0, r0, 4 } /* store(WORD_6) */ -EX: { sw r0, r12; addi r0, r0, 4 } /* store(WORD_7) */ - -EX: { lw r13, r1; addi r1, r1, 4; move zero, r18 } /* r13 = WORD_9 */ -EX: { lw r14, r1; addi r1, r1, 4 } /* r14 = WORD_10 */ -EX: { lw r15, r1; move r1, r20 } /* r15 = WORD_11 */ - - /* Store third L1D line. */ -EX: { sw r0, r18; addi r0, r0, 4 } /* store(WORD_8) */ -EX: { sw r0, r13; addi r0, r0, 4 } /* store(WORD_9) */ -EX: { sw r0, r14; addi r0, r0, 4 } /* store(WORD_10) */ -EX: { sw r0, r15; addi r0, r0, 4 } /* store(WORD_11) */ - - /* Store rest of fourth L1D line. */ -EX: { sw r0, r4; addi r0, r0, 4 } /* store(WORD_12) */ - { -EX: sw r0, r8 /* store(WORD_13) */ - addi r0, r0, 4 - /* Will r2 be > 64 after we subtract 64 below? */ - shri r4, r2, 7 - } - { -EX: sw r0, r11 /* store(WORD_14) */ - addi r0, r0, 8 - /* Record 64 bytes successfully copied. */ - addi r2, r2, -64 - } - - { jrp lr; move lr, r27 } - - /* Convey to the backtrace library that the stack frame is size - * zero, and the real return address is on the stack rather than - * in 'lr'. - */ - { info 8 } - - .align 64 -.Lcopy_unaligned_maybe_many: - /* Skip the setup overhead if we aren't copying many bytes. */ - { slti_u r8, r2, 20; sub r4, zero, r0 } - { bnzt r8, .Lcopy_unaligned_few; andi r4, r4, 3 } - { bz r4, .Ldest_is_word_aligned; add r18, r1, r2 } - -/* - * - * unaligned 4 byte at a time copy handler. - * - */ - - /* Copy single bytes until r0 == 0 mod 4, so we can store words. */ -.Lalign_dest_loop: -EX: { lb_u r3, r1; addi r1, r1, 1; addi r4, r4, -1 } -EX: { sb r0, r3; addi r0, r0, 1; addi r2, r2, -1 } - { bnzt r4, .Lalign_dest_loop; andi r3, r1, 3 } - - /* If source and dest are now *both* aligned, do an aligned copy. */ - { bz r3, .Lcheck_aligned_copy_size; addli r4, r2, -256 } - -.Ldest_is_word_aligned: - -EX: { andi r8, r0, 63; lwadd_na r6, r1, 4} - { slti_u r9, r2, 64; bz r8, .Ldest_is_L2_line_aligned } - - /* This copies unaligned words until either there are fewer - * than 4 bytes left to copy, or until the destination pointer - * is cache-aligned, whichever comes first. - * - * On entry: - * - r0 is the next store address. - * - r1 points 4 bytes past the load address corresponding to r0. - * - r2 >= 4 - * - r6 is the next aligned word loaded. - */ -.Lcopy_unaligned_src_words: -EX: { lwadd_na r7, r1, 4; slti_u r8, r2, 4 + 4 } - /* stall */ - { dword_align r6, r7, r1; slti_u r9, r2, 64 + 4 } -EX: { swadd r0, r6, 4; addi r2, r2, -4 } - { bnz r8, .Lcleanup_unaligned_words; andi r8, r0, 63 } - { bnzt r8, .Lcopy_unaligned_src_words; move r6, r7 } - - /* On entry: - * - r0 is the next store address. - * - r1 points 4 bytes past the load address corresponding to r0. - * - r2 >= 4 (# of bytes left to store). - * - r6 is the next aligned src word value. - * - r9 = (r2 < 64U). - * - r18 points one byte past the end of source memory. - */ -.Ldest_is_L2_line_aligned: - - { - /* Not a full cache line remains. */ - bnz r9, .Lcleanup_unaligned_words - move r7, r6 - } - - /* r2 >= 64 */ - - /* Kick off two prefetches, but don't go past the end. */ - { addi r3, r1, 63 - 4; addi r8, r1, 64 + 63 - 4 } - { prefetch r3; move r3, r8; slt_u r8, r8, r18 } - { mvz r3, r8, r1; addi r8, r3, 64 } - { prefetch r3; move r3, r8; slt_u r8, r8, r18 } - { mvz r3, r8, r1; movei r17, 0 } - -.Lcopy_unaligned_line: - /* Prefetch another line. */ - { prefetch r3; addi r15, r1, 60; addi r3, r3, 64 } - /* Fire off a load of the last word we are about to copy. */ -EX: { lw_na r15, r15; slt_u r8, r3, r18 } - -EX: { mvz r3, r8, r1; wh64 r0 } - - /* This loop runs twice. - * - * On entry: - * - r17 is even before the first iteration, and odd before - * the second. It is incremented inside the loop. Encountering - * an even value at the end of the loop makes it stop. - */ -.Lcopy_half_an_unaligned_line: -EX: { - /* Stall until the last byte is ready. In the steady state this - * guarantees all words to load below will be in the L2 cache, which - * avoids shunting the loads to the RTF. - */ - move zero, r15 - lwadd_na r7, r1, 16 - } -EX: { lwadd_na r11, r1, 12 } -EX: { lwadd_na r14, r1, -24 } -EX: { lwadd_na r8, r1, 4 } -EX: { lwadd_na r9, r1, 4 } -EX: { - lwadd_na r10, r1, 8 - /* r16 = (r2 < 64), after we subtract 32 from r2 below. */ - slti_u r16, r2, 64 + 32 - } -EX: { lwadd_na r12, r1, 4; addi r17, r17, 1 } -EX: { lwadd_na r13, r1, 8; dword_align r6, r7, r1 } -EX: { swadd r0, r6, 4; dword_align r7, r8, r1 } -EX: { swadd r0, r7, 4; dword_align r8, r9, r1 } -EX: { swadd r0, r8, 4; dword_align r9, r10, r1 } -EX: { swadd r0, r9, 4; dword_align r10, r11, r1 } -EX: { swadd r0, r10, 4; dword_align r11, r12, r1 } -EX: { swadd r0, r11, 4; dword_align r12, r13, r1 } -EX: { swadd r0, r12, 4; dword_align r13, r14, r1 } -EX: { swadd r0, r13, 4; addi r2, r2, -32 } - { move r6, r14; bbst r17, .Lcopy_half_an_unaligned_line } - - { bzt r16, .Lcopy_unaligned_line; move r7, r6 } - - /* On entry: - * - r0 is the next store address. - * - r1 points 4 bytes past the load address corresponding to r0. - * - r2 >= 0 (# of bytes left to store). - * - r7 is the next aligned src word value. - */ -.Lcleanup_unaligned_words: - /* Handle any trailing bytes. */ - { bz r2, .Lcopy_unaligned_done; slti_u r8, r2, 4 } - { bzt r8, .Lcopy_unaligned_src_words; move r6, r7 } - - /* Move r1 back to the point where it corresponds to r0. */ - { addi r1, r1, -4 } - - /* Fall through */ - -/* - * - * 1 byte at a time copy handler. - * - */ - -.Lcopy_unaligned_few: -EX: { lb_u r3, r1; addi r1, r1, 1 } -EX: { sb r0, r3; addi r0, r0, 1; addi r2, r2, -1 } - { bnzt r2, .Lcopy_unaligned_few } - -.Lcopy_unaligned_done: - - /* For memcpy return original dest address, else zero. */ - { mz r0, r29, r23; jrp lr } - -.Lend_memcpy_common: - .size memcpy_common, .Lend_memcpy_common - memcpy_common - - .section .fixup,"ax" -memcpy_common_fixup: - .type memcpy_common_fixup, @function - - /* Skip any bytes we already successfully copied. - * r2 (num remaining) is correct, but r0 (dst) and r1 (src) - * may not be quite right because of unrolling and prefetching. - * So we need to recompute their values as the address just - * after the last byte we are sure was successfully loaded and - * then stored. - */ - - /* Determine how many bytes we successfully copied. */ - { sub r3, r25, r2 } - - /* Add this to the original r0 and r1 to get their new values. */ - { add r0, r23, r3; add r1, r24, r3 } - - { bzt r29, memcpy_fixup_loop } - { blzt r29, copy_to_user_fixup_loop } - -copy_from_user_fixup_loop: - /* Try copying the rest one byte at a time, expecting a load fault. */ -.Lcfu: { lb_u r3, r1; addi r1, r1, 1 } - { sb r0, r3; addi r0, r0, 1; addi r2, r2, -1 } - { bnzt r2, copy_from_user_fixup_loop } - -.Lcopy_from_user_fixup_zero_remainder: - move lr, r27 - { move r0, r2; jrp lr } - -copy_to_user_fixup_loop: - /* Try copying the rest one byte at a time, expecting a store fault. */ - { lb_u r3, r1; addi r1, r1, 1 } -.Lctu: { sb r0, r3; addi r0, r0, 1; addi r2, r2, -1 } - { bnzt r2, copy_to_user_fixup_loop } -.Lcopy_to_user_fixup_done: - move lr, r27 - { move r0, r2; jrp lr } - -memcpy_fixup_loop: - /* Try copying the rest one byte at a time. We expect a disastrous - * fault to happen since we are in fixup code, but let it happen. - */ - { lb_u r3, r1; addi r1, r1, 1 } - { sb r0, r3; addi r0, r0, 1; addi r2, r2, -1 } - { bnzt r2, memcpy_fixup_loop } - /* This should be unreachable, we should have faulted again. - * But be paranoid and handle it in case some interrupt changed - * the TLB or something. - */ - move lr, r27 - { move r0, r23; jrp lr } - - .size memcpy_common_fixup, . - memcpy_common_fixup - - .section __ex_table,"a" - .align 4 - .word .Lcfu, .Lcopy_from_user_fixup_zero_remainder - .word .Lctu, .Lcopy_to_user_fixup_done diff --git a/arch/tile/lib/memcpy_64.c b/arch/tile/lib/memcpy_64.c deleted file mode 100644 index 4815354b8cd2..000000000000 --- a/arch/tile/lib/memcpy_64.c +++ /dev/null @@ -1,367 +0,0 @@ -/* - * Copyright 2011 Tilera Corporation. All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation, version 2. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or - * NON INFRINGEMENT. See the GNU General Public License for - * more details. - */ - -#include <linux/types.h> -#include <linux/string.h> -#include <linux/module.h> -/* EXPORT_SYMBOL() is in arch/tile/lib/exports.c since this should be asm. */ - -/* Must be 8 bytes in size. */ -#define op_t uint64_t - -/* Threshold value for when to enter the unrolled loops. */ -#define OP_T_THRES 16 - -#if CHIP_L2_LINE_SIZE() != 64 -#error "Assumes 64 byte line size" -#endif - -/* How many cache lines ahead should we prefetch? */ -#define PREFETCH_LINES_AHEAD 4 - -/* - * Provide "base versions" of load and store for the normal code path. - * The kernel provides other versions for userspace copies. - */ -#define ST(p, v) (*(p) = (v)) -#define LD(p) (*(p)) - -#ifndef USERCOPY_FUNC -#define ST1 ST -#define ST2 ST -#define ST4 ST -#define ST8 ST -#define LD1 LD -#define LD2 LD -#define LD4 LD -#define LD8 LD -#define RETVAL dstv -void *memcpy(void *__restrict dstv, const void *__restrict srcv, size_t n) -#else -/* - * Special kernel version will provide implementation of the LDn/STn - * macros to return a count of uncopied bytes due to mm fault. - */ -#define RETVAL 0 -int __attribute__((optimize("omit-frame-pointer"))) -USERCOPY_FUNC(void *__restrict dstv, const void *__restrict srcv, size_t n) -#endif -{ - char *__restrict dst1 = (char *)dstv; - const char *__restrict src1 = (const char *)srcv; - const char *__restrict src1_end; - const char *__restrict prefetch; - op_t *__restrict dst8; /* 8-byte pointer to destination memory. */ - op_t final; /* Final bytes to write to trailing word, if any */ - long i; - - if (n < 16) { - for (; n; n--) - ST1(dst1++, LD1(src1++)); - return RETVAL; - } - - /* - * Locate the end of source memory we will copy. Don't - * prefetch past this. - */ - src1_end = src1 + n - 1; - - /* Prefetch ahead a few cache lines, but not past the end. */ - prefetch = src1; - for (i = 0; i < PREFETCH_LINES_AHEAD; i++) { - __insn_prefetch(prefetch); - prefetch += CHIP_L2_LINE_SIZE(); - prefetch = (prefetch < src1_end) ? prefetch : src1; - } - - /* Copy bytes until dst is word-aligned. */ - for (; (uintptr_t)dst1 & (sizeof(op_t) - 1); n--) - ST1(dst1++, LD1(src1++)); - - /* 8-byte pointer to destination memory. */ - dst8 = (op_t *)dst1; - - if (__builtin_expect((uintptr_t)src1 & (sizeof(op_t) - 1), 0)) { - /* Unaligned copy. */ - - op_t tmp0 = 0, tmp1 = 0, tmp2, tmp3; - const op_t *src8 = (const op_t *) ((uintptr_t)src1 & - -sizeof(op_t)); - const void *srci = (void *)src1; - int m; - - m = (CHIP_L2_LINE_SIZE() << 2) - - (((uintptr_t)dst8) & ((CHIP_L2_LINE_SIZE() << 2) - 1)); - m = (n < m) ? n : m; - m /= sizeof(op_t); - - /* Copy until 'dst' is cache-line-aligned. */ - n -= (sizeof(op_t) * m); - - switch (m % 4) { - case 0: - if (__builtin_expect(!m, 0)) - goto _M0; - tmp1 = LD8(src8++); - tmp2 = LD8(src8++); - goto _8B3; - case 2: - m += 2; - tmp3 = LD8(src8++); - tmp0 = LD8(src8++); - goto _8B1; - case 3: - m += 1; - tmp2 = LD8(src8++); - tmp3 = LD8(src8++); - goto _8B2; - case 1: - m--; - tmp0 = LD8(src8++); - tmp1 = LD8(src8++); - if (__builtin_expect(!m, 0)) - goto _8B0; - } - - do { - tmp2 = LD8(src8++); - tmp0 = __insn_dblalign(tmp0, tmp1, srci); - ST8(dst8++, tmp0); -_8B3: - tmp3 = LD8(src8++); - tmp1 = __insn_dblalign(tmp1, tmp2, srci); - ST8(dst8++, tmp1); -_8B2: - tmp0 = LD8(src8++); - tmp2 = __insn_dblalign(tmp2, tmp3, srci); - ST8(dst8++, tmp2); -_8B1: - tmp1 = LD8(src8++); - tmp3 = __insn_dblalign(tmp3, tmp0, srci); - ST8(dst8++, tmp3); - m -= 4; - } while (m); - -_8B0: - tmp0 = __insn_dblalign(tmp0, tmp1, srci); - ST8(dst8++, tmp0); - src8--; - -_M0: - if (__builtin_expect(n >= CHIP_L2_LINE_SIZE(), 0)) { - op_t tmp4, tmp5, tmp6, tmp7, tmp8; - - prefetch = ((const char *)src8) + - CHIP_L2_LINE_SIZE() * PREFETCH_LINES_AHEAD; - - for (tmp0 = LD8(src8++); n >= CHIP_L2_LINE_SIZE(); - n -= CHIP_L2_LINE_SIZE()) { - /* Prefetch and advance to next line to - prefetch, but don't go past the end. */ - __insn_prefetch(prefetch); - - /* Make sure prefetch got scheduled - earlier. */ - __asm__ ("" : : : "memory"); - - prefetch += CHIP_L2_LINE_SIZE(); - prefetch = (prefetch < src1_end) ? prefetch : - (const char *) src8; - - tmp1 = LD8(src8++); - tmp2 = LD8(src8++); - tmp3 = LD8(src8++); - tmp4 = LD8(src8++); - tmp5 = LD8(src8++); - tmp6 = LD8(src8++); - tmp7 = LD8(src8++); - tmp8 = LD8(src8++); - - tmp0 = __insn_dblalign(tmp0, tmp1, srci); - tmp1 = __insn_dblalign(tmp1, tmp2, srci); - tmp2 = __insn_dblalign(tmp2, tmp3, srci); - tmp3 = __insn_dblalign(tmp3, tmp4, srci); - tmp4 = __insn_dblalign(tmp4, tmp5, srci); - tmp5 = __insn_dblalign(tmp5, tmp6, srci); - tmp6 = __insn_dblalign(tmp6, tmp7, srci); - tmp7 = __insn_dblalign(tmp7, tmp8, srci); - - __insn_wh64(dst8); - - ST8(dst8++, tmp0); - ST8(dst8++, tmp1); - ST8(dst8++, tmp2); - ST8(dst8++, tmp3); - ST8(dst8++, tmp4); - ST8(dst8++, tmp5); - ST8(dst8++, tmp6); - ST8(dst8++, tmp7); - - tmp0 = tmp8; - } - src8--; - } - - /* Copy the rest 8-byte chunks. */ - if (n >= sizeof(op_t)) { - tmp0 = LD8(src8++); - for (; n >= sizeof(op_t); n -= sizeof(op_t)) { - tmp1 = LD8(src8++); - tmp0 = __insn_dblalign(tmp0, tmp1, srci); - ST8(dst8++, tmp0); - tmp0 = tmp1; - } - src8--; - } - - if (n == 0) - return RETVAL; - - tmp0 = LD8(src8++); - tmp1 = ((const char *)src8 <= src1_end) - ? LD8((op_t *)src8) : 0; - final = __insn_dblalign(tmp0, tmp1, srci); - - } else { - /* Aligned copy. */ - - const op_t *__restrict src8 = (const op_t *)src1; - - /* src8 and dst8 are both word-aligned. */ - if (n >= CHIP_L2_LINE_SIZE()) { - /* Copy until 'dst' is cache-line-aligned. */ - for (; (uintptr_t)dst8 & (CHIP_L2_LINE_SIZE() - 1); - n -= sizeof(op_t)) - ST8(dst8++, LD8(src8++)); - - for (; n >= CHIP_L2_LINE_SIZE(); ) { - op_t tmp0, tmp1, tmp2, tmp3; - op_t tmp4, tmp5, tmp6, tmp7; - - /* - * Prefetch and advance to next line - * to prefetch, but don't go past the - * end. - */ - __insn_prefetch(prefetch); - - /* Make sure prefetch got scheduled - earlier. */ - __asm__ ("" : : : "memory"); - - prefetch += CHIP_L2_LINE_SIZE(); - prefetch = (prefetch < src1_end) ? prefetch : - (const char *)src8; - - /* - * Do all the loads before wh64. This - * is necessary if [src8, src8+7] and - * [dst8, dst8+7] share the same cache - * line and dst8 <= src8, as can be - * the case when called from memmove, - * or with code tested on x86 whose - * memcpy always works with forward - * copies. - */ - tmp0 = LD8(src8++); - tmp1 = LD8(src8++); - tmp2 = LD8(src8++); - tmp3 = LD8(src8++); - tmp4 = LD8(src8++); - tmp5 = LD8(src8++); - tmp6 = LD8(src8++); - tmp7 = LD8(src8++); - - /* wh64 and wait for tmp7 load completion. */ - __asm__ ("move %0, %0; wh64 %1\n" - : : "r"(tmp7), "r"(dst8)); - - ST8(dst8++, tmp0); - ST8(dst8++, tmp1); - ST8(dst8++, tmp2); - ST8(dst8++, tmp3); - ST8(dst8++, tmp4); - ST8(dst8++, tmp5); - ST8(dst8++, tmp6); - ST8(dst8++, tmp7); - - n -= CHIP_L2_LINE_SIZE(); - } -#if CHIP_L2_LINE_SIZE() != 64 -# error "Fix code that assumes particular L2 cache line size." -#endif - } - - for (; n >= sizeof(op_t); n -= sizeof(op_t)) - ST8(dst8++, LD8(src8++)); - - if (__builtin_expect(n == 0, 1)) - return RETVAL; - - final = LD8(src8); - } - - /* n != 0 if we get here. Write out any trailing bytes. */ - dst1 = (char *)dst8; -#ifndef __BIG_ENDIAN__ - if (n & 4) { - ST4((uint32_t *)dst1, final); - dst1 += 4; - final >>= 32; - n &= 3; - } - if (n & 2) { - ST2((uint16_t *)dst1, final); - dst1 += 2; - final >>= 16; - n &= 1; - } - if (n) - ST1((uint8_t *)dst1, final); -#else - if (n & 4) { - ST4((uint32_t *)dst1, final >> 32); - dst1 += 4; - } - else - { - final >>= 32; - } - if (n & 2) { - ST2((uint16_t *)dst1, final >> 16); - dst1 += 2; - } - else - { - final >>= 16; - } - if (n & 1) - ST1((uint8_t *)dst1, final >> 8); -#endif - - return RETVAL; -} - -#ifdef USERCOPY_FUNC -#undef ST1 -#undef ST2 -#undef ST4 -#undef ST8 -#undef LD1 -#undef LD2 -#undef LD4 -#undef LD8 -#undef USERCOPY_FUNC -#endif diff --git a/arch/tile/lib/memcpy_user_64.c b/arch/tile/lib/memcpy_user_64.c deleted file mode 100644 index a3fea9fd973e..000000000000 --- a/arch/tile/lib/memcpy_user_64.c +++ /dev/null @@ -1,85 +0,0 @@ -/* - * Copyright 2011 Tilera Corporation. All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation, version 2. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or - * NON INFRINGEMENT. See the GNU General Public License for - * more details. - * - * Do memcpy(), but trap and return "n" when a load or store faults. - * - * Note: this idiom only works when memcpy() compiles to a leaf function. - * Here leaf function not only means it does not have calls, but also - * requires no stack operations (sp, stack frame pointer) and no - * use of callee-saved registers, else "jrp lr" will be incorrect since - * unwinding stack frame is bypassed. Since memcpy() is not complex so - * these conditions are satisfied here, but we need to be careful when - * modifying this file. This is not a clean solution but is the best - * one so far. - * - * Also note that we are capturing "n" from the containing scope here. - */ - -#define _ST(p, inst, v) \ - ({ \ - asm("1: " #inst " %0, %1;" \ - ".pushsection .coldtext,\"ax\";" \ - "2: { move r0, %2; jrp lr };" \ - ".section __ex_table,\"a\";" \ - ".align 8;" \ - ".quad 1b, 2b;" \ - ".popsection" \ - : "=m" (*(p)) : "r" (v), "r" (n)); \ - }) - -#define _LD(p, inst) \ - ({ \ - unsigned long __v; \ - asm("1: " #inst " %0, %1;" \ - ".pushsection .coldtext,\"ax\";" \ - "2: { move r0, %2; jrp lr };" \ - ".section __ex_table,\"a\";" \ - ".align 8;" \ - ".quad 1b, 2b;" \ - ".popsection" \ - : "=r" (__v) : "m" (*(p)), "r" (n)); \ - __v; \ - }) - -#define USERCOPY_FUNC raw_copy_to_user -#define ST1(p, v) _ST((p), st1, (v)) -#define ST2(p, v) _ST((p), st2, (v)) -#define ST4(p, v) _ST((p), st4, (v)) -#define ST8(p, v) _ST((p), st, (v)) -#define LD1 LD -#define LD2 LD -#define LD4 LD -#define LD8 LD -#include "memcpy_64.c" - -#define USERCOPY_FUNC raw_copy_from_user -#define ST1 ST -#define ST2 ST -#define ST4 ST -#define ST8 ST -#define LD1(p) _LD((p), ld1u) -#define LD2(p) _LD((p), ld2u) -#define LD4(p) _LD((p), ld4u) -#define LD8(p) _LD((p), ld) -#include "memcpy_64.c" - -#define USERCOPY_FUNC raw_copy_in_user -#define ST1(p, v) _ST((p), st1, (v)) -#define ST2(p, v) _ST((p), st2, (v)) -#define ST4(p, v) _ST((p), st4, (v)) -#define ST8(p, v) _ST((p), st, (v)) -#define LD1(p) _LD((p), ld1u) -#define LD2(p) _LD((p), ld2u) -#define LD4(p) _LD((p), ld4u) -#define LD8(p) _LD((p), ld) -#include "memcpy_64.c" diff --git a/arch/tile/lib/memmove.c b/arch/tile/lib/memmove.c deleted file mode 100644 index fd615ae6ade7..000000000000 --- a/arch/tile/lib/memmove.c +++ /dev/null @@ -1,63 +0,0 @@ -/* - * Copyright 2010 Tilera Corporation. All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation, version 2. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or - * NON INFRINGEMENT. See the GNU General Public License for - * more details. - */ - -#include <linux/types.h> -#include <linux/string.h> -#include <linux/module.h> - -void *memmove(void *dest, const void *src, size_t n) -{ - if ((const char *)src >= (char *)dest + n - || (char *)dest >= (const char *)src + n) { - /* We found no overlap, so let memcpy do all the heavy - * lifting (prefetching, etc.) - */ - return memcpy(dest, src, n); - } - - if (n != 0) { - const uint8_t *in; - uint8_t x; - uint8_t *out; - int stride; - - if (src < dest) { - /* copy backwards */ - in = (const uint8_t *)src + n - 1; - out = (uint8_t *)dest + n - 1; - stride = -1; - } else { - /* copy forwards */ - in = (const uint8_t *)src; - out = (uint8_t *)dest; - stride = 1; - } - - /* Manually software-pipeline this loop. */ - x = *in; - in += stride; - - while (--n != 0) { - *out = x; - out += stride; - x = *in; - in += stride; - } - - *out = x; - } - - return dest; -} -EXPORT_SYMBOL(memmove); diff --git a/arch/tile/lib/memset_32.c b/arch/tile/lib/memset_32.c deleted file mode 100644 index 2042bfe6595f..000000000000 --- a/arch/tile/lib/memset_32.c +++ /dev/null @@ -1,143 +0,0 @@ -/* - * Copyright 2010 Tilera Corporation. All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation, version 2. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or - * NON INFRINGEMENT. See the GNU General Public License for - * more details. - */ - -#include <linux/types.h> -#include <linux/string.h> -#include <linux/module.h> -#include <arch/chip.h> - -void *memset(void *s, int c, size_t n) -{ - uint32_t *out32; - int n32; - uint32_t v16, v32; - uint8_t *out8 = s; - int to_align32; - - /* Experimentation shows that a trivial tight loop is a win up until - * around a size of 20, where writing a word at a time starts to win. - */ -#define BYTE_CUTOFF 20 - -#if BYTE_CUTOFF < 3 - /* This must be at least at least this big, or some code later - * on doesn't work. - */ -#error "BYTE_CUTOFF is too small" -#endif - - if (n < BYTE_CUTOFF) { - /* Strangely, this turns out to be the tightest way to - * write this loop. - */ - if (n != 0) { - do { - /* Strangely, combining these into one line - * performs worse. - */ - *out8 = c; - out8++; - } while (--n != 0); - } - - return s; - } - - /* Align 'out8'. We know n >= 3 so this won't write past the end. */ - while (((uintptr_t) out8 & 3) != 0) { - *out8++ = c; - --n; - } - - /* Align 'n'. */ - while (n & 3) - out8[--n] = c; - - out32 = (uint32_t *) out8; - n32 = n >> 2; - - /* Tile input byte out to 32 bits. */ - v16 = __insn_intlb(c, c); - v32 = __insn_intlh(v16, v16); - - /* This must be at least 8 or the following loop doesn't work. */ -#define CACHE_LINE_SIZE_IN_WORDS (CHIP_L2_LINE_SIZE() / 4) - - /* Determine how many words we need to emit before the 'out32' - * pointer becomes aligned modulo the cache line size. - */ - to_align32 = - (-((uintptr_t)out32 >> 2)) & (CACHE_LINE_SIZE_IN_WORDS - 1); - - /* Only bother aligning and using wh64 if there is at least - * one full cache line to process. This check also prevents - * overrunning the end of the buffer with alignment words. - */ - if (to_align32 <= n32 - CACHE_LINE_SIZE_IN_WORDS) { - int lines_left; - - /* Align out32 mod the cache line size so we can use wh64. */ - n32 -= to_align32; - for (; to_align32 != 0; to_align32--) { - *out32 = v32; - out32++; - } - - /* Use unsigned divide to turn this into a right shift. */ - lines_left = (unsigned)n32 / CACHE_LINE_SIZE_IN_WORDS; - - do { - /* Only wh64 a few lines at a time, so we don't - * exceed the maximum number of victim lines. - */ - int x = ((lines_left < CHIP_MAX_OUTSTANDING_VICTIMS()) - ? lines_left - : CHIP_MAX_OUTSTANDING_VICTIMS()); - uint32_t *wh = out32; - int i = x; - int j; - - lines_left -= x; - - do { - __insn_wh64(wh); - wh += CACHE_LINE_SIZE_IN_WORDS; - } while (--i); - - for (j = x * (CACHE_LINE_SIZE_IN_WORDS / 4); - j != 0; j--) { - *out32++ = v32; - *out32++ = v32; - *out32++ = v32; - *out32++ = v32; - } - } while (lines_left != 0); - - /* We processed all full lines above, so only this many - * words remain to be processed. - */ - n32 &= CACHE_LINE_SIZE_IN_WORDS - 1; - } - - /* Now handle any leftover values. */ - if (n32 != 0) { - do { - *out32 = v32; - out32++; - } while (--n32 != 0); - } - - return s; -} -EXPORT_SYMBOL(memset); diff --git a/arch/tile/lib/memset_64.c b/arch/tile/lib/memset_64.c deleted file mode 100644 index 03ef69cd73de..000000000000 --- a/arch/tile/lib/memset_64.c +++ /dev/null @@ -1,142 +0,0 @@ -/* - * Copyright 2011 Tilera Corporation. All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation, version 2. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or - * NON INFRINGEMENT. See the GNU General Public License for - * more details. - */ - -#include <linux/types.h> -#include <linux/string.h> -#include <linux/module.h> -#include <arch/chip.h> -#include "string-endian.h" - -void *memset(void *s, int c, size_t n) -{ - uint64_t *out64; - int n64, to_align64; - uint64_t v64; - uint8_t *out8 = s; - - /* Experimentation shows that a trivial tight loop is a win up until - * around a size of 20, where writing a word at a time starts to win. - */ -#define BYTE_CUTOFF 20 - -#if BYTE_CUTOFF < 7 - /* This must be at least at least this big, or some code later - * on doesn't work. - */ -#error "BYTE_CUTOFF is too small" -#endif - - if (n < BYTE_CUTOFF) { - /* Strangely, this turns out to be the tightest way to - * write this loop. - */ - if (n != 0) { - do { - /* Strangely, combining these into one line - * performs worse. - */ - *out8 = c; - out8++; - } while (--n != 0); - } - - return s; - } - - /* Align 'out8'. We know n >= 7 so this won't write past the end. */ - while (((uintptr_t) out8 & 7) != 0) { - *out8++ = c; - --n; - } - - /* Align 'n'. */ - while (n & 7) - out8[--n] = c; - - out64 = (uint64_t *) out8; - n64 = n >> 3; - - /* Tile input byte out to 64 bits. */ - v64 = copy_byte(c); - - /* This must be at least 8 or the following loop doesn't work. */ -#define CACHE_LINE_SIZE_IN_DOUBLEWORDS (CHIP_L2_LINE_SIZE() / 8) - - /* Determine how many words we need to emit before the 'out32' - * pointer becomes aligned modulo the cache line size. - */ - to_align64 = (-((uintptr_t)out64 >> 3)) & - (CACHE_LINE_SIZE_IN_DOUBLEWORDS - 1); - - /* Only bother aligning and using wh64 if there is at least - * one full cache line to process. This check also prevents - * overrunning the end of the buffer with alignment words. - */ - if (to_align64 <= n64 - CACHE_LINE_SIZE_IN_DOUBLEWORDS) { - int lines_left; - - /* Align out64 mod the cache line size so we can use wh64. */ - n64 -= to_align64; - for (; to_align64 != 0; to_align64--) { - *out64 = v64; - out64++; - } - - /* Use unsigned divide to turn this into a right shift. */ - lines_left = (unsigned)n64 / CACHE_LINE_SIZE_IN_DOUBLEWORDS; - - do { - /* Only wh64 a few lines at a time, so we don't - * exceed the maximum number of victim lines. - */ - int x = ((lines_left < CHIP_MAX_OUTSTANDING_VICTIMS()) - ? lines_left - : CHIP_MAX_OUTSTANDING_VICTIMS()); - uint64_t *wh = out64; - int i = x; - int j; - - lines_left -= x; - - do { - __insn_wh64(wh); - wh += CACHE_LINE_SIZE_IN_DOUBLEWORDS; - } while (--i); - - for (j = x * (CACHE_LINE_SIZE_IN_DOUBLEWORDS / 4); - j != 0; j--) { - *out64++ = v64; - *out64++ = v64; - *out64++ = v64; - *out64++ = v64; - } - } while (lines_left != 0); - - /* We processed all full lines above, so only this many - * words remain to be processed. - */ - n64 &= CACHE_LINE_SIZE_IN_DOUBLEWORDS - 1; - } - - /* Now handle any leftover values. */ - if (n64 != 0) { - do { - *out64 = v64; - out64++; - } while (--n64 != 0); - } - - return s; -} -EXPORT_SYMBOL(memset); diff --git a/arch/tile/lib/spinlock_32.c b/arch/tile/lib/spinlock_32.c deleted file mode 100644 index db9333f2447c..000000000000 --- a/arch/tile/lib/spinlock_32.c +++ /dev/null @@ -1,251 +0,0 @@ -/* - * Copyright 2010 Tilera Corporation. All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation, version 2. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or - * NON INFRINGEMENT. See the GNU General Public License for - * more details. - */ - -#include <linux/spinlock.h> -#include <linux/module.h> -#include <asm/processor.h> -#include <arch/spr_def.h> - -#include "spinlock_common.h" - -void arch_spin_lock(arch_spinlock_t *lock) -{ - int my_ticket; - int iterations = 0; - int delta; - - while ((my_ticket = __insn_tns((void *)&lock->next_ticket)) & 1) - delay_backoff(iterations++); - - /* Increment the next ticket number, implicitly releasing tns lock. */ - lock->next_ticket = my_ticket + TICKET_QUANTUM; - - /* Wait until it's our turn. */ - while ((delta = my_ticket - lock->current_ticket) != 0) - relax((128 / CYCLES_PER_RELAX_LOOP) * delta); -} -EXPORT_SYMBOL(arch_spin_lock); - -int arch_spin_trylock(arch_spinlock_t *lock) -{ - /* - * Grab a ticket; no need to retry if it's busy, we'll just - * treat that the same as "locked", since someone else - * will lock it momentarily anyway. - */ - int my_ticket = __insn_tns((void *)&lock->next_ticket); - - if (my_ticket == lock->current_ticket) { - /* Not currently locked, so lock it by keeping this ticket. */ - lock->next_ticket = my_ticket + TICKET_QUANTUM; - /* Success! */ - return 1; - } - - if (!(my_ticket & 1)) { - /* Release next_ticket. */ - lock->next_ticket = my_ticket; - } - - return 0; -} -EXPORT_SYMBOL(arch_spin_trylock); - -/* - * The low byte is always reserved to be the marker for a "tns" operation - * since the low bit is set to "1" by a tns. The next seven bits are - * zeroes. The next byte holds the "next" writer value, i.e. the ticket - * available for the next task that wants to write. The third byte holds - * the current writer value, i.e. the writer who holds the current ticket. - * If current == next == 0, there are no interested writers. - */ -#define WR_NEXT_SHIFT _WR_NEXT_SHIFT -#define WR_CURR_SHIFT _WR_CURR_SHIFT -#define WR_WIDTH _WR_WIDTH -#define WR_MASK ((1 << WR_WIDTH) - 1) - -/* - * The last eight bits hold the active reader count. This has to be - * zero before a writer can start to write. - */ -#define RD_COUNT_SHIFT _RD_COUNT_SHIFT -#define RD_COUNT_WIDTH _RD_COUNT_WIDTH -#define RD_COUNT_MASK ((1 << RD_COUNT_WIDTH) - 1) - - -/* - * We can get the read lock if everything but the reader bits (which - * are in the high part of the word) is zero, i.e. no active or - * waiting writers, no tns. - * - * We guard the tns/store-back with an interrupt critical section to - * preserve the semantic that the same read lock can be acquired in an - * interrupt context. - */ -int arch_read_trylock(arch_rwlock_t *rwlock) -{ - u32 val; - __insn_mtspr(SPR_INTERRUPT_CRITICAL_SECTION, 1); - val = __insn_tns((int *)&rwlock->lock); - if (likely((val << _RD_COUNT_WIDTH) == 0)) { - val += 1 << RD_COUNT_SHIFT; - rwlock->lock = val; - __insn_mtspr(SPR_INTERRUPT_CRITICAL_SECTION, 0); - BUG_ON(val == 0); /* we don't expect wraparound */ - return 1; - } - if ((val & 1) == 0) - rwlock->lock = val; - __insn_mtspr(SPR_INTERRUPT_CRITICAL_SECTION, 0); - return 0; -} -EXPORT_SYMBOL(arch_read_trylock); - -/* - * Spin doing arch_read_trylock() until we acquire the lock. - * ISSUE: This approach can permanently starve readers. A reader who sees - * a writer could instead take a ticket lock (just like a writer would), - * and atomically enter read mode (with 1 reader) when it gets the ticket. - * This way both readers and writers would always make forward progress - * in a finite time. - */ -void arch_read_lock(arch_rwlock_t *rwlock) -{ - u32 iterations = 0; - while (unlikely(!arch_read_trylock(rwlock))) - delay_backoff(iterations++); -} -EXPORT_SYMBOL(arch_read_lock); - -void arch_read_unlock(arch_rwlock_t *rwlock) -{ - u32 val, iterations = 0; - - mb(); /* guarantee anything modified under the lock is visible */ - for (;;) { - __insn_mtspr(SPR_INTERRUPT_CRITICAL_SECTION, 1); - val = __insn_tns((int *)&rwlock->lock); - if (likely((val & 1) == 0)) { - rwlock->lock = val - (1 << _RD_COUNT_SHIFT); - __insn_mtspr(SPR_INTERRUPT_CRITICAL_SECTION, 0); - break; - } - __insn_mtspr(SPR_INTERRUPT_CRITICAL_SECTION, 0); - delay_backoff(iterations++); - } -} -EXPORT_SYMBOL(arch_read_unlock); - -/* - * We don't need an interrupt critical section here (unlike for - * arch_read_lock) since we should never use a bare write lock where - * it could be interrupted by code that could try to re-acquire it. - */ -void arch_write_lock(arch_rwlock_t *rwlock) -{ - /* - * The trailing underscore on this variable (and curr_ below) - * reminds us that the high bits are garbage; we mask them out - * when we compare them. - */ - u32 my_ticket_; - u32 iterations = 0; - u32 val = __insn_tns((int *)&rwlock->lock); - - if (likely(val == 0)) { - rwlock->lock = 1 << _WR_NEXT_SHIFT; - return; - } - - /* - * Wait until there are no readers, then bump up the next - * field and capture the ticket value. - */ - for (;;) { - if (!(val & 1)) { - if ((val >> RD_COUNT_SHIFT) == 0) - break; - rwlock->lock = val; - } - delay_backoff(iterations++); - val = __insn_tns((int *)&rwlock->lock); - } - - /* Take out the next ticket and extract my ticket value. */ - rwlock->lock = __insn_addb(val, 1 << WR_NEXT_SHIFT); - my_ticket_ = val >> WR_NEXT_SHIFT; - - /* Wait until the "current" field matches our ticket. */ - for (;;) { - u32 curr_ = val >> WR_CURR_SHIFT; - u32 delta = ((my_ticket_ - curr_) & WR_MASK); - if (likely(delta == 0)) - break; - - /* Delay based on how many lock-holders are still out there. */ - relax((256 / CYCLES_PER_RELAX_LOOP) * delta); - - /* - * Get a non-tns value to check; we don't need to tns - * it ourselves. Since we're not tns'ing, we retry - * more rapidly to get a valid value. - */ - while ((val = rwlock->lock) & 1) - relax(4); - } -} -EXPORT_SYMBOL(arch_write_lock); - -int arch_write_trylock(arch_rwlock_t *rwlock) -{ - u32 val = __insn_tns((int *)&rwlock->lock); - - /* - * If a tns is in progress, or there's a waiting or active locker, - * or active readers, we can't take the lock, so give up. - */ - if (unlikely(val != 0)) { - if (!(val & 1)) - rwlock->lock = val; - return 0; - } - - /* Set the "next" field to mark it locked. */ - rwlock->lock = 1 << _WR_NEXT_SHIFT; - return 1; -} -EXPORT_SYMBOL(arch_write_trylock); - -void arch_write_unlock(arch_rwlock_t *rwlock) -{ - u32 val, eq, mask; - - mb(); /* guarantee anything modified under the lock is visible */ - val = __insn_tns((int *)&rwlock->lock); - if (likely(val == (1 << _WR_NEXT_SHIFT))) { - rwlock->lock = 0; - return; - } - while (unlikely(val & 1)) { - /* Limited backoff since we are the highest-priority task. */ - relax(4); - val = __insn_tns((int *)&rwlock->lock); - } - mask = 1 << WR_CURR_SHIFT; - val = __insn_addb(val, mask); - eq = __insn_seqb(val, val << (WR_CURR_SHIFT - WR_NEXT_SHIFT)); - val = __insn_mz(eq & mask, val); - rwlock->lock = val; -} -EXPORT_SYMBOL(arch_write_unlock); diff --git a/arch/tile/lib/spinlock_64.c b/arch/tile/lib/spinlock_64.c deleted file mode 100644 index de414c22892f..000000000000 --- a/arch/tile/lib/spinlock_64.c +++ /dev/null @@ -1,97 +0,0 @@ -/* - * Copyright 2011 Tilera Corporation. All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation, version 2. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or - * NON INFRINGEMENT. See the GNU General Public License for - * more details. - */ - -#include <linux/spinlock.h> -#include <linux/module.h> -#include <asm/processor.h> - -#include "spinlock_common.h" - -/* - * Read the spinlock value without allocating in our cache and without - * causing an invalidation to another cpu with a copy of the cacheline. - * This is important when we are spinning waiting for the lock. - */ -static inline u32 arch_spin_read_noalloc(void *lock) -{ - return atomic_cmpxchg((atomic_t *)lock, -1, -1); -} - -/* - * Wait until the high bits (current) match my ticket. - * If we notice the overflow bit set on entry, we clear it. - */ -void arch_spin_lock_slow(arch_spinlock_t *lock, u32 my_ticket) -{ - if (unlikely(my_ticket & __ARCH_SPIN_NEXT_OVERFLOW)) { - __insn_fetchand4(&lock->lock, ~__ARCH_SPIN_NEXT_OVERFLOW); - my_ticket &= ~__ARCH_SPIN_NEXT_OVERFLOW; - } - - for (;;) { - u32 val = arch_spin_read_noalloc(lock); - u32 delta = my_ticket - arch_spin_current(val); - if (delta == 0) - return; - relax((128 / CYCLES_PER_RELAX_LOOP) * delta); - } -} -EXPORT_SYMBOL(arch_spin_lock_slow); - -/* - * Check the lock to see if it is plausible, and try to get it with cmpxchg(). - */ -int arch_spin_trylock(arch_spinlock_t *lock) -{ - u32 val = arch_spin_read_noalloc(lock); - if (unlikely(arch_spin_current(val) != arch_spin_next(val))) - return 0; - return cmpxchg(&lock->lock, val, (val + 1) & ~__ARCH_SPIN_NEXT_OVERFLOW) - == val; -} -EXPORT_SYMBOL(arch_spin_trylock); - - -/* - * If the read lock fails due to a writer, we retry periodically - * until the value is positive and we write our incremented reader count. - */ -void __read_lock_failed(arch_rwlock_t *rw) -{ - u32 val; - int iterations = 0; - do { - delay_backoff(iterations++); - val = __insn_fetchaddgez4(&rw->lock, 1); - } while (unlikely(arch_write_val_locked(val))); -} -EXPORT_SYMBOL(__read_lock_failed); - -/* - * If we failed because there were readers, clear the "writer" bit - * so we don't block additional readers. Otherwise, there was another - * writer anyway, so our "fetchor" made no difference. Then wait, - * issuing periodic fetchor instructions, till we get the lock. - */ -void __write_lock_failed(arch_rwlock_t *rw, u32 val) -{ - int iterations = 0; - do { - if (!arch_write_val_locked(val)) - val = __insn_fetchand4(&rw->lock, ~__WRITE_LOCK_BIT); - delay_backoff(iterations++); - val = __insn_fetchor4(&rw->lock, __WRITE_LOCK_BIT); - } while (val != 0); -} -EXPORT_SYMBOL(__write_lock_failed); diff --git a/arch/tile/lib/spinlock_common.h b/arch/tile/lib/spinlock_common.h deleted file mode 100644 index 6ac37509faca..000000000000 --- a/arch/tile/lib/spinlock_common.h +++ /dev/null @@ -1,64 +0,0 @@ -/* - * Copyright 2010 Tilera Corporation. All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation, version 2. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or - * NON INFRINGEMENT. See the GNU General Public License for - * more details. - * This file is included into spinlock_32.c or _64.c. - */ - -/* - * The mfspr in __spinlock_relax() is 5 or 6 cycles plus 2 for loop - * overhead. - */ -#ifdef __tilegx__ -#define CYCLES_PER_RELAX_LOOP 7 -#else -#define CYCLES_PER_RELAX_LOOP 8 -#endif - -/* - * Idle the core for CYCLES_PER_RELAX_LOOP * iterations cycles. - */ -static inline void -relax(int iterations) -{ - for (/*above*/; iterations > 0; iterations--) - __insn_mfspr(SPR_PASS); - barrier(); -} - -/* Perform bounded exponential backoff.*/ -static void delay_backoff(int iterations) -{ - u32 exponent, loops; - - /* - * 2^exponent is how many times we go around the loop, - * which takes 8 cycles. We want to start with a 16- to 31-cycle - * loop, so we need to go around minimum 2 = 2^1 times, so we - * bias the original value up by 1. - */ - exponent = iterations + 1; - - /* - * Don't allow exponent to exceed 7, so we have 128 loops, - * or 1,024 (to 2,047) cycles, as our maximum. - */ - if (exponent > 8) - exponent = 8; - - loops = 1 << exponent; - - /* Add a randomness factor so two cpus never get in lock step. */ - loops += __insn_crc32_32(stack_pointer, get_cycles_low()) & - (loops - 1); - - relax(loops); -} diff --git a/arch/tile/lib/strchr_32.c b/arch/tile/lib/strchr_32.c deleted file mode 100644 index 841fe6963019..000000000000 --- a/arch/tile/lib/strchr_32.c +++ /dev/null @@ -1,64 +0,0 @@ -/* - * Copyright 2010 Tilera Corporation. All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation, version 2. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or - * NON INFRINGEMENT. See the GNU General Public License for - * more details. - */ - -#include <linux/types.h> -#include <linux/string.h> -#include <linux/module.h> - -char *strchr(const char *s, int c) -{ - int z, g; - - /* Get an aligned pointer. */ - const uintptr_t s_int = (uintptr_t) s; - const uint32_t *p = (const uint32_t *)(s_int & -4); - - /* Create four copies of the byte for which we are looking. */ - const uint32_t goal = 0x01010101 * (uint8_t) c; - - /* Read the first aligned word, but force bytes before the string to - * match neither zero nor goal (we make sure the high bit of each - * byte is 1, and the low 7 bits are all the opposite of the goal - * byte). - * - * Note that this shift count expression works because we know shift - * counts are taken mod 32. - */ - const uint32_t before_mask = (1 << (s_int << 3)) - 1; - uint32_t v = (*p | before_mask) ^ (goal & __insn_shrib(before_mask, 1)); - - uint32_t zero_matches, goal_matches; - while (1) { - /* Look for a terminating '\0'. */ - zero_matches = __insn_seqb(v, 0); - - /* Look for the goal byte. */ - goal_matches = __insn_seqb(v, goal); - - if (__builtin_expect(zero_matches | goal_matches, 0)) - break; - - v = *++p; - } - - z = __insn_ctz(zero_matches); - g = __insn_ctz(goal_matches); - - /* If we found c before '\0' we got a match. Note that if c == '\0' - * then g == z, and we correctly return the address of the '\0' - * rather than NULL. - */ - return (g <= z) ? ((char *)p) + (g >> 3) : NULL; -} -EXPORT_SYMBOL(strchr); diff --git a/arch/tile/lib/strchr_64.c b/arch/tile/lib/strchr_64.c deleted file mode 100644 index fe6e31c06f8d..000000000000 --- a/arch/tile/lib/strchr_64.c +++ /dev/null @@ -1,62 +0,0 @@ -/* - * Copyright 2011 Tilera Corporation. All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation, version 2. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or - * NON INFRINGEMENT. See the GNU General Public License for - * more details. - */ - -#include <linux/types.h> -#include <linux/string.h> -#include <linux/module.h> -#include "string-endian.h" - -char *strchr(const char *s, int c) -{ - int z, g; - - /* Get an aligned pointer. */ - const uintptr_t s_int = (uintptr_t) s; - const uint64_t *p = (const uint64_t *)(s_int & -8); - - /* Create eight copies of the byte for which we are looking. */ - const uint64_t goal = copy_byte(c); - - /* Read the first aligned word, but force bytes before the string to - * match neither zero nor goal (we make sure the high bit of each - * byte is 1, and the low 7 bits are all the opposite of the goal - * byte). - */ - const uint64_t before_mask = MASK(s_int); - uint64_t v = (*p | before_mask) ^ (goal & __insn_v1shrui(before_mask, 1)); - - uint64_t zero_matches, goal_matches; - while (1) { - /* Look for a terminating '\0'. */ - zero_matches = __insn_v1cmpeqi(v, 0); - - /* Look for the goal byte. */ - goal_matches = __insn_v1cmpeq(v, goal); - - if (__builtin_expect((zero_matches | goal_matches) != 0, 0)) - break; - - v = *++p; - } - - z = CFZ(zero_matches); - g = CFZ(goal_matches); - - /* If we found c before '\0' we got a match. Note that if c == '\0' - * then g == z, and we correctly return the address of the '\0' - * rather than NULL. - */ - return (g <= z) ? ((char *)p) + (g >> 3) : NULL; -} -EXPORT_SYMBOL(strchr); diff --git a/arch/tile/lib/string-endian.h b/arch/tile/lib/string-endian.h deleted file mode 100644 index 2e49cbfe9371..000000000000 --- a/arch/tile/lib/string-endian.h +++ /dev/null @@ -1,44 +0,0 @@ -/* - * Copyright 2013 Tilera Corporation. All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation, version 2. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or - * NON INFRINGEMENT. See the GNU General Public License for - * more details. - * - * Provide a mask based on the pointer alignment that - * sets up non-zero bytes before the beginning of the string. - * The MASK expression works because shift counts are taken mod 64. - * Also, specify how to count "first" and "last" bits - * when the bits have been read as a word. - */ - -#include <asm/byteorder.h> - -#ifdef __LITTLE_ENDIAN -#define MASK(x) (__insn_shl(1ULL, (x << 3)) - 1) -#define NULMASK(x) ((2ULL << x) - 1) -#define CFZ(x) __insn_ctz(x) -#define REVCZ(x) __insn_clz(x) -#else -#define MASK(x) (__insn_shl(-2LL, ((-x << 3) - 1))) -#define NULMASK(x) (-2LL << (63 - x)) -#define CFZ(x) __insn_clz(x) -#define REVCZ(x) __insn_ctz(x) -#endif - -/* - * Create eight copies of the byte in a uint64_t. Byte Shuffle uses - * the bytes of srcB as the index into the dest vector to select a - * byte. With all indices of zero, the first byte is copied into all - * the other bytes. - */ -static inline uint64_t copy_byte(uint8_t byte) -{ - return __insn_shufflebytes(byte, 0, 0); -} diff --git a/arch/tile/lib/strlen_32.c b/arch/tile/lib/strlen_32.c deleted file mode 100644 index f26f88e11e4a..000000000000 --- a/arch/tile/lib/strlen_32.c +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright 2010 Tilera Corporation. All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation, version 2. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or - * NON INFRINGEMENT. See the GNU General Public License for - * more details. - */ - -#include <linux/types.h> -#include <linux/string.h> -#include <linux/module.h> - -size_t strlen(const char *s) -{ - /* Get an aligned pointer. */ - const uintptr_t s_int = (uintptr_t) s; - const uint32_t *p = (const uint32_t *)(s_int & -4); - - /* Read the first word, but force bytes before the string to be nonzero. - * This expression works because we know shift counts are taken mod 32. - */ - uint32_t v = *p | ((1 << (s_int << 3)) - 1); - - uint32_t bits; - while ((bits = __insn_seqb(v, 0)) == 0) - v = *++p; - - return ((const char *)p) + (__insn_ctz(bits) >> 3) - s; -} -EXPORT_SYMBOL(strlen); diff --git a/arch/tile/lib/strlen_64.c b/arch/tile/lib/strlen_64.c deleted file mode 100644 index 9583fc3361fa..000000000000 --- a/arch/tile/lib/strlen_64.c +++ /dev/null @@ -1,35 +0,0 @@ -/* - * Copyright 2011 Tilera Corporation. All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation, version 2. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or - * NON INFRINGEMENT. See the GNU General Public License for - * more details. - */ - -#include <linux/types.h> -#include <linux/string.h> -#include <linux/module.h> -#include "string-endian.h" - -size_t strlen(const char *s) -{ - /* Get an aligned pointer. */ - const uintptr_t s_int = (uintptr_t) s; - const uint64_t *p = (const uint64_t *)(s_int & -8); - - /* Read and MASK the first word. */ - uint64_t v = *p | MASK(s_int); - - uint64_t bits; - while ((bits = __insn_v1cmpeqi(v, 0)) == 0) - v = *++p; - - return ((const char *)p) + (CFZ(bits) >> 3) - s; -} -EXPORT_SYMBOL(strlen); diff --git a/arch/tile/lib/strnlen_32.c b/arch/tile/lib/strnlen_32.c deleted file mode 100644 index 1434141d9e01..000000000000 --- a/arch/tile/lib/strnlen_32.c +++ /dev/null @@ -1,47 +0,0 @@ -/* - * Copyright 2013 Tilera Corporation. All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation, version 2. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or - * NON INFRINGEMENT. See the GNU General Public License for - * more details. - */ - -#include <linux/types.h> -#include <linux/string.h> -#include <linux/module.h> - -size_t strnlen(const char *s, size_t count) -{ - /* Get an aligned pointer. */ - const uintptr_t s_int = (uintptr_t) s; - const uint32_t *p = (const uint32_t *)(s_int & -4); - size_t bytes_read = sizeof(*p) - (s_int & (sizeof(*p) - 1)); - size_t len; - uint32_t v, bits; - - /* Avoid page fault risk by not reading any bytes when count is 0. */ - if (count == 0) - return 0; - - /* Read first word, but force bytes before the string to be nonzero. */ - v = *p | ((1 << ((s_int << 3) & 31)) - 1); - - while ((bits = __insn_seqb(v, 0)) == 0) { - if (bytes_read >= count) { - /* Read COUNT bytes and didn't find the terminator. */ - return count; - } - v = *++p; - bytes_read += sizeof(v); - } - - len = ((const char *) p) + (__insn_ctz(bits) >> 3) - s; - return (len < count ? len : count); -} -EXPORT_SYMBOL(strnlen); diff --git a/arch/tile/lib/strnlen_64.c b/arch/tile/lib/strnlen_64.c deleted file mode 100644 index 2e8de6a5136f..000000000000 --- a/arch/tile/lib/strnlen_64.c +++ /dev/null @@ -1,48 +0,0 @@ -/* - * Copyright 2013 Tilera Corporation. All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation, version 2. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or - * NON INFRINGEMENT. See the GNU General Public License for - * more details. - */ - -#include <linux/types.h> -#include <linux/string.h> -#include <linux/module.h> -#include "string-endian.h" - -size_t strnlen(const char *s, size_t count) -{ - /* Get an aligned pointer. */ - const uintptr_t s_int = (uintptr_t) s; - const uint64_t *p = (const uint64_t *)(s_int & -8); - size_t bytes_read = sizeof(*p) - (s_int & (sizeof(*p) - 1)); - size_t len; - uint64_t v, bits; - - /* Avoid page fault risk by not reading any bytes when count is 0. */ - if (count == 0) - return 0; - - /* Read and MASK the first word. */ - v = *p | MASK(s_int); - - while ((bits = __insn_v1cmpeqi(v, 0)) == 0) { - if (bytes_read >= count) { - /* Read COUNT bytes and didn't find the terminator. */ - return count; - } - v = *++p; - bytes_read += sizeof(v); - } - - len = ((const char *) p) + (CFZ(bits) >> 3) - s; - return (len < count ? len : count); -} -EXPORT_SYMBOL(strnlen); diff --git a/arch/tile/lib/uaccess.c b/arch/tile/lib/uaccess.c deleted file mode 100644 index 030abe3ee4f1..000000000000 --- a/arch/tile/lib/uaccess.c +++ /dev/null @@ -1,24 +0,0 @@ -/* - * Copyright 2010 Tilera Corporation. All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation, version 2. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or - * NON INFRINGEMENT. See the GNU General Public License for - * more details. - */ - -#include <linux/uaccess.h> -#include <linux/module.h> - -int __range_ok(unsigned long addr, unsigned long size) -{ - unsigned long limit = current_thread_info()->addr_limit.seg; - return !((addr < limit && size <= limit - addr) || - is_arch_mappable_range(addr, size)); -} -EXPORT_SYMBOL(__range_ok); diff --git a/arch/tile/lib/usercopy_32.S b/arch/tile/lib/usercopy_32.S deleted file mode 100644 index db93ad5fae25..000000000000 --- a/arch/tile/lib/usercopy_32.S +++ /dev/null @@ -1,89 +0,0 @@ -/* - * Copyright 2010 Tilera Corporation. All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation, version 2. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or - * NON INFRINGEMENT. See the GNU General Public License for - * more details. - */ - -#include <linux/linkage.h> -#include <asm/errno.h> -#include <asm/cache.h> -#include <arch/chip.h> - -/* Access user memory, but use MMU to avoid propagating kernel exceptions. */ - -/* - * clear_user_asm takes the user target address in r0 and the - * number of bytes to zero in r1. - * It returns the number of uncopiable bytes (hopefully zero) in r0. - * Note that we don't use a separate .fixup section here since we fall - * through into the "fixup" code as the last straight-line bundle anyway. - */ -STD_ENTRY(clear_user_asm) - { bz r1, 2f; or r2, r0, r1 } - andi r2, r2, 3 - bzt r2, .Lclear_aligned_user_asm -1: { sb r0, zero; addi r0, r0, 1; addi r1, r1, -1 } - bnzt r1, 1b -2: { move r0, r1; jrp lr } - .pushsection __ex_table,"a" - .align 4 - .word 1b, 2b - .popsection - -.Lclear_aligned_user_asm: -1: { sw r0, zero; addi r0, r0, 4; addi r1, r1, -4 } - bnzt r1, 1b -2: { move r0, r1; jrp lr } - STD_ENDPROC(clear_user_asm) - .pushsection __ex_table,"a" - .align 4 - .word 1b, 2b - .popsection - -/* - * flush_user_asm takes the user target address in r0 and the - * number of bytes to flush in r1. - * It returns the number of unflushable bytes (hopefully zero) in r0. - */ -STD_ENTRY(flush_user_asm) - bz r1, 2f - { movei r2, L2_CACHE_BYTES; add r1, r0, r1 } - { sub r2, zero, r2; addi r1, r1, L2_CACHE_BYTES-1 } - { and r0, r0, r2; and r1, r1, r2 } - { sub r1, r1, r0 } -1: { flush r0; addi r1, r1, -CHIP_FLUSH_STRIDE() } - { addi r0, r0, CHIP_FLUSH_STRIDE(); bnzt r1, 1b } -2: { move r0, r1; jrp lr } - STD_ENDPROC(flush_user_asm) - .pushsection __ex_table,"a" - .align 4 - .word 1b, 2b - .popsection - -/* - * finv_user_asm takes the user target address in r0 and the - * number of bytes to flush-invalidate in r1. - * It returns the number of not finv'able bytes (hopefully zero) in r0. - */ -STD_ENTRY(finv_user_asm) - bz r1, 2f - { movei r2, L2_CACHE_BYTES; add r1, r0, r1 } - { sub r2, zero, r2; addi r1, r1, L2_CACHE_BYTES-1 } - { and r0, r0, r2; and r1, r1, r2 } - { sub r1, r1, r0 } -1: { finv r0; addi r1, r1, -CHIP_FINV_STRIDE() } - { addi r0, r0, CHIP_FINV_STRIDE(); bnzt r1, 1b } -2: { move r0, r1; jrp lr } - STD_ENDPROC(finv_user_asm) - .pushsection __ex_table,"a" - .align 4 - .word 1b, 2b - .popsection diff --git a/arch/tile/lib/usercopy_64.S b/arch/tile/lib/usercopy_64.S deleted file mode 100644 index 9322dc551e91..000000000000 --- a/arch/tile/lib/usercopy_64.S +++ /dev/null @@ -1,89 +0,0 @@ -/* - * Copyright 2011 Tilera Corporation. All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation, version 2. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or - * NON INFRINGEMENT. See the GNU General Public License for - * more details. - */ - -#include <linux/linkage.h> -#include <asm/errno.h> -#include <asm/cache.h> -#include <arch/chip.h> - -/* Access user memory, but use MMU to avoid propagating kernel exceptions. */ - -/* - * clear_user_asm takes the user target address in r0 and the - * number of bytes to zero in r1. - * It returns the number of uncopiable bytes (hopefully zero) in r0. - * Note that we don't use a separate .fixup section here since we fall - * through into the "fixup" code as the last straight-line bundle anyway. - */ -STD_ENTRY(clear_user_asm) - { beqz r1, 2f; or r2, r0, r1 } - andi r2, r2, 7 - beqzt r2, .Lclear_aligned_user_asm -1: { st1 r0, zero; addi r0, r0, 1; addi r1, r1, -1 } - bnezt r1, 1b -2: { move r0, r1; jrp lr } - .pushsection __ex_table,"a" - .align 8 - .quad 1b, 2b - .popsection - -.Lclear_aligned_user_asm: -1: { st r0, zero; addi r0, r0, 8; addi r1, r1, -8 } - bnezt r1, 1b -2: { move r0, r1; jrp lr } - STD_ENDPROC(clear_user_asm) - .pushsection __ex_table,"a" - .align 8 - .quad 1b, 2b - .popsection - -/* - * flush_user_asm takes the user target address in r0 and the - * number of bytes to flush in r1. - * It returns the number of unflushable bytes (hopefully zero) in r0. - */ -STD_ENTRY(flush_user_asm) - beqz r1, 2f - { movei r2, L2_CACHE_BYTES; add r1, r0, r1 } - { sub r2, zero, r2; addi r1, r1, L2_CACHE_BYTES-1 } - { and r0, r0, r2; and r1, r1, r2 } - { sub r1, r1, r0 } -1: { flush r0; addi r1, r1, -CHIP_FLUSH_STRIDE() } - { addi r0, r0, CHIP_FLUSH_STRIDE(); bnezt r1, 1b } -2: { move r0, r1; jrp lr } - STD_ENDPROC(flush_user_asm) - .pushsection __ex_table,"a" - .align 8 - .quad 1b, 2b - .popsection - -/* - * finv_user_asm takes the user target address in r0 and the - * number of bytes to flush-invalidate in r1. - * It returns the number of not finv'able bytes (hopefully zero) in r0. - */ -STD_ENTRY(finv_user_asm) - beqz r1, 2f - { movei r2, L2_CACHE_BYTES; add r1, r0, r1 } - { sub r2, zero, r2; addi r1, r1, L2_CACHE_BYTES-1 } - { and r0, r0, r2; and r1, r1, r2 } - { sub r1, r1, r0 } -1: { finv r0; addi r1, r1, -CHIP_FINV_STRIDE() } - { addi r0, r0, CHIP_FINV_STRIDE(); bnezt r1, 1b } -2: { move r0, r1; jrp lr } - STD_ENDPROC(finv_user_asm) - .pushsection __ex_table,"a" - .align 8 - .quad 1b, 2b - .popsection |