summaryrefslogtreecommitdiffstats
path: root/arch (follow)
Commit message (Collapse)AuthorAgeFilesLines
* lib/GCD.c: use binary GCD algorithm instead of EuclideanZhaoxiu Zeng2016-05-2117-0/+40
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | The binary GCD algorithm is based on the following facts: 1. If a and b are all evens, then gcd(a,b) = 2 * gcd(a/2, b/2) 2. If a is even and b is odd, then gcd(a,b) = gcd(a/2, b) 3. If a and b are all odds, then gcd(a,b) = gcd((a-b)/2, b) = gcd((a+b)/2, b) Even on x86 machines with reasonable division hardware, the binary algorithm runs about 25% faster (80% the execution time) than the division-based Euclidian algorithm. On platforms like Alpha and ARMv6 where division is a function call to emulation code, it's even more significant. There are two variants of the code here, depending on whether a fast __ffs (find least significant set bit) instruction is available. This allows the unpredictable branches in the bit-at-a-time shifting loop to be eliminated. If fast __ffs is not available, the "even/odd" GCD variant is used. I use the following code to benchmark: #include <stdio.h> #include <stdlib.h> #include <stdint.h> #include <string.h> #include <time.h> #include <unistd.h> #define swap(a, b) \ do { \ a ^= b; \ b ^= a; \ a ^= b; \ } while (0) unsigned long gcd0(unsigned long a, unsigned long b) { unsigned long r; if (a < b) { swap(a, b); } if (b == 0) return a; while ((r = a % b) != 0) { a = b; b = r; } return b; } unsigned long gcd1(unsigned long a, unsigned long b) { unsigned long r = a | b; if (!a || !b) return r; b >>= __builtin_ctzl(b); for (;;) { a >>= __builtin_ctzl(a); if (a == b) return a << __builtin_ctzl(r); if (a < b) swap(a, b); a -= b; } } unsigned long gcd2(unsigned long a, unsigned long b) { unsigned long r = a | b; if (!a || !b) return r; r &= -r; while (!(b & r)) b >>= 1; for (;;) { while (!(a & r)) a >>= 1; if (a == b) return a; if (a < b) swap(a, b); a -= b; a >>= 1; if (a & r) a += b; a >>= 1; } } unsigned long gcd3(unsigned long a, unsigned long b) { unsigned long r = a | b; if (!a || !b) return r; b >>= __builtin_ctzl(b); if (b == 1) return r & -r; for (;;) { a >>= __builtin_ctzl(a); if (a == 1) return r & -r; if (a == b) return a << __builtin_ctzl(r); if (a < b) swap(a, b); a -= b; } } unsigned long gcd4(unsigned long a, unsigned long b) { unsigned long r = a | b; if (!a || !b) return r; r &= -r; while (!(b & r)) b >>= 1; if (b == r) return r; for (;;) { while (!(a & r)) a >>= 1; if (a == r) return r; if (a == b) return a; if (a < b) swap(a, b); a -= b; a >>= 1; if (a & r) a += b; a >>= 1; } } static unsigned long (*gcd_func[])(unsigned long a, unsigned long b) = { gcd0, gcd1, gcd2, gcd3, gcd4, }; #define TEST_ENTRIES (sizeof(gcd_func) / sizeof(gcd_func[0])) #if defined(__x86_64__) #define rdtscll(val) do { \ unsigned long __a,__d; \ __asm__ __volatile__("rdtsc" : "=a" (__a), "=d" (__d)); \ (val) = ((unsigned long long)__a) | (((unsigned long long)__d)<<32); \ } while(0) static unsigned long long benchmark_gcd_func(unsigned long (*gcd)(unsigned long, unsigned long), unsigned long a, unsigned long b, unsigned long *res) { unsigned long long start, end; unsigned long long ret; unsigned long gcd_res; rdtscll(start); gcd_res = gcd(a, b); rdtscll(end); if (end >= start) ret = end - start; else ret = ~0ULL - start + 1 + end; *res = gcd_res; return ret; } #else static inline struct timespec read_time(void) { struct timespec time; clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &time); return time; } static inline unsigned long long diff_time(struct timespec start, struct timespec end) { struct timespec temp; if ((end.tv_nsec - start.tv_nsec) < 0) { temp.tv_sec = end.tv_sec - start.tv_sec - 1; temp.tv_nsec = 1000000000ULL + end.tv_nsec - start.tv_nsec; } else { temp.tv_sec = end.tv_sec - start.tv_sec; temp.tv_nsec = end.tv_nsec - start.tv_nsec; } return temp.tv_sec * 1000000000ULL + temp.tv_nsec; } static unsigned long long benchmark_gcd_func(unsigned long (*gcd)(unsigned long, unsigned long), unsigned long a, unsigned long b, unsigned long *res) { struct timespec start, end; unsigned long gcd_res; start = read_time(); gcd_res = gcd(a, b); end = read_time(); *res = gcd_res; return diff_time(start, end); } #endif static inline unsigned long get_rand() { if (sizeof(long) == 8) return (unsigned long)rand() << 32 | rand(); else return rand(); } int main(int argc, char **argv) { unsigned int seed = time(0); int loops = 100; int repeats = 1000; unsigned long (*res)[TEST_ENTRIES]; unsigned long long elapsed[TEST_ENTRIES]; int i, j, k; for (;;) { int opt = getopt(argc, argv, "n:r:s:"); /* End condition always first */ if (opt == -1) break; switch (opt) { case 'n': loops = atoi(optarg); break; case 'r': repeats = atoi(optarg); break; case 's': seed = strtoul(optarg, NULL, 10); break; default: /* You won't actually get here. */ break; } } res = malloc(sizeof(unsigned long) * TEST_ENTRIES * loops); memset(elapsed, 0, sizeof(elapsed)); srand(seed); for (j = 0; j < loops; j++) { unsigned long a = get_rand(); /* Do we have args? */ unsigned long b = argc > optind ? strtoul(argv[optind], NULL, 10) : get_rand(); unsigned long long min_elapsed[TEST_ENTRIES]; for (k = 0; k < repeats; k++) { for (i = 0; i < TEST_ENTRIES; i++) { unsigned long long tmp = benchmark_gcd_func(gcd_func[i], a, b, &res[j][i]); if (k == 0 || min_elapsed[i] > tmp) min_elapsed[i] = tmp; } } for (i = 0; i < TEST_ENTRIES; i++) elapsed[i] += min_elapsed[i]; } for (i = 0; i < TEST_ENTRIES; i++) printf("gcd%d: elapsed %llu\n", i, elapsed[i]); k = 0; srand(seed); for (j = 0; j < loops; j++) { unsigned long a = get_rand(); unsigned long b = argc > optind ? strtoul(argv[optind], NULL, 10) : get_rand(); for (i = 1; i < TEST_ENTRIES; i++) { if (res[j][i] != res[j][0]) break; } if (i < TEST_ENTRIES) { if (k == 0) { k = 1; fprintf(stderr, "Error:\n"); } fprintf(stderr, "gcd(%lu, %lu): ", a, b); for (i = 0; i < TEST_ENTRIES; i++) fprintf(stderr, "%ld%s", res[j][i], i < TEST_ENTRIES - 1 ? ", " : "\n"); } } if (k == 0) fprintf(stderr, "PASS\n"); free(res); return 0; } Compiled with "-O2", on "VirtualBox 4.4.0-22-generic #38-Ubuntu x86_64" got: zhaoxiuzeng@zhaoxiuzeng-VirtualBox:~/develop$ ./gcd -r 500000 -n 10 gcd0: elapsed 10174 gcd1: elapsed 2120 gcd2: elapsed 2902 gcd3: elapsed 2039 gcd4: elapsed 2812 PASS zhaoxiuzeng@zhaoxiuzeng-VirtualBox:~/develop$ ./gcd -r 500000 -n 10 gcd0: elapsed 9309 gcd1: elapsed 2280 gcd2: elapsed 2822 gcd3: elapsed 2217 gcd4: elapsed 2710 PASS zhaoxiuzeng@zhaoxiuzeng-VirtualBox:~/develop$ ./gcd -r 500000 -n 10 gcd0: elapsed 9589 gcd1: elapsed 2098 gcd2: elapsed 2815 gcd3: elapsed 2030 gcd4: elapsed 2718 PASS zhaoxiuzeng@zhaoxiuzeng-VirtualBox:~/develop$ ./gcd -r 500000 -n 10 gcd0: elapsed 9914 gcd1: elapsed 2309 gcd2: elapsed 2779 gcd3: elapsed 2228 gcd4: elapsed 2709 PASS [akpm@linux-foundation.org: avoid #defining a CONFIG_ variable] Signed-off-by: Zhaoxiu Zeng <zhaoxiu.zeng@gmail.com> Signed-off-by: George Spelvin <linux@horizon.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
* printk/nmi: generic solution for safe printk in NMIPetr Mladek2016-05-2114-1/+17
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | printk() takes some locks and could not be used a safe way in NMI context. The chance of a deadlock is real especially when printing stacks from all CPUs. This particular problem has been addressed on x86 by the commit a9edc8809328 ("x86/nmi: Perform a safe NMI stack trace on all CPUs"). The patchset brings two big advantages. First, it makes the NMI backtraces safe on all architectures for free. Second, it makes all NMI messages almost safe on all architectures (the temporary buffer is limited. We still should keep the number of messages in NMI context at minimum). Note that there already are several messages printed in NMI context: WARN_ON(in_nmi()), BUG_ON(in_nmi()), anything being printed out from MCE handlers. These are not easy to avoid. This patch reuses most of the code and makes it generic. It is useful for all messages and architectures that support NMI. The alternative printk_func is set when entering and is reseted when leaving NMI context. It queues IRQ work to copy the messages into the main ring buffer in a safe context. __printk_nmi_flush() copies all available messages and reset the buffer. Then we could use a simple cmpxchg operations to get synchronized with writers. There is also used a spinlock to get synchronized with other flushers. We do not longer use seq_buf because it depends on external lock. It would be hard to make all supported operations safe for a lockless use. It would be confusing and error prone to make only some operations safe. The code is put into separate printk/nmi.c as suggested by Steven Rostedt. It needs a per-CPU buffer and is compiled only on architectures that call nmi_enter(). This is achieved by the new HAVE_NMI Kconfig flag. The are MN10300 and Xtensa architectures. We need to clean up NMI handling there first. Let's do it separately. The patch is heavily based on the draft from Peter Zijlstra, see https://lkml.org/lkml/2015/6/10/327 [arnd@arndb.de: printk-nmi: use %zu format string for size_t] [akpm@linux-foundation.org: min_t->min - all types are size_t here] Signed-off-by: Petr Mladek <pmladek@suse.com> Suggested-by: Peter Zijlstra <peterz@infradead.org> Suggested-by: Steven Rostedt <rostedt@goodmis.org> Cc: Jan Kara <jack@suse.cz> Acked-by: Russell King <rmk+kernel@arm.linux.org.uk> [arm part] Cc: Daniel Thompson <daniel.thompson@linaro.org> Cc: Jiri Kosina <jkosina@suse.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Ralf Baechle <ralf@linux-mips.org> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org> Cc: Martin Schwidefsky <schwidefsky@de.ibm.com> Cc: David Miller <davem@davemloft.net> Cc: Daniel Thompson <daniel.thompson@linaro.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
* exit_thread: accept a task parameter to be exitedJiri Slaby2016-05-2115-44/+39
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | We need to call exit_thread from copy_process in a fail path. So make it accept task_struct as a parameter. [v2] * s390: exit_thread_runtime_instr doesn't make sense to be called for non-current tasks. * arm: fix the comment in vfp_thread_copy * change 'me' to 'tsk' for task_struct * now we can change only archs that actually have exit_thread [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Jiri Slaby <jslaby@suse.cz> Cc: "David S. Miller" <davem@davemloft.net> Cc: "H. Peter Anvin" <hpa@zytor.com> Cc: "James E.J. Bottomley" <jejb@parisc-linux.org> Cc: Aurelien Jacquiot <a-jacquiot@ti.com> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org> Cc: Catalin Marinas <catalin.marinas@arm.com> Cc: Chen Liqin <liqin.linux@gmail.com> Cc: Chris Metcalf <cmetcalf@mellanox.com> Cc: Chris Zankel <chris@zankel.net> Cc: David Howells <dhowells@redhat.com> Cc: Fenghua Yu <fenghua.yu@intel.com> Cc: Geert Uytterhoeven <geert@linux-m68k.org> Cc: Guan Xuetao <gxt@mprc.pku.edu.cn> Cc: Haavard Skinnemoen <hskinnemoen@gmail.com> Cc: Hans-Christian Egtvedt <egtvedt@samfundet.no> Cc: Heiko Carstens <heiko.carstens@de.ibm.com> Cc: Helge Deller <deller@gmx.de> Cc: Ingo Molnar <mingo@redhat.com> Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru> Cc: James Hogan <james.hogan@imgtec.com> Cc: Jeff Dike <jdike@addtoit.com> Cc: Jesper Nilsson <jesper.nilsson@axis.com> Cc: Jiri Slaby <jslaby@suse.cz> Cc: Jonas Bonn <jonas@southpole.se> Cc: Koichi Yasutake <yasutake.koichi@jp.panasonic.com> Cc: Lennox Wu <lennox.wu@gmail.com> Cc: Ley Foon Tan <lftan@altera.com> Cc: Mark Salter <msalter@redhat.com> Cc: Martin Schwidefsky <schwidefsky@de.ibm.com> Cc: Matt Turner <mattst88@gmail.com> Cc: Max Filippov <jcmvbkbc@gmail.com> Cc: Michael Ellerman <mpe@ellerman.id.au> Cc: Michal Simek <monstr@monstr.eu> Cc: Mikael Starvik <starvik@axis.com> Cc: Paul Mackerras <paulus@samba.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Ralf Baechle <ralf@linux-mips.org> Cc: Rich Felker <dalias@libc.org> Cc: Richard Henderson <rth@twiddle.net> Cc: Richard Kuo <rkuo@codeaurora.org> Cc: Richard Weinberger <richard@nod.at> Cc: Russell King <linux@arm.linux.org.uk> Cc: Steven Miao <realmz6@gmail.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Tony Luck <tony.luck@intel.com> Cc: Vineet Gupta <vgupta@synopsys.com> Cc: Will Deacon <will.deacon@arm.com> Cc: Yoshinori Sato <ysato@users.sourceforge.jp> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
* exit_thread: remove empty bodiesJiri Slaby2016-05-2135-140/+17
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | Define HAVE_EXIT_THREAD for archs which want to do something in exit_thread. For others, let's define exit_thread as an empty inline. This is a cleanup before we change the prototype of exit_thread to accept a task parameter. [akpm@linux-foundation.org: fix mips] Signed-off-by: Jiri Slaby <jslaby@suse.cz> Cc: "David S. Miller" <davem@davemloft.net> Cc: "H. Peter Anvin" <hpa@zytor.com> Cc: "James E.J. Bottomley" <jejb@parisc-linux.org> Cc: Aurelien Jacquiot <a-jacquiot@ti.com> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org> Cc: Catalin Marinas <catalin.marinas@arm.com> Cc: Chen Liqin <liqin.linux@gmail.com> Cc: Chris Metcalf <cmetcalf@mellanox.com> Cc: Chris Zankel <chris@zankel.net> Cc: David Howells <dhowells@redhat.com> Cc: Fenghua Yu <fenghua.yu@intel.com> Cc: Geert Uytterhoeven <geert@linux-m68k.org> Cc: Guan Xuetao <gxt@mprc.pku.edu.cn> Cc: Haavard Skinnemoen <hskinnemoen@gmail.com> Cc: Hans-Christian Egtvedt <egtvedt@samfundet.no> Cc: Heiko Carstens <heiko.carstens@de.ibm.com> Cc: Helge Deller <deller@gmx.de> Cc: Ingo Molnar <mingo@redhat.com> Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru> Cc: James Hogan <james.hogan@imgtec.com> Cc: Jeff Dike <jdike@addtoit.com> Cc: Jesper Nilsson <jesper.nilsson@axis.com> Cc: Jiri Slaby <jslaby@suse.cz> Cc: Jonas Bonn <jonas@southpole.se> Cc: Koichi Yasutake <yasutake.koichi@jp.panasonic.com> Cc: Lennox Wu <lennox.wu@gmail.com> Cc: Ley Foon Tan <lftan@altera.com> Cc: Mark Salter <msalter@redhat.com> Cc: Martin Schwidefsky <schwidefsky@de.ibm.com> Cc: Matt Turner <mattst88@gmail.com> Cc: Max Filippov <jcmvbkbc@gmail.com> Cc: Michael Ellerman <mpe@ellerman.id.au> Cc: Michal Simek <monstr@monstr.eu> Cc: Mikael Starvik <starvik@axis.com> Cc: Paul Mackerras <paulus@samba.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Ralf Baechle <ralf@linux-mips.org> Cc: Rich Felker <dalias@libc.org> Cc: Richard Henderson <rth@twiddle.net> Cc: Richard Kuo <rkuo@codeaurora.org> Cc: Richard Weinberger <richard@nod.at> Cc: Russell King <linux@arm.linux.org.uk> Cc: Steven Miao <realmz6@gmail.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Tony Luck <tony.luck@intel.com> Cc: Vineet Gupta <vgupta@synopsys.com> Cc: Will Deacon <will.deacon@arm.com> Cc: Yoshinori Sato <ysato@users.sourceforge.jp> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
* mn10300: let exit_fpu accept a taskJiri Slaby2016-05-212-5/+3
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | We need to call exit_thread from copy_process in a fail path. Since exit_thread on mn10300 calls exit_thread_runtime_instr, make it accept task_struct as a parameter now. Signed-off-by: Jiri Slaby <jslaby@suse.cz> Cc: "David S. Miller" <davem@davemloft.net> Cc: "H. Peter Anvin" <hpa@zytor.com> Cc: "James E.J. Bottomley" <jejb@parisc-linux.org> Cc: Aurelien Jacquiot <a-jacquiot@ti.com> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org> Cc: Catalin Marinas <catalin.marinas@arm.com> Cc: Chen Liqin <liqin.linux@gmail.com> Cc: Chris Metcalf <cmetcalf@mellanox.com> Cc: Chris Zankel <chris@zankel.net> Cc: David Howells <dhowells@redhat.com> Cc: Fenghua Yu <fenghua.yu@intel.com> Cc: Geert Uytterhoeven <geert@linux-m68k.org> Cc: Guan Xuetao <gxt@mprc.pku.edu.cn> Cc: Haavard Skinnemoen <hskinnemoen@gmail.com> Cc: Hans-Christian Egtvedt <egtvedt@samfundet.no> Cc: Heiko Carstens <heiko.carstens@de.ibm.com> Cc: Helge Deller <deller@gmx.de> Cc: Ingo Molnar <mingo@redhat.com> Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru> Cc: James Hogan <james.hogan@imgtec.com> Cc: Jeff Dike <jdike@addtoit.com> Cc: Jesper Nilsson <jesper.nilsson@axis.com> Cc: Jiri Slaby <jslaby@suse.cz> Cc: Jonas Bonn <jonas@southpole.se> Cc: Koichi Yasutake <yasutake.koichi@jp.panasonic.com> Cc: Lennox Wu <lennox.wu@gmail.com> Cc: Ley Foon Tan <lftan@altera.com> Cc: Mark Salter <msalter@redhat.com> Cc: Martin Schwidefsky <schwidefsky@de.ibm.com> Cc: Matt Turner <mattst88@gmail.com> Cc: Max Filippov <jcmvbkbc@gmail.com> Cc: Michael Ellerman <mpe@ellerman.id.au> Cc: Michal Simek <monstr@monstr.eu> Cc: Mikael Starvik <starvik@axis.com> Cc: Paul Mackerras <paulus@samba.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Ralf Baechle <ralf@linux-mips.org> Cc: Rich Felker <dalias@libc.org> Cc: Richard Henderson <rth@twiddle.net> Cc: Richard Kuo <rkuo@codeaurora.org> Cc: Richard Weinberger <richard@nod.at> Cc: Russell King <linux@arm.linux.org.uk> Cc: Steven Miao <realmz6@gmail.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Tony Luck <tony.luck@intel.com> Cc: Vineet Gupta <vgupta@synopsys.com> Cc: Will Deacon <will.deacon@arm.com> Cc: Yoshinori Sato <ysato@users.sourceforge.jp> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
* x86/kasan: instrument user memory access APIAndrey Ryabinin2016-05-212-0/+12
| | | | | | | | | | | | | | | | | | | | | | | | | | Exchange between user and kernel memory is coded in assembly language. Which means that such accesses won't be spotted by KASAN as a compiler instruments only C code. Add explicit KASAN checks to user memory access API to ensure that userspace writes to (or reads from) a valid kernel memory. Note: Unlike others strncpy_from_user() is written mostly in C and KASAN sees memory accesses in it. However, it makes sense to add explicit check for all @count bytes that *potentially* could be written to the kernel. [aryabinin@virtuozzo.com: move kasan check under the condition] Link: http://lkml.kernel.org/r/1462869209-21096-1-git-send-email-aryabinin@virtuozzo.com Link: http://lkml.kernel.org/r/1462538722-1574-4-git-send-email-aryabinin@virtuozzo.com Signed-off-by: Andrey Ryabinin <aryabinin@virtuozzo.com> Cc: Alexander Potapenko <glider@google.com> Cc: Dmitry Vyukov <dvyukov@google.com> Cc: Ingo Molnar <mingo@elte.hu> Cc: "H. Peter Anvin" <hpa@zytor.com> Cc: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
* Merge branch 'linus' of ↵Linus Torvalds2016-05-201-2/+11
|\ | | | | | | | | | | | | | | | | | | git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6 Pull crypto fix from Herbert Xu: "Fix a regression that causes sha-mb to crash" * 'linus' of git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6: crypto: sha1-mb - make sha1_x8_avx2() conform to C function ABI
| * crypto: sha1-mb - make sha1_x8_avx2() conform to C function ABIJosh Poimboeuf2016-05-171-2/+11
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | Megha Dey reported a kernel panic in crypto code. The problem is that sha1_x8_avx2() clobbers registers r12-r15 without saving and restoring them. Before commit aec4d0e301f1 ("x86/asm/crypto: Simplify stack usage in sha-mb functions"), those registers were saved and restored by the callers of the function. I removed them with that commit because I didn't realize sha1_x8_avx2() clobbered them. Fix the potential undefined behavior associated with clobbering the registers and make the behavior less surprising by changing the registers to be callee saved/restored to conform with the C function call ABI. Also, rdx (aka RSP_SAVE) doesn't need to be saved: I verified that none of the callers rely on it being saved, and it's not a callee-saved register in the C ABI. Fixes: aec4d0e301f1 ("x86/asm/crypto: Simplify stack usage in sha-mb functions") Cc: stable@vger.kernel.org # 4.6 Reported-by: Megha Dey <megha.dey@linux.intel.com> Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
* | Merge tag 'powerpc-4.7-1' of ↵Linus Torvalds2016-05-20152-2999/+6008
|\ \ | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | git://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux Pull powerpc updates from Michael Ellerman: "Highlights: - Support for Power ISA 3.0 (Power9) Radix Tree MMU from Aneesh Kumar K.V - Live patching support for ppc64le (also merged via livepatching.git) Various cleanups & minor fixes from: - Aaro Koskinen, Alexey Kardashevskiy, Andrew Donnellan, Aneesh Kumar K.V, Chris Smart, Daniel Axtens, Frederic Barrat, Gavin Shan, Ian Munsie, Lennart Sorensen, Madhavan Srinivasan, Mahesh Salgaonkar, Markus Elfring, Michael Ellerman, Oliver O'Halloran, Paul Gortmaker, Paul Mackerras, Rashmica Gupta, Russell Currey, Suraj Jitindar Singh, Thiago Jung Bauermann, Valentin Rothberg, Vipin K Parashar. General: - Update LMB associativity index during DLPAR add/remove from Nathan Fontenot - Fix branching to OOL handlers in relocatable kernel from Hari Bathini - Add support for userspace Power9 copy/paste from Chris Smart - Always use STRICT_MM_TYPECHECKS from Michael Ellerman - Add mask of possible MMU features from Michael Ellerman PCI: - Enable pass through of NVLink to guests from Alexey Kardashevskiy - Cleanups in preparation for powernv PCI hotplug from Gavin Shan - Don't report error in eeh_pe_reset_and_recover() from Gavin Shan - Restore initial state in eeh_pe_reset_and_recover() from Gavin Shan - Revert "powerpc/eeh: Fix crash in eeh_add_device_early() on Cell" from Guilherme G Piccoli - Remove the dependency on EEH struct in DDW mechanism from Guilherme G Piccoli selftests: - Test cp_abort during context switch from Chris Smart - Add several tests for transactional memory support from Rashmica Gupta perf: - Add support for sampling interrupt register state from Anju T - Add support for unwinding perf-stackdump from Chandan Kumar cxl: - Configure the PSL for two CAPI ports on POWER8NVL from Philippe Bergheaud - Allow initialization on timebase sync failures from Frederic Barrat - Increase timeout for detection of AFU mmio hang from Frederic Barrat - Handle num_of_processes larger than can fit in the SPA from Ian Munsie - Ensure PSL interrupt is configured for contexts with no AFU IRQs from Ian Munsie - Add kernel API to allow a context to operate with relocate disabled from Ian Munsie - Check periodically the coherent platform function's state from Christophe Lombard Freescale: - Updates from Scott: "Contains 86xx fixes, minor device tree fixes, an erratum workaround, and a kconfig dependency fix." * tag 'powerpc-4.7-1' of git://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux: (192 commits) powerpc/86xx: Fix PCI interrupt map definition powerpc/86xx: Move pci1 definition to the include file powerpc/fsl: Fix build of the dtb embedded kernel images powerpc/fsl: Fix rcpm compatible string powerpc/fsl: Remove FSL_SOC dependency from FSL_LBC powerpc/fsl-pci: Add a workaround for PCI 5 errata powerpc/fsl: Fix SPI compatible on t208xrdb and t1040rdb powerpc/powernv/npu: Add PE to PHB's list powerpc/powernv: Fix insufficient memory allocation powerpc/iommu: Remove the dependency on EEH struct in DDW mechanism Revert "powerpc/eeh: Fix crash in eeh_add_device_early() on Cell" powerpc/eeh: Drop unnecessary label in eeh_pe_change_owner() powerpc/eeh: Ignore handlers in eeh_pe_reset_and_recover() powerpc/eeh: Restore initial state in eeh_pe_reset_and_recover() powerpc/eeh: Don't report error in eeh_pe_reset_and_recover() Revert "powerpc/powernv: Exclude root bus in pnv_pci_reset_secondary_bus()" powerpc/powernv/npu: Enable NVLink pass through powerpc/powernv/npu: Rework TCE Kill handling powerpc/powernv/npu: Add set/unset window helpers powerpc/powernv/ioda2: Export debug helper pe_level_printk() ...
| * | powerpc/86xx: Fix PCI interrupt map definitionAlessio Igor Bogani2016-05-171-16/+16
| | | | | | | | | | | | | | | | | | | | | | | | | | | Fix PCI interrupt map definition from 2 to 4 cells. Move interrupt-map and interrupt-map-mask and clone interrupts into the pcie child nodes. Signed-off-by: Alessio Igor Bogani <alessio.bogani@elettra.eu> Signed-off-by: Scott Wood <oss@buserror.net>
| * | powerpc/86xx: Move pci1 definition to the include fileAlessio Igor Bogani2016-05-178-91/+38
| | | | | | | | | | | | | | | Signed-off-by: Alessio Igor Bogani <alessio.bogani@elettra.eu> Signed-off-by: Scott Wood <oss@buserror.net>
| * | powerpc/fsl: Fix build of the dtb embedded kernel imagesAlessio Igor Bogani2016-05-171-3/+3
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | Commit dc37374b9c833 ("powerpc/fsl: Move Freescale device tree files into fsl folder") moved a lot of device tree files into fsl directory, fixing Makefile for cuImage target only. Unfortunately there are other targets which require embedding a device tree into the kernel image (e.g. dtbImage.%). So use a more generic approach. Signed-off-by: Alessio Igor Bogani <alessio.bogani@elettra.eu> [scottwood: cleaned up commit message] Signed-off-by: Scott Wood <oss@buserror.net>
| * | powerpc/fsl: Fix rcpm compatible stringChenhui Zhao2016-05-172-2/+2
| | | | | | | | | | | | | | | | | | | | | | | | For T1040, T1042, T1023, and T1024, they should use the compatible string "fsl,qoriq-rcpm-2.1". Signed-off-by: Chenhui Zhao <chenhui.zhao@nxp.com> Signed-off-by: Scott Wood <oss@buserror.net>
| * | powerpc/fsl: Remove FSL_SOC dependency from FSL_LBCScott Wood2016-05-171-1/+0
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | This dependency led to kconfig errors when MTD_NAND_FSL_ELBC was enabled, which selects FSL_LBC, in the absence of FSL_SOC, as reported in http://patchwork.ozlabs.org/patch/564405/ It was originally suggested to add an FSL_SOC dependency to MTD_NAND_FSL_ELBC, but the FSL_SOC symbol has been a growing problem due to hardware being shared between PPC and ARM SoCs. Even though eLBC isn't found on ARM SoCs (the newer IFC is used instead), I don't want to expand the use of FSL_SOC for things other than functions exported by fsl_soc.c. In particular, it would be odd to add it to MTD_NAND_FSL_ELBC and then remove it from MTD_NAND_FSL_IFC. Removing artificial dependencies also helps get compile-test exposure via randconfig, allyesconfig, etc. Reported-by: Brian Norris <computersforpeace@gmail.com> Cc: Brian Norris <computersforpeace@gmail.com> Signed-off-by: Scott Wood <oss@buserror.net>
| * | powerpc/fsl-pci: Add a workaround for PCI 5 erratachenhui zhao2016-05-161-0/+24
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | Issue: As a master, the PCI IP block can combine a memory write to the last PCI double word (4 bytes) of a cacheline with a 4 byte memory write to the first PCI double word of the subsequent cacheline. This affects 32-bit PCI target devices that blindly assert STOP on memory-write transactions, without detecting that the data beat being transferred is the last data beat of the transaction. It can cause a hang. PCI-X operation is not affected by this erratum. Workaround: Setting the bit MDS in the PCI Bus Function Register will disable the combining of crossing cacheline boundary requests into one burst transaction. Therefore, it can prevent the errata scenario from occurring. This errata exists in MPC8543, MPC8543E, MPC8545, MPC8545E, MPC8547, MPC8547E, MPC8548 and MPC8548E. Refer to PCI 5 in MPC8548 errata document. Signed-off-by: Zhao Chenhui <chenhui.zhao@freescale.com> Signed-off-by: Zhiqiang Hou <Zhiqiang.Hou@freescale.com> [scottwood: whitespace fix] Signed-off-by: Scott Wood <oss@buserror.net>
| * | powerpc/fsl: Fix SPI compatible on t208xrdb and t1040rdbHou Zhiqiang2016-05-162-2/+2
| | | | | | | | | | | | | | | | | | | | | | | | On the t208xrdb and t1040rdb, the SPI device is n25q512ax3 instead of n25q512a. Signed-off-by: Hou Zhiqiang <Zhiqiang.Hou@freescale.com> Signed-off-by: Scott Wood <oss@buserror.net>
| * | powerpc/powernv/npu: Add PE to PHB's listAlexey Kardashevskiy2016-05-121-0/+3
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | Before commit 3e68dc57 "powerpc/powernv: Remove DMA32 PE list", NPU PEs were linked to the NPU PHB via phb->ioda.pe_dma_list; after that fix, the phb->ioda.pe_list is used. During the pe_dma_list removal, list_add_tail(&phb->ioda.pe_dma_list) was removed, however no list_add() was added so does this patch. Fixes: 3e68dc57219a ("powerpc/powernv: Remove DMA32 PE list") Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru> Reviewed-by: Gavin Shan <gwshan@linux.vnet.ibm.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
| * | powerpc/powernv: Fix insufficient memory allocationAlexey Kardashevskiy2016-05-121-1/+2
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | The pnv_pci_init_ioda_phb() helper allocates a blob to store auxilary data such PE and M32/M64 segment allocation maps; this single blob has few partitions, size of each is derived from the PE number - phb->ioda.total_pe_num. It was assumed that the minimum PE number is 8, however it is 4 for NPU so the pe_alloc part was missing in the allocated blob. It was invisible till recently as we were not tracking used M64 segments and NPUs do not use M32 segments so the phb->ioda.m32_segmap (which was pointing to the same address as phb->ioda.pe_alloc) has never been written to leaving the pe_alloc memory intact. After commit 401203ac2d "powerpc/powernv: Track M64 segment consumption" the pe_alloc gets corrupted and PE allocation cannot work. This fixes the issue by enforcing the minimum PE number to 8. Fixes: 401203ac2d15 ("powerpc/powernv: Track M64 segment consumption") Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru> Reviewed-by: Gavin Shan <gwshan@linux.vnet.ibm.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
| * | powerpc/iommu: Remove the dependency on EEH struct in DDW mechanismGuilherme G. Piccoli2016-05-121-12/+12
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | Commit 39baadbf36ce ("powerpc/eeh: Remove eeh information from pci_dn") changed the pci_dn struct by removing its EEH-related members. As part of this clean-up, DDW mechanism was modified to read the device configuration address from eeh_dev struct. As a consequence, now if we disable EEH mechanism on kernel command-line for example, the DDW mechanism will fail, generating a kernel oops by dereferencing a NULL pointer (which turns to be the eeh_dev pointer). This patch just changes the configuration address calculation on DDW functions to a manual calculation based on pci_dn members instead of using eeh_dev-based address. No functional changes were made. This was tested on pSeries, both in PHyp and qemu guest. Fixes: 39baadbf36ce ("powerpc/eeh: Remove eeh information from pci_dn") Cc: stable@vger.kernel.org # v3.4+ Reviewed-by: Gavin Shan <gwshan@linux.vnet.ibm.com> Signed-off-by: Guilherme G. Piccoli <gpiccoli@linux.vnet.ibm.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
| * | Revert "powerpc/eeh: Fix crash in eeh_add_device_early() on Cell"Guilherme G. Piccoli2016-05-121-1/+1
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | This reverts commit 89a51df5ab1d38b257300b8ac940bbac3bb0eb9b. The function eeh_add_device_early() is used to perform EEH initialization in devices added later on the system, like in hotplug/DLPAR scenarios. Since the commit 89a51df5ab1d ("powerpc/eeh: Fix crash in eeh_add_device_early() on Cell") a new check was introduced in this function - Cell has no EEH capabilities which led to kernel oops if hotplug was performed, so checking for eeh_enabled() was introduced to avoid the issue. However, in architectures that EEH is present like pSeries or PowerNV, we might reach a case in which no PCI devices are present on boot time and so EEH is not initialized. Then, if a device is added via DLPAR for example, eeh_add_device_early() fails because eeh_enabled() is false, and EEH end up not being enabled at all. This reverts the aforementioned patch since a new verification was introduced by the commit d91dafc02f42 ("powerpc/eeh: Delay probing EEH device during hotplug") and so the original Cell issue does not happen anymore. Cc: stable@vger.kernel.org # v4.1+ Reviewed-by: Gavin Shan <gwshan@linux.vnet.ibm.com> Signed-off-by: Guilherme G. Piccoli <gpiccoli@linux.vnet.ibm.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
| * | powerpc/eeh: Drop unnecessary label in eeh_pe_change_owner()Gavin Shan2016-05-121-4/+1
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | The label "reset" in eeh_pe_change_owner() is used only for once. No need to keep it and just drop it. No logical changes introduced. Signed-off-by: Gavin Shan <gwshan@linux.vnet.ibm.com> Reviewed-by: David Gibson <david@gibson.dropbear.id.au> Reviewed-by: Russell Currey <ruscur@russell.cc> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
| * | powerpc/eeh: Ignore handlers in eeh_pe_reset_and_recover()Gavin Shan2016-05-121-7/+1
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | The function eeh_pe_reset_and_recover() is used to recover EEH error when the passthrough device are transferred to guest and backwards, meaning the device's driver is vfio-pci or none. In both cases, the handlers triggered by eeh_report_reset() and eeh_report_resume() shouldn't be called. This ignores the error handlers from eeh_report_reset() and eeh_report_resume(). Signed-off-by: Gavin Shan <gwshan@linux.vnet.ibm.com> Reviewed-by: Russell Currey <ruscur@russell.cc> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
| * | powerpc/eeh: Restore initial state in eeh_pe_reset_and_recover()Gavin Shan2016-05-121-0/+23
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | The function eeh_pe_reset_and_recover() is used to recover EEH error when the passthrou device are transferred to guest and backwards. The content in the device's config space will be lost on PE reset issued in the middle of the recovery. The function saves/restores it before/after the reset. However, config access to some adapters like Broadcom BCM5719 at this point will causes fenced PHB. The config space is always blocked and we save 0xFF's that are restored at late point. The memory BARs are totally corrupted, causing another EEH error upon access to one of the memory BARs. This restores the config space on those adapters like BCM5719 from the content saved to the EEH device when it's populated, to resolve above issue. Fixes: 5cfb20b9 ("powerpc/eeh: Emulate EEH recovery for VFIO devices") Cc: stable@vger.kernel.org #v3.18+ Signed-off-by: Gavin Shan <gwshan@linux.vnet.ibm.com> Reviewed-by: Russell Currey <ruscur@russell.cc> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
| * | powerpc/eeh: Don't report error in eeh_pe_reset_and_recover()Gavin Shan2016-05-121-3/+0
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | The function eeh_pe_reset_and_recover() is used to recover EEH error when the passthrough device are transferred to guest and backwards, meaning the device's driver is vfio-pci or none. When the driver is vfio-pci that provides error_detected() error handler only, the handler simply stops the guest and it's not expected behaviour. On the other hand, no error handlers will be called if we don't have a bound driver. This ignores the error handler in eeh_pe_reset_and_recover() that reports the error to device driver to avoid the exceptional behaviour. Fixes: 5cfb20b9 ("powerpc/eeh: Emulate EEH recovery for VFIO devices") Cc: stable@vger.kernel.org #v3.18+ Signed-off-by: Gavin Shan <gwshan@linux.vnet.ibm.com> Reviewed-by: Russell Currey <ruscur@russell.cc> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
| * | Revert "powerpc/powernv: Exclude root bus in pnv_pci_reset_secondary_bus()"Michael Ellerman2016-05-121-2/+10
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | This reverts commit c8ceacc22bce95d3a9cff198c9c27a30105a16b8. Gavin says: I missed the fact that it affects the PCI passthrou path as reported by Alexey: When passing GPU (0003:01:00.0) which seats behind the root port, the reset request is routed to skiboot in original code. In skiboot, the link bouncing events are masked during the reset. So we don't see EEH (freeze all) error even link bouncing happens. With the changes included, the reset is done by kernel and the link bouncing events aren't masked by altering content of PHB3 (or P7IOC) specific hardware registers which are invisible to kernel (skiboot hides the hardware specific). It means the link bouncing is seen by the root port and it causes a EEH (freeze all) error. The PCI passthrough on GPU device cannot work. Requested-by: Alexey Kardashevskiy <aik@ozlabs.ru> Requested-by: Gavin Shan <gwshan@linux.vnet.ibm.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
| * | powerpc/powernv/npu: Enable NVLink pass throughAlexey Kardashevskiy2016-05-113-6/+176
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | IBM POWER8 NVlink systems come with Tesla K40-ish GPUs each of which also has a couple of fast speed links (NVLink). The interface to links is exposed as an emulated PCI bridge which is included into the same IOMMU group as the corresponding GPU. In the kernel, NPUs get a separate PHB of the PNV_PHB_NPU type and a PE which behave pretty much as the standard IODA2 PHB except NPU PHB has just a single TVE in the hardware which means it can have either 32bit window or 64bit window or DMA bypass but never two of these. In order to make these links work when GPU is passed to the guest, these bridges need to be passed as well; otherwise performance will degrade. This implements and exports API to manage NPU state in regard to VFIO; it replicates iommu_table_group_ops. This defines a new pnv_pci_ioda2_npu_ops which is assigned to the IODA2 bridge if there are NPUs for a GPU on the bridge. The new callbacks call the default IODA2 callbacks plus new NPU API. This adds a gpe_table_group_to_npe() helper to find NPU PE for the IODA2 table_group, it is not expected to fail as the helper is only called from the pnv_pci_ioda2_npu_ops. This does not define NPU-specific .release_ownership() so after VFIO is finished, DMA on NPU is disabled which is ok as the nvidia driver sets DMA mask when probing which enable 32 or 64bit DMA on NPU. This adds a pnv_pci_npu_setup_iommu() helper which adds NPUs to the GPU group if any found. The helper uses helpers to look for the "ibm,gpu" property in the device tree which is a phandle of the corresponding GPU. This adds an additional loop over PEs in pnv_ioda_setup_dma() as the main loop skips NPU PEs as they do not have 32bit DMA segments. As pnv_npu_set_window() and pnv_npu_unset_window() are started being used by the new IODA2-NPU IOMMU group, this makes the helpers public and adds the DMA window number parameter. Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru> Reviewed-By: Alistair Popple <alistair@popple.id.au> [mpe: Add pnv_pci_ioda_setup_iommu_api() to fix build with IOMMU_API=n] Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
| * | powerpc/powernv/npu: Rework TCE Kill handlingAlexey Kardashevskiy2016-05-113-112/+27
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | The pnv_ioda_pe struct keeps an array of peers. At the moment it is only used to link GPU and NPU for 2 purposes: 1. Access NPU quickly when configuring DMA for GPU - this was addressed in the previos patch by removing use of it as DMA setup is not what the kernel would constantly do. 2. Invalidate TCE cache for NPU when it is invalidated for GPU. GPU and NPU are in different PE. There is already a mechanism to attach multiple iommu_table_group to the same iommu_table (used for VFIO), we can reuse it here so does this patch. This gets rid of peers[] array and PNV_IODA_PE_PEER flag as they are not needed anymore. While we are here, add TCE cache invalidation after enabling bypass. Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru> Reviewed-By: Alistair Popple <alistair@popple.id.au> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
| * | powerpc/powernv/npu: Add set/unset window helpersAlexey Kardashevskiy2016-05-111-10/+55
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | The upcoming NVLink passthrough support will require NPU code to cope with two DMA windows. This adds a pnv_npu_set_window() helper which programs 32bit window to the hardware. This also adds multilevel TCE support. This adds a pnv_npu_unset_window() helper which removes the DMA window from the hardware. This does not make difference now as the caller - pnv_npu_dma_set_bypass() - enables bypass in the hardware but the next patch will use it to manage TCE table lists for TCE Kill handling. Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru> Reviewed-By: Alistair Popple <alistair@popple.id.au> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
| * | powerpc/powernv/ioda2: Export debug helper pe_level_printk()Alexey Kardashevskiy2016-05-112-8/+10
| | | | | | | | | | | | | | | | | | | | | | | | | | | This exports debugging helper pe_level_printk() and corresponding macroses so they can be used in npu-dma.c. Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru> Reviewed-By: Alistair Popple <alistair@popple.id.au> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
| * | powerpc/powernv/npu: Simplify DMA setupAlexey Kardashevskiy2016-05-113-68/+52
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | NPU devices are emulated in firmware and mainly used for NPU NVLink training; one NPU device is per a hardware link. Their DMA/TCE setup must match the GPU which is connected via PCIe and NVLink so any changes to the DMA/TCE setup on the GPU PCIe device need to be propagated to the NVLink device as this is what device drivers expect and it doesn't make much sense to do anything else. This makes NPU DMA setup explicit. pnv_npu_ioda_controller_ops::pnv_npu_dma_set_mask is moved to pci-ioda, made static and prints warning as dma_set_mask() should never be called on this function as in any case it will not configure GPU; so we make this explicit. Instead of using PNV_IODA_PE_PEER and peers[] (which the next patch will remove), we test every PCI device if there are corresponding NVLink devices. If there are any, we propagate bypass mode to just found NPU devices by calling the setup helper directly (which takes @bypass) and avoid guessing (i.e. calculating from DMA mask) whether we need bypass or not on NPU devices. Since DMA setup happens in very rare occasion, this will not slow down booting or VFIO start/stop much. This renames pnv_npu_disable_bypass to pnv_npu_dma_set_32 to make it more clear what the function really does which is programming 32bit table address to the TVT ("disabling bypass" means writing zeroes to the TVT). This removes pnv_npu_dma_set_bypass() from pnv_npu_ioda_fixup() as the DMA configuration on NPU does not matter until dma_set_mask() is called on GPU and that will do the NPU DMA configuration. This removes phb->dma_dev_setup initialization for NPU as pnv_pci_ioda_dma_dev_setup is no-op for it anyway. This stops using npe->tce_bypass_base as it never changes and values other than zero are not supported. Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru> Reviewed-by: David Gibson <david@gibson.dropbear.id.au> Reviewed-by: Alistair Popple <alistair@popple.id.au> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
| * | powerpc/powernv/npu: Use the correct IOMMU page sizeAlexey Kardashevskiy2016-05-111-6/+5
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | This uses the page size from iommu_table instead of hard-coded 4K. This should cause no change in behavior. While we are here, move bits around to prepare for further rework which will define and use iommu_table_group_ops. Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru> Reviewed-by: David Gibson <david@gibson.dropbear.id.au> Reviewed-by: Alistair Popple <alistair@popple.id.au> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
| * | powerpc/powernv/npu: TCE Kill helpers cleanupAlexey Kardashevskiy2016-05-113-52/+25
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | NPU PHB TCE Kill register is exactly the same as in the rest of POWER8 so let's reuse the existing code for NPU. The only bit missing is a helper to reset the entire TCE cache so this moves such a helper from NPU code and renames it. Since pnv_npu_tce_invalidate() does really invalidate the entire cache, this uses pnv_pci_ioda2_tce_invalidate_entire() directly for NPU. This adds an explicit comment for workaround for invalidating NPU TCE cache. Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru> Reviewed-by: David Gibson <david@gibson.dropbear.id.au> Reviewed-by: Alistair Popple <alistair@popple.id.au> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
| * | powerpc/powernv: Define TCE Kill flagsAlexey Kardashevskiy2016-05-111-2/+5
| | | | | | | | | | | | | | | | | | | | | | | | This replaces magic constants for TCE Kill IODA2 register with macros. Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru> Reviewed-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
| * | powerpc/powernv: Rename pnv_pci_ioda2_tce_invalidate_entireAlexey Kardashevskiy2016-05-111-3/+3
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | As in fact pnv_pci_ioda2_tce_invalidate_entire() invalidates TCEs for the specific PE rather than the entire cache, rename it to pnv_pci_ioda2_tce_invalidate_pe(). In later patches we will add a proper pnv_pci_ioda2_tce_invalidate_entire(). Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru> Reviewed-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
| * | powerpc/powernv: Exclude root bus in pnv_pci_reset_secondary_bus()Gavin Shan2016-05-111-10/+2
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | The function pnv_pci_reset_secondary_bus() is called like below. It's impossible for call the function on root bus. So it's safe to remove the root bus case in the function. No functional changes introduced. pci_parent_bus_reset() / pci_bus_reset() / pci_try_reset_bus() pci_reset_bridge_secondary_bus() pcibios_reset_secondary_bus() pnv_pci_reset_secondary_bus() Signed-off-by: Gavin Shan <gwshan@linux.vnet.ibm.com> Reviewed-by: Daniel Axtens <dja@axtens.net> Reviewed-by: Alexey Kardashevskiy <aik@ozlabs.ru> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
| * | powerpc/powernv: Simplify pnv_eeh_reset()Gavin Shan2016-05-111-36/+31
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | This drops unnecessary nested if statements in pnv_eeh_reset() to improve the code readability. After the changes, the unused local variable "ret" is dropped as well. No logical changes introduced. Signed-off-by: Gavin Shan <gwshan@linux.vnet.ibm.com> Reviewed-by: Andrew Donnellan <andrew.donnellan@au1.ibm.com> Reviewed-by: Alexey Kardashevskiy <aik@ozlabs.ru> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
| * | powerpc/pci: Don't scan empty slotGavin Shan2016-05-111-1/+2
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | In hotplug case, function pci_add_pci_devices() is called to rescan the specified PCI bus, which might not have any child devices. Access to the PCI bus's child device node will cause kernel crash without exception. This adds one more check to skip scanning PCI bus that doesn't have any subordinate devices from device-tree, in order to avoid kernel crash. Signed-off-by: Gavin Shan <gwshan@linux.vnet.ibm.com> Reviewed-by: Alexey Kardashevskiy <aik@ozlabs.ru> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
| * | powerpc/pci: Export pci_traverse_device_nodes()Gavin Shan2016-05-113-10/+15
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | This renames traverse_pci_devices() to pci_traverse_device_nodes(). The function traverses all subordinate device nodes of the specified one. Also, below cleanup applied to the function. No logical changes introduced. * Rename "pre" to "fn". * Avoid assignment in if condition reported from checkpatch.pl. Signed-off-by: Gavin Shan <gwshan@linux.vnet.ibm.com> Reviewed-by: Alexey Kardashevskiy <aik@ozlabs.ru> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
| * | powerpc/pci: Introduce pci_remove_device_node_info()Gavin Shan2016-05-112-0/+24
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | This implements and exports pci_remove_device_node_info(). It's used to remove the pdn (struct pci_dn) for the indicated device node. The function is going to be used by PowerNV PCI hotplug driver. Signed-off-by: Gavin Shan <gwshan@linux.vnet.ibm.com> Reviewed-by: Alexey Kardashevskiy <aik@ozlabs.ru> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
| * | powerpc/pci: Export pci_add_device_node_info()Gavin Shan2016-05-113-13/+22
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | This renames update_dn_pci_info() to pci_add_device_node_info() with corresponding adjustment on the parameter type and exports it. The function is used to create pdn (struct pci_dn) for the indicated device node. Another function add_pdn(), almost wrapper of pci_add_device_node_info(), to be used in traverse_pci_devices(). No logical changes introduced. Signed-off-by: Gavin Shan <gwshan@linux.vnet.ibm.com> Reviewed-by: Alexey Kardashevskiy <aik@ozlabs.ru> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
| * | powerpc/pci: Move pci_find_bus_by_node() aroundGavin Shan2016-05-112-31/+29
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | This moves pci_find_bus_by_node() from arch/powerpc/platforms/ pseries/pci_dlpar.c to arch/powerpc/kernel/pci-hotplug.c so that the function can be used by pSeries and PowerNV platform at the same time. Also, below cleanup applied. No functional changes introduced. * Remove variable "busdn" in find_bus_among_children() * Use PCI_DN() to convert device node to pci_dn Signed-off-by: Gavin Shan <gwshan@linux.vnet.ibm.com> Acked-by: Benjamin Herrenschmidt <benh@kernel.crashing.org> Reviewed-by: Andrew Donnellan <andrew.donnellan@au1.ibm.com> Reviewed-by: Alexey Kardashevskiy <aik@ozlabs.ru> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
| * | powerpc/pci: Rename pcibios_find_pci_bus()Gavin Shan2016-05-112-4/+3
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | This renames pcibios_find_pci_bus() to pci_find_bus_by_node() to avoid conflicts with those PCI subsystem weak function names, which have prefix "pcibios". No logical changes introduced. Signed-off-by: Gavin Shan <gwshan@linux.vnet.ibm.com> Reviewed-by: Alexey Kardashevskiy <aik@ozlabs.ru> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
| * | powerpc/pci: Rename pcibios_{add, remove}_pci_devices()Gavin Shan2016-05-113-16/+15
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | This renames pcibios_{add,remove}_pci_devices() to avoid conflicts with names of the weak functions in PCI subsystem, which have the prefix "pcibios". No logical changes introduced. Signed-off-by: Gavin Shan <gwshan@linux.vnet.ibm.com> Reviewed-By: Alistair Popple <alistair@popple.id.au> Reviewed-by: Andrew Donnellan <andrew.donnellan@au1.ibm.com> Reviewed-by: Alexey Kardashevskiy <aik@ozlabs.ru> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
| * | powerpc/powernv: Use PE instead of number during setup and releaseGavin Shan2016-05-112-47/+59
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | In current implementation, the PEs that are allocated or picked from the reserved list are identified by PE number. The PE instance has to be picked according to the PE number eventually. We have same issue when PE is released. For pnv_ioda_pick_m64_pe() and pnv_ioda_alloc_pe(), this returns PE instance so that pnv_ioda_setup_bus_PE() can use the allocated or reserved PE instance directly. Also, pnv_ioda_setup_bus_PE() returns the reserved/allocated PE instance to be used in subsequent patches. On the other hand, pnv_ioda_free_pe() uses PE instance (not number) as its argument. No logical changes introduced. Signed-off-by: Gavin Shan <gwshan@linux.vnet.ibm.com> Reviewed-by: Alexey Kardashevskiy <aik@ozlabs.ru> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
| * | powerpc/powernv/ioda1: Improve DMA32 segment trackGavin Shan2016-05-112-56/+66
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | In current implementation, the DMA32 segments required by one specific PE isn't calculated with the information hold in the PE independently. It conflicts with the PCI hotplug design: PE centralized, meaning the PE's DMA32 segments should be calculated from the information hold in the PE independently. This introduces an array (@dma32_segmap) for every PHB to track the DMA32 segmeng usage. Besides, this moves the logic calculating PE's consumed DMA32 segments to pnv_pci_ioda1_setup_dma_pe() so that PE's DMA32 segments are calculated/allocated from the information hold in the PE (DMA32 weight). Also the logic is improved: we try to allocate as much DMA32 segments as we can. It's acceptable that number of DMA32 segments less than the expected number are allocated. Signed-off-by: Gavin Shan <gwshan@linux.vnet.ibm.com> Reviewed-by: Alexey Kardashevskiy <aik@ozlabs.ru> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
| * | powerpc/powernv: Remove DMA32 PE listGavin Shan2016-05-112-112/+78
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | PEs are put into PHB DMA32 list (phb->ioda.pe_dma_list) according to their DMA32 weight. The PEs on the list are iterated to setup their TCE32 tables at system booting time. The list is used for once at boot time and no need to keep it. This moves the logic calculating DMA32 weight of PHB and PE to pnv_ioda_setup_dma() to drop PHB's DMA32 list. Also, every PE traces the consumed DMA32 segment by @tce32_seg and @tce32_segcount are useless and they're removed. Signed-off-by: Gavin Shan <gwshan@linux.vnet.ibm.com> Reviewed-by: Alexey Kardashevskiy <aik@ozlabs.ru> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
| * | powerpc/powernv/ioda1: Introduce PNV_IODA1_DMA32_SEGSIZEGavin Shan2016-05-111-13/+18
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | Currently, there is one macro (TCE32_TABLE_SIZE) representing the TCE table size for one DMA32 segment. The constant representing the DMA32 segment size (1 << 28) is still used in the code. This defines PNV_IODA1_DMA32_SEGSIZE representing one DMA32 segment size. the TCE table size can be calcualted when the page has fixed 4KB size. So all the related calculation depends on one macro (PNV_IODA1_DMA32_SEGSIZE). No logical changes introduced. Signed-off-by: Gavin Shan <gwshan@linux.vnet.ibm.com> Reviewed-By: Alistair Popple <alistair@popple.id.au> Reviewed-by: Alexey Kardashevskiy <aik@ozlabs.ru> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
| * | powerpc/powernv/ioda1: Rename pnv_pci_ioda_setup_dma_pe()Gavin Shan2016-05-111-4/+5
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | This renames pnv_pci_ioda_setup_dma_pe() to pnv_pci_ioda1_setup_dma_pe() as it's the counter-part of IODA2's pnv_pci_ioda2_setup_dma_pe(). No logical changes introduced. Signed-off-by: Gavin Shan <gwshan@linux.vnet.ibm.com> Reviewed-by: Alexey Kardashevskiy <aik@ozlabs.ru> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
| * | powerpc/powernv/ioda1: M64 support on P7IOCGavin Shan2016-05-111-3/+86
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | This enables M64 window on P7IOC, which has been enabled on PHB3. Different from PHB3 where 16 M64 BARs are supported and each of them can be owned by one particular PE# exclusively or divided evenly to 256 segments, every P7IOC PHB has 16 M64 BARs and each of them are divided to 8 segments. So every P7IOC PHB supports 128 M64 segments in total. P7IOC has M64DT, which helps mapping one particular M64 segment# to arbitrary PE#. PHB3 doesn't have M64DT, indicating that one M64 segment can only be pinned to the fixed PE#. In order to unified M64 support M64 on P7IOC and PHB3, we just provide 128 M64 segments on every P7IOC PHB and each of them is pinned to the fixed PE# by bypassing the function of M64DT. In turn, we just need different phb->init_m64() for P7IOC and PHB3 and maps M64 segment in pnv_ioda_reserve_m64_pe() for P7IOC, most of the code are shared by them. Signed-off-by: Gavin Shan <gwshan@linux.vnet.ibm.com> Reviewed-by: Alistair Popple <alistair@popple.id.au> Reviewed-by: Alexey Kardashevskiy <aik@ozlabs.ru> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
| * | powerpc/powernv: Rename M64 related functionsGavin Shan2016-05-111-11/+11
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | This renames those functions picking PE number based on consumed M64 segments, mapping M64 segments to PEs as those functions are going to be shared by IODA1/IODA2 in next patch. No logical changes introduced. Signed-off-by: Gavin Shan <gwshan@linux.vnet.ibm.com> Reviewed-by: Alexey Kardashevskiy <aik@ozlabs.ru> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>