From 22823ab419d8ed884195cfa75483fd3a99bb1462 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 11 Jan 2016 10:54:54 -0500 Subject: EXPORT_SYMBOL() for asm Add asm-usable variants of EXPORT_SYMBOL/EXPORT_SYMBOL_GPL. This commit just adds the default implementation; most of the architectures can simply add export.h to asm/Kbuild and start using from assembler. The rest needs to have their define everal macros and then explicitly include One area where the things might diverge from default is the alignment; normally it's 8 bytes on 64bit targets and 4 on 32bit ones, both for unsigned long and for struct kernel_symbol. Unfortunately, amd64 and m68k are unusual - m68k aligns to 2 bytes (for both) and amd64 aligns struct kernel_symbol to 16 bytes. For those we'll need asm/export.h to override the constants used by generic version - KSYM_ALIGN and KCRC_ALIGN for kernel_symbol and unsigned long resp. And no, __alignof__ would not do the trick - on amd64 __alignof__ of struct kernel_symbol is 8, not 16. More serious source of unpleasantness is treatment of function descriptors on architectures that have those. Things like ppc64, parisc, ia64, etc. need more than the address of the first insn to call an arbitrary function. As the result, their representation of pointers to functions is not the typical "address of the entry point" - it's an address of a small static structure containing all the required information (including the entry point, of course). Sadly, the asm-side conventions differ in what the function name refers to - entry point or the function descriptor. On ppc64 we do the latter; bar: .quad foo is what void (*bar)(void) = foo; turns into and the rare places where we need to explicitly work with the label of entry point are dealt with as DOTSYM(foo). For our purposes it's ideal - generic macros are usable. However, parisc would have foo and P%foo used for label of entry point and address of the function descriptor and bar: .long P%foo woudl be used instead. ia64 goes similar to parisc in that respect, except that there it's @fptr(foo) rather than P%foo. Such architectures need to define KSYM_FUNC that would turn a function name into whatever is needed to refer to function descriptor. What's more, on such architectures we need to know whether we are exporting a function or an object - in assembler we have to tell that explicitly, to decide whether we want EXPORT_SYMBOL(foo) produce e.g. __ksymtab_foo: .quad foo or __ksymtab_foo: .quad @fptr(foo) For that reason we introduce EXPORT_DATA_SYMBOL{,_GPL}(), to be used for exports of data objects. On normal architectures it's the same thing as EXPORT_SYMBOL{,_GPL}(), but on parisc-like ones they differ and the right one needs to be used. Most of the exports are functions, so we keep EXPORT_SYMBOL for those... Signed-off-by: Al Viro --- include/asm-generic/export.h | 94 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 94 insertions(+) create mode 100644 include/asm-generic/export.h (limited to 'include') diff --git a/include/asm-generic/export.h b/include/asm-generic/export.h new file mode 100644 index 000000000000..43199a049da5 --- /dev/null +++ b/include/asm-generic/export.h @@ -0,0 +1,94 @@ +#ifndef __ASM_GENERIC_EXPORT_H +#define __ASM_GENERIC_EXPORT_H + +#ifndef KSYM_FUNC +#define KSYM_FUNC(x) x +#endif +#ifdef CONFIG_64BIT +#define __put .quad +#ifndef KSYM_ALIGN +#define KSYM_ALIGN 8 +#endif +#ifndef KCRC_ALIGN +#define KCRC_ALIGN 8 +#endif +#else +#define __put .long +#ifndef KSYM_ALIGN +#define KSYM_ALIGN 4 +#endif +#ifndef KCRC_ALIGN +#define KCRC_ALIGN 4 +#endif +#endif + +#ifdef CONFIG_HAVE_UNDERSCORE_SYMBOL_PREFIX +#define KSYM(name) _##name +#else +#define KSYM(name) name +#endif + +/* + * note on .section use: @progbits vs %progbits nastiness doesn't matter, + * since we immediately emit into those sections anyway. + */ +.macro ___EXPORT_SYMBOL name,val,sec +#ifdef CONFIG_MODULES + .globl KSYM(__ksymtab_\name) + .section ___ksymtab\sec+\name,"a" + .balign KSYM_ALIGN +KSYM(__ksymtab_\name): + __put \val, KSYM(__kstrtab_\name) + .previous + .section __ksymtab_strings,"a" +KSYM(__kstrtab_\name): +#ifdef CONFIG_HAVE_UNDERSCORE_SYMBOL_PREFIX + .asciz "_\name" +#else + .asciz "\name" +#endif + .previous +#ifdef CONFIG_MODVERSIONS + .section ___kcrctab\sec+\name,"a" + .balign KCRC_ALIGN +KSYM(__kcrctab_\name): + __put KSYM(__crc_\name) + .weak KSYM(__crc_\name) + .previous +#endif +#endif +.endm +#undef __put + +#if defined(__KSYM_DEPS__) + +#define __EXPORT_SYMBOL(sym, val, sec) === __KSYM_##sym === + +#elif defined(CONFIG_TRIM_UNUSED_KSYMS) + +#include +#include + +#define __EXPORT_SYMBOL(sym, val, sec) \ + __cond_export_sym(sym, val, sec, config_enabled(__KSYM_##sym)) +#define __cond_export_sym(sym, val, sec, conf) \ + ___cond_export_sym(sym, val, sec, conf) +#define ___cond_export_sym(sym, val, sec, enabled) \ + __cond_export_sym_##enabled(sym, val, sec) +#define __cond_export_sym_1(sym, val, sec) ___EXPORT_SYMBOL sym, val, sec +#define __cond_export_sym_0(sym, val, sec) /* nothing */ + +#else +#define __EXPORT_SYMBOL(sym, val, sec) ___EXPORT_SYMBOL sym, val, sec +#endif + +#define EXPORT_SYMBOL(name) \ + __EXPORT_SYMBOL(name, KSYM_FUNC(KSYM(name)),) +#define EXPORT_SYMBOL_GPL(name) \ + __EXPORT_SYMBOL(name, KSYM_FUNC(KSYM(name)), _gpl) +#define EXPORT_DATA_SYMBOL(name) \ + __EXPORT_SYMBOL(name, KSYM(name),) +#define EXPORT_DATA_SYMBOL_GPL(name) \ + __EXPORT_SYMBOL(name, KSYM(name),_gpl) + +#endif -- cgit v1.2.3 From 0e0b2afdf644aa523f5eb10ce1f9e3c6cd8362ec Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 10 Aug 2016 11:23:43 -0400 Subject: kernfs: add dummy implementation of kernfs_path_from_node() The dummy version of kernfs_path_from_node() was missing. This currently doesn't break anything. Let's add it for consistency and to ease adding wrappers around it. v2: Removed stray ';' which was causing build failures. Signed-off-by: Tejun Heo Acked-by: Greg Kroah-Hartman --- include/linux/kernfs.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h index 96356ef012de..7d2efd2128bb 100644 --- a/include/linux/kernfs.h +++ b/include/linux/kernfs.h @@ -344,6 +344,11 @@ static inline int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen) static inline size_t kernfs_path_len(struct kernfs_node *kn) { return 0; } +static inline int kernfs_path_from_node(struct kernfs_node *root_kn, + struct kernfs_node *kn, + char *buf, size_t buflen) +{ return -ENOSYS; } + static inline char *kernfs_path(struct kernfs_node *kn, char *buf, size_t buflen) { return NULL; } -- cgit v1.2.3 From 3abb1d90f5d930c6183534a624aa0158a71bc5eb Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 10 Aug 2016 11:23:44 -0400 Subject: kernfs: make kernfs_path*() behave in the style of strlcpy() kernfs_path*() functions always return the length of the full path but the path content is undefined if the length is larger than the provided buffer. This makes its behavior different from strlcpy() and requires error handling in all its users even when they don't care about truncation. In addition, the implementation can actully be simplified by making it behave properly in strlcpy() style. * Update kernfs_path_from_node_locked() to always fill up the buffer with path. If the buffer is not large enough, the output is truncated and terminated. * kernfs_path() no longer needs error handling. Make it a simple inline wrapper around kernfs_path_from_node(). * sysfs_warn_dup()'s use of kernfs_path() doesn't need error handling. Updated accordingly. * cgroup_path()'s use of kernfs_path() updated to retain the old behavior. Signed-off-by: Tejun Heo Acked-by: Greg Kroah-Hartman Acked-by: Serge Hallyn --- fs/kernfs/dir.c | 61 ++++++++++++++------------------------------------ fs/sysfs/dir.c | 6 ++--- include/linux/cgroup.h | 7 +++++- include/linux/kernfs.h | 21 ++++++++++++----- 4 files changed, 42 insertions(+), 53 deletions(-) (limited to 'include') diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index e57174d43683..09242aaa8829 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -110,8 +110,9 @@ static struct kernfs_node *kernfs_common_ancestor(struct kernfs_node *a, * kn_to: /n1/n2/n3 [depth=3] * result: /../.. * - * return value: length of the string. If greater than buflen, - * then contents of buf are undefined. On error, -1 is returned. + * Returns the length of the full path. If the full length is equal to or + * greater than @buflen, @buf contains the truncated path with the trailing + * '\0'. On error, -errno is returned. */ static int kernfs_path_from_node_locked(struct kernfs_node *kn_to, struct kernfs_node *kn_from, @@ -119,9 +120,8 @@ static int kernfs_path_from_node_locked(struct kernfs_node *kn_to, { struct kernfs_node *kn, *common; const char parent_str[] = "/.."; - size_t depth_from, depth_to, len = 0, nlen = 0; - char *p; - int i; + size_t depth_from, depth_to, len = 0; + int i, j; if (!kn_from) kn_from = kernfs_root(kn_to)->kn; @@ -131,7 +131,7 @@ static int kernfs_path_from_node_locked(struct kernfs_node *kn_to, common = kernfs_common_ancestor(kn_from, kn_to); if (WARN_ON(!common)) - return -1; + return -EINVAL; depth_to = kernfs_depth(common, kn_to); depth_from = kernfs_depth(common, kn_from); @@ -144,22 +144,16 @@ static int kernfs_path_from_node_locked(struct kernfs_node *kn_to, len < buflen ? buflen - len : 0); /* Calculate how many bytes we need for the rest */ - for (kn = kn_to; kn != common; kn = kn->parent) - nlen += strlen(kn->name) + 1; - - if (len + nlen >= buflen) - return len + nlen; - - p = buf + len + nlen; - *p = '\0'; - for (kn = kn_to; kn != common; kn = kn->parent) { - size_t tmp = strlen(kn->name); - p -= tmp; - memcpy(p, kn->name, tmp); - *(--p) = '/'; + for (i = depth_to - 1; i >= 0; i--) { + for (kn = kn_to, j = 0; j < i; j++) + kn = kn->parent; + len += strlcpy(buf + len, "/", + len < buflen ? buflen - len : 0); + len += strlcpy(buf + len, kn->name, + len < buflen ? buflen - len : 0); } - return len + nlen; + return len; } /** @@ -220,8 +214,9 @@ size_t kernfs_path_len(struct kernfs_node *kn) * path (which includes '..'s) as needed to reach from @from to @to is * returned. * - * If @buf isn't long enough, the return value will be greater than @buflen - * and @buf contents are undefined. + * Returns the length of the full path. If the full length is equal to or + * greater than @buflen, @buf contains the truncated path with the trailing + * '\0'. On error, -errno is returned. */ int kernfs_path_from_node(struct kernfs_node *to, struct kernfs_node *from, char *buf, size_t buflen) @@ -236,28 +231,6 @@ int kernfs_path_from_node(struct kernfs_node *to, struct kernfs_node *from, } EXPORT_SYMBOL_GPL(kernfs_path_from_node); -/** - * kernfs_path - build full path of a given node - * @kn: kernfs_node of interest - * @buf: buffer to copy @kn's name into - * @buflen: size of @buf - * - * Builds and returns the full path of @kn in @buf of @buflen bytes. The - * path is built from the end of @buf so the returned pointer usually - * doesn't match @buf. If @buf isn't long enough, @buf is nul terminated - * and %NULL is returned. - */ -char *kernfs_path(struct kernfs_node *kn, char *buf, size_t buflen) -{ - int ret; - - ret = kernfs_path_from_node(kn, NULL, buf, buflen); - if (ret < 0 || ret >= buflen) - return NULL; - return buf; -} -EXPORT_SYMBOL_GPL(kernfs_path); - /** * pr_cont_kernfs_name - pr_cont name of a kernfs_node * @kn: kernfs_node of interest diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index 94374e435025..2b67bda2021b 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -21,14 +21,14 @@ DEFINE_SPINLOCK(sysfs_symlink_target_lock); void sysfs_warn_dup(struct kernfs_node *parent, const char *name) { - char *buf, *path = NULL; + char *buf; buf = kzalloc(PATH_MAX, GFP_KERNEL); if (buf) - path = kernfs_path(parent, buf, PATH_MAX); + kernfs_path(parent, buf, PATH_MAX); WARN(1, KERN_WARNING "sysfs: cannot create duplicate filename '%s/%s'\n", - path, name); + buf, name); kfree(buf); } diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 984f73b719a9..5a9abdee43fe 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -541,7 +541,12 @@ static inline int cgroup_name(struct cgroup *cgrp, char *buf, size_t buflen) static inline char * __must_check cgroup_path(struct cgroup *cgrp, char *buf, size_t buflen) { - return kernfs_path(cgrp->kn, buf, buflen); + int ret; + + ret = kernfs_path(cgrp->kn, buf, buflen); + if (ret < 0 || ret >= buflen) + return NULL; + return buf; } static inline void pr_cont_cgroup_name(struct cgroup *cgrp) diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h index 7d2efd2128bb..4a02b1b49821 100644 --- a/include/linux/kernfs.h +++ b/include/linux/kernfs.h @@ -272,7 +272,6 @@ int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen); size_t kernfs_path_len(struct kernfs_node *kn); int kernfs_path_from_node(struct kernfs_node *root_kn, struct kernfs_node *kn, char *buf, size_t buflen); -char *kernfs_path(struct kernfs_node *kn, char *buf, size_t buflen); void pr_cont_kernfs_name(struct kernfs_node *kn); void pr_cont_kernfs_path(struct kernfs_node *kn); struct kernfs_node *kernfs_get_parent(struct kernfs_node *kn); @@ -349,10 +348,6 @@ static inline int kernfs_path_from_node(struct kernfs_node *root_kn, char *buf, size_t buflen) { return -ENOSYS; } -static inline char *kernfs_path(struct kernfs_node *kn, char *buf, - size_t buflen) -{ return NULL; } - static inline void pr_cont_kernfs_name(struct kernfs_node *kn) { } static inline void pr_cont_kernfs_path(struct kernfs_node *kn) { } @@ -441,6 +436,22 @@ static inline void kernfs_init(void) { } #endif /* CONFIG_KERNFS */ +/** + * kernfs_path - build full path of a given node + * @kn: kernfs_node of interest + * @buf: buffer to copy @kn's name into + * @buflen: size of @buf + * + * Builds and returns the full path of @kn in @buf of @buflen bytes. The + * path is built from the end of @buf so the returned pointer usually + * doesn't match @buf. If @buf isn't long enough, @buf is nul terminated + * and %NULL is returned. + */ +static inline int kernfs_path(struct kernfs_node *kn, char *buf, size_t buflen) +{ + return kernfs_path_from_node(kn, NULL, buf, buflen); +} + static inline struct kernfs_node * kernfs_find_and_get(struct kernfs_node *kn, const char *name) { -- cgit v1.2.3 From bb09c8634b1e484b8840fb2384d55739bfcb68bd Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 10 Aug 2016 11:23:44 -0400 Subject: kernfs: remove kernfs_path_len() It doesn't have any in-kernel user and the same result can be obtained from kernfs_path(@kn, NULL, 0). Remove it. Signed-off-by: Tejun Heo Acked-by: Greg Kroah-Hartman Cc: Serge Hallyn --- fs/kernfs/dir.c | 23 ----------------------- include/linux/kernfs.h | 4 ---- 2 files changed, 27 deletions(-) (limited to 'include') diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index 09242aaa8829..6e7fd37615f8 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -179,29 +179,6 @@ int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen) return ret; } -/** - * kernfs_path_len - determine the length of the full path of a given node - * @kn: kernfs_node of interest - * - * The returned length doesn't include the space for the terminating '\0'. - */ -size_t kernfs_path_len(struct kernfs_node *kn) -{ - size_t len = 0; - unsigned long flags; - - spin_lock_irqsave(&kernfs_rename_lock, flags); - - do { - len += strlen(kn->name) + 1; - kn = kn->parent; - } while (kn && kn->parent); - - spin_unlock_irqrestore(&kernfs_rename_lock, flags); - - return len; -} - /** * kernfs_path_from_node - build path of node @to relative to @from. * @from: parent kernfs_node relative to which we need to build the path diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h index 4a02b1b49821..7056238fd9f5 100644 --- a/include/linux/kernfs.h +++ b/include/linux/kernfs.h @@ -269,7 +269,6 @@ static inline bool kernfs_ns_enabled(struct kernfs_node *kn) } int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen); -size_t kernfs_path_len(struct kernfs_node *kn); int kernfs_path_from_node(struct kernfs_node *root_kn, struct kernfs_node *kn, char *buf, size_t buflen); void pr_cont_kernfs_name(struct kernfs_node *kn); @@ -340,9 +339,6 @@ static inline bool kernfs_ns_enabled(struct kernfs_node *kn) static inline int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen) { return -ENOSYS; } -static inline size_t kernfs_path_len(struct kernfs_node *kn) -{ return 0; } - static inline int kernfs_path_from_node(struct kernfs_node *root_kn, struct kernfs_node *kn, char *buf, size_t buflen) -- cgit v1.2.3 From 4c737b41de7f4eef2a593803bad1b918dd718b10 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 10 Aug 2016 11:23:44 -0400 Subject: cgroup: make cgroup_path() and friends behave in the style of strlcpy() cgroup_path() and friends used to format the path from the end and thus the resulting path usually didn't start at the start of the passed in buffer. Also, when the buffer was too small, the partial result was truncated from the head rather than tail and there was no way to tell how long the full path would be. These make the functions less robust and more awkward to use. With recent updates to kernfs_path(), cgroup_path() and friends can be made to behave in strlcpy() style. * cgroup_path(), cgroup_path_ns[_locked]() and task_cgroup_path() now always return the length of the full path. If buffer is too small, it contains nul terminated truncated output. * All users updated accordingly. v2: cgroup_path() usage in kernel/sched/debug.c converted. Signed-off-by: Tejun Heo Acked-by: Greg Kroah-Hartman Cc: Serge Hallyn Cc: Peter Zijlstra --- include/linux/blk-cgroup.h | 11 +---------- include/linux/cgroup.h | 16 +++++----------- kernel/cgroup.c | 48 +++++++++++++++++++++------------------------- kernel/cpuset.c | 12 ++++++------ kernel/sched/debug.c | 3 ++- 5 files changed, 36 insertions(+), 54 deletions(-) (limited to 'include') diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 10648e300c93..4e8c215e185c 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -343,16 +343,7 @@ static inline struct blkcg *cpd_to_blkcg(struct blkcg_policy_data *cpd) */ static inline int blkg_path(struct blkcg_gq *blkg, char *buf, int buflen) { - char *p; - - p = cgroup_path(blkg->blkcg->css.cgroup, buf, buflen); - if (!p) { - strncpy(buf, "", buflen); - return -ENAMETOOLONG; - } - - memmove(buf, p, buf + buflen - p); - return 0; + return cgroup_path(blkg->blkcg->css.cgroup, buf, buflen); } /** diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 5a9abdee43fe..6df36361a492 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -97,7 +97,7 @@ int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); int cgroup_rm_cftypes(struct cftype *cfts); void cgroup_file_notify(struct cgroup_file *cfile); -char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen); +int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen); int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry); int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *tsk); @@ -538,15 +538,9 @@ static inline int cgroup_name(struct cgroup *cgrp, char *buf, size_t buflen) return kernfs_name(cgrp->kn, buf, buflen); } -static inline char * __must_check cgroup_path(struct cgroup *cgrp, char *buf, - size_t buflen) +static inline int cgroup_path(struct cgroup *cgrp, char *buf, size_t buflen) { - int ret; - - ret = kernfs_path(cgrp->kn, buf, buflen); - if (ret < 0 || ret >= buflen) - return NULL; - return buf; + return kernfs_path(cgrp->kn, buf, buflen); } static inline void pr_cont_cgroup_name(struct cgroup *cgrp) @@ -639,8 +633,8 @@ struct cgroup_namespace *copy_cgroup_ns(unsigned long flags, struct user_namespace *user_ns, struct cgroup_namespace *old_ns); -char *cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen, - struct cgroup_namespace *ns); +int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen, + struct cgroup_namespace *ns); #else /* !CONFIG_CGROUPS */ diff --git a/kernel/cgroup.c b/kernel/cgroup.c index d1c51b7f5221..3861161e460f 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2315,22 +2315,18 @@ static struct file_system_type cgroup2_fs_type = { .fs_flags = FS_USERNS_MOUNT, }; -static char *cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen, - struct cgroup_namespace *ns) +static int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen, + struct cgroup_namespace *ns) { struct cgroup *root = cset_cgroup_from_root(ns->root_cset, cgrp->root); - int ret; - ret = kernfs_path_from_node(cgrp->kn, root->kn, buf, buflen); - if (ret < 0 || ret >= buflen) - return NULL; - return buf; + return kernfs_path_from_node(cgrp->kn, root->kn, buf, buflen); } -char *cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen, - struct cgroup_namespace *ns) +int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen, + struct cgroup_namespace *ns) { - char *ret; + int ret; mutex_lock(&cgroup_mutex); spin_lock_irq(&css_set_lock); @@ -2357,12 +2353,12 @@ EXPORT_SYMBOL_GPL(cgroup_path_ns); * * Return value is the same as kernfs_path(). */ -char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen) +int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen) { struct cgroup_root *root; struct cgroup *cgrp; int hierarchy_id = 1; - char *path = NULL; + int ret; mutex_lock(&cgroup_mutex); spin_lock_irq(&css_set_lock); @@ -2371,16 +2367,15 @@ char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen) if (root) { cgrp = task_cgroup_from_root(task, root); - path = cgroup_path_ns_locked(cgrp, buf, buflen, &init_cgroup_ns); + ret = cgroup_path_ns_locked(cgrp, buf, buflen, &init_cgroup_ns); } else { /* if no hierarchy exists, everyone is in "/" */ - if (strlcpy(buf, "/", buflen) < buflen) - path = buf; + ret = strlcpy(buf, "/", buflen); } spin_unlock_irq(&css_set_lock); mutex_unlock(&cgroup_mutex); - return path; + return ret; } EXPORT_SYMBOL_GPL(task_cgroup_path); @@ -5716,7 +5711,7 @@ core_initcall(cgroup_wq_init); int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *tsk) { - char *buf, *path; + char *buf; int retval; struct cgroup_root *root; @@ -5759,18 +5754,18 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, * " (deleted)" is appended to the cgroup path. */ if (cgroup_on_dfl(cgrp) || !(tsk->flags & PF_EXITING)) { - path = cgroup_path_ns_locked(cgrp, buf, PATH_MAX, + retval = cgroup_path_ns_locked(cgrp, buf, PATH_MAX, current->nsproxy->cgroup_ns); - if (!path) { + if (retval >= PATH_MAX) { retval = -ENAMETOOLONG; goto out_unlock; } + + seq_puts(m, buf); } else { - path = "/"; + seq_puts(m, "/"); } - seq_puts(m, path); - if (cgroup_on_dfl(cgrp) && cgroup_is_dead(cgrp)) seq_puts(m, " (deleted)\n"); else @@ -6035,8 +6030,9 @@ static void cgroup_release_agent(struct work_struct *work) { struct cgroup *cgrp = container_of(work, struct cgroup, release_agent_work); - char *pathbuf = NULL, *agentbuf = NULL, *path; + char *pathbuf = NULL, *agentbuf = NULL; char *argv[3], *envp[3]; + int ret; mutex_lock(&cgroup_mutex); @@ -6046,13 +6042,13 @@ static void cgroup_release_agent(struct work_struct *work) goto out; spin_lock_irq(&css_set_lock); - path = cgroup_path_ns_locked(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns); + ret = cgroup_path_ns_locked(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns); spin_unlock_irq(&css_set_lock); - if (!path) + if (ret >= PATH_MAX) goto out; argv[0] = agentbuf; - argv[1] = path; + argv[1] = pathbuf; argv[2] = NULL; /* minimal command environment */ diff --git a/kernel/cpuset.c b/kernel/cpuset.c index c7fd2778ed50..793ae6fd96dc 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -2689,7 +2689,7 @@ void __cpuset_memory_pressure_bump(void) int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *tsk) { - char *buf, *p; + char *buf; struct cgroup_subsys_state *css; int retval; @@ -2700,18 +2700,18 @@ int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns, retval = -ENAMETOOLONG; css = task_get_css(tsk, cpuset_cgrp_id); - p = cgroup_path_ns(css->cgroup, buf, PATH_MAX, - current->nsproxy->cgroup_ns); + retval = cgroup_path_ns(css->cgroup, buf, PATH_MAX, + current->nsproxy->cgroup_ns); css_put(css); - if (!p) + if (retval >= PATH_MAX) goto out_free; - seq_puts(m, p); + seq_puts(m, buf); seq_putc(m, '\n'); retval = 0; out_free: kfree(buf); out: - return retval; + return 0; } #endif /* CONFIG_PROC_PID_CPUSET */ diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 2a0a9995256d..23cb609ba4eb 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -410,7 +410,8 @@ static char *task_group_path(struct task_group *tg) if (autogroup_path(tg, group_path, PATH_MAX)) return group_path; - return cgroup_path(tg->css.cgroup, group_path, PATH_MAX); + cgroup_path(tg->css.cgroup, group_path, PATH_MAX); + return group_path; } #endif -- cgit v1.2.3 From ed1777de25e45bfb58fad63341904f8a77911785 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 10 Aug 2016 11:23:44 -0400 Subject: cgroup: add tracepoints for basic operations Debugging what goes wrong with cgroup setup can get hairy. Add tracepoints for cgroup hierarchy mount, cgroup creation/destruction and task migration operations for better visibility. Signed-off-by: Tejun Heo --- include/trace/events/cgroup.h | 163 ++++++++++++++++++++++++++++++++++++++++++ kernel/cgroup.c | 25 +++++++ 2 files changed, 188 insertions(+) create mode 100644 include/trace/events/cgroup.h (limited to 'include') diff --git a/include/trace/events/cgroup.h b/include/trace/events/cgroup.h new file mode 100644 index 000000000000..ab68640a18d0 --- /dev/null +++ b/include/trace/events/cgroup.h @@ -0,0 +1,163 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM cgroup + +#if !defined(_TRACE_CGROUP_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_CGROUP_H + +#include +#include + +DECLARE_EVENT_CLASS(cgroup_root, + + TP_PROTO(struct cgroup_root *root), + + TP_ARGS(root), + + TP_STRUCT__entry( + __field( int, root ) + __field( u16, ss_mask ) + __string( name, root->name ) + ), + + TP_fast_assign( + __entry->root = root->hierarchy_id; + __entry->ss_mask = root->subsys_mask; + __assign_str(name, root->name); + ), + + TP_printk("root=%d ss_mask=%#x name=%s", + __entry->root, __entry->ss_mask, __get_str(name)) +); + +DEFINE_EVENT(cgroup_root, cgroup_setup_root, + + TP_PROTO(struct cgroup_root *root), + + TP_ARGS(root) +); + +DEFINE_EVENT(cgroup_root, cgroup_destroy_root, + + TP_PROTO(struct cgroup_root *root), + + TP_ARGS(root) +); + +DEFINE_EVENT(cgroup_root, cgroup_remount, + + TP_PROTO(struct cgroup_root *root), + + TP_ARGS(root) +); + +DECLARE_EVENT_CLASS(cgroup, + + TP_PROTO(struct cgroup *cgrp), + + TP_ARGS(cgrp), + + TP_STRUCT__entry( + __field( int, root ) + __field( int, id ) + __field( int, level ) + __dynamic_array(char, path, + cgrp->kn ? cgroup_path(cgrp, NULL, 0) + 1 + : strlen("(null)")) + ), + + TP_fast_assign( + __entry->root = cgrp->root->hierarchy_id; + __entry->id = cgrp->id; + __entry->level = cgrp->level; + if (cgrp->kn) + cgroup_path(cgrp, __get_dynamic_array(path), + __get_dynamic_array_len(path)); + else + __assign_str(path, "(null)"); + ), + + TP_printk("root=%d id=%d level=%d path=%s", + __entry->root, __entry->id, __entry->level, __get_str(path)) +); + +DEFINE_EVENT(cgroup, cgroup_mkdir, + + TP_PROTO(struct cgroup *cgroup), + + TP_ARGS(cgroup) +); + +DEFINE_EVENT(cgroup, cgroup_rmdir, + + TP_PROTO(struct cgroup *cgroup), + + TP_ARGS(cgroup) +); + +DEFINE_EVENT(cgroup, cgroup_release, + + TP_PROTO(struct cgroup *cgroup), + + TP_ARGS(cgroup) +); + +DEFINE_EVENT(cgroup, cgroup_rename, + + TP_PROTO(struct cgroup *cgroup), + + TP_ARGS(cgroup) +); + +DECLARE_EVENT_CLASS(cgroup_migrate, + + TP_PROTO(struct cgroup *dst_cgrp, struct task_struct *task, bool threadgroup), + + TP_ARGS(dst_cgrp, task, threadgroup), + + TP_STRUCT__entry( + __field( int, dst_root ) + __field( int, dst_id ) + __field( int, dst_level ) + __dynamic_array(char, dst_path, + dst_cgrp->kn ? cgroup_path(dst_cgrp, NULL, 0) + 1 + : strlen("(null)")) + __field( int, pid ) + __string( comm, task->comm ) + ), + + TP_fast_assign( + __entry->dst_root = dst_cgrp->root->hierarchy_id; + __entry->dst_id = dst_cgrp->id; + __entry->dst_level = dst_cgrp->level; + if (dst_cgrp->kn) + cgroup_path(dst_cgrp, __get_dynamic_array(dst_path), + __get_dynamic_array_len(dst_path)); + else + __assign_str(dst_path, "(null)"); + __entry->pid = task->pid; + __assign_str(comm, task->comm); + ), + + TP_printk("dst_root=%d dst_id=%d dst_level=%d dst_path=%s pid=%d comm=%s", + __entry->dst_root, __entry->dst_id, __entry->dst_level, + __get_str(dst_path), __entry->pid, __get_str(comm)) +); + +DEFINE_EVENT(cgroup_migrate, cgroup_attach_task, + + TP_PROTO(struct cgroup *dst_cgrp, struct task_struct *task, bool threadgroup), + + TP_ARGS(dst_cgrp, task, threadgroup) +); + +DEFINE_EVENT(cgroup_migrate, cgroup_transfer_tasks, + + TP_PROTO(struct cgroup *dst_cgrp, struct task_struct *task, bool threadgroup), + + TP_ARGS(dst_cgrp, task, threadgroup) +); + +#endif /* _TRACE_CGROUP_H */ + +/* This part must be outside protection */ +#include diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 3861161e460f..5e2e81ad9175 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -64,6 +64,9 @@ #include #include +#define CREATE_TRACE_POINTS +#include + /* * pidlists linger the following amount before being destroyed. The goal * is avoiding frequent destruction in the middle of consecutive read calls @@ -1176,6 +1179,8 @@ static void cgroup_destroy_root(struct cgroup_root *root) struct cgroup *cgrp = &root->cgrp; struct cgrp_cset_link *link, *tmp_link; + trace_cgroup_destroy_root(root); + cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp); BUG_ON(atomic_read(&root->nr_cgrps)); @@ -1874,6 +1879,9 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) strcpy(root->release_agent_path, opts.release_agent); spin_unlock(&release_agent_path_lock); } + + trace_cgroup_remount(root); + out_unlock: kfree(opts.release_agent); kfree(opts.name); @@ -2031,6 +2039,8 @@ static int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) if (ret) goto destroy_root; + trace_cgroup_setup_root(root); + /* * There must be no failure case after here, since rebinding takes * care of subsystems' refcounts, which are explicitly dropped in @@ -2825,6 +2835,10 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp, ret = cgroup_migrate(leader, threadgroup, dst_cgrp->root); cgroup_migrate_finish(&preloaded_csets); + + if (!ret) + trace_cgroup_attach_task(dst_cgrp, leader, threadgroup); + return ret; } @@ -3587,6 +3601,8 @@ static int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent, mutex_lock(&cgroup_mutex); ret = kernfs_rename(kn, new_parent, new_name_str); + if (!ret) + trace_cgroup_rename(cgrp); mutex_unlock(&cgroup_mutex); @@ -4355,6 +4371,8 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) if (task) { ret = cgroup_migrate(task, false, to->root); + if (!ret) + trace_cgroup_transfer_tasks(to, task, false); put_task_struct(task); } } while (task && !ret); @@ -5020,6 +5038,8 @@ static void css_release_work_fn(struct work_struct *work) ss->css_released(css); } else { /* cgroup release path */ + trace_cgroup_release(cgrp); + cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id); cgrp->id = -1; @@ -5306,6 +5326,8 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, if (ret) goto out_destroy; + trace_cgroup_mkdir(cgrp); + /* let's create and online css's */ kernfs_activate(kn); @@ -5481,6 +5503,9 @@ static int cgroup_rmdir(struct kernfs_node *kn) ret = cgroup_destroy_locked(cgrp); + if (!ret) + trace_cgroup_rmdir(cgrp); + cgroup_kn_unlock(kn); return ret; } -- cgit v1.2.3 From 9379e6b8e0f995365dc6158a1463c8dab4f2c8da Mon Sep 17 00:00:00 2001 From: Shaun Tancheff Date: Sun, 21 Aug 2016 23:23:18 -0500 Subject: libata: Safely overwrite attached page in WRITE SAME xlat Safely overwriting the attached page to ATA format from the SCSI formatted variant. Signed-off-by: Shaun Tancheff Reviewed-by: Hannes Reinecke Acked-by: Tejun Heo --- drivers/ata/libata-scsi.c | 56 ++++++++++++++++++++++++++++++++++++++++++----- include/linux/ata.h | 26 ---------------------- 2 files changed, 51 insertions(+), 31 deletions(-) (limited to 'include') diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c index be9c76c938b2..84e7f48ca83e 100644 --- a/drivers/ata/libata-scsi.c +++ b/drivers/ata/libata-scsi.c @@ -3282,6 +3282,54 @@ static unsigned int ata_scsi_pass_thru(struct ata_queued_cmd *qc) return 1; } +/** + * ata_format_dsm_trim_descr() - SATL Write Same to DSM Trim + * @cmd: SCSI command being translated + * @num: Maximum number of entries (nominally 64). + * @sector: Starting sector + * @count: Total Range of request + * + * Rewrite the WRITE SAME descriptor to be a DSM TRIM little-endian formatted + * descriptor. + * + * Upto 64 entries of the format: + * 63:48 Range Length + * 47:0 LBA + * + * Range Length of 0 is ignored. + * LBA's should be sorted order and not overlap. + * + * NOTE: this is the same format as ADD LBA(S) TO NV CACHE PINNED SET + */ +static unsigned int ata_format_dsm_trim_descr(struct scsi_cmnd *cmd, u32 num, + u64 sector, u32 count) +{ + __le64 *buffer; + u32 i = 0, used_bytes; + unsigned long flags; + + BUILD_BUG_ON(512 > ATA_SCSI_RBUF_SIZE); + + spin_lock_irqsave(&ata_scsi_rbuf_lock, flags); + buffer = ((void *)ata_scsi_rbuf); + while (i < num) { + u64 entry = sector | + ((u64)(count > 0xffff ? 0xffff : count) << 48); + buffer[i++] = __cpu_to_le64(entry); + if (count <= 0xffff) + break; + count -= 0xffff; + sector += 0xffff; + } + + used_bytes = ALIGN(i * 8, 512); + memset(buffer + i, 0, used_bytes - i * 8); + sg_copy_from_buffer(scsi_sglist(cmd), scsi_sg_count(cmd), buffer, 512); + spin_unlock_irqrestore(&ata_scsi_rbuf_lock, flags); + + return used_bytes; +} + static unsigned int ata_scsi_write_same_xlat(struct ata_queued_cmd *qc) { struct ata_taskfile *tf = &qc->tf; @@ -3290,8 +3338,8 @@ static unsigned int ata_scsi_write_same_xlat(struct ata_queued_cmd *qc) const u8 *cdb = scmd->cmnd; u64 block; u32 n_block; + const u32 trmax = ATA_MAX_TRIM_RNUM; u32 size; - void *buf; u16 fp; u8 bp = 0xff; @@ -3319,10 +3367,8 @@ static unsigned int ata_scsi_write_same_xlat(struct ata_queued_cmd *qc) if (!scsi_sg_count(scmd)) goto invalid_param_len; - buf = page_address(sg_page(scsi_sglist(scmd))); - - if (n_block <= 65535 * ATA_MAX_TRIM_RNUM) { - size = ata_set_lba_range_entries(buf, ATA_MAX_TRIM_RNUM, block, n_block); + if (n_block <= 0xffff * trmax) { + size = ata_format_dsm_trim_descr(scmd, trmax, block, n_block); } else { fp = 2; goto invalid_fld; diff --git a/include/linux/ata.h b/include/linux/ata.h index adbc812c009b..45a1d71c55f1 100644 --- a/include/linux/ata.h +++ b/include/linux/ata.h @@ -1071,32 +1071,6 @@ static inline void ata_id_to_hd_driveid(u16 *id) #endif } -/* - * Write LBA Range Entries to the buffer that will cover the extent from - * sector to sector + count. This is used for TRIM and for ADD LBA(S) - * TO NV CACHE PINNED SET. - */ -static inline unsigned ata_set_lba_range_entries(void *_buffer, - unsigned num, u64 sector, unsigned long count) -{ - __le64 *buffer = _buffer; - unsigned i = 0, used_bytes; - - while (i < num) { - u64 entry = sector | - ((u64)(count > 0xffff ? 0xffff : count) << 48); - buffer[i++] = __cpu_to_le64(entry); - if (count <= 0xffff) - break; - count -= 0xffff; - sector += 0xffff; - } - - used_bytes = ALIGN(i * 8, 512); - memset(buffer + i, 0, used_bytes - i * 8); - return used_bytes; -} - static inline bool ata_ok(u8 status) { return ((status & (ATA_BUSY | ATA_DRDY | ATA_DF | ATA_DRQ | ATA_ERR)) -- cgit v1.2.3 From 7b20309428598df00ffeb0b01f5948dea6aaf1f7 Mon Sep 17 00:00:00 2001 From: Shaun Tancheff Date: Sun, 21 Aug 2016 23:23:19 -0500 Subject: libata: Add support for SCT Write Same SATA drives may support write same via SCT. This is useful for setting the drive contents to a specific pattern (0's). Translate a SCSI WRITE SAME 16 command to be either a DSM TRIM command or an SCT Write Same command. Based on the UNMAP flag: - When set translate to DSM TRIM - When not set translate to SCT Write Same Signed-off-by: Shaun Tancheff Reviewed-by: Hannes Reinecke Acked-by: Tejun Heo --- drivers/ata/libata-scsi.c | 199 +++++++++++++++++++++++++++++++++++++++------- include/linux/ata.h | 43 ++++++++++ 2 files changed, 213 insertions(+), 29 deletions(-) (limited to 'include') diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c index 84e7f48ca83e..08d9c8e731b3 100644 --- a/drivers/ata/libata-scsi.c +++ b/drivers/ata/libata-scsi.c @@ -1159,8 +1159,6 @@ static void ata_scsi_sdev_config(struct scsi_device *sdev) { sdev->use_10_for_rw = 1; sdev->use_10_for_ms = 1; - sdev->no_report_opcodes = 1; - sdev->no_write_same = 1; /* Schedule policy is determined by ->qc_defer() callback and * it needs to see every deferred qc. Set dev_blocked to 1 to @@ -3287,7 +3285,7 @@ static unsigned int ata_scsi_pass_thru(struct ata_queued_cmd *qc) * @cmd: SCSI command being translated * @num: Maximum number of entries (nominally 64). * @sector: Starting sector - * @count: Total Range of request + * @count: Total Range of request in logical sectors * * Rewrite the WRITE SAME descriptor to be a DSM TRIM little-endian formatted * descriptor. @@ -3330,6 +3328,45 @@ static unsigned int ata_format_dsm_trim_descr(struct scsi_cmnd *cmd, u32 num, return used_bytes; } +/** + * ata_format_dsm_trim_descr() - SATL Write Same to ATA SCT Write Same + * @cmd: SCSI command being translated + * @lba: Starting sector + * @num: Number of logical sectors to be zero'd. + * + * Rewrite the WRITE SAME descriptor to be an SCT Write Same formatted + * descriptor. + * NOTE: Writes a pattern (0's) in the foreground. + * Large write-same requents can timeout. + */ +static void ata_format_sct_write_same(struct scsi_cmnd *cmd, u64 lba, u64 num) +{ + u16 *sctpg; + unsigned long flags; + + spin_lock_irqsave(&ata_scsi_rbuf_lock, flags); + sctpg = ((void *)ata_scsi_rbuf); + + put_unaligned_le16(0x0002, &sctpg[0]); /* SCT_ACT_WRITE_SAME */ + put_unaligned_le16(0x0101, &sctpg[1]); /* WRITE PTRN FG */ + put_unaligned_le64(lba, &sctpg[2]); + put_unaligned_le64(num, &sctpg[6]); + put_unaligned_le32(0u, &sctpg[10]); + + sg_copy_from_buffer(scsi_sglist(cmd), scsi_sg_count(cmd), sctpg, 512); + spin_unlock_irqrestore(&ata_scsi_rbuf_lock, flags); +} + +/** + * ata_scsi_write_same_xlat() - SATL Write Same to ATA SCT Write Same + * @qc: Command to be translated + * + * Translate a SCSI WRITE SAME command to be either a DSM TRIM command or + * an SCT Write Same command. + * Based on WRITE SAME has the UNMAP flag + * When set translate to DSM TRIM + * When clear translate to SCT Write Same + */ static unsigned int ata_scsi_write_same_xlat(struct ata_queued_cmd *qc) { struct ata_taskfile *tf = &qc->tf; @@ -3342,6 +3379,7 @@ static unsigned int ata_scsi_write_same_xlat(struct ata_queued_cmd *qc) u32 size; u16 fp; u8 bp = 0xff; + u8 unmap = cdb[1] & 0x8; /* we may not issue DMA commands if no DMA mode is set */ if (unlikely(!dev->dma_mode)) @@ -3353,11 +3391,26 @@ static unsigned int ata_scsi_write_same_xlat(struct ata_queued_cmd *qc) } scsi_16_lba_len(cdb, &block, &n_block); - /* for now we only support WRITE SAME with the unmap bit set */ - if (unlikely(!(cdb[1] & 0x8))) { - fp = 1; - bp = 3; - goto invalid_fld; + if (unmap) { + /* If trim is not enabled the cmd is invalid. */ + if ((dev->horkage & ATA_HORKAGE_NOTRIM) || + !ata_id_has_trim(dev->id)) { + fp = 1; + bp = 3; + goto invalid_fld; + } + /* If the request is too large the cmd is invalid */ + if (n_block > 0xffff * trmax) { + fp = 2; + goto invalid_fld; + } + } else { + /* If write same is not available the cmd is invalid */ + if (!ata_id_sct_write_same(dev->id)) { + fp = 1; + bp = 3; + goto invalid_fld; + } } /* @@ -3367,30 +3420,42 @@ static unsigned int ata_scsi_write_same_xlat(struct ata_queued_cmd *qc) if (!scsi_sg_count(scmd)) goto invalid_param_len; - if (n_block <= 0xffff * trmax) { + if (unmap) { size = ata_format_dsm_trim_descr(scmd, trmax, block, n_block); + if (ata_ncq_enabled(dev) && ata_fpdma_dsm_supported(dev)) { + /* Newer devices support queued TRIM commands */ + tf->protocol = ATA_PROT_NCQ; + tf->command = ATA_CMD_FPDMA_SEND; + tf->hob_nsect = ATA_SUBCMD_FPDMA_SEND_DSM & 0x1f; + tf->nsect = qc->tag << 3; + tf->hob_feature = (size / 512) >> 8; + tf->feature = size / 512; + + tf->auxiliary = 1; + } else { + tf->protocol = ATA_PROT_DMA; + tf->hob_feature = 0; + tf->feature = ATA_DSM_TRIM; + tf->hob_nsect = (size / 512) >> 8; + tf->nsect = size / 512; + tf->command = ATA_CMD_DSM; + } } else { - fp = 2; - goto invalid_fld; - } - - if (ata_ncq_enabled(dev) && ata_fpdma_dsm_supported(dev)) { - /* Newer devices support queued TRIM commands */ - tf->protocol = ATA_PROT_NCQ; - tf->command = ATA_CMD_FPDMA_SEND; - tf->hob_nsect = ATA_SUBCMD_FPDMA_SEND_DSM & 0x1f; - tf->nsect = qc->tag << 3; - tf->hob_feature = (size / 512) >> 8; - tf->feature = size / 512; + ata_format_sct_write_same(scmd, block, n_block); - tf->auxiliary = 1; - } else { - tf->protocol = ATA_PROT_DMA; tf->hob_feature = 0; - tf->feature = ATA_DSM_TRIM; - tf->hob_nsect = (size / 512) >> 8; - tf->nsect = size / 512; - tf->command = ATA_CMD_DSM; + tf->feature = 0; + tf->hob_nsect = 0; + tf->nsect = 1; + tf->lbah = 0; + tf->lbam = 0; + tf->lbal = ATA_CMD_STANDBYNOW1; + tf->hob_lbah = 0; + tf->hob_lbam = 0; + tf->hob_lbal = 0; + tf->device = ATA_CMD_STANDBYNOW1; + tf->protocol = ATA_PROT_DMA; + tf->command = ATA_CMD_WRITE_LOG_DMA_EXT; } tf->flags |= ATA_TFLAG_ISADDR | ATA_TFLAG_DEVICE | ATA_TFLAG_LBA48 | @@ -3413,6 +3478,76 @@ invalid_opcode: return 1; } +/** + * ata_scsiop_maint_in - Simulate a subset of MAINTENANCE_IN + * @args: device MAINTENANCE_IN data / SCSI command of interest. + * @rbuf: Response buffer, to which simulated SCSI cmd output is sent. + * + * Yields a subset to satisfy scsi_report_opcode() + * + * LOCKING: + * spin_lock_irqsave(host lock) + */ +static unsigned int ata_scsiop_maint_in(struct ata_scsi_args *args, u8 *rbuf) +{ + struct ata_device *dev = args->dev; + u8 *cdb = args->cmd->cmnd; + u8 supported = 0; + unsigned int err = 0; + + if (cdb[2] != 1) { + ata_dev_warn(dev, "invalid command format %d\n", cdb[2]); + err = 2; + goto out; + } + switch (cdb[3]) { + case INQUIRY: + case MODE_SENSE: + case MODE_SENSE_10: + case READ_CAPACITY: + case SERVICE_ACTION_IN_16: + case REPORT_LUNS: + case REQUEST_SENSE: + case SYNCHRONIZE_CACHE: + case REZERO_UNIT: + case SEEK_6: + case SEEK_10: + case TEST_UNIT_READY: + case SEND_DIAGNOSTIC: + case MAINTENANCE_IN: + case READ_6: + case READ_10: + case READ_16: + case WRITE_6: + case WRITE_10: + case WRITE_16: + case ATA_12: + case ATA_16: + case VERIFY: + case VERIFY_16: + case MODE_SELECT: + case MODE_SELECT_10: + case START_STOP: + supported = 3; + break; + case WRITE_SAME_16: + if (ata_id_sct_write_same(dev->id)) + supported = 3; + break; + case ZBC_IN: + case ZBC_OUT: + if (ata_id_zoned_cap(dev->id) || + dev->class == ATA_DEV_ZAC) + supported = 3; + break; + default: + break; + } +out: + rbuf[1] = supported; /* supported */ + return err; +} + /** * ata_scsi_report_zones_complete - convert ATA output * @qc: command structure returning the data @@ -4193,6 +4328,13 @@ void ata_scsi_simulate(struct ata_device *dev, struct scsi_cmnd *cmd) ata_scsi_invalid_field(dev, cmd, 1); break; + case MAINTENANCE_IN: + if (scsicmd[1] == MI_REPORT_SUPPORTED_OPERATION_CODES) + ata_scsi_rbuf_fill(&args, ata_scsiop_maint_in); + else + ata_scsi_invalid_field(dev, cmd, 1); + break; + /* all other commands */ default: ata_scsi_set_sense(dev, cmd, ILLEGAL_REQUEST, 0x20, 0x0); @@ -4225,7 +4367,6 @@ int ata_scsi_add_hosts(struct ata_host *host, struct scsi_host_template *sht) shost->max_lun = 1; shost->max_channel = 1; shost->max_cmd_len = 16; - shost->no_write_same = 1; /* Schedule policy is determined by ->qc_defer() * callback and it needs to see every deferred qc. diff --git a/include/linux/ata.h b/include/linux/ata.h index 45a1d71c55f1..fdb180367ba1 100644 --- a/include/linux/ata.h +++ b/include/linux/ata.h @@ -105,6 +105,7 @@ enum { ATA_ID_CFA_KEY_MGMT = 162, ATA_ID_CFA_MODES = 163, ATA_ID_DATA_SET_MGMT = 169, + ATA_ID_SCT_CMD_XPORT = 206, ATA_ID_ROT_SPEED = 217, ATA_ID_PIO4 = (1 << 1), @@ -788,6 +789,48 @@ static inline bool ata_id_sense_reporting_enabled(const u16 *id) return id[ATA_ID_COMMAND_SET_4] & (1 << 6); } +/** + * + * Word: 206 - SCT Command Transport + * 15:12 - Vendor Specific + * 11:6 - Reserved + * 5 - SCT Command Transport Data Tables supported + * 4 - SCT Command Transport Features Control supported + * 3 - SCT Command Transport Error Recovery Control supported + * 2 - SCT Command Transport Write Same supported + * 1 - SCT Command Transport Long Sector Access supported + * 0 - SCT Command Transport supported + */ +static inline bool ata_id_sct_data_tables(const u16 *id) +{ + return id[ATA_ID_SCT_CMD_XPORT] & (1 << 5) ? true : false; +} + +static inline bool ata_id_sct_features_ctrl(const u16 *id) +{ + return id[ATA_ID_SCT_CMD_XPORT] & (1 << 4) ? true : false; +} + +static inline bool ata_id_sct_error_recovery_ctrl(const u16 *id) +{ + return id[ATA_ID_SCT_CMD_XPORT] & (1 << 3) ? true : false; +} + +static inline bool ata_id_sct_write_same(const u16 *id) +{ + return id[ATA_ID_SCT_CMD_XPORT] & (1 << 2) ? true : false; +} + +static inline bool ata_id_sct_long_sector_access(const u16 *id) +{ + return id[ATA_ID_SCT_CMD_XPORT] & (1 << 1) ? true : false; +} + +static inline bool ata_id_sct_supported(const u16 *id) +{ + return id[ATA_ID_SCT_CMD_XPORT] & (1 << 0) ? true : false; +} + /** * ata_id_major_version - get ATA level of drive * @id: Identify data -- cgit v1.2.3 From b67067f1176df6ee727450546b58704e4b588563 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Wed, 24 Aug 2016 22:29:20 +1000 Subject: kbuild: allow archs to select link dead code/data elimination Introduce LD_DEAD_CODE_DATA_ELIMINATION option for architectures to select to build with -ffunction-sections, -fdata-sections, and link with --gc-sections. It requires some work (documented) to ensure all unreferenced entrypoints are live, and requires toolchain and build verification, so it is made a per-arch option for now. On a random powerpc64le build, this yelds a significant size saving, it boots and runs fine, but there is a lot I haven't tested as yet, so these savings may be reduced if there are bugs in the link. text data bss dec filename 11169741 1180744 1923176 14273661 vmlinux 10445269 1004127 1919707 13369103 vmlinux.dce ~700K text, ~170K data, 6% removed from kernel image size. Signed-off-by: Nicholas Piggin Signed-off-by: Michal Marek --- Makefile | 9 +++++++ arch/Kconfig | 13 ++++++++++ include/asm-generic/vmlinux.lds.h | 52 ++++++++++++++++++++++----------------- include/linux/compiler.h | 23 +++++++++++++++++ include/linux/export.h | 30 +++++++++++----------- include/linux/init.h | 38 ++++++++++------------------ init/Makefile | 2 ++ 7 files changed, 104 insertions(+), 63 deletions(-) (limited to 'include') diff --git a/Makefile b/Makefile index 70de1448c571..6d40f3a3a8cc 100644 --- a/Makefile +++ b/Makefile @@ -622,6 +622,11 @@ include arch/$(SRCARCH)/Makefile KBUILD_CFLAGS += $(call cc-option,-fno-delete-null-pointer-checks,) KBUILD_CFLAGS += $(call cc-disable-warning,maybe-uninitialized,) +ifdef CONFIG_LD_DEAD_CODE_DATA_ELIMINATION +KBUILD_CFLAGS += $(call cc-option,-ffunction-sections,) +KBUILD_CFLAGS += $(call cc-option,-fdata-sections,) +endif + ifdef CONFIG_CC_OPTIMIZE_FOR_SIZE KBUILD_CFLAGS += -Os else @@ -809,6 +814,10 @@ LDFLAGS_BUILD_ID = $(patsubst -Wl$(comma)%,%,\ KBUILD_LDFLAGS_MODULE += $(LDFLAGS_BUILD_ID) LDFLAGS_vmlinux += $(LDFLAGS_BUILD_ID) +ifdef CONFIG_LD_DEAD_CODE_DATA_ELIMINATION +LDFLAGS_vmlinux += $(call ld-option, --gc-sections,) +endif + ifeq ($(CONFIG_STRIP_ASM_SYMS),y) LDFLAGS_vmlinux += $(call ld-option, -X,) endif diff --git a/arch/Kconfig b/arch/Kconfig index 6842154813e5..3f948c422d9d 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -467,6 +467,19 @@ config THIN_ARCHIVES Select this if the architecture wants to use thin archives instead of ld -r to create the built-in.o files. +config LD_DEAD_CODE_DATA_ELIMINATION + bool + help + Select this if the architecture wants to do dead code and + data elimination with the linker by compiling with + -ffunction-sections -fdata-sections and linking with + --gc-sections. + + This requires that the arch annotates or otherwise protects + its external entry points from being discarded. Linker scripts + must also merge .text.*, .data.*, and .bss.* correctly into + output sections. + config HAVE_CONTEXT_TRACKING bool help diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 24563970ff7b..ad9d8f94dc7a 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -196,9 +196,14 @@ *(.dtb.init.rodata) \ VMLINUX_SYMBOL(__dtb_end) = .; -/* .data section */ +/* + * .data section + * -fdata-sections generates .data.identifier which needs to be pulled in + * with .data, but don't want to pull in .data..stuff which has its own + * requirements. Same for bss. + */ #define DATA_DATA \ - *(.data) \ + *(.data .data.[0-9a-zA-Z_]*) \ *(.ref.data) \ *(.data..shared_aligned) /* percpu related */ \ MEM_KEEP(init.data) \ @@ -320,76 +325,76 @@ /* Kernel symbol table: Normal symbols */ \ __ksymtab : AT(ADDR(__ksymtab) - LOAD_OFFSET) { \ VMLINUX_SYMBOL(__start___ksymtab) = .; \ - *(SORT(___ksymtab+*)) \ + KEEP(*(SORT(___ksymtab+*))) \ VMLINUX_SYMBOL(__stop___ksymtab) = .; \ } \ \ /* Kernel symbol table: GPL-only symbols */ \ __ksymtab_gpl : AT(ADDR(__ksymtab_gpl) - LOAD_OFFSET) { \ VMLINUX_SYMBOL(__start___ksymtab_gpl) = .; \ - *(SORT(___ksymtab_gpl+*)) \ + KEEP(*(SORT(___ksymtab_gpl+*))) \ VMLINUX_SYMBOL(__stop___ksymtab_gpl) = .; \ } \ \ /* Kernel symbol table: Normal unused symbols */ \ __ksymtab_unused : AT(ADDR(__ksymtab_unused) - LOAD_OFFSET) { \ VMLINUX_SYMBOL(__start___ksymtab_unused) = .; \ - *(SORT(___ksymtab_unused+*)) \ + KEEP(*(SORT(___ksymtab_unused+*))) \ VMLINUX_SYMBOL(__stop___ksymtab_unused) = .; \ } \ \ /* Kernel symbol table: GPL-only unused symbols */ \ __ksymtab_unused_gpl : AT(ADDR(__ksymtab_unused_gpl) - LOAD_OFFSET) { \ VMLINUX_SYMBOL(__start___ksymtab_unused_gpl) = .; \ - *(SORT(___ksymtab_unused_gpl+*)) \ + KEEP(*(SORT(___ksymtab_unused_gpl+*))) \ VMLINUX_SYMBOL(__stop___ksymtab_unused_gpl) = .; \ } \ \ /* Kernel symbol table: GPL-future-only symbols */ \ __ksymtab_gpl_future : AT(ADDR(__ksymtab_gpl_future) - LOAD_OFFSET) { \ VMLINUX_SYMBOL(__start___ksymtab_gpl_future) = .; \ - *(SORT(___ksymtab_gpl_future+*)) \ + KEEP(*(SORT(___ksymtab_gpl_future+*))) \ VMLINUX_SYMBOL(__stop___ksymtab_gpl_future) = .; \ } \ \ /* Kernel symbol table: Normal symbols */ \ __kcrctab : AT(ADDR(__kcrctab) - LOAD_OFFSET) { \ VMLINUX_SYMBOL(__start___kcrctab) = .; \ - *(SORT(___kcrctab+*)) \ + KEEP(*(SORT(___kcrctab+*))) \ VMLINUX_SYMBOL(__stop___kcrctab) = .; \ } \ \ /* Kernel symbol table: GPL-only symbols */ \ __kcrctab_gpl : AT(ADDR(__kcrctab_gpl) - LOAD_OFFSET) { \ VMLINUX_SYMBOL(__start___kcrctab_gpl) = .; \ - *(SORT(___kcrctab_gpl+*)) \ + KEEP(*(SORT(___kcrctab_gpl+*))) \ VMLINUX_SYMBOL(__stop___kcrctab_gpl) = .; \ } \ \ /* Kernel symbol table: Normal unused symbols */ \ __kcrctab_unused : AT(ADDR(__kcrctab_unused) - LOAD_OFFSET) { \ VMLINUX_SYMBOL(__start___kcrctab_unused) = .; \ - *(SORT(___kcrctab_unused+*)) \ + KEEP(*(SORT(___kcrctab_unused+*))) \ VMLINUX_SYMBOL(__stop___kcrctab_unused) = .; \ } \ \ /* Kernel symbol table: GPL-only unused symbols */ \ __kcrctab_unused_gpl : AT(ADDR(__kcrctab_unused_gpl) - LOAD_OFFSET) { \ VMLINUX_SYMBOL(__start___kcrctab_unused_gpl) = .; \ - *(SORT(___kcrctab_unused_gpl+*)) \ + KEEP(*(SORT(___kcrctab_unused_gpl+*))) \ VMLINUX_SYMBOL(__stop___kcrctab_unused_gpl) = .; \ } \ \ /* Kernel symbol table: GPL-future-only symbols */ \ __kcrctab_gpl_future : AT(ADDR(__kcrctab_gpl_future) - LOAD_OFFSET) { \ VMLINUX_SYMBOL(__start___kcrctab_gpl_future) = .; \ - *(SORT(___kcrctab_gpl_future+*)) \ + KEEP(*(SORT(___kcrctab_gpl_future+*))) \ VMLINUX_SYMBOL(__stop___kcrctab_gpl_future) = .; \ } \ \ /* Kernel symbol table: strings */ \ __ksymtab_strings : AT(ADDR(__ksymtab_strings) - LOAD_OFFSET) { \ - *(__ksymtab_strings) \ + KEEP(*(__ksymtab_strings)) \ } \ \ /* __*init sections */ \ @@ -424,7 +429,7 @@ #define SECURITY_INIT \ .security_initcall.init : AT(ADDR(.security_initcall.init) - LOAD_OFFSET) { \ VMLINUX_SYMBOL(__security_initcall_start) = .; \ - *(.security_initcall.init) \ + KEEP(*(.security_initcall.init)) \ VMLINUX_SYMBOL(__security_initcall_end) = .; \ } @@ -432,7 +437,7 @@ * during second ld run in second ld pass when generating System.map */ #define TEXT_TEXT \ ALIGN_FUNCTION(); \ - *(.text.hot .text .text.fixup .text.unlikely) \ + *(.text.hot .text .text.fixup .text.unlikely .text.*) \ *(.ref.text) \ MEM_KEEP(init.text) \ MEM_KEEP(exit.text) \ @@ -527,6 +532,7 @@ /* init and exit section handling */ #define INIT_DATA \ + KEEP(*(SORT(___kentry+*))) \ *(.init.data) \ MEM_DISCARD(init.data) \ KERNEL_CTORS() \ @@ -593,7 +599,7 @@ BSS_FIRST_SECTIONS \ *(.bss..page_aligned) \ *(.dynbss) \ - *(.bss) \ + *(.bss .bss.[0-9a-zA-Z_]*) \ *(COMMON) \ } @@ -676,12 +682,12 @@ #define INIT_CALLS_LEVEL(level) \ VMLINUX_SYMBOL(__initcall##level##_start) = .; \ - *(.initcall##level##.init) \ - *(.initcall##level##s.init) \ + KEEP(*(.initcall##level##.init)) \ + KEEP(*(.initcall##level##s.init)) \ #define INIT_CALLS \ VMLINUX_SYMBOL(__initcall_start) = .; \ - *(.initcallearly.init) \ + KEEP(*(.initcallearly.init)) \ INIT_CALLS_LEVEL(0) \ INIT_CALLS_LEVEL(1) \ INIT_CALLS_LEVEL(2) \ @@ -695,21 +701,21 @@ #define CON_INITCALL \ VMLINUX_SYMBOL(__con_initcall_start) = .; \ - *(.con_initcall.init) \ + KEEP(*(.con_initcall.init)) \ VMLINUX_SYMBOL(__con_initcall_end) = .; #define SECURITY_INITCALL \ VMLINUX_SYMBOL(__security_initcall_start) = .; \ - *(.security_initcall.init) \ + KEEP(*(.security_initcall.init)) \ VMLINUX_SYMBOL(__security_initcall_end) = .; #ifdef CONFIG_BLK_DEV_INITRD #define INIT_RAM_FS \ . = ALIGN(4); \ VMLINUX_SYMBOL(__initramfs_start) = .; \ - *(.init.ramfs) \ + KEEP(*(.init.ramfs)) \ . = ALIGN(8); \ - *(.init.ramfs.info) + KEEP(*(.init.ramfs.info)) #else #define INIT_RAM_FS #endif diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 1bb954842725..86130cded110 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -182,6 +182,29 @@ void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect); # define unreachable() do { } while (1) #endif +/* + * KENTRY - kernel entry point + * This can be used to annotate symbols (functions or data) that are used + * without their linker symbol being referenced explicitly. For example, + * interrupt vector handlers, or functions in the kernel image that are found + * programatically. + * + * Not required for symbols exported with EXPORT_SYMBOL, or initcalls. Those + * are handled in their own way (with KEEP() in linker scripts). + * + * KENTRY can be avoided if the symbols in question are marked as KEEP() in the + * linker script. For example an architecture could KEEP() its entire + * boot/exception vector code rather than annotate each function and data. + */ +#ifndef KENTRY +# define KENTRY(sym) \ + extern typeof(sym) sym; \ + static const unsigned long __kentry_##sym \ + __used \ + __attribute__((section("___kentry" "+" #sym ), used)) \ + = (unsigned long)&sym; +#endif + #ifndef RELOC_HIDE # define RELOC_HIDE(ptr, off) \ ({ unsigned long __ptr; \ diff --git a/include/linux/export.h b/include/linux/export.h index c565f87f005e..337cb90f3668 100644 --- a/include/linux/export.h +++ b/include/linux/export.h @@ -1,5 +1,6 @@ #ifndef _LINUX_EXPORT_H #define _LINUX_EXPORT_H + /* * Export symbols from the kernel to modules. Forked from module.h * to reduce the amount of pointless cruft we feed to gcc when only @@ -42,27 +43,26 @@ extern struct module __this_module; #ifdef CONFIG_MODVERSIONS /* Mark the CRC weak since genksyms apparently decides not to * generate a checksums for some symbols */ -#define __CRC_SYMBOL(sym, sec) \ - extern __visible void *__crc_##sym __attribute__((weak)); \ - static const unsigned long __kcrctab_##sym \ - __used \ - __attribute__((section("___kcrctab" sec "+" #sym), unused)) \ +#define __CRC_SYMBOL(sym, sec) \ + extern __visible void *__crc_##sym __attribute__((weak)); \ + static const unsigned long __kcrctab_##sym \ + __used \ + __attribute__((section("___kcrctab" sec "+" #sym), used)) \ = (unsigned long) &__crc_##sym; #else #define __CRC_SYMBOL(sym, sec) #endif /* For every exported symbol, place a struct in the __ksymtab section */ -#define ___EXPORT_SYMBOL(sym, sec) \ - extern typeof(sym) sym; \ - __CRC_SYMBOL(sym, sec) \ - static const char __kstrtab_##sym[] \ - __attribute__((section("__ksymtab_strings"), aligned(1))) \ - = VMLINUX_SYMBOL_STR(sym); \ - extern const struct kernel_symbol __ksymtab_##sym; \ - __visible const struct kernel_symbol __ksymtab_##sym \ - __used \ - __attribute__((section("___ksymtab" sec "+" #sym), unused)) \ +#define ___EXPORT_SYMBOL(sym, sec) \ + extern typeof(sym) sym; \ + __CRC_SYMBOL(sym, sec) \ + static const char __kstrtab_##sym[] \ + __attribute__((section("__ksymtab_strings"), aligned(1))) \ + = VMLINUX_SYMBOL_STR(sym); \ + static const struct kernel_symbol __ksymtab_##sym \ + __used \ + __attribute__((section("___ksymtab" sec "+" #sym), used)) \ = { (unsigned long)&sym, __kstrtab_##sym } #if defined(__KSYM_DEPS__) diff --git a/include/linux/init.h b/include/linux/init.h index 6935d02474aa..e571fec4bb28 100644 --- a/include/linux/init.h +++ b/include/linux/init.h @@ -150,24 +150,8 @@ extern bool initcall_debug; #ifndef __ASSEMBLY__ -#ifdef CONFIG_LTO -/* Work around a LTO gcc problem: when there is no reference to a variable - * in a module it will be moved to the end of the program. This causes - * reordering of initcalls which the kernel does not like. - * Add a dummy reference function to avoid this. The function is - * deleted by the linker. - */ -#define LTO_REFERENCE_INITCALL(x) \ - ; /* yes this is needed */ \ - static __used __exit void *reference_##x(void) \ - { \ - return &x; \ - } -#else -#define LTO_REFERENCE_INITCALL(x) -#endif - -/* initcalls are now grouped by functionality into separate +/* + * initcalls are now grouped by functionality into separate * subsections. Ordering inside the subsections is determined * by link order. * For backwards compatibility, initcall() puts the call in @@ -175,12 +159,16 @@ extern bool initcall_debug; * * The `id' arg to __define_initcall() is needed so that multiple initcalls * can point at the same handler without causing duplicate-symbol build errors. + * + * Initcalls are run by placing pointers in initcall sections that the + * kernel iterates at runtime. The linker can do dead code / data elimination + * and remove that completely, so the initcall sections have to be marked + * as KEEP() in the linker script. */ #define __define_initcall(fn, id) \ static initcall_t __initcall_##fn##id __used \ - __attribute__((__section__(".initcall" #id ".init"))) = fn; \ - LTO_REFERENCE_INITCALL(__initcall_##fn##id) + __attribute__((__section__(".initcall" #id ".init"))) = fn; /* * Early initcalls run before initializing SMP. @@ -216,15 +204,15 @@ extern bool initcall_debug; #define __initcall(fn) device_initcall(fn) -#define __exitcall(fn) \ +#define __exitcall(fn) \ static exitcall_t __exitcall_##fn __exit_call = fn -#define console_initcall(fn) \ - static initcall_t __initcall_##fn \ +#define console_initcall(fn) \ + static initcall_t __initcall_##fn \ __used __section(.con_initcall.init) = fn -#define security_initcall(fn) \ - static initcall_t __initcall_##fn \ +#define security_initcall(fn) \ + static initcall_t __initcall_##fn \ __used __section(.security_initcall.init) = fn struct obs_kernel_param { diff --git a/init/Makefile b/init/Makefile index 7bc47ee31c36..c4fb45525d08 100644 --- a/init/Makefile +++ b/init/Makefile @@ -2,6 +2,8 @@ # Makefile for the linux kernel. # +ccflags-y := -fno-function-sections -fno-data-sections + obj-y := main.o version.o mounts.o ifneq ($(CONFIG_BLK_DEV_INITRD),y) obj-y += noinitramfs.o -- cgit v1.2.3 From 0f4c4af06eec5717041d6fe568b80ee4cf3c1475 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Wed, 14 Sep 2016 12:24:03 +1000 Subject: kbuild: -ffunction-sections fix for archs with conflicting sections Enabling -ffunction-sections modified the generic linker script to pull .text.* sections into regular TEXT_TEXT section, conflicting with some architectures. Revert that change and require archs that enable the option to ensure they have no conflicting section names, and do the appropriate merging. Reported-by: Guenter Roeck Tested-by: Guenter Roeck Fixes: b67067f1176d ("kbuild: allow archs to select link dead code/data elimination") Signed-off-by: Nicholas Piggin Signed-off-by: Michal Marek --- arch/Kconfig | 4 +++- include/asm-generic/vmlinux.lds.h | 15 ++++++++++----- 2 files changed, 13 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/arch/Kconfig b/arch/Kconfig index 3f948c422d9d..48d1e76a1ee3 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -478,7 +478,9 @@ config LD_DEAD_CODE_DATA_ELIMINATION This requires that the arch annotates or otherwise protects its external entry points from being discarded. Linker scripts must also merge .text.*, .data.*, and .bss.* correctly into - output sections. + output sections. Care must be taken not to pull in unrelated + sections (e.g., '.text.init'). Typically '.' in section names + is used to distinguish them from label names / C identifiers. config HAVE_CONTEXT_TRACKING bool diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index ad9d8f94dc7a..48dd44f3f24b 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -198,9 +198,9 @@ /* * .data section - * -fdata-sections generates .data.identifier which needs to be pulled in - * with .data, but don't want to pull in .data..stuff which has its own - * requirements. Same for bss. + * LD_DEAD_CODE_DATA_ELIMINATION option enables -fdata-sections generates + * .data.identifier which needs to be pulled in with .data, but don't want to + * pull in .data..stuff which has its own requirements. Same for bss. */ #define DATA_DATA \ *(.data .data.[0-9a-zA-Z_]*) \ @@ -434,10 +434,15 @@ } /* .text section. Map to function alignment to avoid address changes - * during second ld run in second ld pass when generating System.map */ + * during second ld run in second ld pass when generating System.map + * LD_DEAD_CODE_DATA_ELIMINATION option enables -ffunction-sections generates + * .text.identifier which needs to be pulled in with .text , but some + * architectures define .text.foo which is not intended to be pulled in here. + * Those enabling LD_DEAD_CODE_DATA_ELIMINATION must ensure they don't have + * conflicting section names, and must pull in .text.[0-9a-zA-Z_]* */ #define TEXT_TEXT \ ALIGN_FUNCTION(); \ - *(.text.hot .text .text.fixup .text.unlikely .text.*) \ + *(.text.hot .text .text.fixup .text.unlikely) \ *(.ref.text) \ MEM_KEEP(init.text) \ MEM_KEEP(exit.text) \ -- cgit v1.2.3 From dcd3ea81f572fba57d18e91743a525a9787c1f6e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 22 Sep 2016 07:49:50 -0700 Subject: libata: remove asm-generic is only intended for architecture defaults, and we can simply kill it off by moving the two defintions directly to . Signed-off-by: Christoph Hellwig Signed-off-by: Tejun Heo --- include/asm-generic/libata-portmap.h | 7 ------- include/linux/libata.h | 3 ++- 2 files changed, 2 insertions(+), 8 deletions(-) delete mode 100644 include/asm-generic/libata-portmap.h (limited to 'include') diff --git a/include/asm-generic/libata-portmap.h b/include/asm-generic/libata-portmap.h deleted file mode 100644 index cf14f2ff40b6..000000000000 --- a/include/asm-generic/libata-portmap.h +++ /dev/null @@ -1,7 +0,0 @@ -#ifndef __ASM_GENERIC_LIBATA_PORTMAP_H -#define __ASM_GENERIC_LIBATA_PORTMAP_H - -#define ATA_PRIMARY_IRQ(dev) 14 -#define ATA_SECONDARY_IRQ(dev) 15 - -#endif diff --git a/include/linux/libata.h b/include/linux/libata.h index e37d4f99f510..616eef4d81ea 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -46,7 +46,8 @@ #ifdef CONFIG_ATA_NONSTANDARD #include #else -#include +#define ATA_PRIMARY_IRQ(dev) 14 +#define ATA_SECONDARY_IRQ(dev) 15 #endif /* -- cgit v1.2.3 From 1b5ca12127427c51be605a75ecd0141eb3357249 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Thu, 22 Sep 2016 11:55:54 -0400 Subject: percpu: improve generic percpu modify-return implementation Some architectures require an additional load to find the address of percpu pointers. In some implemenatations, the C aliasing rules do not allow the result of that load to be kept over the store that modifies the percpu variable, which causes additional loads. Work around this by finding the pointer first, then operating on that. It's also possible to mark things as restrict and those kind of games, but that can require larger and arch specific changes. On powerpc, __this_cpu_inc_return compiles to: ld 10,48(13) ldx 9,3,10 addi 9,9,1 stdx 9,3,10 ld 9,48(13) ldx 3,9,3 With this patch it compiles to: ld 10,48(13) ldx 9,3,10 addi 9,9,1 stdx 9,3,10 Signed-off-by: Nicholas Piggin To: Tejun Heo To: Christoph Lameter Cc: linux-kernel@vger.kernel.org Cc: linux-arch@vger.kernel.org Signed-off-by: Tejun Heo --- include/asm-generic/percpu.h | 53 +++++++++++++++++++++++++------------------- 1 file changed, 30 insertions(+), 23 deletions(-) (limited to 'include') diff --git a/include/asm-generic/percpu.h b/include/asm-generic/percpu.h index 4d9f233c4ba8..40e887068da2 100644 --- a/include/asm-generic/percpu.h +++ b/include/asm-generic/percpu.h @@ -65,6 +65,11 @@ extern void setup_per_cpu_areas(void); #define PER_CPU_DEF_ATTRIBUTES #endif +#define raw_cpu_generic_read(pcp) \ +({ \ + *raw_cpu_ptr(&(pcp)); \ +}) + #define raw_cpu_generic_to_op(pcp, val, op) \ do { \ *raw_cpu_ptr(&(pcp)) op val; \ @@ -72,34 +77,39 @@ do { \ #define raw_cpu_generic_add_return(pcp, val) \ ({ \ - raw_cpu_add(pcp, val); \ - raw_cpu_read(pcp); \ + typeof(&(pcp)) __p = raw_cpu_ptr(&(pcp)); \ + \ + *__p += val; \ + *__p; \ }) #define raw_cpu_generic_xchg(pcp, nval) \ ({ \ + typeof(&(pcp)) __p = raw_cpu_ptr(&(pcp)); \ typeof(pcp) __ret; \ - __ret = raw_cpu_read(pcp); \ - raw_cpu_write(pcp, nval); \ + __ret = *__p; \ + *__p = nval; \ __ret; \ }) #define raw_cpu_generic_cmpxchg(pcp, oval, nval) \ ({ \ + typeof(&(pcp)) __p = raw_cpu_ptr(&(pcp)); \ typeof(pcp) __ret; \ - __ret = raw_cpu_read(pcp); \ + __ret = *__p; \ if (__ret == (oval)) \ - raw_cpu_write(pcp, nval); \ + *__p = nval; \ __ret; \ }) #define raw_cpu_generic_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1, nval2) \ ({ \ + typeof(&(pcp1)) __p1 = raw_cpu_ptr(&(pcp1)); \ + typeof(&(pcp2)) __p2 = raw_cpu_ptr(&(pcp2)); \ int __ret = 0; \ - if (raw_cpu_read(pcp1) == (oval1) && \ - raw_cpu_read(pcp2) == (oval2)) { \ - raw_cpu_write(pcp1, nval1); \ - raw_cpu_write(pcp2, nval2); \ + if (*__p1 == (oval1) && *__p2 == (oval2)) { \ + *__p1 = nval1; \ + *__p2 = nval2; \ __ret = 1; \ } \ (__ret); \ @@ -109,7 +119,7 @@ do { \ ({ \ typeof(pcp) __ret; \ preempt_disable(); \ - __ret = *this_cpu_ptr(&(pcp)); \ + __ret = raw_cpu_generic_read(pcp); \ preempt_enable(); \ __ret; \ }) @@ -118,17 +128,17 @@ do { \ do { \ unsigned long __flags; \ raw_local_irq_save(__flags); \ - *raw_cpu_ptr(&(pcp)) op val; \ + raw_cpu_generic_to_op(pcp, val, op); \ raw_local_irq_restore(__flags); \ } while (0) + #define this_cpu_generic_add_return(pcp, val) \ ({ \ typeof(pcp) __ret; \ unsigned long __flags; \ raw_local_irq_save(__flags); \ - raw_cpu_add(pcp, val); \ - __ret = raw_cpu_read(pcp); \ + __ret = raw_cpu_generic_add_return(pcp, val); \ raw_local_irq_restore(__flags); \ __ret; \ }) @@ -138,8 +148,7 @@ do { \ typeof(pcp) __ret; \ unsigned long __flags; \ raw_local_irq_save(__flags); \ - __ret = raw_cpu_read(pcp); \ - raw_cpu_write(pcp, nval); \ + __ret = raw_cpu_generic_xchg(pcp, nval); \ raw_local_irq_restore(__flags); \ __ret; \ }) @@ -149,9 +158,7 @@ do { \ typeof(pcp) __ret; \ unsigned long __flags; \ raw_local_irq_save(__flags); \ - __ret = raw_cpu_read(pcp); \ - if (__ret == (oval)) \ - raw_cpu_write(pcp, nval); \ + __ret = raw_cpu_generic_cmpxchg(pcp, oval, nval); \ raw_local_irq_restore(__flags); \ __ret; \ }) @@ -168,16 +175,16 @@ do { \ }) #ifndef raw_cpu_read_1 -#define raw_cpu_read_1(pcp) (*raw_cpu_ptr(&(pcp))) +#define raw_cpu_read_1(pcp) raw_cpu_generic_read(pcp) #endif #ifndef raw_cpu_read_2 -#define raw_cpu_read_2(pcp) (*raw_cpu_ptr(&(pcp))) +#define raw_cpu_read_2(pcp) raw_cpu_generic_read(pcp) #endif #ifndef raw_cpu_read_4 -#define raw_cpu_read_4(pcp) (*raw_cpu_ptr(&(pcp))) +#define raw_cpu_read_4(pcp) raw_cpu_generic_read(pcp) #endif #ifndef raw_cpu_read_8 -#define raw_cpu_read_8(pcp) (*raw_cpu_ptr(&(pcp))) +#define raw_cpu_read_8(pcp) raw_cpu_generic_read(pcp) #endif #ifndef raw_cpu_write_1 -- cgit v1.2.3 From 1f9c0a77278bc9ec7df14bbd799fccfabff1010a Mon Sep 17 00:00:00 2001 From: Claudiu Manoil Date: Thu, 22 Sep 2016 18:04:08 +0300 Subject: soc/fsl: Introduce DPAA 1.x BMan device driver This driver enables the Freescale DPAA 1.x Buffer Manager block. BMan is a hardware accelerator that manages buffer pools. It allows CPUs and other accelerators connected to the SoC datapath to acquire and release buffers during data processing. Signed-off-by: Roy Pledge Signed-off-by: Claudiu Manoil Signed-off-by: Scott Wood --- drivers/soc/fsl/qbman/Kconfig | 28 ++ drivers/soc/fsl/qbman/Makefile | 2 + drivers/soc/fsl/qbman/bman.c | 797 ++++++++++++++++++++++++++++++++++++ drivers/soc/fsl/qbman/bman_ccsr.c | 263 ++++++++++++ drivers/soc/fsl/qbman/bman_portal.c | 219 ++++++++++ drivers/soc/fsl/qbman/bman_priv.h | 80 ++++ drivers/soc/fsl/qbman/dpaa_sys.h | 103 +++++ include/soc/fsl/bman.h | 129 ++++++ 8 files changed, 1621 insertions(+) create mode 100644 drivers/soc/fsl/qbman/Kconfig create mode 100644 drivers/soc/fsl/qbman/Makefile create mode 100644 drivers/soc/fsl/qbman/bman.c create mode 100644 drivers/soc/fsl/qbman/bman_ccsr.c create mode 100644 drivers/soc/fsl/qbman/bman_portal.c create mode 100644 drivers/soc/fsl/qbman/bman_priv.h create mode 100644 drivers/soc/fsl/qbman/dpaa_sys.h create mode 100644 include/soc/fsl/bman.h (limited to 'include') diff --git a/drivers/soc/fsl/qbman/Kconfig b/drivers/soc/fsl/qbman/Kconfig new file mode 100644 index 000000000000..8f2df64224bf --- /dev/null +++ b/drivers/soc/fsl/qbman/Kconfig @@ -0,0 +1,28 @@ +menuconfig FSL_DPAA + bool "Freescale DPAA 1.x support" + depends on FSL_SOC_BOOKE + select GENERIC_ALLOCATOR + help + The Freescale Data Path Acceleration Architecture (DPAA) is a set of + hardware components on specific QorIQ multicore processors. + This architecture provides the infrastructure to support simplified + sharing of networking interfaces and accelerators by multiple CPUs. + The major h/w blocks composing DPAA are BMan and QMan. + + The Buffer Manager (BMan) is a hardware buffer pool management block + that allows software and accelerators on the datapath to acquire and + release buffers in order to build frames. + + The Queue Manager (QMan) is a hardware queue management block + that allows software and accelerators on the datapath to enqueue and + dequeue frames in order to communicate. + +if FSL_DPAA + +config FSL_DPAA_CHECKING + bool "Additional driver checking" + help + Compiles in additional checks, to sanity-check the drivers and + any use of the exported API. Not recommended for performance. + +endif # FSL_DPAA diff --git a/drivers/soc/fsl/qbman/Makefile b/drivers/soc/fsl/qbman/Makefile new file mode 100644 index 000000000000..855c3ac0f89e --- /dev/null +++ b/drivers/soc/fsl/qbman/Makefile @@ -0,0 +1,2 @@ +obj-$(CONFIG_FSL_DPAA) += bman_ccsr.o bman_portal.o \ + bman.o diff --git a/drivers/soc/fsl/qbman/bman.c b/drivers/soc/fsl/qbman/bman.c new file mode 100644 index 000000000000..ffa48fdbb1a9 --- /dev/null +++ b/drivers/soc/fsl/qbman/bman.c @@ -0,0 +1,797 @@ +/* Copyright 2008 - 2016 Freescale Semiconductor, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Freescale Semiconductor nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * ALTERNATIVELY, this software may be distributed under the terms of the + * GNU General Public License ("GPL") as published by the Free Software + * Foundation, either version 2 of that License or (at your option) any + * later version. + * + * THIS SOFTWARE IS PROVIDED BY Freescale Semiconductor ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL Freescale Semiconductor BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "bman_priv.h" + +#define IRQNAME "BMan portal %d" +#define MAX_IRQNAME 16 /* big enough for "BMan portal %d" */ + +/* Portal register assists */ + +/* Cache-inhibited register offsets */ +#define BM_REG_RCR_PI_CINH 0x0000 +#define BM_REG_RCR_CI_CINH 0x0004 +#define BM_REG_RCR_ITR 0x0008 +#define BM_REG_CFG 0x0100 +#define BM_REG_SCN(n) (0x0200 + ((n) << 2)) +#define BM_REG_ISR 0x0e00 +#define BM_REG_IER 0x0e04 +#define BM_REG_ISDR 0x0e08 +#define BM_REG_IIR 0x0e0c + +/* Cache-enabled register offsets */ +#define BM_CL_CR 0x0000 +#define BM_CL_RR0 0x0100 +#define BM_CL_RR1 0x0140 +#define BM_CL_RCR 0x1000 +#define BM_CL_RCR_PI_CENA 0x3000 +#define BM_CL_RCR_CI_CENA 0x3100 + +/* + * Portal modes. + * Enum types; + * pmode == production mode + * cmode == consumption mode, + * Enum values use 3 letter codes. First letter matches the portal mode, + * remaining two letters indicate; + * ci == cache-inhibited portal register + * ce == cache-enabled portal register + * vb == in-band valid-bit (cache-enabled) + */ +enum bm_rcr_pmode { /* matches BCSP_CFG::RPM */ + bm_rcr_pci = 0, /* PI index, cache-inhibited */ + bm_rcr_pce = 1, /* PI index, cache-enabled */ + bm_rcr_pvb = 2 /* valid-bit */ +}; +enum bm_rcr_cmode { /* s/w-only */ + bm_rcr_cci, /* CI index, cache-inhibited */ + bm_rcr_cce /* CI index, cache-enabled */ +}; + + +/* --- Portal structures --- */ + +#define BM_RCR_SIZE 8 + +/* Release Command */ +struct bm_rcr_entry { + union { + struct { + u8 _ncw_verb; /* writes to this are non-coherent */ + u8 bpid; /* used with BM_RCR_VERB_CMD_BPID_SINGLE */ + u8 __reserved1[62]; + }; + struct bm_buffer bufs[8]; + }; +}; +#define BM_RCR_VERB_VBIT 0x80 +#define BM_RCR_VERB_CMD_MASK 0x70 /* one of two values; */ +#define BM_RCR_VERB_CMD_BPID_SINGLE 0x20 +#define BM_RCR_VERB_CMD_BPID_MULTI 0x30 +#define BM_RCR_VERB_BUFCOUNT_MASK 0x0f /* values 1..8 */ + +struct bm_rcr { + struct bm_rcr_entry *ring, *cursor; + u8 ci, available, ithresh, vbit; +#ifdef CONFIG_FSL_DPAA_CHECKING + u32 busy; + enum bm_rcr_pmode pmode; + enum bm_rcr_cmode cmode; +#endif +}; + +/* MC (Management Command) command */ +struct bm_mc_command { + u8 _ncw_verb; /* writes to this are non-coherent */ + u8 bpid; /* used by acquire command */ + u8 __reserved[62]; +}; +#define BM_MCC_VERB_VBIT 0x80 +#define BM_MCC_VERB_CMD_MASK 0x70 /* where the verb contains; */ +#define BM_MCC_VERB_CMD_ACQUIRE 0x10 +#define BM_MCC_VERB_CMD_QUERY 0x40 +#define BM_MCC_VERB_ACQUIRE_BUFCOUNT 0x0f /* values 1..8 go here */ + +/* MC result, Acquire and Query Response */ +union bm_mc_result { + struct { + u8 verb; + u8 bpid; + u8 __reserved[62]; + }; + struct bm_buffer bufs[8]; +}; +#define BM_MCR_VERB_VBIT 0x80 +#define BM_MCR_VERB_CMD_MASK BM_MCC_VERB_CMD_MASK +#define BM_MCR_VERB_CMD_ACQUIRE BM_MCC_VERB_CMD_ACQUIRE +#define BM_MCR_VERB_CMD_QUERY BM_MCC_VERB_CMD_QUERY +#define BM_MCR_VERB_CMD_ERR_INVALID 0x60 +#define BM_MCR_VERB_CMD_ERR_ECC 0x70 +#define BM_MCR_VERB_ACQUIRE_BUFCOUNT BM_MCC_VERB_ACQUIRE_BUFCOUNT /* 0..8 */ +#define BM_MCR_TIMEOUT 10000 /* us */ + +struct bm_mc { + struct bm_mc_command *cr; + union bm_mc_result *rr; + u8 rridx, vbit; +#ifdef CONFIG_FSL_DPAA_CHECKING + enum { + /* Can only be _mc_start()ed */ + mc_idle, + /* Can only be _mc_commit()ed or _mc_abort()ed */ + mc_user, + /* Can only be _mc_retry()ed */ + mc_hw + } state; +#endif +}; + +struct bm_addr { + void __iomem *ce; /* cache-enabled */ + void __iomem *ci; /* cache-inhibited */ +}; + +struct bm_portal { + struct bm_addr addr; + struct bm_rcr rcr; + struct bm_mc mc; +} ____cacheline_aligned; + +/* Cache-inhibited register access. */ +static inline u32 bm_in(struct bm_portal *p, u32 offset) +{ + return __raw_readl(p->addr.ci + offset); +} + +static inline void bm_out(struct bm_portal *p, u32 offset, u32 val) +{ + __raw_writel(val, p->addr.ci + offset); +} + +/* Cache Enabled Portal Access */ +static inline void bm_cl_invalidate(struct bm_portal *p, u32 offset) +{ + dpaa_invalidate(p->addr.ce + offset); +} + +static inline void bm_cl_touch_ro(struct bm_portal *p, u32 offset) +{ + dpaa_touch_ro(p->addr.ce + offset); +} + +static inline u32 bm_ce_in(struct bm_portal *p, u32 offset) +{ + return __raw_readl(p->addr.ce + offset); +} + +struct bman_portal { + struct bm_portal p; + /* interrupt sources processed by portal_isr(), configurable */ + unsigned long irq_sources; + /* probing time config params for cpu-affine portals */ + const struct bm_portal_config *config; + char irqname[MAX_IRQNAME]; +}; + +static cpumask_t affine_mask; +static DEFINE_SPINLOCK(affine_mask_lock); +static DEFINE_PER_CPU(struct bman_portal, bman_affine_portal); + +static inline struct bman_portal *get_affine_portal(void) +{ + return &get_cpu_var(bman_affine_portal); +} + +static inline void put_affine_portal(void) +{ + put_cpu_var(bman_affine_portal); +} + +/* + * This object type refers to a pool, it isn't *the* pool. There may be + * more than one such object per BMan buffer pool, eg. if different users of the + * pool are operating via different portals. + */ +struct bman_pool { + /* index of the buffer pool to encapsulate (0-63) */ + u32 bpid; + /* Used for hash-table admin when using depletion notifications. */ + struct bman_portal *portal; + struct bman_pool *next; +}; + +static u32 poll_portal_slow(struct bman_portal *p, u32 is); + +static irqreturn_t portal_isr(int irq, void *ptr) +{ + struct bman_portal *p = ptr; + struct bm_portal *portal = &p->p; + u32 clear = p->irq_sources; + u32 is = bm_in(portal, BM_REG_ISR) & p->irq_sources; + + if (unlikely(!is)) + return IRQ_NONE; + + clear |= poll_portal_slow(p, is); + bm_out(portal, BM_REG_ISR, clear); + return IRQ_HANDLED; +} + +/* --- RCR API --- */ + +#define RCR_SHIFT ilog2(sizeof(struct bm_rcr_entry)) +#define RCR_CARRY (uintptr_t)(BM_RCR_SIZE << RCR_SHIFT) + +/* Bit-wise logic to wrap a ring pointer by clearing the "carry bit" */ +static struct bm_rcr_entry *rcr_carryclear(struct bm_rcr_entry *p) +{ + uintptr_t addr = (uintptr_t)p; + + addr &= ~RCR_CARRY; + + return (struct bm_rcr_entry *)addr; +} + +#ifdef CONFIG_FSL_DPAA_CHECKING +/* Bit-wise logic to convert a ring pointer to a ring index */ +static int rcr_ptr2idx(struct bm_rcr_entry *e) +{ + return ((uintptr_t)e >> RCR_SHIFT) & (BM_RCR_SIZE - 1); +} +#endif + +/* Increment the 'cursor' ring pointer, taking 'vbit' into account */ +static inline void rcr_inc(struct bm_rcr *rcr) +{ + /* increment to the next RCR pointer and handle overflow and 'vbit' */ + struct bm_rcr_entry *partial = rcr->cursor + 1; + + rcr->cursor = rcr_carryclear(partial); + if (partial != rcr->cursor) + rcr->vbit ^= BM_RCR_VERB_VBIT; +} + +static int bm_rcr_get_avail(struct bm_portal *portal) +{ + struct bm_rcr *rcr = &portal->rcr; + + return rcr->available; +} + +static int bm_rcr_get_fill(struct bm_portal *portal) +{ + struct bm_rcr *rcr = &portal->rcr; + + return BM_RCR_SIZE - 1 - rcr->available; +} + +static void bm_rcr_set_ithresh(struct bm_portal *portal, u8 ithresh) +{ + struct bm_rcr *rcr = &portal->rcr; + + rcr->ithresh = ithresh; + bm_out(portal, BM_REG_RCR_ITR, ithresh); +} + +static void bm_rcr_cce_prefetch(struct bm_portal *portal) +{ + __maybe_unused struct bm_rcr *rcr = &portal->rcr; + + DPAA_ASSERT(rcr->cmode == bm_rcr_cce); + bm_cl_touch_ro(portal, BM_CL_RCR_CI_CENA); +} + +static u8 bm_rcr_cce_update(struct bm_portal *portal) +{ + struct bm_rcr *rcr = &portal->rcr; + u8 diff, old_ci = rcr->ci; + + DPAA_ASSERT(rcr->cmode == bm_rcr_cce); + rcr->ci = bm_ce_in(portal, BM_CL_RCR_CI_CENA) & (BM_RCR_SIZE - 1); + bm_cl_invalidate(portal, BM_CL_RCR_CI_CENA); + diff = dpaa_cyc_diff(BM_RCR_SIZE, old_ci, rcr->ci); + rcr->available += diff; + return diff; +} + +static inline struct bm_rcr_entry *bm_rcr_start(struct bm_portal *portal) +{ + struct bm_rcr *rcr = &portal->rcr; + + DPAA_ASSERT(!rcr->busy); + if (!rcr->available) + return NULL; +#ifdef CONFIG_FSL_DPAA_CHECKING + rcr->busy = 1; +#endif + dpaa_zero(rcr->cursor); + return rcr->cursor; +} + +static inline void bm_rcr_pvb_commit(struct bm_portal *portal, u8 myverb) +{ + struct bm_rcr *rcr = &portal->rcr; + struct bm_rcr_entry *rcursor; + + DPAA_ASSERT(rcr->busy); + DPAA_ASSERT(rcr->pmode == bm_rcr_pvb); + DPAA_ASSERT(rcr->available >= 1); + dma_wmb(); + rcursor = rcr->cursor; + rcursor->_ncw_verb = myverb | rcr->vbit; + dpaa_flush(rcursor); + rcr_inc(rcr); + rcr->available--; +#ifdef CONFIG_FSL_DPAA_CHECKING + rcr->busy = 0; +#endif +} + +static int bm_rcr_init(struct bm_portal *portal, enum bm_rcr_pmode pmode, + enum bm_rcr_cmode cmode) +{ + struct bm_rcr *rcr = &portal->rcr; + u32 cfg; + u8 pi; + + rcr->ring = portal->addr.ce + BM_CL_RCR; + rcr->ci = bm_in(portal, BM_REG_RCR_CI_CINH) & (BM_RCR_SIZE - 1); + pi = bm_in(portal, BM_REG_RCR_PI_CINH) & (BM_RCR_SIZE - 1); + rcr->cursor = rcr->ring + pi; + rcr->vbit = (bm_in(portal, BM_REG_RCR_PI_CINH) & BM_RCR_SIZE) ? + BM_RCR_VERB_VBIT : 0; + rcr->available = BM_RCR_SIZE - 1 + - dpaa_cyc_diff(BM_RCR_SIZE, rcr->ci, pi); + rcr->ithresh = bm_in(portal, BM_REG_RCR_ITR); +#ifdef CONFIG_FSL_DPAA_CHECKING + rcr->busy = 0; + rcr->pmode = pmode; + rcr->cmode = cmode; +#endif + cfg = (bm_in(portal, BM_REG_CFG) & 0xffffffe0) + | (pmode & 0x3); /* BCSP_CFG::RPM */ + bm_out(portal, BM_REG_CFG, cfg); + return 0; +} + +static void bm_rcr_finish(struct bm_portal *portal) +{ +#ifdef CONFIG_FSL_DPAA_CHECKING + struct bm_rcr *rcr = &portal->rcr; + int i; + + DPAA_ASSERT(!rcr->busy); + + i = bm_in(portal, BM_REG_RCR_PI_CINH) & (BM_RCR_SIZE - 1); + if (i != rcr_ptr2idx(rcr->cursor)) + pr_crit("losing uncommited RCR entries\n"); + + i = bm_in(portal, BM_REG_RCR_CI_CINH) & (BM_RCR_SIZE - 1); + if (i != rcr->ci) + pr_crit("missing existing RCR completions\n"); + if (rcr->ci != rcr_ptr2idx(rcr->cursor)) + pr_crit("RCR destroyed unquiesced\n"); +#endif +} + +/* --- Management command API --- */ +static int bm_mc_init(struct bm_portal *portal) +{ + struct bm_mc *mc = &portal->mc; + + mc->cr = portal->addr.ce + BM_CL_CR; + mc->rr = portal->addr.ce + BM_CL_RR0; + mc->rridx = (__raw_readb(&mc->cr->_ncw_verb) & BM_MCC_VERB_VBIT) ? + 0 : 1; + mc->vbit = mc->rridx ? BM_MCC_VERB_VBIT : 0; +#ifdef CONFIG_FSL_DPAA_CHECKING + mc->state = mc_idle; +#endif + return 0; +} + +static void bm_mc_finish(struct bm_portal *portal) +{ +#ifdef CONFIG_FSL_DPAA_CHECKING + struct bm_mc *mc = &portal->mc; + + DPAA_ASSERT(mc->state == mc_idle); + if (mc->state != mc_idle) + pr_crit("Losing incomplete MC command\n"); +#endif +} + +static inline struct bm_mc_command *bm_mc_start(struct bm_portal *portal) +{ + struct bm_mc *mc = &portal->mc; + + DPAA_ASSERT(mc->state == mc_idle); +#ifdef CONFIG_FSL_DPAA_CHECKING + mc->state = mc_user; +#endif + dpaa_zero(mc->cr); + return mc->cr; +} + +static inline void bm_mc_commit(struct bm_portal *portal, u8 myverb) +{ + struct bm_mc *mc = &portal->mc; + union bm_mc_result *rr = mc->rr + mc->rridx; + + DPAA_ASSERT(mc->state == mc_user); + dma_wmb(); + mc->cr->_ncw_verb = myverb | mc->vbit; + dpaa_flush(mc->cr); + dpaa_invalidate_touch_ro(rr); +#ifdef CONFIG_FSL_DPAA_CHECKING + mc->state = mc_hw; +#endif +} + +static inline union bm_mc_result *bm_mc_result(struct bm_portal *portal) +{ + struct bm_mc *mc = &portal->mc; + union bm_mc_result *rr = mc->rr + mc->rridx; + + DPAA_ASSERT(mc->state == mc_hw); + /* + * The inactive response register's verb byte always returns zero until + * its command is submitted and completed. This includes the valid-bit, + * in case you were wondering... + */ + if (!__raw_readb(&rr->verb)) { + dpaa_invalidate_touch_ro(rr); + return NULL; + } + mc->rridx ^= 1; + mc->vbit ^= BM_MCC_VERB_VBIT; +#ifdef CONFIG_FSL_DPAA_CHECKING + mc->state = mc_idle; +#endif + return rr; +} + +static inline int bm_mc_result_timeout(struct bm_portal *portal, + union bm_mc_result **mcr) +{ + int timeout = BM_MCR_TIMEOUT; + + do { + *mcr = bm_mc_result(portal); + if (*mcr) + break; + udelay(1); + } while (--timeout); + + return timeout; +} + +/* Disable all BSCN interrupts for the portal */ +static void bm_isr_bscn_disable(struct bm_portal *portal) +{ + bm_out(portal, BM_REG_SCN(0), 0); + bm_out(portal, BM_REG_SCN(1), 0); +} + +static int bman_create_portal(struct bman_portal *portal, + const struct bm_portal_config *c) +{ + struct bm_portal *p; + int ret; + + p = &portal->p; + /* + * prep the low-level portal struct with the mapped addresses from the + * config, everything that follows depends on it and "config" is more + * for (de)reference... + */ + p->addr.ce = c->addr_virt[DPAA_PORTAL_CE]; + p->addr.ci = c->addr_virt[DPAA_PORTAL_CI]; + if (bm_rcr_init(p, bm_rcr_pvb, bm_rcr_cce)) { + dev_err(c->dev, "RCR initialisation failed\n"); + goto fail_rcr; + } + if (bm_mc_init(p)) { + dev_err(c->dev, "MC initialisation failed\n"); + goto fail_mc; + } + /* + * Default to all BPIDs disabled, we enable as required at + * run-time. + */ + bm_isr_bscn_disable(p); + + /* Write-to-clear any stale interrupt status bits */ + bm_out(p, BM_REG_ISDR, 0xffffffff); + portal->irq_sources = 0; + bm_out(p, BM_REG_IER, 0); + bm_out(p, BM_REG_ISR, 0xffffffff); + snprintf(portal->irqname, MAX_IRQNAME, IRQNAME, c->cpu); + if (request_irq(c->irq, portal_isr, 0, portal->irqname, portal)) { + dev_err(c->dev, "request_irq() failed\n"); + goto fail_irq; + } + if (c->cpu != -1 && irq_can_set_affinity(c->irq) && + irq_set_affinity(c->irq, cpumask_of(c->cpu))) { + dev_err(c->dev, "irq_set_affinity() failed\n"); + goto fail_affinity; + } + + /* Need RCR to be empty before continuing */ + ret = bm_rcr_get_fill(p); + if (ret) { + dev_err(c->dev, "RCR unclean\n"); + goto fail_rcr_empty; + } + /* Success */ + portal->config = c; + + bm_out(p, BM_REG_ISDR, 0); + bm_out(p, BM_REG_IIR, 0); + + return 0; + +fail_rcr_empty: +fail_affinity: + free_irq(c->irq, portal); +fail_irq: + bm_mc_finish(p); +fail_mc: + bm_rcr_finish(p); +fail_rcr: + return -EIO; +} + +struct bman_portal *bman_create_affine_portal(const struct bm_portal_config *c) +{ + struct bman_portal *portal; + int err; + + portal = &per_cpu(bman_affine_portal, c->cpu); + err = bman_create_portal(portal, c); + if (err) + return NULL; + + spin_lock(&affine_mask_lock); + cpumask_set_cpu(c->cpu, &affine_mask); + spin_unlock(&affine_mask_lock); + + return portal; +} + +static u32 poll_portal_slow(struct bman_portal *p, u32 is) +{ + u32 ret = is; + + if (is & BM_PIRQ_RCRI) { + bm_rcr_cce_update(&p->p); + bm_rcr_set_ithresh(&p->p, 0); + bm_out(&p->p, BM_REG_ISR, BM_PIRQ_RCRI); + is &= ~BM_PIRQ_RCRI; + } + + /* There should be no status register bits left undefined */ + DPAA_ASSERT(!is); + return ret; +} + +int bman_p_irqsource_add(struct bman_portal *p, u32 bits) +{ + unsigned long irqflags; + + local_irq_save(irqflags); + set_bits(bits & BM_PIRQ_VISIBLE, &p->irq_sources); + bm_out(&p->p, BM_REG_IER, p->irq_sources); + local_irq_restore(irqflags); + return 0; +} + +static int bm_shutdown_pool(u32 bpid) +{ + struct bm_mc_command *bm_cmd; + union bm_mc_result *bm_res; + + while (1) { + struct bman_portal *p = get_affine_portal(); + /* Acquire buffers until empty */ + bm_cmd = bm_mc_start(&p->p); + bm_cmd->bpid = bpid; + bm_mc_commit(&p->p, BM_MCC_VERB_CMD_ACQUIRE | 1); + if (!bm_mc_result_timeout(&p->p, &bm_res)) { + put_affine_portal(); + pr_crit("BMan Acquire Command timedout\n"); + return -ETIMEDOUT; + } + if (!(bm_res->verb & BM_MCR_VERB_ACQUIRE_BUFCOUNT)) { + put_affine_portal(); + /* Pool is empty */ + return 0; + } + put_affine_portal(); + } + + return 0; +} + +struct gen_pool *bm_bpalloc; + +static int bm_alloc_bpid_range(u32 *result, u32 count) +{ + unsigned long addr; + + addr = gen_pool_alloc(bm_bpalloc, count); + if (!addr) + return -ENOMEM; + + *result = addr & ~DPAA_GENALLOC_OFF; + + return 0; +} + +static int bm_release_bpid(u32 bpid) +{ + int ret; + + ret = bm_shutdown_pool(bpid); + if (ret) { + pr_debug("BPID %d leaked\n", bpid); + return ret; + } + + gen_pool_free(bm_bpalloc, bpid | DPAA_GENALLOC_OFF, 1); + return 0; +} + +struct bman_pool *bman_new_pool(void) +{ + struct bman_pool *pool = NULL; + u32 bpid; + + if (bm_alloc_bpid_range(&bpid, 1)) + return NULL; + + pool = kmalloc(sizeof(*pool), GFP_KERNEL); + if (!pool) + goto err; + + pool->bpid = bpid; + + return pool; +err: + bm_release_bpid(bpid); + kfree(pool); + return NULL; +} +EXPORT_SYMBOL(bman_new_pool); + +void bman_free_pool(struct bman_pool *pool) +{ + bm_release_bpid(pool->bpid); + + kfree(pool); +} +EXPORT_SYMBOL(bman_free_pool); + +int bman_get_bpid(const struct bman_pool *pool) +{ + return pool->bpid; +} +EXPORT_SYMBOL(bman_get_bpid); + +static void update_rcr_ci(struct bman_portal *p, int avail) +{ + if (avail) + bm_rcr_cce_prefetch(&p->p); + else + bm_rcr_cce_update(&p->p); +} + +int bman_release(struct bman_pool *pool, const struct bm_buffer *bufs, u8 num) +{ + struct bman_portal *p; + struct bm_rcr_entry *r; + unsigned long irqflags; + int avail, timeout = 1000; /* 1ms */ + int i = num - 1; + + DPAA_ASSERT(num > 0 && num <= 8); + + do { + p = get_affine_portal(); + local_irq_save(irqflags); + avail = bm_rcr_get_avail(&p->p); + if (avail < 2) + update_rcr_ci(p, avail); + r = bm_rcr_start(&p->p); + local_irq_restore(irqflags); + put_affine_portal(); + if (likely(r)) + break; + + udelay(1); + } while (--timeout); + + if (unlikely(!timeout)) + return -ETIMEDOUT; + + p = get_affine_portal(); + local_irq_save(irqflags); + /* + * we can copy all but the first entry, as this can trigger badness + * with the valid-bit + */ + bm_buffer_set64(r->bufs, bm_buffer_get64(bufs)); + bm_buffer_set_bpid(r->bufs, pool->bpid); + if (i) + memcpy(&r->bufs[1], &bufs[1], i * sizeof(bufs[0])); + + bm_rcr_pvb_commit(&p->p, BM_RCR_VERB_CMD_BPID_SINGLE | + (num & BM_RCR_VERB_BUFCOUNT_MASK)); + + local_irq_restore(irqflags); + put_affine_portal(); + return 0; +} +EXPORT_SYMBOL(bman_release); + +int bman_acquire(struct bman_pool *pool, struct bm_buffer *bufs, u8 num) +{ + struct bman_portal *p = get_affine_portal(); + struct bm_mc_command *mcc; + union bm_mc_result *mcr; + int ret; + + DPAA_ASSERT(num > 0 && num <= 8); + + mcc = bm_mc_start(&p->p); + mcc->bpid = pool->bpid; + bm_mc_commit(&p->p, BM_MCC_VERB_CMD_ACQUIRE | + (num & BM_MCC_VERB_ACQUIRE_BUFCOUNT)); + if (!bm_mc_result_timeout(&p->p, &mcr)) { + put_affine_portal(); + pr_crit("BMan Acquire Timeout\n"); + return -ETIMEDOUT; + } + ret = mcr->verb & BM_MCR_VERB_ACQUIRE_BUFCOUNT; + if (bufs) + memcpy(&bufs[0], &mcr->bufs[0], num * sizeof(bufs[0])); + + put_affine_portal(); + if (ret != num) + ret = -ENOMEM; + return ret; +} +EXPORT_SYMBOL(bman_acquire); + +const struct bm_portal_config * +bman_get_bm_portal_config(const struct bman_portal *portal) +{ + return portal->config; +} diff --git a/drivers/soc/fsl/qbman/bman_ccsr.c b/drivers/soc/fsl/qbman/bman_ccsr.c new file mode 100644 index 000000000000..9deb0524543f --- /dev/null +++ b/drivers/soc/fsl/qbman/bman_ccsr.c @@ -0,0 +1,263 @@ +/* Copyright (c) 2009 - 2016 Freescale Semiconductor, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Freescale Semiconductor nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * ALTERNATIVELY, this software may be distributed under the terms of the + * GNU General Public License ("GPL") as published by the Free Software + * Foundation, either version 2 of that License or (at your option) any + * later version. + * + * THIS SOFTWARE IS PROVIDED BY Freescale Semiconductor ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL Freescale Semiconductor BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "bman_priv.h" + +u16 bman_ip_rev; +EXPORT_SYMBOL(bman_ip_rev); + +/* Register offsets */ +#define REG_FBPR_FPC 0x0800 +#define REG_ECSR 0x0a00 +#define REG_ECIR 0x0a04 +#define REG_EADR 0x0a08 +#define REG_EDATA(n) (0x0a10 + ((n) * 0x04)) +#define REG_SBEC(n) (0x0a80 + ((n) * 0x04)) +#define REG_IP_REV_1 0x0bf8 +#define REG_IP_REV_2 0x0bfc +#define REG_FBPR_BARE 0x0c00 +#define REG_FBPR_BAR 0x0c04 +#define REG_FBPR_AR 0x0c10 +#define REG_SRCIDR 0x0d04 +#define REG_LIODNR 0x0d08 +#define REG_ERR_ISR 0x0e00 +#define REG_ERR_IER 0x0e04 +#define REG_ERR_ISDR 0x0e08 + +/* Used by all error interrupt registers except 'inhibit' */ +#define BM_EIRQ_IVCI 0x00000010 /* Invalid Command Verb */ +#define BM_EIRQ_FLWI 0x00000008 /* FBPR Low Watermark */ +#define BM_EIRQ_MBEI 0x00000004 /* Multi-bit ECC Error */ +#define BM_EIRQ_SBEI 0x00000002 /* Single-bit ECC Error */ +#define BM_EIRQ_BSCN 0x00000001 /* pool State Change Notification */ + +struct bman_hwerr_txt { + u32 mask; + const char *txt; +}; + +static const struct bman_hwerr_txt bman_hwerr_txts[] = { + { BM_EIRQ_IVCI, "Invalid Command Verb" }, + { BM_EIRQ_FLWI, "FBPR Low Watermark" }, + { BM_EIRQ_MBEI, "Multi-bit ECC Error" }, + { BM_EIRQ_SBEI, "Single-bit ECC Error" }, + { BM_EIRQ_BSCN, "Pool State Change Notification" }, +}; + +/* Only trigger low water mark interrupt once only */ +#define BMAN_ERRS_TO_DISABLE BM_EIRQ_FLWI + +/* Pointer to the start of the BMan's CCSR space */ +static u32 __iomem *bm_ccsr_start; + +static inline u32 bm_ccsr_in(u32 offset) +{ + return ioread32be(bm_ccsr_start + offset/4); +} +static inline void bm_ccsr_out(u32 offset, u32 val) +{ + iowrite32be(val, bm_ccsr_start + offset/4); +} + +static void bm_get_version(u16 *id, u8 *major, u8 *minor) +{ + u32 v = bm_ccsr_in(REG_IP_REV_1); + *id = (v >> 16); + *major = (v >> 8) & 0xff; + *minor = v & 0xff; +} + +/* signal transactions for FBPRs with higher priority */ +#define FBPR_AR_RPRIO_HI BIT(30) + +static void bm_set_memory(u64 ba, u32 size) +{ + u32 exp = ilog2(size); + /* choke if size isn't within range */ + DPAA_ASSERT(size >= 4096 && size <= 1024*1024*1024 && + is_power_of_2(size)); + /* choke if '[e]ba' has lower-alignment than 'size' */ + DPAA_ASSERT(!(ba & (size - 1))); + bm_ccsr_out(REG_FBPR_BARE, upper_32_bits(ba)); + bm_ccsr_out(REG_FBPR_BAR, lower_32_bits(ba)); + bm_ccsr_out(REG_FBPR_AR, exp - 1); +} + +/* + * Location and size of BMan private memory + * + * Ideally we would use the DMA API to turn rmem->base into a DMA address + * (especially if iommu translations ever get involved). Unfortunately, the + * DMA API currently does not allow mapping anything that is not backed with + * a struct page. + */ +static dma_addr_t fbpr_a; +static size_t fbpr_sz; + +static int bman_fbpr(struct reserved_mem *rmem) +{ + fbpr_a = rmem->base; + fbpr_sz = rmem->size; + + WARN_ON(!(fbpr_a && fbpr_sz)); + + return 0; +} +RESERVEDMEM_OF_DECLARE(bman_fbpr, "fsl,bman-fbpr", bman_fbpr); + +static irqreturn_t bman_isr(int irq, void *ptr) +{ + u32 isr_val, ier_val, ecsr_val, isr_mask, i; + struct device *dev = ptr; + + ier_val = bm_ccsr_in(REG_ERR_IER); + isr_val = bm_ccsr_in(REG_ERR_ISR); + ecsr_val = bm_ccsr_in(REG_ECSR); + isr_mask = isr_val & ier_val; + + if (!isr_mask) + return IRQ_NONE; + + for (i = 0; i < ARRAY_SIZE(bman_hwerr_txts); i++) { + if (bman_hwerr_txts[i].mask & isr_mask) { + dev_err_ratelimited(dev, "ErrInt: %s\n", + bman_hwerr_txts[i].txt); + if (bman_hwerr_txts[i].mask & ecsr_val) { + /* Re-arm error capture registers */ + bm_ccsr_out(REG_ECSR, ecsr_val); + } + if (bman_hwerr_txts[i].mask & BMAN_ERRS_TO_DISABLE) { + dev_dbg(dev, "Disabling error 0x%x\n", + bman_hwerr_txts[i].mask); + ier_val &= ~bman_hwerr_txts[i].mask; + bm_ccsr_out(REG_ERR_IER, ier_val); + } + } + } + bm_ccsr_out(REG_ERR_ISR, isr_val); + + return IRQ_HANDLED; +} + +static int fsl_bman_probe(struct platform_device *pdev) +{ + int ret, err_irq; + struct device *dev = &pdev->dev; + struct device_node *node = dev->of_node; + struct resource *res; + u16 id, bm_pool_cnt; + u8 major, minor; + + res = platform_get_resource(pdev, IORESOURCE_MEM, 0); + if (!res) { + dev_err(dev, "Can't get %s property 'IORESOURCE_MEM'\n", + node->full_name); + return -ENXIO; + } + bm_ccsr_start = devm_ioremap(dev, res->start, + res->end - res->start + 1); + if (!bm_ccsr_start) + return -ENXIO; + + bm_get_version(&id, &major, &minor); + if (major == 1 && minor == 0) { + bman_ip_rev = BMAN_REV10; + bm_pool_cnt = BM_POOL_MAX; + } else if (major == 2 && minor == 0) { + bman_ip_rev = BMAN_REV20; + bm_pool_cnt = 8; + } else if (major == 2 && minor == 1) { + bman_ip_rev = BMAN_REV21; + bm_pool_cnt = BM_POOL_MAX; + } else { + dev_err(dev, "Unknown Bman version:%04x,%02x,%02x\n", + id, major, minor); + return -ENODEV; + } + + bm_set_memory(fbpr_a, fbpr_sz); + + err_irq = platform_get_irq(pdev, 0); + if (err_irq <= 0) { + dev_info(dev, "Can't get %s IRQ\n", node->full_name); + return -ENODEV; + } + ret = devm_request_irq(dev, err_irq, bman_isr, IRQF_SHARED, "bman-err", + dev); + if (ret) { + dev_err(dev, "devm_request_irq() failed %d for '%s'\n", + ret, node->full_name); + return ret; + } + /* Disable Buffer Pool State Change */ + bm_ccsr_out(REG_ERR_ISDR, BM_EIRQ_BSCN); + /* + * Write-to-clear any stale bits, (eg. starvation being asserted prior + * to resource allocation during driver init). + */ + bm_ccsr_out(REG_ERR_ISR, 0xffffffff); + /* Enable Error Interrupts */ + bm_ccsr_out(REG_ERR_IER, 0xffffffff); + + bm_bpalloc = devm_gen_pool_create(dev, 0, -1, "bman-bpalloc"); + if (IS_ERR(bm_bpalloc)) { + ret = PTR_ERR(bm_bpalloc); + dev_err(dev, "bman-bpalloc pool init failed (%d)\n", ret); + return ret; + } + + /* seed BMan resource pool */ + ret = gen_pool_add(bm_bpalloc, DPAA_GENALLOC_OFF, bm_pool_cnt, -1); + if (ret) { + dev_err(dev, "Failed to seed BPID range [%d..%d] (%d)\n", + 0, bm_pool_cnt - 1, ret); + return ret; + } + + return 0; +}; + +static const struct of_device_id fsl_bman_ids[] = { + { + .compatible = "fsl,bman", + }, + {} +}; + +static struct platform_driver fsl_bman_driver = { + .driver = { + .name = KBUILD_MODNAME, + .of_match_table = fsl_bman_ids, + .suppress_bind_attrs = true, + }, + .probe = fsl_bman_probe, +}; + +builtin_platform_driver(fsl_bman_driver); diff --git a/drivers/soc/fsl/qbman/bman_portal.c b/drivers/soc/fsl/qbman/bman_portal.c new file mode 100644 index 000000000000..6579cc18811a --- /dev/null +++ b/drivers/soc/fsl/qbman/bman_portal.c @@ -0,0 +1,219 @@ +/* Copyright 2008 - 2016 Freescale Semiconductor, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Freescale Semiconductor nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * ALTERNATIVELY, this software may be distributed under the terms of the + * GNU General Public License ("GPL") as published by the Free Software + * Foundation, either version 2 of that License or (at your option) any + * later version. + * + * THIS SOFTWARE IS PROVIDED BY Freescale Semiconductor ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL Freescale Semiconductor BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "bman_priv.h" + +static struct bman_portal *affine_bportals[NR_CPUS]; +static struct cpumask portal_cpus; +/* protect bman global registers and global data shared among portals */ +static DEFINE_SPINLOCK(bman_lock); + +static struct bman_portal *init_pcfg(struct bm_portal_config *pcfg) +{ + struct bman_portal *p = bman_create_affine_portal(pcfg); + + if (!p) { + dev_crit(pcfg->dev, "%s: Portal failure on cpu %d\n", + __func__, pcfg->cpu); + return NULL; + } + + bman_p_irqsource_add(p, BM_PIRQ_RCRI); + affine_bportals[pcfg->cpu] = p; + + dev_info(pcfg->dev, "Portal initialised, cpu %d\n", pcfg->cpu); + + return p; +} + +static void bman_offline_cpu(unsigned int cpu) +{ + struct bman_portal *p = affine_bportals[cpu]; + const struct bm_portal_config *pcfg; + + if (!p) + return; + + pcfg = bman_get_bm_portal_config(p); + if (!pcfg) + return; + + irq_set_affinity(pcfg->irq, cpumask_of(0)); +} + +static void bman_online_cpu(unsigned int cpu) +{ + struct bman_portal *p = affine_bportals[cpu]; + const struct bm_portal_config *pcfg; + + if (!p) + return; + + pcfg = bman_get_bm_portal_config(p); + if (!pcfg) + return; + + irq_set_affinity(pcfg->irq, cpumask_of(cpu)); +} + +static int bman_hotplug_cpu_callback(struct notifier_block *nfb, + unsigned long action, void *hcpu) +{ + unsigned int cpu = (unsigned long)hcpu; + + switch (action) { + case CPU_ONLINE: + case CPU_ONLINE_FROZEN: + bman_online_cpu(cpu); + break; + case CPU_DOWN_PREPARE: + case CPU_DOWN_PREPARE_FROZEN: + bman_offline_cpu(cpu); + } + + return NOTIFY_OK; +} + +static struct notifier_block bman_hotplug_cpu_notifier = { + .notifier_call = bman_hotplug_cpu_callback, +}; + +static int bman_portal_probe(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + struct device_node *node = dev->of_node; + struct bm_portal_config *pcfg; + struct resource *addr_phys[2]; + void __iomem *va; + int irq, cpu; + + pcfg = devm_kmalloc(dev, sizeof(*pcfg), GFP_KERNEL); + if (!pcfg) + return -ENOMEM; + + pcfg->dev = dev; + + addr_phys[0] = platform_get_resource(pdev, IORESOURCE_MEM, + DPAA_PORTAL_CE); + if (!addr_phys[0]) { + dev_err(dev, "Can't get %s property 'reg::CE'\n", + node->full_name); + return -ENXIO; + } + + addr_phys[1] = platform_get_resource(pdev, IORESOURCE_MEM, + DPAA_PORTAL_CI); + if (!addr_phys[1]) { + dev_err(dev, "Can't get %s property 'reg::CI'\n", + node->full_name); + return -ENXIO; + } + + pcfg->cpu = -1; + + irq = platform_get_irq(pdev, 0); + if (irq <= 0) { + dev_err(dev, "Can't get %s IRQ'\n", node->full_name); + return -ENXIO; + } + pcfg->irq = irq; + + va = ioremap_prot(addr_phys[0]->start, resource_size(addr_phys[0]), 0); + if (!va) + goto err_ioremap1; + + pcfg->addr_virt[DPAA_PORTAL_CE] = va; + + va = ioremap_prot(addr_phys[1]->start, resource_size(addr_phys[1]), + _PAGE_GUARDED | _PAGE_NO_CACHE); + if (!va) + goto err_ioremap2; + + pcfg->addr_virt[DPAA_PORTAL_CI] = va; + + spin_lock(&bman_lock); + cpu = cpumask_next_zero(-1, &portal_cpus); + if (cpu >= nr_cpu_ids) { + /* unassigned portal, skip init */ + spin_unlock(&bman_lock); + return 0; + } + + cpumask_set_cpu(cpu, &portal_cpus); + spin_unlock(&bman_lock); + pcfg->cpu = cpu; + + if (!init_pcfg(pcfg)) + goto err_ioremap2; + + /* clear irq affinity if assigned cpu is offline */ + if (!cpu_online(cpu)) + bman_offline_cpu(cpu); + + return 0; + +err_ioremap2: + iounmap(pcfg->addr_virt[DPAA_PORTAL_CE]); +err_ioremap1: + dev_err(dev, "ioremap failed\n"); + return -ENXIO; +} + +static const struct of_device_id bman_portal_ids[] = { + { + .compatible = "fsl,bman-portal", + }, + {} +}; +MODULE_DEVICE_TABLE(of, bman_portal_ids); + +static struct platform_driver bman_portal_driver = { + .driver = { + .name = KBUILD_MODNAME, + .of_match_table = bman_portal_ids, + }, + .probe = bman_portal_probe, +}; + +static int __init bman_portal_driver_register(struct platform_driver *drv) +{ + int ret; + + ret = platform_driver_register(drv); + if (ret < 0) + return ret; + + register_hotcpu_notifier(&bman_hotplug_cpu_notifier); + + return 0; +} + +module_driver(bman_portal_driver, + bman_portal_driver_register, platform_driver_unregister); diff --git a/drivers/soc/fsl/qbman/bman_priv.h b/drivers/soc/fsl/qbman/bman_priv.h new file mode 100644 index 000000000000..f6896a2f6d90 --- /dev/null +++ b/drivers/soc/fsl/qbman/bman_priv.h @@ -0,0 +1,80 @@ +/* Copyright 2008 - 2016 Freescale Semiconductor, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Freescale Semiconductor nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * ALTERNATIVELY, this software may be distributed under the terms of the + * GNU General Public License ("GPL") as published by the Free Software + * Foundation, either version 2 of that License or (at your option) any + * later version. + * + * THIS SOFTWARE IS PROVIDED BY Freescale Semiconductor ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL Freescale Semiconductor BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include "dpaa_sys.h" + +#include + +/* Portal processing (interrupt) sources */ +#define BM_PIRQ_RCRI 0x00000002 /* RCR Ring (below threshold) */ + +/* Revision info (for errata and feature handling) */ +#define BMAN_REV10 0x0100 +#define BMAN_REV20 0x0200 +#define BMAN_REV21 0x0201 +extern u16 bman_ip_rev; /* 0 if uninitialised, otherwise BMAN_REVx */ + +extern struct gen_pool *bm_bpalloc; + +struct bm_portal_config { + /* + * Corenet portal addresses; + * [0]==cache-enabled, [1]==cache-inhibited. + */ + void __iomem *addr_virt[2]; + /* Allow these to be joined in lists */ + struct list_head list; + struct device *dev; + /* User-visible portal configuration settings */ + /* portal is affined to this cpu */ + int cpu; + /* portal interrupt line */ + int irq; +}; + +struct bman_portal *bman_create_affine_portal( + const struct bm_portal_config *config); +/* + * The below bman_p_***() variant might be called in a situation that the cpu + * which the portal affine to is not online yet. + * @bman_portal specifies which portal the API will use. + */ +int bman_p_irqsource_add(struct bman_portal *p, u32 bits); + +/* + * Used by all portal interrupt registers except 'inhibit' + * This mask contains all the "irqsource" bits visible to API users + */ +#define BM_PIRQ_VISIBLE BM_PIRQ_RCRI + +const struct bm_portal_config * +bman_get_bm_portal_config(const struct bman_portal *portal); diff --git a/drivers/soc/fsl/qbman/dpaa_sys.h b/drivers/soc/fsl/qbman/dpaa_sys.h new file mode 100644 index 000000000000..b63fd72295c6 --- /dev/null +++ b/drivers/soc/fsl/qbman/dpaa_sys.h @@ -0,0 +1,103 @@ +/* Copyright 2008 - 2016 Freescale Semiconductor, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Freescale Semiconductor nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * ALTERNATIVELY, this software may be distributed under the terms of the + * GNU General Public License ("GPL") as published by the Free Software + * Foundation, either version 2 of that License or (at your option) any + * later version. + * + * THIS SOFTWARE IS PROVIDED BY Freescale Semiconductor ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL Freescale Semiconductor BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __DPAA_SYS_H +#define __DPAA_SYS_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* For 2-element tables related to cache-inhibited and cache-enabled mappings */ +#define DPAA_PORTAL_CE 0 +#define DPAA_PORTAL_CI 1 + +#if (L1_CACHE_BYTES != 32) && (L1_CACHE_BYTES != 64) +#error "Unsupported Cacheline Size" +#endif + +static inline void dpaa_flush(void *p) +{ +#ifdef CONFIG_PPC + flush_dcache_range((unsigned long)p, (unsigned long)p+64); +#elif defined(CONFIG_ARM32) + __cpuc_flush_dcache_area(p, 64); +#elif defined(CONFIG_ARM64) + __flush_dcache_area(p, 64); +#endif +} + +#define dpaa_invalidate(p) dpaa_flush(p) + +#define dpaa_zero(p) memset(p, 0, 64) + +static inline void dpaa_touch_ro(void *p) +{ +#if (L1_CACHE_BYTES == 32) + prefetch(p+32); +#endif + prefetch(p); +} + +/* Commonly used combo */ +static inline void dpaa_invalidate_touch_ro(void *p) +{ + dpaa_invalidate(p); + dpaa_touch_ro(p); +} + + +#ifdef CONFIG_FSL_DPAA_CHECKING +#define DPAA_ASSERT(x) WARN_ON(!(x)) +#else +#define DPAA_ASSERT(x) +#endif + +/* cyclic helper for rings */ +static inline u8 dpaa_cyc_diff(u8 ringsize, u8 first, u8 last) +{ + /* 'first' is included, 'last' is excluded */ + if (first <= last) + return last - first; + return ringsize + last - first; +} + +/* Offset applied to genalloc pools due to zero being an error return */ +#define DPAA_GENALLOC_OFF 0x80000000 + +#endif /* __DPAA_SYS_H */ diff --git a/include/soc/fsl/bman.h b/include/soc/fsl/bman.h new file mode 100644 index 000000000000..eaaf56df4086 --- /dev/null +++ b/include/soc/fsl/bman.h @@ -0,0 +1,129 @@ +/* Copyright 2008 - 2016 Freescale Semiconductor, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Freescale Semiconductor nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * ALTERNATIVELY, this software may be distributed under the terms of the + * GNU General Public License ("GPL") as published by the Free Software + * Foundation, either version 2 of that License or (at your option) any + * later version. + * + * THIS SOFTWARE IS PROVIDED BY Freescale Semiconductor ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL Freescale Semiconductor BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __FSL_BMAN_H +#define __FSL_BMAN_H + +/* wrapper for 48-bit buffers */ +struct bm_buffer { + union { + struct { + __be16 bpid; /* hi 8-bits reserved */ + __be16 hi; /* High 16-bits of 48-bit address */ + __be32 lo; /* Low 32-bits of 48-bit address */ + }; + __be64 data; + }; +} __aligned(8); +/* + * Restore the 48 bit address previously stored in BMan + * hardware pools as a dma_addr_t + */ +static inline dma_addr_t bm_buf_addr(const struct bm_buffer *buf) +{ + return be64_to_cpu(buf->data) & 0xffffffffffffLLU; +} + +static inline u64 bm_buffer_get64(const struct bm_buffer *buf) +{ + return be64_to_cpu(buf->data) & 0xffffffffffffLLU; +} + +static inline void bm_buffer_set64(struct bm_buffer *buf, u64 addr) +{ + buf->hi = cpu_to_be16(upper_32_bits(addr)); + buf->lo = cpu_to_be32(lower_32_bits(addr)); +} + +static inline u8 bm_buffer_get_bpid(const struct bm_buffer *buf) +{ + return be16_to_cpu(buf->bpid) & 0xff; +} + +static inline void bm_buffer_set_bpid(struct bm_buffer *buf, int bpid) +{ + buf->bpid = cpu_to_be16(bpid & 0xff); +} + +/* Managed portal, high-level i/face */ + +/* Portal and Buffer Pools */ +struct bman_portal; +struct bman_pool; + +#define BM_POOL_MAX 64 /* max # of buffer pools */ + +/** + * bman_new_pool - Allocates a Buffer Pool object + * + * Creates a pool object, and returns a reference to it or NULL on error. + */ +struct bman_pool *bman_new_pool(void); + +/** + * bman_free_pool - Deallocates a Buffer Pool object + * @pool: the pool object to release + */ +void bman_free_pool(struct bman_pool *pool); + +/** + * bman_get_bpid - Returns a pool object's BPID. + * @pool: the pool object + * + * The returned value is the index of the encapsulated buffer pool, + * in the range of [0, @BM_POOL_MAX-1]. + */ +int bman_get_bpid(const struct bman_pool *pool); + +/** + * bman_release - Release buffer(s) to the buffer pool + * @pool: the buffer pool object to release to + * @bufs: an array of buffers to release + * @num: the number of buffers in @bufs (1-8) + * + * Adds the given buffers to RCR entries. If the RCR ring is unresponsive, + * the function will return -ETIMEDOUT. Otherwise, it returns zero. + */ +int bman_release(struct bman_pool *pool, const struct bm_buffer *bufs, u8 num); + +/** + * bman_acquire - Acquire buffer(s) from a buffer pool + * @pool: the buffer pool object to acquire from + * @bufs: array for storing the acquired buffers + * @num: the number of buffers desired (@bufs is at least this big) + * + * Issues an "Acquire" command via the portal's management command interface. + * The return value will be the number of buffers obtained from the pool, or a + * negative error code if a h/w error or pool starvation was encountered. In + * the latter case, the content of @bufs is undefined. + */ +int bman_acquire(struct bman_pool *pool, struct bm_buffer *bufs, u8 num); + +#endif /* __FSL_BMAN_H */ -- cgit v1.2.3 From c535e923bb97a4b361e89a6383693482057f8b0c Mon Sep 17 00:00:00 2001 From: Claudiu Manoil Date: Thu, 22 Sep 2016 18:04:09 +0300 Subject: soc/fsl: Introduce DPAA 1.x QMan device driver This driver enables the Freescale DPAA 1.x Queue Manager block. QMan is a hardware accelerator that manages frame queues. It allows CPUs and other accelerators connected to the SoC datapath to enqueue and dequeue ethernet frames, thus providing the infrastructure for data exchange among CPUs and datapath accelerators. Signed-off-by: Roy Pledge Signed-off-by: Claudiu Manoil Signed-off-by: Scott Wood --- drivers/soc/fsl/qbman/Makefile | 5 +- drivers/soc/fsl/qbman/qman.c | 2881 +++++++++++++++++++++++++++++++++++ drivers/soc/fsl/qbman/qman_ccsr.c | 808 ++++++++++ drivers/soc/fsl/qbman/qman_portal.c | 355 +++++ drivers/soc/fsl/qbman/qman_priv.h | 371 +++++ include/soc/fsl/qman.h | 1074 +++++++++++++ 6 files changed, 5492 insertions(+), 2 deletions(-) create mode 100644 drivers/soc/fsl/qbman/qman.c create mode 100644 drivers/soc/fsl/qbman/qman_ccsr.c create mode 100644 drivers/soc/fsl/qbman/qman_portal.c create mode 100644 drivers/soc/fsl/qbman/qman_priv.h create mode 100644 include/soc/fsl/qman.h (limited to 'include') diff --git a/drivers/soc/fsl/qbman/Makefile b/drivers/soc/fsl/qbman/Makefile index 855c3ac0f89e..6e0ee30b2834 100644 --- a/drivers/soc/fsl/qbman/Makefile +++ b/drivers/soc/fsl/qbman/Makefile @@ -1,2 +1,3 @@ -obj-$(CONFIG_FSL_DPAA) += bman_ccsr.o bman_portal.o \ - bman.o +obj-$(CONFIG_FSL_DPAA) += bman_ccsr.o qman_ccsr.o \ + bman_portal.o qman_portal.o \ + bman.o qman.o diff --git a/drivers/soc/fsl/qbman/qman.c b/drivers/soc/fsl/qbman/qman.c new file mode 100644 index 000000000000..119054bc922b --- /dev/null +++ b/drivers/soc/fsl/qbman/qman.c @@ -0,0 +1,2881 @@ +/* Copyright 2008 - 2016 Freescale Semiconductor, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Freescale Semiconductor nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * ALTERNATIVELY, this software may be distributed under the terms of the + * GNU General Public License ("GPL") as published by the Free Software + * Foundation, either version 2 of that License or (at your option) any + * later version. + * + * THIS SOFTWARE IS PROVIDED BY Freescale Semiconductor ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL Freescale Semiconductor BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "qman_priv.h" + +#define DQRR_MAXFILL 15 +#define EQCR_ITHRESH 4 /* if EQCR congests, interrupt threshold */ +#define IRQNAME "QMan portal %d" +#define MAX_IRQNAME 16 /* big enough for "QMan portal %d" */ +#define QMAN_POLL_LIMIT 32 +#define QMAN_PIRQ_DQRR_ITHRESH 12 +#define QMAN_PIRQ_MR_ITHRESH 4 +#define QMAN_PIRQ_IPERIOD 100 + +/* Portal register assists */ + +/* Cache-inhibited register offsets */ +#define QM_REG_EQCR_PI_CINH 0x0000 +#define QM_REG_EQCR_CI_CINH 0x0004 +#define QM_REG_EQCR_ITR 0x0008 +#define QM_REG_DQRR_PI_CINH 0x0040 +#define QM_REG_DQRR_CI_CINH 0x0044 +#define QM_REG_DQRR_ITR 0x0048 +#define QM_REG_DQRR_DCAP 0x0050 +#define QM_REG_DQRR_SDQCR 0x0054 +#define QM_REG_DQRR_VDQCR 0x0058 +#define QM_REG_DQRR_PDQCR 0x005c +#define QM_REG_MR_PI_CINH 0x0080 +#define QM_REG_MR_CI_CINH 0x0084 +#define QM_REG_MR_ITR 0x0088 +#define QM_REG_CFG 0x0100 +#define QM_REG_ISR 0x0e00 +#define QM_REG_IER 0x0e04 +#define QM_REG_ISDR 0x0e08 +#define QM_REG_IIR 0x0e0c +#define QM_REG_ITPR 0x0e14 + +/* Cache-enabled register offsets */ +#define QM_CL_EQCR 0x0000 +#define QM_CL_DQRR 0x1000 +#define QM_CL_MR 0x2000 +#define QM_CL_EQCR_PI_CENA 0x3000 +#define QM_CL_EQCR_CI_CENA 0x3100 +#define QM_CL_DQRR_PI_CENA 0x3200 +#define QM_CL_DQRR_CI_CENA 0x3300 +#define QM_CL_MR_PI_CENA 0x3400 +#define QM_CL_MR_CI_CENA 0x3500 +#define QM_CL_CR 0x3800 +#define QM_CL_RR0 0x3900 +#define QM_CL_RR1 0x3940 + +/* + * BTW, the drivers (and h/w programming model) already obtain the required + * synchronisation for portal accesses and data-dependencies. Use of barrier()s + * or other order-preserving primitives simply degrade performance. Hence the + * use of the __raw_*() interfaces, which simply ensure that the compiler treats + * the portal registers as volatile + */ + +/* Cache-enabled ring access */ +#define qm_cl(base, idx) ((void *)base + ((idx) << 6)) + +/* + * Portal modes. + * Enum types; + * pmode == production mode + * cmode == consumption mode, + * dmode == h/w dequeue mode. + * Enum values use 3 letter codes. First letter matches the portal mode, + * remaining two letters indicate; + * ci == cache-inhibited portal register + * ce == cache-enabled portal register + * vb == in-band valid-bit (cache-enabled) + * dc == DCA (Discrete Consumption Acknowledgment), DQRR-only + * As for "enum qm_dqrr_dmode", it should be self-explanatory. + */ +enum qm_eqcr_pmode { /* matches QCSP_CFG::EPM */ + qm_eqcr_pci = 0, /* PI index, cache-inhibited */ + qm_eqcr_pce = 1, /* PI index, cache-enabled */ + qm_eqcr_pvb = 2 /* valid-bit */ +}; +enum qm_dqrr_dmode { /* matches QCSP_CFG::DP */ + qm_dqrr_dpush = 0, /* SDQCR + VDQCR */ + qm_dqrr_dpull = 1 /* PDQCR */ +}; +enum qm_dqrr_pmode { /* s/w-only */ + qm_dqrr_pci, /* reads DQRR_PI_CINH */ + qm_dqrr_pce, /* reads DQRR_PI_CENA */ + qm_dqrr_pvb /* reads valid-bit */ +}; +enum qm_dqrr_cmode { /* matches QCSP_CFG::DCM */ + qm_dqrr_cci = 0, /* CI index, cache-inhibited */ + qm_dqrr_cce = 1, /* CI index, cache-enabled */ + qm_dqrr_cdc = 2 /* Discrete Consumption Acknowledgment */ +}; +enum qm_mr_pmode { /* s/w-only */ + qm_mr_pci, /* reads MR_PI_CINH */ + qm_mr_pce, /* reads MR_PI_CENA */ + qm_mr_pvb /* reads valid-bit */ +}; +enum qm_mr_cmode { /* matches QCSP_CFG::MM */ + qm_mr_cci = 0, /* CI index, cache-inhibited */ + qm_mr_cce = 1 /* CI index, cache-enabled */ +}; + +/* --- Portal structures --- */ + +#define QM_EQCR_SIZE 8 +#define QM_DQRR_SIZE 16 +#define QM_MR_SIZE 8 + +/* "Enqueue Command" */ +struct qm_eqcr_entry { + u8 _ncw_verb; /* writes to this are non-coherent */ + u8 dca; + u16 seqnum; + u32 orp; /* 24-bit */ + u32 fqid; /* 24-bit */ + u32 tag; + struct qm_fd fd; + u8 __reserved3[32]; +} __packed; +#define QM_EQCR_VERB_VBIT 0x80 +#define QM_EQCR_VERB_CMD_MASK 0x61 /* but only one value; */ +#define QM_EQCR_VERB_CMD_ENQUEUE 0x01 +#define QM_EQCR_SEQNUM_NESN 0x8000 /* Advance NESN */ +#define QM_EQCR_SEQNUM_NLIS 0x4000 /* More fragments to come */ +#define QM_EQCR_SEQNUM_SEQMASK 0x3fff /* sequence number goes here */ + +struct qm_eqcr { + struct qm_eqcr_entry *ring, *cursor; + u8 ci, available, ithresh, vbit; +#ifdef CONFIG_FSL_DPAA_CHECKING + u32 busy; + enum qm_eqcr_pmode pmode; +#endif +}; + +struct qm_dqrr { + const struct qm_dqrr_entry *ring, *cursor; + u8 pi, ci, fill, ithresh, vbit; +#ifdef CONFIG_FSL_DPAA_CHECKING + enum qm_dqrr_dmode dmode; + enum qm_dqrr_pmode pmode; + enum qm_dqrr_cmode cmode; +#endif +}; + +struct qm_mr { + union qm_mr_entry *ring, *cursor; + u8 pi, ci, fill, ithresh, vbit; +#ifdef CONFIG_FSL_DPAA_CHECKING + enum qm_mr_pmode pmode; + enum qm_mr_cmode cmode; +#endif +}; + +/* MC (Management Command) command */ +/* "Query FQ" */ +struct qm_mcc_queryfq { + u8 _ncw_verb; + u8 __reserved1[3]; + u32 fqid; /* 24-bit */ + u8 __reserved2[56]; +} __packed; +/* "Alter FQ State Commands " */ +struct qm_mcc_alterfq { + u8 _ncw_verb; + u8 __reserved1[3]; + u32 fqid; /* 24-bit */ + u8 __reserved2; + u8 count; /* number of consecutive FQID */ + u8 __reserved3[10]; + u32 context_b; /* frame queue context b */ + u8 __reserved4[40]; +} __packed; + +/* "Query CGR" */ +struct qm_mcc_querycgr { + u8 _ncw_verb; + u8 __reserved1[30]; + u8 cgid; + u8 __reserved2[32]; +}; + +struct qm_mcc_querywq { + u8 _ncw_verb; + u8 __reserved; + /* select channel if verb != QUERYWQ_DEDICATED */ + u16 channel_wq; /* ignores wq (3 lsbits): _res[0-2] */ + u8 __reserved2[60]; +} __packed; + +#define QM_MCC_VERB_VBIT 0x80 +#define QM_MCC_VERB_MASK 0x7f /* where the verb contains; */ +#define QM_MCC_VERB_INITFQ_PARKED 0x40 +#define QM_MCC_VERB_INITFQ_SCHED 0x41 +#define QM_MCC_VERB_QUERYFQ 0x44 +#define QM_MCC_VERB_QUERYFQ_NP 0x45 /* "non-programmable" fields */ +#define QM_MCC_VERB_QUERYWQ 0x46 +#define QM_MCC_VERB_QUERYWQ_DEDICATED 0x47 +#define QM_MCC_VERB_ALTER_SCHED 0x48 /* Schedule FQ */ +#define QM_MCC_VERB_ALTER_FE 0x49 /* Force Eligible FQ */ +#define QM_MCC_VERB_ALTER_RETIRE 0x4a /* Retire FQ */ +#define QM_MCC_VERB_ALTER_OOS 0x4b /* Take FQ out of service */ +#define QM_MCC_VERB_ALTER_FQXON 0x4d /* FQ XON */ +#define QM_MCC_VERB_ALTER_FQXOFF 0x4e /* FQ XOFF */ +#define QM_MCC_VERB_INITCGR 0x50 +#define QM_MCC_VERB_MODIFYCGR 0x51 +#define QM_MCC_VERB_CGRTESTWRITE 0x52 +#define QM_MCC_VERB_QUERYCGR 0x58 +#define QM_MCC_VERB_QUERYCONGESTION 0x59 +union qm_mc_command { + struct { + u8 _ncw_verb; /* writes to this are non-coherent */ + u8 __reserved[63]; + }; + struct qm_mcc_initfq initfq; + struct qm_mcc_queryfq queryfq; + struct qm_mcc_alterfq alterfq; + struct qm_mcc_initcgr initcgr; + struct qm_mcc_querycgr querycgr; + struct qm_mcc_querywq querywq; + struct qm_mcc_queryfq_np queryfq_np; +}; + +/* MC (Management Command) result */ +/* "Query FQ" */ +struct qm_mcr_queryfq { + u8 verb; + u8 result; + u8 __reserved1[8]; + struct qm_fqd fqd; /* the FQD fields are here */ + u8 __reserved2[30]; +} __packed; + +/* "Alter FQ State Commands" */ +struct qm_mcr_alterfq { + u8 verb; + u8 result; + u8 fqs; /* Frame Queue Status */ + u8 __reserved1[61]; +}; +#define QM_MCR_VERB_RRID 0x80 +#define QM_MCR_VERB_MASK QM_MCC_VERB_MASK +#define QM_MCR_VERB_INITFQ_PARKED QM_MCC_VERB_INITFQ_PARKED +#define QM_MCR_VERB_INITFQ_SCHED QM_MCC_VERB_INITFQ_SCHED +#define QM_MCR_VERB_QUERYFQ QM_MCC_VERB_QUERYFQ +#define QM_MCR_VERB_QUERYFQ_NP QM_MCC_VERB_QUERYFQ_NP +#define QM_MCR_VERB_QUERYWQ QM_MCC_VERB_QUERYWQ +#define QM_MCR_VERB_QUERYWQ_DEDICATED QM_MCC_VERB_QUERYWQ_DEDICATED +#define QM_MCR_VERB_ALTER_SCHED QM_MCC_VERB_ALTER_SCHED +#define QM_MCR_VERB_ALTER_FE QM_MCC_VERB_ALTER_FE +#define QM_MCR_VERB_ALTER_RETIRE QM_MCC_VERB_ALTER_RETIRE +#define QM_MCR_VERB_ALTER_OOS QM_MCC_VERB_ALTER_OOS +#define QM_MCR_RESULT_NULL 0x00 +#define QM_MCR_RESULT_OK 0xf0 +#define QM_MCR_RESULT_ERR_FQID 0xf1 +#define QM_MCR_RESULT_ERR_FQSTATE 0xf2 +#define QM_MCR_RESULT_ERR_NOTEMPTY 0xf3 /* OOS fails if FQ is !empty */ +#define QM_MCR_RESULT_ERR_BADCHANNEL 0xf4 +#define QM_MCR_RESULT_PENDING 0xf8 +#define QM_MCR_RESULT_ERR_BADCOMMAND 0xff +#define QM_MCR_FQS_ORLPRESENT 0x02 /* ORL fragments to come */ +#define QM_MCR_FQS_NOTEMPTY 0x01 /* FQ has enqueued frames */ +#define QM_MCR_TIMEOUT 10000 /* us */ +union qm_mc_result { + struct { + u8 verb; + u8 result; + u8 __reserved1[62]; + }; + struct qm_mcr_queryfq queryfq; + struct qm_mcr_alterfq alterfq; + struct qm_mcr_querycgr querycgr; + struct qm_mcr_querycongestion querycongestion; + struct qm_mcr_querywq querywq; + struct qm_mcr_queryfq_np queryfq_np; +}; + +struct qm_mc { + union qm_mc_command *cr; + union qm_mc_result *rr; + u8 rridx, vbit; +#ifdef CONFIG_FSL_DPAA_CHECKING + enum { + /* Can be _mc_start()ed */ + qman_mc_idle, + /* Can be _mc_commit()ed or _mc_abort()ed */ + qman_mc_user, + /* Can only be _mc_retry()ed */ + qman_mc_hw + } state; +#endif +}; + +struct qm_addr { + void __iomem *ce; /* cache-enabled */ + void __iomem *ci; /* cache-inhibited */ +}; + +struct qm_portal { + /* + * In the non-CONFIG_FSL_DPAA_CHECKING case, the following stuff up to + * and including 'mc' fits within a cacheline (yay!). The 'config' part + * is setup-only, so isn't a cause for a concern. In other words, don't + * rearrange this structure on a whim, there be dragons ... + */ + struct qm_addr addr; + struct qm_eqcr eqcr; + struct qm_dqrr dqrr; + struct qm_mr mr; + struct qm_mc mc; +} ____cacheline_aligned; + +/* Cache-inhibited register access. */ +static inline u32 qm_in(struct qm_portal *p, u32 offset) +{ + return __raw_readl(p->addr.ci + offset); +} + +static inline void qm_out(struct qm_portal *p, u32 offset, u32 val) +{ + __raw_writel(val, p->addr.ci + offset); +} + +/* Cache Enabled Portal Access */ +static inline void qm_cl_invalidate(struct qm_portal *p, u32 offset) +{ + dpaa_invalidate(p->addr.ce + offset); +} + +static inline void qm_cl_touch_ro(struct qm_portal *p, u32 offset) +{ + dpaa_touch_ro(p->addr.ce + offset); +} + +static inline u32 qm_ce_in(struct qm_portal *p, u32 offset) +{ + return __raw_readl(p->addr.ce + offset); +} + +/* --- EQCR API --- */ + +#define EQCR_SHIFT ilog2(sizeof(struct qm_eqcr_entry)) +#define EQCR_CARRY (uintptr_t)(QM_EQCR_SIZE << EQCR_SHIFT) + +/* Bit-wise logic to wrap a ring pointer by clearing the "carry bit" */ +static struct qm_eqcr_entry *eqcr_carryclear(struct qm_eqcr_entry *p) +{ + uintptr_t addr = (uintptr_t)p; + + addr &= ~EQCR_CARRY; + + return (struct qm_eqcr_entry *)addr; +} + +/* Bit-wise logic to convert a ring pointer to a ring index */ +static int eqcr_ptr2idx(struct qm_eqcr_entry *e) +{ + return ((uintptr_t)e >> EQCR_SHIFT) & (QM_EQCR_SIZE - 1); +} + +/* Increment the 'cursor' ring pointer, taking 'vbit' into account */ +static inline void eqcr_inc(struct qm_eqcr *eqcr) +{ + /* increment to the next EQCR pointer and handle overflow and 'vbit' */ + struct qm_eqcr_entry *partial = eqcr->cursor + 1; + + eqcr->cursor = eqcr_carryclear(partial); + if (partial != eqcr->cursor) + eqcr->vbit ^= QM_EQCR_VERB_VBIT; +} + +static inline int qm_eqcr_init(struct qm_portal *portal, + enum qm_eqcr_pmode pmode, + unsigned int eq_stash_thresh, + int eq_stash_prio) +{ + struct qm_eqcr *eqcr = &portal->eqcr; + u32 cfg; + u8 pi; + + eqcr->ring = portal->addr.ce + QM_CL_EQCR; + eqcr->ci = qm_in(portal, QM_REG_EQCR_CI_CINH) & (QM_EQCR_SIZE - 1); + qm_cl_invalidate(portal, QM_CL_EQCR_CI_CENA); + pi = qm_in(portal, QM_REG_EQCR_PI_CINH) & (QM_EQCR_SIZE - 1); + eqcr->cursor = eqcr->ring + pi; + eqcr->vbit = (qm_in(portal, QM_REG_EQCR_PI_CINH) & QM_EQCR_SIZE) ? + QM_EQCR_VERB_VBIT : 0; + eqcr->available = QM_EQCR_SIZE - 1 - + dpaa_cyc_diff(QM_EQCR_SIZE, eqcr->ci, pi); + eqcr->ithresh = qm_in(portal, QM_REG_EQCR_ITR); +#ifdef CONFIG_FSL_DPAA_CHECKING + eqcr->busy = 0; + eqcr->pmode = pmode; +#endif + cfg = (qm_in(portal, QM_REG_CFG) & 0x00ffffff) | + (eq_stash_thresh << 28) | /* QCSP_CFG: EST */ + (eq_stash_prio << 26) | /* QCSP_CFG: EP */ + ((pmode & 0x3) << 24); /* QCSP_CFG::EPM */ + qm_out(portal, QM_REG_CFG, cfg); + return 0; +} + +static inline unsigned int qm_eqcr_get_ci_stashing(struct qm_portal *portal) +{ + return (qm_in(portal, QM_REG_CFG) >> 28) & 0x7; +} + +static inline void qm_eqcr_finish(struct qm_portal *portal) +{ + struct qm_eqcr *eqcr = &portal->eqcr; + u8 pi = qm_in(portal, QM_REG_EQCR_PI_CINH) & (QM_EQCR_SIZE - 1); + u8 ci = qm_in(portal, QM_REG_EQCR_CI_CINH) & (QM_EQCR_SIZE - 1); + + DPAA_ASSERT(!eqcr->busy); + if (pi != eqcr_ptr2idx(eqcr->cursor)) + pr_crit("losing uncommited EQCR entries\n"); + if (ci != eqcr->ci) + pr_crit("missing existing EQCR completions\n"); + if (eqcr->ci != eqcr_ptr2idx(eqcr->cursor)) + pr_crit("EQCR destroyed unquiesced\n"); +} + +static inline struct qm_eqcr_entry *qm_eqcr_start_no_stash(struct qm_portal + *portal) +{ + struct qm_eqcr *eqcr = &portal->eqcr; + + DPAA_ASSERT(!eqcr->busy); + if (!eqcr->available) + return NULL; + +#ifdef CONFIG_FSL_DPAA_CHECKING + eqcr->busy = 1; +#endif + dpaa_zero(eqcr->cursor); + return eqcr->cursor; +} + +static inline struct qm_eqcr_entry *qm_eqcr_start_stash(struct qm_portal + *portal) +{ + struct qm_eqcr *eqcr = &portal->eqcr; + u8 diff, old_ci; + + DPAA_ASSERT(!eqcr->busy); + if (!eqcr->available) { + old_ci = eqcr->ci; + eqcr->ci = qm_ce_in(portal, QM_CL_EQCR_CI_CENA) & + (QM_EQCR_SIZE - 1); + diff = dpaa_cyc_diff(QM_EQCR_SIZE, old_ci, eqcr->ci); + eqcr->available += diff; + if (!diff) + return NULL; + } +#ifdef CONFIG_FSL_DPAA_CHECKING + eqcr->busy = 1; +#endif + dpaa_zero(eqcr->cursor); + return eqcr->cursor; +} + +static inline void eqcr_commit_checks(struct qm_eqcr *eqcr) +{ + DPAA_ASSERT(eqcr->busy); + DPAA_ASSERT(eqcr->cursor->orp == (eqcr->cursor->orp & 0x00ffffff)); + DPAA_ASSERT(eqcr->cursor->fqid == (eqcr->cursor->fqid & 0x00ffffff)); + DPAA_ASSERT(eqcr->available >= 1); +} + +static inline void qm_eqcr_pvb_commit(struct qm_portal *portal, u8 myverb) +{ + struct qm_eqcr *eqcr = &portal->eqcr; + struct qm_eqcr_entry *eqcursor; + + eqcr_commit_checks(eqcr); + DPAA_ASSERT(eqcr->pmode == qm_eqcr_pvb); + dma_wmb(); + eqcursor = eqcr->cursor; + eqcursor->_ncw_verb = myverb | eqcr->vbit; + dpaa_flush(eqcursor); + eqcr_inc(eqcr); + eqcr->available--; +#ifdef CONFIG_FSL_DPAA_CHECKING + eqcr->busy = 0; +#endif +} + +static inline void qm_eqcr_cce_prefetch(struct qm_portal *portal) +{ + qm_cl_touch_ro(portal, QM_CL_EQCR_CI_CENA); +} + +static inline u8 qm_eqcr_cce_update(struct qm_portal *portal) +{ + struct qm_eqcr *eqcr = &portal->eqcr; + u8 diff, old_ci = eqcr->ci; + + eqcr->ci = qm_ce_in(portal, QM_CL_EQCR_CI_CENA) & (QM_EQCR_SIZE - 1); + qm_cl_invalidate(portal, QM_CL_EQCR_CI_CENA); + diff = dpaa_cyc_diff(QM_EQCR_SIZE, old_ci, eqcr->ci); + eqcr->available += diff; + return diff; +} + +static inline void qm_eqcr_set_ithresh(struct qm_portal *portal, u8 ithresh) +{ + struct qm_eqcr *eqcr = &portal->eqcr; + + eqcr->ithresh = ithresh; + qm_out(portal, QM_REG_EQCR_ITR, ithresh); +} + +static inline u8 qm_eqcr_get_avail(struct qm_portal *portal) +{ + struct qm_eqcr *eqcr = &portal->eqcr; + + return eqcr->available; +} + +static inline u8 qm_eqcr_get_fill(struct qm_portal *portal) +{ + struct qm_eqcr *eqcr = &portal->eqcr; + + return QM_EQCR_SIZE - 1 - eqcr->available; +} + +/* --- DQRR API --- */ + +#define DQRR_SHIFT ilog2(sizeof(struct qm_dqrr_entry)) +#define DQRR_CARRY (uintptr_t)(QM_DQRR_SIZE << DQRR_SHIFT) + +static const struct qm_dqrr_entry *dqrr_carryclear( + const struct qm_dqrr_entry *p) +{ + uintptr_t addr = (uintptr_t)p; + + addr &= ~DQRR_CARRY; + + return (const struct qm_dqrr_entry *)addr; +} + +static inline int dqrr_ptr2idx(const struct qm_dqrr_entry *e) +{ + return ((uintptr_t)e >> DQRR_SHIFT) & (QM_DQRR_SIZE - 1); +} + +static const struct qm_dqrr_entry *dqrr_inc(const struct qm_dqrr_entry *e) +{ + return dqrr_carryclear(e + 1); +} + +static inline void qm_dqrr_set_maxfill(struct qm_portal *portal, u8 mf) +{ + qm_out(portal, QM_REG_CFG, (qm_in(portal, QM_REG_CFG) & 0xff0fffff) | + ((mf & (QM_DQRR_SIZE - 1)) << 20)); +} + +static inline int qm_dqrr_init(struct qm_portal *portal, + const struct qm_portal_config *config, + enum qm_dqrr_dmode dmode, + enum qm_dqrr_pmode pmode, + enum qm_dqrr_cmode cmode, u8 max_fill) +{ + struct qm_dqrr *dqrr = &portal->dqrr; + u32 cfg; + + /* Make sure the DQRR will be idle when we enable */ + qm_out(portal, QM_REG_DQRR_SDQCR, 0); + qm_out(portal, QM_REG_DQRR_VDQCR, 0); + qm_out(portal, QM_REG_DQRR_PDQCR, 0); + dqrr->ring = portal->addr.ce + QM_CL_DQRR; + dqrr->pi = qm_in(portal, QM_REG_DQRR_PI_CINH) & (QM_DQRR_SIZE - 1); + dqrr->ci = qm_in(portal, QM_REG_DQRR_CI_CINH) & (QM_DQRR_SIZE - 1); + dqrr->cursor = dqrr->ring + dqrr->ci; + dqrr->fill = dpaa_cyc_diff(QM_DQRR_SIZE, dqrr->ci, dqrr->pi); + dqrr->vbit = (qm_in(portal, QM_REG_DQRR_PI_CINH) & QM_DQRR_SIZE) ? + QM_DQRR_VERB_VBIT : 0; + dqrr->ithresh = qm_in(portal, QM_REG_DQRR_ITR); +#ifdef CONFIG_FSL_DPAA_CHECKING + dqrr->dmode = dmode; + dqrr->pmode = pmode; + dqrr->cmode = cmode; +#endif + /* Invalidate every ring entry before beginning */ + for (cfg = 0; cfg < QM_DQRR_SIZE; cfg++) + dpaa_invalidate(qm_cl(dqrr->ring, cfg)); + cfg = (qm_in(portal, QM_REG_CFG) & 0xff000f00) | + ((max_fill & (QM_DQRR_SIZE - 1)) << 20) | /* DQRR_MF */ + ((dmode & 1) << 18) | /* DP */ + ((cmode & 3) << 16) | /* DCM */ + 0xa0 | /* RE+SE */ + (0 ? 0x40 : 0) | /* Ignore RP */ + (0 ? 0x10 : 0); /* Ignore SP */ + qm_out(portal, QM_REG_CFG, cfg); + qm_dqrr_set_maxfill(portal, max_fill); + return 0; +} + +static inline void qm_dqrr_finish(struct qm_portal *portal) +{ +#ifdef CONFIG_FSL_DPAA_CHECKING + struct qm_dqrr *dqrr = &portal->dqrr; + + if (dqrr->cmode != qm_dqrr_cdc && + dqrr->ci != dqrr_ptr2idx(dqrr->cursor)) + pr_crit("Ignoring completed DQRR entries\n"); +#endif +} + +static inline const struct qm_dqrr_entry *qm_dqrr_current( + struct qm_portal *portal) +{ + struct qm_dqrr *dqrr = &portal->dqrr; + + if (!dqrr->fill) + return NULL; + return dqrr->cursor; +} + +static inline u8 qm_dqrr_next(struct qm_portal *portal) +{ + struct qm_dqrr *dqrr = &portal->dqrr; + + DPAA_ASSERT(dqrr->fill); + dqrr->cursor = dqrr_inc(dqrr->cursor); + return --dqrr->fill; +} + +static inline void qm_dqrr_pvb_update(struct qm_portal *portal) +{ + struct qm_dqrr *dqrr = &portal->dqrr; + struct qm_dqrr_entry *res = qm_cl(dqrr->ring, dqrr->pi); + + DPAA_ASSERT(dqrr->pmode == qm_dqrr_pvb); +#ifndef CONFIG_FSL_PAMU + /* + * If PAMU is not available we need to invalidate the cache. + * When PAMU is available the cache is updated by stash + */ + dpaa_invalidate_touch_ro(res); +#endif + /* + * when accessing 'verb', use __raw_readb() to ensure that compiler + * inlining doesn't try to optimise out "excess reads". + */ + if ((__raw_readb(&res->verb) & QM_DQRR_VERB_VBIT) == dqrr->vbit) { + dqrr->pi = (dqrr->pi + 1) & (QM_DQRR_SIZE - 1); + if (!dqrr->pi) + dqrr->vbit ^= QM_DQRR_VERB_VBIT; + dqrr->fill++; + } +} + +static inline void qm_dqrr_cdc_consume_1ptr(struct qm_portal *portal, + const struct qm_dqrr_entry *dq, + int park) +{ + __maybe_unused struct qm_dqrr *dqrr = &portal->dqrr; + int idx = dqrr_ptr2idx(dq); + + DPAA_ASSERT(dqrr->cmode == qm_dqrr_cdc); + DPAA_ASSERT((dqrr->ring + idx) == dq); + DPAA_ASSERT(idx < QM_DQRR_SIZE); + qm_out(portal, QM_REG_DQRR_DCAP, (0 << 8) | /* DQRR_DCAP::S */ + ((park ? 1 : 0) << 6) | /* DQRR_DCAP::PK */ + idx); /* DQRR_DCAP::DCAP_CI */ +} + +static inline void qm_dqrr_cdc_consume_n(struct qm_portal *portal, u32 bitmask) +{ + __maybe_unused struct qm_dqrr *dqrr = &portal->dqrr; + + DPAA_ASSERT(dqrr->cmode == qm_dqrr_cdc); + qm_out(portal, QM_REG_DQRR_DCAP, (1 << 8) | /* DQRR_DCAP::S */ + (bitmask << 16)); /* DQRR_DCAP::DCAP_CI */ +} + +static inline void qm_dqrr_sdqcr_set(struct qm_portal *portal, u32 sdqcr) +{ + qm_out(portal, QM_REG_DQRR_SDQCR, sdqcr); +} + +static inline void qm_dqrr_vdqcr_set(struct qm_portal *portal, u32 vdqcr) +{ + qm_out(portal, QM_REG_DQRR_VDQCR, vdqcr); +} + +static inline void qm_dqrr_set_ithresh(struct qm_portal *portal, u8 ithresh) +{ + qm_out(portal, QM_REG_DQRR_ITR, ithresh); +} + +/* --- MR API --- */ + +#define MR_SHIFT ilog2(sizeof(union qm_mr_entry)) +#define MR_CARRY (uintptr_t)(QM_MR_SIZE << MR_SHIFT) + +static union qm_mr_entry *mr_carryclear(union qm_mr_entry *p) +{ + uintptr_t addr = (uintptr_t)p; + + addr &= ~MR_CARRY; + + return (union qm_mr_entry *)addr; +} + +static inline int mr_ptr2idx(const union qm_mr_entry *e) +{ + return ((uintptr_t)e >> MR_SHIFT) & (QM_MR_SIZE - 1); +} + +static inline union qm_mr_entry *mr_inc(union qm_mr_entry *e) +{ + return mr_carryclear(e + 1); +} + +static inline int qm_mr_init(struct qm_portal *portal, enum qm_mr_pmode pmode, + enum qm_mr_cmode cmode) +{ + struct qm_mr *mr = &portal->mr; + u32 cfg; + + mr->ring = portal->addr.ce + QM_CL_MR; + mr->pi = qm_in(portal, QM_REG_MR_PI_CINH) & (QM_MR_SIZE - 1); + mr->ci = qm_in(portal, QM_REG_MR_CI_CINH) & (QM_MR_SIZE - 1); + mr->cursor = mr->ring + mr->ci; + mr->fill = dpaa_cyc_diff(QM_MR_SIZE, mr->ci, mr->pi); + mr->vbit = (qm_in(portal, QM_REG_MR_PI_CINH) & QM_MR_SIZE) + ? QM_MR_VERB_VBIT : 0; + mr->ithresh = qm_in(portal, QM_REG_MR_ITR); +#ifdef CONFIG_FSL_DPAA_CHECKING + mr->pmode = pmode; + mr->cmode = cmode; +#endif + cfg = (qm_in(portal, QM_REG_CFG) & 0xfffff0ff) | + ((cmode & 1) << 8); /* QCSP_CFG:MM */ + qm_out(portal, QM_REG_CFG, cfg); + return 0; +} + +static inline void qm_mr_finish(struct qm_portal *portal) +{ + struct qm_mr *mr = &portal->mr; + + if (mr->ci != mr_ptr2idx(mr->cursor)) + pr_crit("Ignoring completed MR entries\n"); +} + +static inline const union qm_mr_entry *qm_mr_current(struct qm_portal *portal) +{ + struct qm_mr *mr = &portal->mr; + + if (!mr->fill) + return NULL; + return mr->cursor; +} + +static inline int qm_mr_next(struct qm_portal *portal) +{ + struct qm_mr *mr = &portal->mr; + + DPAA_ASSERT(mr->fill); + mr->cursor = mr_inc(mr->cursor); + return --mr->fill; +} + +static inline void qm_mr_pvb_update(struct qm_portal *portal) +{ + struct qm_mr *mr = &portal->mr; + union qm_mr_entry *res = qm_cl(mr->ring, mr->pi); + + DPAA_ASSERT(mr->pmode == qm_mr_pvb); + /* + * when accessing 'verb', use __raw_readb() to ensure that compiler + * inlining doesn't try to optimise out "excess reads". + */ + if ((__raw_readb(&res->verb) & QM_MR_VERB_VBIT) == mr->vbit) { + mr->pi = (mr->pi + 1) & (QM_MR_SIZE - 1); + if (!mr->pi) + mr->vbit ^= QM_MR_VERB_VBIT; + mr->fill++; + res = mr_inc(res); + } + dpaa_invalidate_touch_ro(res); +} + +static inline void qm_mr_cci_consume(struct qm_portal *portal, u8 num) +{ + struct qm_mr *mr = &portal->mr; + + DPAA_ASSERT(mr->cmode == qm_mr_cci); + mr->ci = (mr->ci + num) & (QM_MR_SIZE - 1); + qm_out(portal, QM_REG_MR_CI_CINH, mr->ci); +} + +static inline void qm_mr_cci_consume_to_current(struct qm_portal *portal) +{ + struct qm_mr *mr = &portal->mr; + + DPAA_ASSERT(mr->cmode == qm_mr_cci); + mr->ci = mr_ptr2idx(mr->cursor); + qm_out(portal, QM_REG_MR_CI_CINH, mr->ci); +} + +static inline void qm_mr_set_ithresh(struct qm_portal *portal, u8 ithresh) +{ + qm_out(portal, QM_REG_MR_ITR, ithresh); +} + +/* --- Management command API --- */ + +static inline int qm_mc_init(struct qm_portal *portal) +{ + struct qm_mc *mc = &portal->mc; + + mc->cr = portal->addr.ce + QM_CL_CR; + mc->rr = portal->addr.ce + QM_CL_RR0; + mc->rridx = (__raw_readb(&mc->cr->_ncw_verb) & QM_MCC_VERB_VBIT) + ? 0 : 1; + mc->vbit = mc->rridx ? QM_MCC_VERB_VBIT : 0; +#ifdef CONFIG_FSL_DPAA_CHECKING + mc->state = qman_mc_idle; +#endif + return 0; +} + +static inline void qm_mc_finish(struct qm_portal *portal) +{ +#ifdef CONFIG_FSL_DPAA_CHECKING + struct qm_mc *mc = &portal->mc; + + DPAA_ASSERT(mc->state == qman_mc_idle); + if (mc->state != qman_mc_idle) + pr_crit("Losing incomplete MC command\n"); +#endif +} + +static inline union qm_mc_command *qm_mc_start(struct qm_portal *portal) +{ + struct qm_mc *mc = &portal->mc; + + DPAA_ASSERT(mc->state == qman_mc_idle); +#ifdef CONFIG_FSL_DPAA_CHECKING + mc->state = qman_mc_user; +#endif + dpaa_zero(mc->cr); + return mc->cr; +} + +static inline void qm_mc_commit(struct qm_portal *portal, u8 myverb) +{ + struct qm_mc *mc = &portal->mc; + union qm_mc_result *rr = mc->rr + mc->rridx; + + DPAA_ASSERT(mc->state == qman_mc_user); + dma_wmb(); + mc->cr->_ncw_verb = myverb | mc->vbit; + dpaa_flush(mc->cr); + dpaa_invalidate_touch_ro(rr); +#ifdef CONFIG_FSL_DPAA_CHECKING + mc->state = qman_mc_hw; +#endif +} + +static inline union qm_mc_result *qm_mc_result(struct qm_portal *portal) +{ + struct qm_mc *mc = &portal->mc; + union qm_mc_result *rr = mc->rr + mc->rridx; + + DPAA_ASSERT(mc->state == qman_mc_hw); + /* + * The inactive response register's verb byte always returns zero until + * its command is submitted and completed. This includes the valid-bit, + * in case you were wondering... + */ + if (!__raw_readb(&rr->verb)) { + dpaa_invalidate_touch_ro(rr); + return NULL; + } + mc->rridx ^= 1; + mc->vbit ^= QM_MCC_VERB_VBIT; +#ifdef CONFIG_FSL_DPAA_CHECKING + mc->state = qman_mc_idle; +#endif + return rr; +} + +static inline int qm_mc_result_timeout(struct qm_portal *portal, + union qm_mc_result **mcr) +{ + int timeout = QM_MCR_TIMEOUT; + + do { + *mcr = qm_mc_result(portal); + if (*mcr) + break; + udelay(1); + } while (--timeout); + + return timeout; +} + +static inline void fq_set(struct qman_fq *fq, u32 mask) +{ + set_bits(mask, &fq->flags); +} + +static inline void fq_clear(struct qman_fq *fq, u32 mask) +{ + clear_bits(mask, &fq->flags); +} + +static inline int fq_isset(struct qman_fq *fq, u32 mask) +{ + return fq->flags & mask; +} + +static inline int fq_isclear(struct qman_fq *fq, u32 mask) +{ + return !(fq->flags & mask); +} + +struct qman_portal { + struct qm_portal p; + /* PORTAL_BITS_*** - dynamic, strictly internal */ + unsigned long bits; + /* interrupt sources processed by portal_isr(), configurable */ + unsigned long irq_sources; + u32 use_eqcr_ci_stashing; + /* only 1 volatile dequeue at a time */ + struct qman_fq *vdqcr_owned; + u32 sdqcr; + /* probing time config params for cpu-affine portals */ + const struct qm_portal_config *config; + /* needed for providing a non-NULL device to dma_map_***() */ + struct platform_device *pdev; + /* 2-element array. cgrs[0] is mask, cgrs[1] is snapshot. */ + struct qman_cgrs *cgrs; + /* linked-list of CSCN handlers. */ + struct list_head cgr_cbs; + /* list lock */ + spinlock_t cgr_lock; + struct work_struct congestion_work; + struct work_struct mr_work; + char irqname[MAX_IRQNAME]; +}; + +static cpumask_t affine_mask; +static DEFINE_SPINLOCK(affine_mask_lock); +static u16 affine_channels[NR_CPUS]; +static DEFINE_PER_CPU(struct qman_portal, qman_affine_portal); +struct qman_portal *affine_portals[NR_CPUS]; + +static inline struct qman_portal *get_affine_portal(void) +{ + return &get_cpu_var(qman_affine_portal); +} + +static inline void put_affine_portal(void) +{ + put_cpu_var(qman_affine_portal); +} + +static struct workqueue_struct *qm_portal_wq; + +int qman_wq_alloc(void) +{ + qm_portal_wq = alloc_workqueue("qman_portal_wq", 0, 1); + if (!qm_portal_wq) + return -ENOMEM; + return 0; +} + +/* + * This is what everything can wait on, even if it migrates to a different cpu + * to the one whose affine portal it is waiting on. + */ +static DECLARE_WAIT_QUEUE_HEAD(affine_queue); + +static struct qman_fq **fq_table; +static u32 num_fqids; + +int qman_alloc_fq_table(u32 _num_fqids) +{ + num_fqids = _num_fqids; + + fq_table = vzalloc(num_fqids * 2 * sizeof(struct qman_fq *)); + if (!fq_table) + return -ENOMEM; + + pr_debug("Allocated fq lookup table at %p, entry count %u\n", + fq_table, num_fqids * 2); + return 0; +} + +static struct qman_fq *idx_to_fq(u32 idx) +{ + struct qman_fq *fq; + +#ifdef CONFIG_FSL_DPAA_CHECKING + if (WARN_ON(idx >= num_fqids * 2)) + return NULL; +#endif + fq = fq_table[idx]; + DPAA_ASSERT(!fq || idx == fq->idx); + + return fq; +} + +/* + * Only returns full-service fq objects, not enqueue-only + * references (QMAN_FQ_FLAG_NO_MODIFY). + */ +static struct qman_fq *fqid_to_fq(u32 fqid) +{ + return idx_to_fq(fqid * 2); +} + +static struct qman_fq *tag_to_fq(u32 tag) +{ +#if BITS_PER_LONG == 64 + return idx_to_fq(tag); +#else + return (struct qman_fq *)tag; +#endif +} + +static u32 fq_to_tag(struct qman_fq *fq) +{ +#if BITS_PER_LONG == 64 + return fq->idx; +#else + return (u32)fq; +#endif +} + +static u32 __poll_portal_slow(struct qman_portal *p, u32 is); +static inline unsigned int __poll_portal_fast(struct qman_portal *p, + unsigned int poll_limit); +static void qm_congestion_task(struct work_struct *work); +static void qm_mr_process_task(struct work_struct *work); + +static irqreturn_t portal_isr(int irq, void *ptr) +{ + struct qman_portal *p = ptr; + + u32 clear = QM_DQAVAIL_MASK | p->irq_sources; + u32 is = qm_in(&p->p, QM_REG_ISR) & p->irq_sources; + + if (unlikely(!is)) + return IRQ_NONE; + + /* DQRR-handling if it's interrupt-driven */ + if (is & QM_PIRQ_DQRI) + __poll_portal_fast(p, QMAN_POLL_LIMIT); + /* Handling of anything else that's interrupt-driven */ + clear |= __poll_portal_slow(p, is); + qm_out(&p->p, QM_REG_ISR, clear); + return IRQ_HANDLED; +} + +static int drain_mr_fqrni(struct qm_portal *p) +{ + const union qm_mr_entry *msg; +loop: + msg = qm_mr_current(p); + if (!msg) { + /* + * if MR was full and h/w had other FQRNI entries to produce, we + * need to allow it time to produce those entries once the + * existing entries are consumed. A worst-case situation + * (fully-loaded system) means h/w sequencers may have to do 3-4 + * other things before servicing the portal's MR pump, each of + * which (if slow) may take ~50 qman cycles (which is ~200 + * processor cycles). So rounding up and then multiplying this + * worst-case estimate by a factor of 10, just to be + * ultra-paranoid, goes as high as 10,000 cycles. NB, we consume + * one entry at a time, so h/w has an opportunity to produce new + * entries well before the ring has been fully consumed, so + * we're being *really* paranoid here. + */ + u64 now, then = jiffies; + + do { + now = jiffies; + } while ((then + 10000) > now); + msg = qm_mr_current(p); + if (!msg) + return 0; + } + if ((msg->verb & QM_MR_VERB_TYPE_MASK) != QM_MR_VERB_FQRNI) { + /* We aren't draining anything but FQRNIs */ + pr_err("Found verb 0x%x in MR\n", msg->verb); + return -1; + } + qm_mr_next(p); + qm_mr_cci_consume(p, 1); + goto loop; +} + +static int qman_create_portal(struct qman_portal *portal, + const struct qm_portal_config *c, + const struct qman_cgrs *cgrs) +{ + struct qm_portal *p; + char buf[16]; + int ret; + u32 isdr; + + p = &portal->p; + +#ifdef CONFIG_FSL_PAMU + /* PAMU is required for stashing */ + portal->use_eqcr_ci_stashing = ((qman_ip_rev >= QMAN_REV30) ? 1 : 0); +#else + portal->use_eqcr_ci_stashing = 0; +#endif + /* + * prep the low-level portal struct with the mapped addresses from the + * config, everything that follows depends on it and "config" is more + * for (de)reference + */ + p->addr.ce = c->addr_virt[DPAA_PORTAL_CE]; + p->addr.ci = c->addr_virt[DPAA_PORTAL_CI]; + /* + * If CI-stashing is used, the current defaults use a threshold of 3, + * and stash with high-than-DQRR priority. + */ + if (qm_eqcr_init(p, qm_eqcr_pvb, + portal->use_eqcr_ci_stashing ? 3 : 0, 1)) { + dev_err(c->dev, "EQCR initialisation failed\n"); + goto fail_eqcr; + } + if (qm_dqrr_init(p, c, qm_dqrr_dpush, qm_dqrr_pvb, + qm_dqrr_cdc, DQRR_MAXFILL)) { + dev_err(c->dev, "DQRR initialisation failed\n"); + goto fail_dqrr; + } + if (qm_mr_init(p, qm_mr_pvb, qm_mr_cci)) { + dev_err(c->dev, "MR initialisation failed\n"); + goto fail_mr; + } + if (qm_mc_init(p)) { + dev_err(c->dev, "MC initialisation failed\n"); + goto fail_mc; + } + /* static interrupt-gating controls */ + qm_dqrr_set_ithresh(p, QMAN_PIRQ_DQRR_ITHRESH); + qm_mr_set_ithresh(p, QMAN_PIRQ_MR_ITHRESH); + qm_out(p, QM_REG_ITPR, QMAN_PIRQ_IPERIOD); + portal->cgrs = kmalloc(2 * sizeof(*cgrs), GFP_KERNEL); + if (!portal->cgrs) + goto fail_cgrs; + /* initial snapshot is no-depletion */ + qman_cgrs_init(&portal->cgrs[1]); + if (cgrs) + portal->cgrs[0] = *cgrs; + else + /* if the given mask is NULL, assume all CGRs can be seen */ + qman_cgrs_fill(&portal->cgrs[0]); + INIT_LIST_HEAD(&portal->cgr_cbs); + spin_lock_init(&portal->cgr_lock); + INIT_WORK(&portal->congestion_work, qm_congestion_task); + INIT_WORK(&portal->mr_work, qm_mr_process_task); + portal->bits = 0; + portal->sdqcr = QM_SDQCR_SOURCE_CHANNELS | QM_SDQCR_COUNT_UPTO3 | + QM_SDQCR_DEDICATED_PRECEDENCE | QM_SDQCR_TYPE_PRIO_QOS | + QM_SDQCR_TOKEN_SET(0xab) | QM_SDQCR_CHANNELS_DEDICATED; + sprintf(buf, "qportal-%d", c->channel); + portal->pdev = platform_device_alloc(buf, -1); + if (!portal->pdev) + goto fail_devalloc; + if (dma_set_mask(&portal->pdev->dev, DMA_BIT_MASK(40))) + goto fail_devadd; + ret = platform_device_add(portal->pdev); + if (ret) + goto fail_devadd; + isdr = 0xffffffff; + qm_out(p, QM_REG_ISDR, isdr); + portal->irq_sources = 0; + qm_out(p, QM_REG_IER, 0); + qm_out(p, QM_REG_ISR, 0xffffffff); + snprintf(portal->irqname, MAX_IRQNAME, IRQNAME, c->cpu); + if (request_irq(c->irq, portal_isr, 0, portal->irqname, portal)) { + dev_err(c->dev, "request_irq() failed\n"); + goto fail_irq; + } + if (c->cpu != -1 && irq_can_set_affinity(c->irq) && + irq_set_affinity(c->irq, cpumask_of(c->cpu))) { + dev_err(c->dev, "irq_set_affinity() failed\n"); + goto fail_affinity; + } + + /* Need EQCR to be empty before continuing */ + isdr &= ~QM_PIRQ_EQCI; + qm_out(p, QM_REG_ISDR, isdr); + ret = qm_eqcr_get_fill(p); + if (ret) { + dev_err(c->dev, "EQCR unclean\n"); + goto fail_eqcr_empty; + } + isdr &= ~(QM_PIRQ_DQRI | QM_PIRQ_MRI); + qm_out(p, QM_REG_ISDR, isdr); + if (qm_dqrr_current(p)) { + dev_err(c->dev, "DQRR unclean\n"); + qm_dqrr_cdc_consume_n(p, 0xffff); + } + if (qm_mr_current(p) && drain_mr_fqrni(p)) { + /* special handling, drain just in case it's a few FQRNIs */ + const union qm_mr_entry *e = qm_mr_current(p); + + dev_err(c->dev, "MR dirty, VB 0x%x, rc 0x%x\n, addr 0x%x", + e->verb, e->ern.rc, e->ern.fd.addr_lo); + goto fail_dqrr_mr_empty; + } + /* Success */ + portal->config = c; + qm_out(p, QM_REG_ISDR, 0); + qm_out(p, QM_REG_IIR, 0); + /* Write a sane SDQCR */ + qm_dqrr_sdqcr_set(p, portal->sdqcr); + return 0; + +fail_dqrr_mr_empty: +fail_eqcr_empty: +fail_affinity: + free_irq(c->irq, portal); +fail_irq: + platform_device_del(portal->pdev); +fail_devadd: + platform_device_put(portal->pdev); +fail_devalloc: + kfree(portal->cgrs); +fail_cgrs: + qm_mc_finish(p); +fail_mc: + qm_mr_finish(p); +fail_mr: + qm_dqrr_finish(p); +fail_dqrr: + qm_eqcr_finish(p); +fail_eqcr: + return -EIO; +} + +struct qman_portal *qman_create_affine_portal(const struct qm_portal_config *c, + const struct qman_cgrs *cgrs) +{ + struct qman_portal *portal; + int err; + + portal = &per_cpu(qman_affine_portal, c->cpu); + err = qman_create_portal(portal, c, cgrs); + if (err) + return NULL; + + spin_lock(&affine_mask_lock); + cpumask_set_cpu(c->cpu, &affine_mask); + affine_channels[c->cpu] = c->channel; + affine_portals[c->cpu] = portal; + spin_unlock(&affine_mask_lock); + + return portal; +} + +static void qman_destroy_portal(struct qman_portal *qm) +{ + const struct qm_portal_config *pcfg; + + /* Stop dequeues on the portal */ + qm_dqrr_sdqcr_set(&qm->p, 0); + + /* + * NB we do this to "quiesce" EQCR. If we add enqueue-completions or + * something related to QM_PIRQ_EQCI, this may need fixing. + * Also, due to the prefetching model used for CI updates in the enqueue + * path, this update will only invalidate the CI cacheline *after* + * working on it, so we need to call this twice to ensure a full update + * irrespective of where the enqueue processing was at when the teardown + * began. + */ + qm_eqcr_cce_update(&qm->p); + qm_eqcr_cce_update(&qm->p); + pcfg = qm->config; + + free_irq(pcfg->irq, qm); + + kfree(qm->cgrs); + qm_mc_finish(&qm->p); + qm_mr_finish(&qm->p); + qm_dqrr_finish(&qm->p); + qm_eqcr_finish(&qm->p); + + platform_device_del(qm->pdev); + platform_device_put(qm->pdev); + + qm->config = NULL; +} + +const struct qm_portal_config *qman_destroy_affine_portal(void) +{ + struct qman_portal *qm = get_affine_portal(); + const struct qm_portal_config *pcfg; + int cpu; + + pcfg = qm->config; + cpu = pcfg->cpu; + + qman_destroy_portal(qm); + + spin_lock(&affine_mask_lock); + cpumask_clear_cpu(cpu, &affine_mask); + spin_unlock(&affine_mask_lock); + put_affine_portal(); + return pcfg; +} + +/* Inline helper to reduce nesting in __poll_portal_slow() */ +static inline void fq_state_change(struct qman_portal *p, struct qman_fq *fq, + const union qm_mr_entry *msg, u8 verb) +{ + switch (verb) { + case QM_MR_VERB_FQRL: + DPAA_ASSERT(fq_isset(fq, QMAN_FQ_STATE_ORL)); + fq_clear(fq, QMAN_FQ_STATE_ORL); + break; + case QM_MR_VERB_FQRN: + DPAA_ASSERT(fq->state == qman_fq_state_parked || + fq->state == qman_fq_state_sched); + DPAA_ASSERT(fq_isset(fq, QMAN_FQ_STATE_CHANGING)); + fq_clear(fq, QMAN_FQ_STATE_CHANGING); + if (msg->fq.fqs & QM_MR_FQS_NOTEMPTY) + fq_set(fq, QMAN_FQ_STATE_NE); + if (msg->fq.fqs & QM_MR_FQS_ORLPRESENT) + fq_set(fq, QMAN_FQ_STATE_ORL); + fq->state = qman_fq_state_retired; + break; + case QM_MR_VERB_FQPN: + DPAA_ASSERT(fq->state == qman_fq_state_sched); + DPAA_ASSERT(fq_isclear(fq, QMAN_FQ_STATE_CHANGING)); + fq->state = qman_fq_state_parked; + } +} + +static void qm_congestion_task(struct work_struct *work) +{ + struct qman_portal *p = container_of(work, struct qman_portal, + congestion_work); + struct qman_cgrs rr, c; + union qm_mc_result *mcr; + struct qman_cgr *cgr; + + spin_lock(&p->cgr_lock); + qm_mc_start(&p->p); + qm_mc_commit(&p->p, QM_MCC_VERB_QUERYCONGESTION); + if (!qm_mc_result_timeout(&p->p, &mcr)) { + spin_unlock(&p->cgr_lock); + dev_crit(p->config->dev, "QUERYCONGESTION timeout\n"); + return; + } + /* mask out the ones I'm not interested in */ + qman_cgrs_and(&rr, (struct qman_cgrs *)&mcr->querycongestion.state, + &p->cgrs[0]); + /* check previous snapshot for delta, enter/exit congestion */ + qman_cgrs_xor(&c, &rr, &p->cgrs[1]); + /* update snapshot */ + qman_cgrs_cp(&p->cgrs[1], &rr); + /* Invoke callback */ + list_for_each_entry(cgr, &p->cgr_cbs, node) + if (cgr->cb && qman_cgrs_get(&c, cgr->cgrid)) + cgr->cb(p, cgr, qman_cgrs_get(&rr, cgr->cgrid)); + spin_unlock(&p->cgr_lock); +} + +static void qm_mr_process_task(struct work_struct *work) +{ + struct qman_portal *p = container_of(work, struct qman_portal, + mr_work); + const union qm_mr_entry *msg; + struct qman_fq *fq; + u8 verb, num = 0; + + preempt_disable(); + + while (1) { + qm_mr_pvb_update(&p->p); + msg = qm_mr_current(&p->p); + if (!msg) + break; + + verb = msg->verb & QM_MR_VERB_TYPE_MASK; + /* The message is a software ERN iff the 0x20 bit is clear */ + if (verb & 0x20) { + switch (verb) { + case QM_MR_VERB_FQRNI: + /* nada, we drop FQRNIs on the floor */ + break; + case QM_MR_VERB_FQRN: + case QM_MR_VERB_FQRL: + /* Lookup in the retirement table */ + fq = fqid_to_fq(msg->fq.fqid); + if (WARN_ON(!fq)) + break; + fq_state_change(p, fq, msg, verb); + if (fq->cb.fqs) + fq->cb.fqs(p, fq, msg); + break; + case QM_MR_VERB_FQPN: + /* Parked */ + fq = tag_to_fq(msg->fq.contextB); + fq_state_change(p, fq, msg, verb); + if (fq->cb.fqs) + fq->cb.fqs(p, fq, msg); + break; + case QM_MR_VERB_DC_ERN: + /* DCP ERN */ + pr_crit_once("Leaking DCP ERNs!\n"); + break; + default: + pr_crit("Invalid MR verb 0x%02x\n", verb); + } + } else { + /* Its a software ERN */ + fq = tag_to_fq(msg->ern.tag); + fq->cb.ern(p, fq, msg); + } + num++; + qm_mr_next(&p->p); + } + + qm_mr_cci_consume(&p->p, num); + preempt_enable(); +} + +static u32 __poll_portal_slow(struct qman_portal *p, u32 is) +{ + if (is & QM_PIRQ_CSCI) { + queue_work_on(smp_processor_id(), qm_portal_wq, + &p->congestion_work); + } + + if (is & QM_PIRQ_EQRI) { + qm_eqcr_cce_update(&p->p); + qm_eqcr_set_ithresh(&p->p, 0); + wake_up(&affine_queue); + } + + if (is & QM_PIRQ_MRI) { + queue_work_on(smp_processor_id(), qm_portal_wq, + &p->mr_work); + } + + return is; +} + +/* + * remove some slowish-path stuff from the "fast path" and make sure it isn't + * inlined. + */ +static noinline void clear_vdqcr(struct qman_portal *p, struct qman_fq *fq) +{ + p->vdqcr_owned = NULL; + fq_clear(fq, QMAN_FQ_STATE_VDQCR); + wake_up(&affine_queue); +} + +/* + * The only states that would conflict with other things if they ran at the + * same time on the same cpu are: + * + * (i) setting/clearing vdqcr_owned, and + * (ii) clearing the NE (Not Empty) flag. + * + * Both are safe. Because; + * + * (i) this clearing can only occur after qman_volatile_dequeue() has set the + * vdqcr_owned field (which it does before setting VDQCR), and + * qman_volatile_dequeue() blocks interrupts and preemption while this is + * done so that we can't interfere. + * (ii) the NE flag is only cleared after qman_retire_fq() has set it, and as + * with (i) that API prevents us from interfering until it's safe. + * + * The good thing is that qman_volatile_dequeue() and qman_retire_fq() run far + * less frequently (ie. per-FQ) than __poll_portal_fast() does, so the nett + * advantage comes from this function not having to "lock" anything at all. + * + * Note also that the callbacks are invoked at points which are safe against the + * above potential conflicts, but that this function itself is not re-entrant + * (this is because the function tracks one end of each FIFO in the portal and + * we do *not* want to lock that). So the consequence is that it is safe for + * user callbacks to call into any QMan API. + */ +static inline unsigned int __poll_portal_fast(struct qman_portal *p, + unsigned int poll_limit) +{ + const struct qm_dqrr_entry *dq; + struct qman_fq *fq; + enum qman_cb_dqrr_result res; + unsigned int limit = 0; + + do { + qm_dqrr_pvb_update(&p->p); + dq = qm_dqrr_current(&p->p); + if (!dq) + break; + + if (dq->stat & QM_DQRR_STAT_UNSCHEDULED) { + /* + * VDQCR: don't trust contextB as the FQ may have + * been configured for h/w consumption and we're + * draining it post-retirement. + */ + fq = p->vdqcr_owned; + /* + * We only set QMAN_FQ_STATE_NE when retiring, so we + * only need to check for clearing it when doing + * volatile dequeues. It's one less thing to check + * in the critical path (SDQCR). + */ + if (dq->stat & QM_DQRR_STAT_FQ_EMPTY) + fq_clear(fq, QMAN_FQ_STATE_NE); + /* + * This is duplicated from the SDQCR code, but we + * have stuff to do before *and* after this callback, + * and we don't want multiple if()s in the critical + * path (SDQCR). + */ + res = fq->cb.dqrr(p, fq, dq); + if (res == qman_cb_dqrr_stop) + break; + /* Check for VDQCR completion */ + if (dq->stat & QM_DQRR_STAT_DQCR_EXPIRED) + clear_vdqcr(p, fq); + } else { + /* SDQCR: contextB points to the FQ */ + fq = tag_to_fq(dq->contextB); + /* Now let the callback do its stuff */ + res = fq->cb.dqrr(p, fq, dq); + /* + * The callback can request that we exit without + * consuming this entry nor advancing; + */ + if (res == qman_cb_dqrr_stop) + break; + } + /* Interpret 'dq' from a driver perspective. */ + /* + * Parking isn't possible unless HELDACTIVE was set. NB, + * FORCEELIGIBLE implies HELDACTIVE, so we only need to + * check for HELDACTIVE to cover both. + */ + DPAA_ASSERT((dq->stat & QM_DQRR_STAT_FQ_HELDACTIVE) || + (res != qman_cb_dqrr_park)); + /* just means "skip it, I'll consume it myself later on" */ + if (res != qman_cb_dqrr_defer) + qm_dqrr_cdc_consume_1ptr(&p->p, dq, + res == qman_cb_dqrr_park); + /* Move forward */ + qm_dqrr_next(&p->p); + /* + * Entry processed and consumed, increment our counter. The + * callback can request that we exit after consuming the + * entry, and we also exit if we reach our processing limit, + * so loop back only if neither of these conditions is met. + */ + } while (++limit < poll_limit && res != qman_cb_dqrr_consume_stop); + + return limit; +} + +void qman_p_irqsource_add(struct qman_portal *p, u32 bits) +{ + unsigned long irqflags; + + local_irq_save(irqflags); + set_bits(bits & QM_PIRQ_VISIBLE, &p->irq_sources); + qm_out(&p->p, QM_REG_IER, p->irq_sources); + local_irq_restore(irqflags); +} +EXPORT_SYMBOL(qman_p_irqsource_add); + +void qman_p_irqsource_remove(struct qman_portal *p, u32 bits) +{ + unsigned long irqflags; + u32 ier; + + /* + * Our interrupt handler only processes+clears status register bits that + * are in p->irq_sources. As we're trimming that mask, if one of them + * were to assert in the status register just before we remove it from + * the enable register, there would be an interrupt-storm when we + * release the IRQ lock. So we wait for the enable register update to + * take effect in h/w (by reading it back) and then clear all other bits + * in the status register. Ie. we clear them from ISR once it's certain + * IER won't allow them to reassert. + */ + local_irq_save(irqflags); + bits &= QM_PIRQ_VISIBLE; + clear_bits(bits, &p->irq_sources); + qm_out(&p->p, QM_REG_IER, p->irq_sources); + ier = qm_in(&p->p, QM_REG_IER); + /* + * Using "~ier" (rather than "bits" or "~p->irq_sources") creates a + * data-dependency, ie. to protect against re-ordering. + */ + qm_out(&p->p, QM_REG_ISR, ~ier); + local_irq_restore(irqflags); +} +EXPORT_SYMBOL(qman_p_irqsource_remove); + +const cpumask_t *qman_affine_cpus(void) +{ + return &affine_mask; +} +EXPORT_SYMBOL(qman_affine_cpus); + +u16 qman_affine_channel(int cpu) +{ + if (cpu < 0) { + struct qman_portal *portal = get_affine_portal(); + + cpu = portal->config->cpu; + put_affine_portal(); + } + WARN_ON(!cpumask_test_cpu(cpu, &affine_mask)); + return affine_channels[cpu]; +} +EXPORT_SYMBOL(qman_affine_channel); + +struct qman_portal *qman_get_affine_portal(int cpu) +{ + return affine_portals[cpu]; +} +EXPORT_SYMBOL(qman_get_affine_portal); + +int qman_p_poll_dqrr(struct qman_portal *p, unsigned int limit) +{ + return __poll_portal_fast(p, limit); +} +EXPORT_SYMBOL(qman_p_poll_dqrr); + +void qman_p_static_dequeue_add(struct qman_portal *p, u32 pools) +{ + unsigned long irqflags; + + local_irq_save(irqflags); + pools &= p->config->pools; + p->sdqcr |= pools; + qm_dqrr_sdqcr_set(&p->p, p->sdqcr); + local_irq_restore(irqflags); +} +EXPORT_SYMBOL(qman_p_static_dequeue_add); + +/* Frame queue API */ + +static const char *mcr_result_str(u8 result) +{ + switch (result) { + case QM_MCR_RESULT_NULL: + return "QM_MCR_RESULT_NULL"; + case QM_MCR_RESULT_OK: + return "QM_MCR_RESULT_OK"; + case QM_MCR_RESULT_ERR_FQID: + return "QM_MCR_RESULT_ERR_FQID"; + case QM_MCR_RESULT_ERR_FQSTATE: + return "QM_MCR_RESULT_ERR_FQSTATE"; + case QM_MCR_RESULT_ERR_NOTEMPTY: + return "QM_MCR_RESULT_ERR_NOTEMPTY"; + case QM_MCR_RESULT_PENDING: + return "QM_MCR_RESULT_PENDING"; + case QM_MCR_RESULT_ERR_BADCOMMAND: + return "QM_MCR_RESULT_ERR_BADCOMMAND"; + } + return ""; +} + +int qman_create_fq(u32 fqid, u32 flags, struct qman_fq *fq) +{ + if (flags & QMAN_FQ_FLAG_DYNAMIC_FQID) { + int ret = qman_alloc_fqid(&fqid); + + if (ret) + return ret; + } + fq->fqid = fqid; + fq->flags = flags; + fq->state = qman_fq_state_oos; + fq->cgr_groupid = 0; + + /* A context_b of 0 is allegedly special, so don't use that fqid */ + if (fqid == 0 || fqid >= num_fqids) { + WARN(1, "bad fqid %d\n", fqid); + return -EINVAL; + } + + fq->idx = fqid * 2; + if (flags & QMAN_FQ_FLAG_NO_MODIFY) + fq->idx++; + + WARN_ON(fq_table[fq->idx]); + fq_table[fq->idx] = fq; + + return 0; +} +EXPORT_SYMBOL(qman_create_fq); + +void qman_destroy_fq(struct qman_fq *fq) +{ + /* + * We don't need to lock the FQ as it is a pre-condition that the FQ be + * quiesced. Instead, run some checks. + */ + switch (fq->state) { + case qman_fq_state_parked: + case qman_fq_state_oos: + if (fq_isset(fq, QMAN_FQ_FLAG_DYNAMIC_FQID)) + qman_release_fqid(fq->fqid); + + DPAA_ASSERT(fq_table[fq->idx]); + fq_table[fq->idx] = NULL; + return; + default: + break; + } + DPAA_ASSERT(NULL == "qman_free_fq() on unquiesced FQ!"); +} +EXPORT_SYMBOL(qman_destroy_fq); + +u32 qman_fq_fqid(struct qman_fq *fq) +{ + return fq->fqid; +} +EXPORT_SYMBOL(qman_fq_fqid); + +int qman_init_fq(struct qman_fq *fq, u32 flags, struct qm_mcc_initfq *opts) +{ + union qm_mc_command *mcc; + union qm_mc_result *mcr; + struct qman_portal *p; + u8 res, myverb; + int ret = 0; + + myverb = (flags & QMAN_INITFQ_FLAG_SCHED) + ? QM_MCC_VERB_INITFQ_SCHED : QM_MCC_VERB_INITFQ_PARKED; + + if (fq->state != qman_fq_state_oos && + fq->state != qman_fq_state_parked) + return -EINVAL; +#ifdef CONFIG_FSL_DPAA_CHECKING + if (fq_isset(fq, QMAN_FQ_FLAG_NO_MODIFY)) + return -EINVAL; +#endif + if (opts && (opts->we_mask & QM_INITFQ_WE_OAC)) { + /* And can't be set at the same time as TDTHRESH */ + if (opts->we_mask & QM_INITFQ_WE_TDTHRESH) + return -EINVAL; + } + /* Issue an INITFQ_[PARKED|SCHED] management command */ + p = get_affine_portal(); + if (fq_isset(fq, QMAN_FQ_STATE_CHANGING) || + (fq->state != qman_fq_state_oos && + fq->state != qman_fq_state_parked)) { + ret = -EBUSY; + goto out; + } + mcc = qm_mc_start(&p->p); + if (opts) + mcc->initfq = *opts; + mcc->initfq.fqid = fq->fqid; + mcc->initfq.count = 0; + /* + * If the FQ does *not* have the TO_DCPORTAL flag, contextB is set as a + * demux pointer. Otherwise, the caller-provided value is allowed to + * stand, don't overwrite it. + */ + if (fq_isclear(fq, QMAN_FQ_FLAG_TO_DCPORTAL)) { + dma_addr_t phys_fq; + + mcc->initfq.we_mask |= QM_INITFQ_WE_CONTEXTB; + mcc->initfq.fqd.context_b = fq_to_tag(fq); + /* + * and the physical address - NB, if the user wasn't trying to + * set CONTEXTA, clear the stashing settings. + */ + if (!(mcc->initfq.we_mask & QM_INITFQ_WE_CONTEXTA)) { + mcc->initfq.we_mask |= QM_INITFQ_WE_CONTEXTA; + memset(&mcc->initfq.fqd.context_a, 0, + sizeof(mcc->initfq.fqd.context_a)); + } else { + phys_fq = dma_map_single(&p->pdev->dev, fq, sizeof(*fq), + DMA_TO_DEVICE); + qm_fqd_stashing_set64(&mcc->initfq.fqd, phys_fq); + } + } + if (flags & QMAN_INITFQ_FLAG_LOCAL) { + int wq = 0; + + if (!(mcc->initfq.we_mask & QM_INITFQ_WE_DESTWQ)) { + mcc->initfq.we_mask |= QM_INITFQ_WE_DESTWQ; + wq = 4; + } + qm_fqd_set_destwq(&mcc->initfq.fqd, p->config->channel, wq); + } + qm_mc_commit(&p->p, myverb); + if (!qm_mc_result_timeout(&p->p, &mcr)) { + dev_err(p->config->dev, "MCR timeout\n"); + ret = -ETIMEDOUT; + goto out; + } + + DPAA_ASSERT((mcr->verb & QM_MCR_VERB_MASK) == myverb); + res = mcr->result; + if (res != QM_MCR_RESULT_OK) { + ret = -EIO; + goto out; + } + if (opts) { + if (opts->we_mask & QM_INITFQ_WE_FQCTRL) { + if (opts->fqd.fq_ctrl & QM_FQCTRL_CGE) + fq_set(fq, QMAN_FQ_STATE_CGR_EN); + else + fq_clear(fq, QMAN_FQ_STATE_CGR_EN); + } + if (opts->we_mask & QM_INITFQ_WE_CGID) + fq->cgr_groupid = opts->fqd.cgid; + } + fq->state = (flags & QMAN_INITFQ_FLAG_SCHED) ? + qman_fq_state_sched : qman_fq_state_parked; + +out: + put_affine_portal(); + return ret; +} +EXPORT_SYMBOL(qman_init_fq); + +int qman_schedule_fq(struct qman_fq *fq) +{ + union qm_mc_command *mcc; + union qm_mc_result *mcr; + struct qman_portal *p; + int ret = 0; + + if (fq->state != qman_fq_state_parked) + return -EINVAL; +#ifdef CONFIG_FSL_DPAA_CHECKING + if (fq_isset(fq, QMAN_FQ_FLAG_NO_MODIFY)) + return -EINVAL; +#endif + /* Issue a ALTERFQ_SCHED management command */ + p = get_affine_portal(); + if (fq_isset(fq, QMAN_FQ_STATE_CHANGING) || + fq->state != qman_fq_state_parked) { + ret = -EBUSY; + goto out; + } + mcc = qm_mc_start(&p->p); + mcc->alterfq.fqid = fq->fqid; + qm_mc_commit(&p->p, QM_MCC_VERB_ALTER_SCHED); + if (!qm_mc_result_timeout(&p->p, &mcr)) { + dev_err(p->config->dev, "ALTER_SCHED timeout\n"); + ret = -ETIMEDOUT; + goto out; + } + + DPAA_ASSERT((mcr->verb & QM_MCR_VERB_MASK) == QM_MCR_VERB_ALTER_SCHED); + if (mcr->result != QM_MCR_RESULT_OK) { + ret = -EIO; + goto out; + } + fq->state = qman_fq_state_sched; +out: + put_affine_portal(); + return ret; +} +EXPORT_SYMBOL(qman_schedule_fq); + +int qman_retire_fq(struct qman_fq *fq, u32 *flags) +{ + union qm_mc_command *mcc; + union qm_mc_result *mcr; + struct qman_portal *p; + int ret; + u8 res; + + if (fq->state != qman_fq_state_parked && + fq->state != qman_fq_state_sched) + return -EINVAL; +#ifdef CONFIG_FSL_DPAA_CHECKING + if (fq_isset(fq, QMAN_FQ_FLAG_NO_MODIFY)) + return -EINVAL; +#endif + p = get_affine_portal(); + if (fq_isset(fq, QMAN_FQ_STATE_CHANGING) || + fq->state == qman_fq_state_retired || + fq->state == qman_fq_state_oos) { + ret = -EBUSY; + goto out; + } + mcc = qm_mc_start(&p->p); + mcc->alterfq.fqid = fq->fqid; + qm_mc_commit(&p->p, QM_MCC_VERB_ALTER_RETIRE); + if (!qm_mc_result_timeout(&p->p, &mcr)) { + dev_crit(p->config->dev, "ALTER_RETIRE timeout\n"); + ret = -ETIMEDOUT; + goto out; + } + + DPAA_ASSERT((mcr->verb & QM_MCR_VERB_MASK) == QM_MCR_VERB_ALTER_RETIRE); + res = mcr->result; + /* + * "Elegant" would be to treat OK/PENDING the same way; set CHANGING, + * and defer the flags until FQRNI or FQRN (respectively) show up. But + * "Friendly" is to process OK immediately, and not set CHANGING. We do + * friendly, otherwise the caller doesn't necessarily have a fully + * "retired" FQ on return even if the retirement was immediate. However + * this does mean some code duplication between here and + * fq_state_change(). + */ + if (res == QM_MCR_RESULT_OK) { + ret = 0; + /* Process 'fq' right away, we'll ignore FQRNI */ + if (mcr->alterfq.fqs & QM_MCR_FQS_NOTEMPTY) + fq_set(fq, QMAN_FQ_STATE_NE); + if (mcr->alterfq.fqs & QM_MCR_FQS_ORLPRESENT) + fq_set(fq, QMAN_FQ_STATE_ORL); + if (flags) + *flags = fq->flags; + fq->state = qman_fq_state_retired; + if (fq->cb.fqs) { + /* + * Another issue with supporting "immediate" retirement + * is that we're forced to drop FQRNIs, because by the + * time they're seen it may already be "too late" (the + * fq may have been OOS'd and free()'d already). But if + * the upper layer wants a callback whether it's + * immediate or not, we have to fake a "MR" entry to + * look like an FQRNI... + */ + union qm_mr_entry msg; + + msg.verb = QM_MR_VERB_FQRNI; + msg.fq.fqs = mcr->alterfq.fqs; + msg.fq.fqid = fq->fqid; + msg.fq.contextB = fq_to_tag(fq); + fq->cb.fqs(p, fq, &msg); + } + } else if (res == QM_MCR_RESULT_PENDING) { + ret = 1; + fq_set(fq, QMAN_FQ_STATE_CHANGING); + } else { + ret = -EIO; + } +out: + put_affine_portal(); + return ret; +} +EXPORT_SYMBOL(qman_retire_fq); + +int qman_oos_fq(struct qman_fq *fq) +{ + union qm_mc_command *mcc; + union qm_mc_result *mcr; + struct qman_portal *p; + int ret = 0; + + if (fq->state != qman_fq_state_retired) + return -EINVAL; +#ifdef CONFIG_FSL_DPAA_CHECKING + if (fq_isset(fq, QMAN_FQ_FLAG_NO_MODIFY)) + return -EINVAL; +#endif + p = get_affine_portal(); + if (fq_isset(fq, QMAN_FQ_STATE_BLOCKOOS) || + fq->state != qman_fq_state_retired) { + ret = -EBUSY; + goto out; + } + mcc = qm_mc_start(&p->p); + mcc->alterfq.fqid = fq->fqid; + qm_mc_commit(&p->p, QM_MCC_VERB_ALTER_OOS); + if (!qm_mc_result_timeout(&p->p, &mcr)) { + ret = -ETIMEDOUT; + goto out; + } + DPAA_ASSERT((mcr->verb & QM_MCR_VERB_MASK) == QM_MCR_VERB_ALTER_OOS); + if (mcr->result != QM_MCR_RESULT_OK) { + ret = -EIO; + goto out; + } + fq->state = qman_fq_state_oos; +out: + put_affine_portal(); + return ret; +} +EXPORT_SYMBOL(qman_oos_fq); + +int qman_query_fq(struct qman_fq *fq, struct qm_fqd *fqd) +{ + union qm_mc_command *mcc; + union qm_mc_result *mcr; + struct qman_portal *p = get_affine_portal(); + int ret = 0; + + mcc = qm_mc_start(&p->p); + mcc->queryfq.fqid = fq->fqid; + qm_mc_commit(&p->p, QM_MCC_VERB_QUERYFQ); + if (!qm_mc_result_timeout(&p->p, &mcr)) { + ret = -ETIMEDOUT; + goto out; + } + + DPAA_ASSERT((mcr->verb & QM_MCR_VERB_MASK) == QM_MCR_VERB_QUERYFQ); + if (mcr->result == QM_MCR_RESULT_OK) + *fqd = mcr->queryfq.fqd; + else + ret = -EIO; +out: + put_affine_portal(); + return ret; +} + +static int qman_query_fq_np(struct qman_fq *fq, + struct qm_mcr_queryfq_np *np) +{ + union qm_mc_command *mcc; + union qm_mc_result *mcr; + struct qman_portal *p = get_affine_portal(); + int ret = 0; + + mcc = qm_mc_start(&p->p); + mcc->queryfq.fqid = fq->fqid; + qm_mc_commit(&p->p, QM_MCC_VERB_QUERYFQ_NP); + if (!qm_mc_result_timeout(&p->p, &mcr)) { + ret = -ETIMEDOUT; + goto out; + } + + DPAA_ASSERT((mcr->verb & QM_MCR_VERB_MASK) == QM_MCR_VERB_QUERYFQ_NP); + if (mcr->result == QM_MCR_RESULT_OK) + *np = mcr->queryfq_np; + else if (mcr->result == QM_MCR_RESULT_ERR_FQID) + ret = -ERANGE; + else + ret = -EIO; +out: + put_affine_portal(); + return ret; +} + +static int qman_query_cgr(struct qman_cgr *cgr, + struct qm_mcr_querycgr *cgrd) +{ + union qm_mc_command *mcc; + union qm_mc_result *mcr; + struct qman_portal *p = get_affine_portal(); + int ret = 0; + + mcc = qm_mc_start(&p->p); + mcc->querycgr.cgid = cgr->cgrid; + qm_mc_commit(&p->p, QM_MCC_VERB_QUERYCGR); + if (!qm_mc_result_timeout(&p->p, &mcr)) { + ret = -ETIMEDOUT; + goto out; + } + DPAA_ASSERT((mcr->verb & QM_MCR_VERB_MASK) == QM_MCC_VERB_QUERYCGR); + if (mcr->result == QM_MCR_RESULT_OK) + *cgrd = mcr->querycgr; + else { + dev_err(p->config->dev, "QUERY_CGR failed: %s\n", + mcr_result_str(mcr->result)); + ret = -EIO; + } +out: + put_affine_portal(); + return ret; +} + +int qman_query_cgr_congested(struct qman_cgr *cgr, bool *result) +{ + struct qm_mcr_querycgr query_cgr; + int err; + + err = qman_query_cgr(cgr, &query_cgr); + if (err) + return err; + + *result = !!query_cgr.cgr.cs; + return 0; +} +EXPORT_SYMBOL(qman_query_cgr_congested); + +/* internal function used as a wait_event() expression */ +static int set_p_vdqcr(struct qman_portal *p, struct qman_fq *fq, u32 vdqcr) +{ + unsigned long irqflags; + int ret = -EBUSY; + + local_irq_save(irqflags); + if (p->vdqcr_owned) + goto out; + if (fq_isset(fq, QMAN_FQ_STATE_VDQCR)) + goto out; + + fq_set(fq, QMAN_FQ_STATE_VDQCR); + p->vdqcr_owned = fq; + qm_dqrr_vdqcr_set(&p->p, vdqcr); + ret = 0; +out: + local_irq_restore(irqflags); + return ret; +} + +static int set_vdqcr(struct qman_portal **p, struct qman_fq *fq, u32 vdqcr) +{ + int ret; + + *p = get_affine_portal(); + ret = set_p_vdqcr(*p, fq, vdqcr); + put_affine_portal(); + return ret; +} + +static int wait_vdqcr_start(struct qman_portal **p, struct qman_fq *fq, + u32 vdqcr, u32 flags) +{ + int ret = 0; + + if (flags & QMAN_VOLATILE_FLAG_WAIT_INT) + ret = wait_event_interruptible(affine_queue, + !set_vdqcr(p, fq, vdqcr)); + else + wait_event(affine_queue, !set_vdqcr(p, fq, vdqcr)); + return ret; +} + +int qman_volatile_dequeue(struct qman_fq *fq, u32 flags, u32 vdqcr) +{ + struct qman_portal *p; + int ret; + + if (fq->state != qman_fq_state_parked && + fq->state != qman_fq_state_retired) + return -EINVAL; + if (vdqcr & QM_VDQCR_FQID_MASK) + return -EINVAL; + if (fq_isset(fq, QMAN_FQ_STATE_VDQCR)) + return -EBUSY; + vdqcr = (vdqcr & ~QM_VDQCR_FQID_MASK) | fq->fqid; + if (flags & QMAN_VOLATILE_FLAG_WAIT) + ret = wait_vdqcr_start(&p, fq, vdqcr, flags); + else + ret = set_vdqcr(&p, fq, vdqcr); + if (ret) + return ret; + /* VDQCR is set */ + if (flags & QMAN_VOLATILE_FLAG_FINISH) { + if (flags & QMAN_VOLATILE_FLAG_WAIT_INT) + /* + * NB: don't propagate any error - the caller wouldn't + * know whether the VDQCR was issued or not. A signal + * could arrive after returning anyway, so the caller + * can check signal_pending() if that's an issue. + */ + wait_event_interruptible(affine_queue, + !fq_isset(fq, QMAN_FQ_STATE_VDQCR)); + else + wait_event(affine_queue, + !fq_isset(fq, QMAN_FQ_STATE_VDQCR)); + } + return 0; +} +EXPORT_SYMBOL(qman_volatile_dequeue); + +static void update_eqcr_ci(struct qman_portal *p, u8 avail) +{ + if (avail) + qm_eqcr_cce_prefetch(&p->p); + else + qm_eqcr_cce_update(&p->p); +} + +int qman_enqueue(struct qman_fq *fq, const struct qm_fd *fd) +{ + struct qman_portal *p; + struct qm_eqcr_entry *eq; + unsigned long irqflags; + u8 avail; + + p = get_affine_portal(); + local_irq_save(irqflags); + + if (p->use_eqcr_ci_stashing) { + /* + * The stashing case is easy, only update if we need to in + * order to try and liberate ring entries. + */ + eq = qm_eqcr_start_stash(&p->p); + } else { + /* + * The non-stashing case is harder, need to prefetch ahead of + * time. + */ + avail = qm_eqcr_get_avail(&p->p); + if (avail < 2) + update_eqcr_ci(p, avail); + eq = qm_eqcr_start_no_stash(&p->p); + } + + if (unlikely(!eq)) + goto out; + + eq->fqid = fq->fqid; + eq->tag = fq_to_tag(fq); + eq->fd = *fd; + + qm_eqcr_pvb_commit(&p->p, QM_EQCR_VERB_CMD_ENQUEUE); +out: + local_irq_restore(irqflags); + put_affine_portal(); + return 0; +} +EXPORT_SYMBOL(qman_enqueue); + +static int qm_modify_cgr(struct qman_cgr *cgr, u32 flags, + struct qm_mcc_initcgr *opts) +{ + union qm_mc_command *mcc; + union qm_mc_result *mcr; + struct qman_portal *p = get_affine_portal(); + u8 verb = QM_MCC_VERB_MODIFYCGR; + int ret = 0; + + mcc = qm_mc_start(&p->p); + if (opts) + mcc->initcgr = *opts; + mcc->initcgr.cgid = cgr->cgrid; + if (flags & QMAN_CGR_FLAG_USE_INIT) + verb = QM_MCC_VERB_INITCGR; + qm_mc_commit(&p->p, verb); + if (!qm_mc_result_timeout(&p->p, &mcr)) { + ret = -ETIMEDOUT; + goto out; + } + + DPAA_ASSERT((mcr->verb & QM_MCR_VERB_MASK) == verb); + if (mcr->result != QM_MCR_RESULT_OK) + ret = -EIO; + +out: + put_affine_portal(); + return ret; +} + +#define PORTAL_IDX(n) (n->config->channel - QM_CHANNEL_SWPORTAL0) +#define TARG_MASK(n) (BIT(31) >> PORTAL_IDX(n)) + +static u8 qman_cgr_cpus[CGR_NUM]; + +void qman_init_cgr_all(void) +{ + struct qman_cgr cgr; + int err_cnt = 0; + + for (cgr.cgrid = 0; cgr.cgrid < CGR_NUM; cgr.cgrid++) { + if (qm_modify_cgr(&cgr, QMAN_CGR_FLAG_USE_INIT, NULL)) + err_cnt++; + } + + if (err_cnt) + pr_err("Warning: %d error%s while initialising CGR h/w\n", + err_cnt, (err_cnt > 1) ? "s" : ""); +} + +int qman_create_cgr(struct qman_cgr *cgr, u32 flags, + struct qm_mcc_initcgr *opts) +{ + struct qm_mcr_querycgr cgr_state; + struct qm_mcc_initcgr local_opts = {}; + int ret; + struct qman_portal *p; + + /* + * We have to check that the provided CGRID is within the limits of the + * data-structures, for obvious reasons. However we'll let h/w take + * care of determining whether it's within the limits of what exists on + * the SoC. + */ + if (cgr->cgrid >= CGR_NUM) + return -EINVAL; + + preempt_disable(); + p = get_affine_portal(); + qman_cgr_cpus[cgr->cgrid] = smp_processor_id(); + preempt_enable(); + + cgr->chan = p->config->channel; + spin_lock(&p->cgr_lock); + + if (opts) { + ret = qman_query_cgr(cgr, &cgr_state); + if (ret) + goto out; + if (opts) + local_opts = *opts; + if ((qman_ip_rev & 0xFF00) >= QMAN_REV30) + local_opts.cgr.cscn_targ_upd_ctrl = + QM_CGR_TARG_UDP_CTRL_WRITE_BIT | PORTAL_IDX(p); + else + /* Overwrite TARG */ + local_opts.cgr.cscn_targ = cgr_state.cgr.cscn_targ | + TARG_MASK(p); + local_opts.we_mask |= QM_CGR_WE_CSCN_TARG; + + /* send init if flags indicate so */ + if (opts && (flags & QMAN_CGR_FLAG_USE_INIT)) + ret = qm_modify_cgr(cgr, QMAN_CGR_FLAG_USE_INIT, + &local_opts); + else + ret = qm_modify_cgr(cgr, 0, &local_opts); + if (ret) + goto out; + } + + list_add(&cgr->node, &p->cgr_cbs); + + /* Determine if newly added object requires its callback to be called */ + ret = qman_query_cgr(cgr, &cgr_state); + if (ret) { + /* we can't go back, so proceed and return success */ + dev_err(p->config->dev, "CGR HW state partially modified\n"); + ret = 0; + goto out; + } + if (cgr->cb && cgr_state.cgr.cscn_en && + qman_cgrs_get(&p->cgrs[1], cgr->cgrid)) + cgr->cb(p, cgr, 1); +out: + spin_unlock(&p->cgr_lock); + put_affine_portal(); + return ret; +} +EXPORT_SYMBOL(qman_create_cgr); + +int qman_delete_cgr(struct qman_cgr *cgr) +{ + unsigned long irqflags; + struct qm_mcr_querycgr cgr_state; + struct qm_mcc_initcgr local_opts; + int ret = 0; + struct qman_cgr *i; + struct qman_portal *p = get_affine_portal(); + + if (cgr->chan != p->config->channel) { + /* attempt to delete from other portal than creator */ + dev_err(p->config->dev, "CGR not owned by current portal"); + dev_dbg(p->config->dev, " create 0x%x, delete 0x%x\n", + cgr->chan, p->config->channel); + + ret = -EINVAL; + goto put_portal; + } + memset(&local_opts, 0, sizeof(struct qm_mcc_initcgr)); + spin_lock_irqsave(&p->cgr_lock, irqflags); + list_del(&cgr->node); + /* + * If there are no other CGR objects for this CGRID in the list, + * update CSCN_TARG accordingly + */ + list_for_each_entry(i, &p->cgr_cbs, node) + if (i->cgrid == cgr->cgrid && i->cb) + goto release_lock; + ret = qman_query_cgr(cgr, &cgr_state); + if (ret) { + /* add back to the list */ + list_add(&cgr->node, &p->cgr_cbs); + goto release_lock; + } + /* Overwrite TARG */ + local_opts.we_mask = QM_CGR_WE_CSCN_TARG; + if ((qman_ip_rev & 0xFF00) >= QMAN_REV30) + local_opts.cgr.cscn_targ_upd_ctrl = PORTAL_IDX(p); + else + local_opts.cgr.cscn_targ = cgr_state.cgr.cscn_targ & + ~(TARG_MASK(p)); + ret = qm_modify_cgr(cgr, 0, &local_opts); + if (ret) + /* add back to the list */ + list_add(&cgr->node, &p->cgr_cbs); +release_lock: + spin_unlock_irqrestore(&p->cgr_lock, irqflags); +put_portal: + put_affine_portal(); + return ret; +} +EXPORT_SYMBOL(qman_delete_cgr); + +struct cgr_comp { + struct qman_cgr *cgr; + struct completion completion; +}; + +static int qman_delete_cgr_thread(void *p) +{ + struct cgr_comp *cgr_comp = (struct cgr_comp *)p; + int ret; + + ret = qman_delete_cgr(cgr_comp->cgr); + complete(&cgr_comp->completion); + + return ret; +} + +void qman_delete_cgr_safe(struct qman_cgr *cgr) +{ + struct task_struct *thread; + struct cgr_comp cgr_comp; + + preempt_disable(); + if (qman_cgr_cpus[cgr->cgrid] != smp_processor_id()) { + init_completion(&cgr_comp.completion); + cgr_comp.cgr = cgr; + thread = kthread_create(qman_delete_cgr_thread, &cgr_comp, + "cgr_del"); + + if (IS_ERR(thread)) + goto out; + + kthread_bind(thread, qman_cgr_cpus[cgr->cgrid]); + wake_up_process(thread); + wait_for_completion(&cgr_comp.completion); + preempt_enable(); + return; + } +out: + qman_delete_cgr(cgr); + preempt_enable(); +} +EXPORT_SYMBOL(qman_delete_cgr_safe); + +/* Cleanup FQs */ + +static int _qm_mr_consume_and_match_verb(struct qm_portal *p, int v) +{ + const union qm_mr_entry *msg; + int found = 0; + + qm_mr_pvb_update(p); + msg = qm_mr_current(p); + while (msg) { + if ((msg->verb & QM_MR_VERB_TYPE_MASK) == v) + found = 1; + qm_mr_next(p); + qm_mr_cci_consume_to_current(p); + qm_mr_pvb_update(p); + msg = qm_mr_current(p); + } + return found; +} + +static int _qm_dqrr_consume_and_match(struct qm_portal *p, u32 fqid, int s, + bool wait) +{ + const struct qm_dqrr_entry *dqrr; + int found = 0; + + do { + qm_dqrr_pvb_update(p); + dqrr = qm_dqrr_current(p); + if (!dqrr) + cpu_relax(); + } while (wait && !dqrr); + + while (dqrr) { + if (dqrr->fqid == fqid && (dqrr->stat & s)) + found = 1; + qm_dqrr_cdc_consume_1ptr(p, dqrr, 0); + qm_dqrr_pvb_update(p); + qm_dqrr_next(p); + dqrr = qm_dqrr_current(p); + } + return found; +} + +#define qm_mr_drain(p, V) \ + _qm_mr_consume_and_match_verb(p, QM_MR_VERB_##V) + +#define qm_dqrr_drain(p, f, S) \ + _qm_dqrr_consume_and_match(p, f, QM_DQRR_STAT_##S, false) + +#define qm_dqrr_drain_wait(p, f, S) \ + _qm_dqrr_consume_and_match(p, f, QM_DQRR_STAT_##S, true) + +#define qm_dqrr_drain_nomatch(p) \ + _qm_dqrr_consume_and_match(p, 0, 0, false) + +static int qman_shutdown_fq(u32 fqid) +{ + struct qman_portal *p; + struct device *dev; + union qm_mc_command *mcc; + union qm_mc_result *mcr; + int orl_empty, drain = 0, ret = 0; + u32 channel, wq, res; + u8 state; + + p = get_affine_portal(); + dev = p->config->dev; + /* Determine the state of the FQID */ + mcc = qm_mc_start(&p->p); + mcc->queryfq_np.fqid = fqid; + qm_mc_commit(&p->p, QM_MCC_VERB_QUERYFQ_NP); + if (!qm_mc_result_timeout(&p->p, &mcr)) { + dev_err(dev, "QUERYFQ_NP timeout\n"); + ret = -ETIMEDOUT; + goto out; + } + + DPAA_ASSERT((mcr->verb & QM_MCR_VERB_MASK) == QM_MCR_VERB_QUERYFQ_NP); + state = mcr->queryfq_np.state & QM_MCR_NP_STATE_MASK; + if (state == QM_MCR_NP_STATE_OOS) + goto out; /* Already OOS, no need to do anymore checks */ + + /* Query which channel the FQ is using */ + mcc = qm_mc_start(&p->p); + mcc->queryfq.fqid = fqid; + qm_mc_commit(&p->p, QM_MCC_VERB_QUERYFQ); + if (!qm_mc_result_timeout(&p->p, &mcr)) { + dev_err(dev, "QUERYFQ timeout\n"); + ret = -ETIMEDOUT; + goto out; + } + + DPAA_ASSERT((mcr->verb & QM_MCR_VERB_MASK) == QM_MCR_VERB_QUERYFQ); + /* Need to store these since the MCR gets reused */ + channel = qm_fqd_get_chan(&mcr->queryfq.fqd); + wq = qm_fqd_get_wq(&mcr->queryfq.fqd); + + switch (state) { + case QM_MCR_NP_STATE_TEN_SCHED: + case QM_MCR_NP_STATE_TRU_SCHED: + case QM_MCR_NP_STATE_ACTIVE: + case QM_MCR_NP_STATE_PARKED: + orl_empty = 0; + mcc = qm_mc_start(&p->p); + mcc->alterfq.fqid = fqid; + qm_mc_commit(&p->p, QM_MCC_VERB_ALTER_RETIRE); + if (!qm_mc_result_timeout(&p->p, &mcr)) { + dev_err(dev, "QUERYFQ_NP timeout\n"); + ret = -ETIMEDOUT; + goto out; + } + DPAA_ASSERT((mcr->verb & QM_MCR_VERB_MASK) == + QM_MCR_VERB_ALTER_RETIRE); + res = mcr->result; /* Make a copy as we reuse MCR below */ + + if (res == QM_MCR_RESULT_PENDING) { + /* + * Need to wait for the FQRN in the message ring, which + * will only occur once the FQ has been drained. In + * order for the FQ to drain the portal needs to be set + * to dequeue from the channel the FQ is scheduled on + */ + int found_fqrn = 0; + u16 dequeue_wq = 0; + + /* Flag that we need to drain FQ */ + drain = 1; + + if (channel >= qm_channel_pool1 && + channel < qm_channel_pool1 + 15) { + /* Pool channel, enable the bit in the portal */ + dequeue_wq = (channel - + qm_channel_pool1 + 1)<<4 | wq; + } else if (channel < qm_channel_pool1) { + /* Dedicated channel */ + dequeue_wq = wq; + } else { + dev_err(dev, "Can't recover FQ 0x%x, ch: 0x%x", + fqid, channel); + ret = -EBUSY; + goto out; + } + /* Set the sdqcr to drain this channel */ + if (channel < qm_channel_pool1) + qm_dqrr_sdqcr_set(&p->p, + QM_SDQCR_TYPE_ACTIVE | + QM_SDQCR_CHANNELS_DEDICATED); + else + qm_dqrr_sdqcr_set(&p->p, + QM_SDQCR_TYPE_ACTIVE | + QM_SDQCR_CHANNELS_POOL_CONV + (channel)); + do { + /* Keep draining DQRR while checking the MR*/ + qm_dqrr_drain_nomatch(&p->p); + /* Process message ring too */ + found_fqrn = qm_mr_drain(&p->p, FQRN); + cpu_relax(); + } while (!found_fqrn); + + } + if (res != QM_MCR_RESULT_OK && + res != QM_MCR_RESULT_PENDING) { + dev_err(dev, "retire_fq failed: FQ 0x%x, res=0x%x\n", + fqid, res); + ret = -EIO; + goto out; + } + if (!(mcr->alterfq.fqs & QM_MCR_FQS_ORLPRESENT)) { + /* + * ORL had no entries, no need to wait until the + * ERNs come in + */ + orl_empty = 1; + } + /* + * Retirement succeeded, check to see if FQ needs + * to be drained + */ + if (drain || mcr->alterfq.fqs & QM_MCR_FQS_NOTEMPTY) { + /* FQ is Not Empty, drain using volatile DQ commands */ + do { + u32 vdqcr = fqid | QM_VDQCR_NUMFRAMES_SET(3); + + qm_dqrr_vdqcr_set(&p->p, vdqcr); + /* + * Wait for a dequeue and process the dequeues, + * making sure to empty the ring completely + */ + } while (qm_dqrr_drain_wait(&p->p, fqid, FQ_EMPTY)); + } + qm_dqrr_sdqcr_set(&p->p, 0); + + while (!orl_empty) { + /* Wait for the ORL to have been completely drained */ + orl_empty = qm_mr_drain(&p->p, FQRL); + cpu_relax(); + } + mcc = qm_mc_start(&p->p); + mcc->alterfq.fqid = fqid; + qm_mc_commit(&p->p, QM_MCC_VERB_ALTER_OOS); + if (!qm_mc_result_timeout(&p->p, &mcr)) { + ret = -ETIMEDOUT; + goto out; + } + + DPAA_ASSERT((mcr->verb & QM_MCR_VERB_MASK) == + QM_MCR_VERB_ALTER_OOS); + if (mcr->result != QM_MCR_RESULT_OK) { + dev_err(dev, "OOS after drain fail: FQ 0x%x (0x%x)\n", + fqid, mcr->result); + ret = -EIO; + goto out; + } + break; + + case QM_MCR_NP_STATE_RETIRED: + /* Send OOS Command */ + mcc = qm_mc_start(&p->p); + mcc->alterfq.fqid = fqid; + qm_mc_commit(&p->p, QM_MCC_VERB_ALTER_OOS); + if (!qm_mc_result_timeout(&p->p, &mcr)) { + ret = -ETIMEDOUT; + goto out; + } + + DPAA_ASSERT((mcr->verb & QM_MCR_VERB_MASK) == + QM_MCR_VERB_ALTER_OOS); + if (mcr->result) { + dev_err(dev, "OOS fail: FQ 0x%x (0x%x)\n", + fqid, mcr->result); + ret = -EIO; + goto out; + } + break; + + case QM_MCR_NP_STATE_OOS: + /* Done */ + break; + + default: + ret = -EIO; + } + +out: + put_affine_portal(); + return ret; +} + +const struct qm_portal_config *qman_get_qm_portal_config( + struct qman_portal *portal) +{ + return portal->config; +} + +struct gen_pool *qm_fqalloc; /* FQID allocator */ +struct gen_pool *qm_qpalloc; /* pool-channel allocator */ +struct gen_pool *qm_cgralloc; /* CGR ID allocator */ + +static int qman_alloc_range(struct gen_pool *p, u32 *result, u32 cnt) +{ + unsigned long addr; + + addr = gen_pool_alloc(p, cnt); + if (!addr) + return -ENOMEM; + + *result = addr & ~DPAA_GENALLOC_OFF; + + return 0; +} + +int qman_alloc_fqid_range(u32 *result, u32 count) +{ + return qman_alloc_range(qm_fqalloc, result, count); +} +EXPORT_SYMBOL(qman_alloc_fqid_range); + +int qman_alloc_pool_range(u32 *result, u32 count) +{ + return qman_alloc_range(qm_qpalloc, result, count); +} +EXPORT_SYMBOL(qman_alloc_pool_range); + +int qman_alloc_cgrid_range(u32 *result, u32 count) +{ + return qman_alloc_range(qm_cgralloc, result, count); +} +EXPORT_SYMBOL(qman_alloc_cgrid_range); + +int qman_release_fqid(u32 fqid) +{ + int ret = qman_shutdown_fq(fqid); + + if (ret) { + pr_debug("FQID %d leaked\n", fqid); + return ret; + } + + gen_pool_free(qm_fqalloc, fqid | DPAA_GENALLOC_OFF, 1); + return 0; +} +EXPORT_SYMBOL(qman_release_fqid); + +static int qpool_cleanup(u32 qp) +{ + /* + * We query all FQDs starting from + * FQID 1 until we get an "invalid FQID" error, looking for non-OOS FQDs + * whose destination channel is the pool-channel being released. + * When a non-OOS FQD is found we attempt to clean it up + */ + struct qman_fq fq = { + .fqid = QM_FQID_RANGE_START + }; + int err; + + do { + struct qm_mcr_queryfq_np np; + + err = qman_query_fq_np(&fq, &np); + if (err) + /* FQID range exceeded, found no problems */ + return 0; + if ((np.state & QM_MCR_NP_STATE_MASK) != QM_MCR_NP_STATE_OOS) { + struct qm_fqd fqd; + + err = qman_query_fq(&fq, &fqd); + if (WARN_ON(err)) + return 0; + if (qm_fqd_get_chan(&fqd) == qp) { + /* The channel is the FQ's target, clean it */ + err = qman_shutdown_fq(fq.fqid); + if (err) + /* + * Couldn't shut down the FQ + * so the pool must be leaked + */ + return err; + } + } + /* Move to the next FQID */ + fq.fqid++; + } while (1); +} + +int qman_release_pool(u32 qp) +{ + int ret; + + ret = qpool_cleanup(qp); + if (ret) { + pr_debug("CHID %d leaked\n", qp); + return ret; + } + + gen_pool_free(qm_qpalloc, qp | DPAA_GENALLOC_OFF, 1); + return 0; +} +EXPORT_SYMBOL(qman_release_pool); + +static int cgr_cleanup(u32 cgrid) +{ + /* + * query all FQDs starting from FQID 1 until we get an "invalid FQID" + * error, looking for non-OOS FQDs whose CGR is the CGR being released + */ + struct qman_fq fq = { + .fqid = 1 + }; + int err; + + do { + struct qm_mcr_queryfq_np np; + + err = qman_query_fq_np(&fq, &np); + if (err) + /* FQID range exceeded, found no problems */ + return 0; + if ((np.state & QM_MCR_NP_STATE_MASK) != QM_MCR_NP_STATE_OOS) { + struct qm_fqd fqd; + + err = qman_query_fq(&fq, &fqd); + if (WARN_ON(err)) + return 0; + if ((fqd.fq_ctrl & QM_FQCTRL_CGE) && + fqd.cgid == cgrid) { + pr_err("CRGID 0x%x is being used by FQID 0x%x, CGR will be leaked\n", + cgrid, fq.fqid); + return -EIO; + } + } + /* Move to the next FQID */ + fq.fqid++; + } while (1); +} + +int qman_release_cgrid(u32 cgrid) +{ + int ret; + + ret = cgr_cleanup(cgrid); + if (ret) { + pr_debug("CGRID %d leaked\n", cgrid); + return ret; + } + + gen_pool_free(qm_cgralloc, cgrid | DPAA_GENALLOC_OFF, 1); + return 0; +} +EXPORT_SYMBOL(qman_release_cgrid); diff --git a/drivers/soc/fsl/qbman/qman_ccsr.c b/drivers/soc/fsl/qbman/qman_ccsr.c new file mode 100644 index 000000000000..0cace9e0077e --- /dev/null +++ b/drivers/soc/fsl/qbman/qman_ccsr.c @@ -0,0 +1,808 @@ +/* Copyright 2008 - 2016 Freescale Semiconductor, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Freescale Semiconductor nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * ALTERNATIVELY, this software may be distributed under the terms of the + * GNU General Public License ("GPL") as published by the Free Software + * Foundation, either version 2 of that License or (at your option) any + * later version. + * + * THIS SOFTWARE IS PROVIDED BY Freescale Semiconductor ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL Freescale Semiconductor BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "qman_priv.h" + +u16 qman_ip_rev; +EXPORT_SYMBOL(qman_ip_rev); +u16 qm_channel_pool1 = QMAN_CHANNEL_POOL1; +EXPORT_SYMBOL(qm_channel_pool1); + +/* Register offsets */ +#define REG_QCSP_LIO_CFG(n) (0x0000 + ((n) * 0x10)) +#define REG_QCSP_IO_CFG(n) (0x0004 + ((n) * 0x10)) +#define REG_QCSP_DD_CFG(n) (0x000c + ((n) * 0x10)) +#define REG_DD_CFG 0x0200 +#define REG_DCP_CFG(n) (0x0300 + ((n) * 0x10)) +#define REG_DCP_DD_CFG(n) (0x0304 + ((n) * 0x10)) +#define REG_DCP_DLM_AVG(n) (0x030c + ((n) * 0x10)) +#define REG_PFDR_FPC 0x0400 +#define REG_PFDR_FP_HEAD 0x0404 +#define REG_PFDR_FP_TAIL 0x0408 +#define REG_PFDR_FP_LWIT 0x0410 +#define REG_PFDR_CFG 0x0414 +#define REG_SFDR_CFG 0x0500 +#define REG_SFDR_IN_USE 0x0504 +#define REG_WQ_CS_CFG(n) (0x0600 + ((n) * 0x04)) +#define REG_WQ_DEF_ENC_WQID 0x0630 +#define REG_WQ_SC_DD_CFG(n) (0x640 + ((n) * 0x04)) +#define REG_WQ_PC_DD_CFG(n) (0x680 + ((n) * 0x04)) +#define REG_WQ_DC0_DD_CFG(n) (0x6c0 + ((n) * 0x04)) +#define REG_WQ_DC1_DD_CFG(n) (0x700 + ((n) * 0x04)) +#define REG_WQ_DCn_DD_CFG(n) (0x6c0 + ((n) * 0x40)) /* n=2,3 */ +#define REG_CM_CFG 0x0800 +#define REG_ECSR 0x0a00 +#define REG_ECIR 0x0a04 +#define REG_EADR 0x0a08 +#define REG_ECIR2 0x0a0c +#define REG_EDATA(n) (0x0a10 + ((n) * 0x04)) +#define REG_SBEC(n) (0x0a80 + ((n) * 0x04)) +#define REG_MCR 0x0b00 +#define REG_MCP(n) (0x0b04 + ((n) * 0x04)) +#define REG_MISC_CFG 0x0be0 +#define REG_HID_CFG 0x0bf0 +#define REG_IDLE_STAT 0x0bf4 +#define REG_IP_REV_1 0x0bf8 +#define REG_IP_REV_2 0x0bfc +#define REG_FQD_BARE 0x0c00 +#define REG_PFDR_BARE 0x0c20 +#define REG_offset_BAR 0x0004 /* relative to REG_[FQD|PFDR]_BARE */ +#define REG_offset_AR 0x0010 /* relative to REG_[FQD|PFDR]_BARE */ +#define REG_QCSP_BARE 0x0c80 +#define REG_QCSP_BAR 0x0c84 +#define REG_CI_SCHED_CFG 0x0d00 +#define REG_SRCIDR 0x0d04 +#define REG_LIODNR 0x0d08 +#define REG_CI_RLM_AVG 0x0d14 +#define REG_ERR_ISR 0x0e00 +#define REG_ERR_IER 0x0e04 +#define REG_REV3_QCSP_LIO_CFG(n) (0x1000 + ((n) * 0x10)) +#define REG_REV3_QCSP_IO_CFG(n) (0x1004 + ((n) * 0x10)) +#define REG_REV3_QCSP_DD_CFG(n) (0x100c + ((n) * 0x10)) + +/* Assists for QMAN_MCR */ +#define MCR_INIT_PFDR 0x01000000 +#define MCR_get_rslt(v) (u8)((v) >> 24) +#define MCR_rslt_idle(r) (!(r) || ((r) >= 0xf0)) +#define MCR_rslt_ok(r) ((r) == 0xf0) +#define MCR_rslt_eaccess(r) ((r) == 0xf8) +#define MCR_rslt_inval(r) ((r) == 0xff) + +/* + * Corenet initiator settings. Stash request queues are 4-deep to match cores + * ability to snarf. Stash priority is 3, other priorities are 2. + */ +#define QM_CI_SCHED_CFG_SRCCIV 4 +#define QM_CI_SCHED_CFG_SRQ_W 3 +#define QM_CI_SCHED_CFG_RW_W 2 +#define QM_CI_SCHED_CFG_BMAN_W 2 +/* write SRCCIV enable */ +#define QM_CI_SCHED_CFG_SRCCIV_EN BIT(31) + +/* Follows WQ_CS_CFG0-5 */ +enum qm_wq_class { + qm_wq_portal = 0, + qm_wq_pool = 1, + qm_wq_fman0 = 2, + qm_wq_fman1 = 3, + qm_wq_caam = 4, + qm_wq_pme = 5, + qm_wq_first = qm_wq_portal, + qm_wq_last = qm_wq_pme +}; + +/* Follows FQD_[BARE|BAR|AR] and PFDR_[BARE|BAR|AR] */ +enum qm_memory { + qm_memory_fqd, + qm_memory_pfdr +}; + +/* Used by all error interrupt registers except 'inhibit' */ +#define QM_EIRQ_CIDE 0x20000000 /* Corenet Initiator Data Error */ +#define QM_EIRQ_CTDE 0x10000000 /* Corenet Target Data Error */ +#define QM_EIRQ_CITT 0x08000000 /* Corenet Invalid Target Transaction */ +#define QM_EIRQ_PLWI 0x04000000 /* PFDR Low Watermark */ +#define QM_EIRQ_MBEI 0x02000000 /* Multi-bit ECC Error */ +#define QM_EIRQ_SBEI 0x01000000 /* Single-bit ECC Error */ +#define QM_EIRQ_PEBI 0x00800000 /* PFDR Enqueues Blocked Interrupt */ +#define QM_EIRQ_IFSI 0x00020000 /* Invalid FQ Flow Control State */ +#define QM_EIRQ_ICVI 0x00010000 /* Invalid Command Verb */ +#define QM_EIRQ_IDDI 0x00000800 /* Invalid Dequeue (Direct-connect) */ +#define QM_EIRQ_IDFI 0x00000400 /* Invalid Dequeue FQ */ +#define QM_EIRQ_IDSI 0x00000200 /* Invalid Dequeue Source */ +#define QM_EIRQ_IDQI 0x00000100 /* Invalid Dequeue Queue */ +#define QM_EIRQ_IECE 0x00000010 /* Invalid Enqueue Configuration */ +#define QM_EIRQ_IEOI 0x00000008 /* Invalid Enqueue Overflow */ +#define QM_EIRQ_IESI 0x00000004 /* Invalid Enqueue State */ +#define QM_EIRQ_IECI 0x00000002 /* Invalid Enqueue Channel */ +#define QM_EIRQ_IEQI 0x00000001 /* Invalid Enqueue Queue */ + +/* QMAN_ECIR valid error bit */ +#define PORTAL_ECSR_ERR (QM_EIRQ_IEQI | QM_EIRQ_IESI | QM_EIRQ_IEOI | \ + QM_EIRQ_IDQI | QM_EIRQ_IDSI | QM_EIRQ_IDFI | \ + QM_EIRQ_IDDI | QM_EIRQ_ICVI | QM_EIRQ_IFSI) +#define FQID_ECSR_ERR (QM_EIRQ_IEQI | QM_EIRQ_IECI | QM_EIRQ_IESI | \ + QM_EIRQ_IEOI | QM_EIRQ_IDQI | QM_EIRQ_IDFI | \ + QM_EIRQ_IFSI) + +struct qm_ecir { + u32 info; /* res[30-31], ptyp[29], pnum[24-28], fqid[0-23] */ +}; + +static bool qm_ecir_is_dcp(const struct qm_ecir *p) +{ + return p->info & BIT(29); +} + +static int qm_ecir_get_pnum(const struct qm_ecir *p) +{ + return (p->info >> 24) & 0x1f; +} + +static int qm_ecir_get_fqid(const struct qm_ecir *p) +{ + return p->info & (BIT(24) - 1); +} + +struct qm_ecir2 { + u32 info; /* ptyp[31], res[10-30], pnum[0-9] */ +}; + +static bool qm_ecir2_is_dcp(const struct qm_ecir2 *p) +{ + return p->info & BIT(31); +} + +static int qm_ecir2_get_pnum(const struct qm_ecir2 *p) +{ + return p->info & (BIT(10) - 1); +} + +struct qm_eadr { + u32 info; /* memid[24-27], eadr[0-11] */ + /* v3: memid[24-28], eadr[0-15] */ +}; + +static int qm_eadr_get_memid(const struct qm_eadr *p) +{ + return (p->info >> 24) & 0xf; +} + +static int qm_eadr_get_eadr(const struct qm_eadr *p) +{ + return p->info & (BIT(12) - 1); +} + +static int qm_eadr_v3_get_memid(const struct qm_eadr *p) +{ + return (p->info >> 24) & 0x1f; +} + +static int qm_eadr_v3_get_eadr(const struct qm_eadr *p) +{ + return p->info & (BIT(16) - 1); +} + +struct qman_hwerr_txt { + u32 mask; + const char *txt; +}; + + +static const struct qman_hwerr_txt qman_hwerr_txts[] = { + { QM_EIRQ_CIDE, "Corenet Initiator Data Error" }, + { QM_EIRQ_CTDE, "Corenet Target Data Error" }, + { QM_EIRQ_CITT, "Corenet Invalid Target Transaction" }, + { QM_EIRQ_PLWI, "PFDR Low Watermark" }, + { QM_EIRQ_MBEI, "Multi-bit ECC Error" }, + { QM_EIRQ_SBEI, "Single-bit ECC Error" }, + { QM_EIRQ_PEBI, "PFDR Enqueues Blocked Interrupt" }, + { QM_EIRQ_ICVI, "Invalid Command Verb" }, + { QM_EIRQ_IFSI, "Invalid Flow Control State" }, + { QM_EIRQ_IDDI, "Invalid Dequeue (Direct-connect)" }, + { QM_EIRQ_IDFI, "Invalid Dequeue FQ" }, + { QM_EIRQ_IDSI, "Invalid Dequeue Source" }, + { QM_EIRQ_IDQI, "Invalid Dequeue Queue" }, + { QM_EIRQ_IECE, "Invalid Enqueue Configuration" }, + { QM_EIRQ_IEOI, "Invalid Enqueue Overflow" }, + { QM_EIRQ_IESI, "Invalid Enqueue State" }, + { QM_EIRQ_IECI, "Invalid Enqueue Channel" }, + { QM_EIRQ_IEQI, "Invalid Enqueue Queue" }, +}; + +struct qman_error_info_mdata { + u16 addr_mask; + u16 bits; + const char *txt; +}; + +static const struct qman_error_info_mdata error_mdata[] = { + { 0x01FF, 24, "FQD cache tag memory 0" }, + { 0x01FF, 24, "FQD cache tag memory 1" }, + { 0x01FF, 24, "FQD cache tag memory 2" }, + { 0x01FF, 24, "FQD cache tag memory 3" }, + { 0x0FFF, 512, "FQD cache memory" }, + { 0x07FF, 128, "SFDR memory" }, + { 0x01FF, 72, "WQ context memory" }, + { 0x00FF, 240, "CGR memory" }, + { 0x00FF, 302, "Internal Order Restoration List memory" }, + { 0x01FF, 256, "SW portal ring memory" }, +}; + +#define QMAN_ERRS_TO_DISABLE (QM_EIRQ_PLWI | QM_EIRQ_PEBI) + +/* + * TODO: unimplemented registers + * + * Keeping a list here of QMan registers I have not yet covered; + * QCSP_DD_IHRSR, QCSP_DD_IHRFR, QCSP_DD_HASR, + * DCP_DD_IHRSR, DCP_DD_IHRFR, DCP_DD_HASR, CM_CFG, + * QMAN_EECC, QMAN_SBET, QMAN_EINJ, QMAN_SBEC0-12 + */ + +/* Pointer to the start of the QMan's CCSR space */ +static u32 __iomem *qm_ccsr_start; +/* A SDQCR mask comprising all the available/visible pool channels */ +static u32 qm_pools_sdqcr; + +static inline u32 qm_ccsr_in(u32 offset) +{ + return ioread32be(qm_ccsr_start + offset/4); +} + +static inline void qm_ccsr_out(u32 offset, u32 val) +{ + iowrite32be(val, qm_ccsr_start + offset/4); +} + +u32 qm_get_pools_sdqcr(void) +{ + return qm_pools_sdqcr; +} + +enum qm_dc_portal { + qm_dc_portal_fman0 = 0, + qm_dc_portal_fman1 = 1 +}; + +static void qm_set_dc(enum qm_dc_portal portal, int ed, u8 sernd) +{ + DPAA_ASSERT(!ed || portal == qm_dc_portal_fman0 || + portal == qm_dc_portal_fman1); + if ((qman_ip_rev & 0xFF00) >= QMAN_REV30) + qm_ccsr_out(REG_DCP_CFG(portal), + (ed ? 0x1000 : 0) | (sernd & 0x3ff)); + else + qm_ccsr_out(REG_DCP_CFG(portal), + (ed ? 0x100 : 0) | (sernd & 0x1f)); +} + +static void qm_set_wq_scheduling(enum qm_wq_class wq_class, + u8 cs_elev, u8 csw2, u8 csw3, u8 csw4, + u8 csw5, u8 csw6, u8 csw7) +{ + qm_ccsr_out(REG_WQ_CS_CFG(wq_class), ((cs_elev & 0xff) << 24) | + ((csw2 & 0x7) << 20) | ((csw3 & 0x7) << 16) | + ((csw4 & 0x7) << 12) | ((csw5 & 0x7) << 8) | + ((csw6 & 0x7) << 4) | (csw7 & 0x7)); +} + +static void qm_set_hid(void) +{ + qm_ccsr_out(REG_HID_CFG, 0); +} + +static void qm_set_corenet_initiator(void) +{ + qm_ccsr_out(REG_CI_SCHED_CFG, QM_CI_SCHED_CFG_SRCCIV_EN | + (QM_CI_SCHED_CFG_SRCCIV << 24) | + (QM_CI_SCHED_CFG_SRQ_W << 8) | + (QM_CI_SCHED_CFG_RW_W << 4) | + QM_CI_SCHED_CFG_BMAN_W); +} + +static void qm_get_version(u16 *id, u8 *major, u8 *minor) +{ + u32 v = qm_ccsr_in(REG_IP_REV_1); + *id = (v >> 16); + *major = (v >> 8) & 0xff; + *minor = v & 0xff; +} + +#define PFDR_AR_EN BIT(31) +static void qm_set_memory(enum qm_memory memory, u64 ba, u32 size) +{ + u32 offset = (memory == qm_memory_fqd) ? REG_FQD_BARE : REG_PFDR_BARE; + u32 exp = ilog2(size); + + /* choke if size isn't within range */ + DPAA_ASSERT((size >= 4096) && (size <= 1024*1024*1024) && + is_power_of_2(size)); + /* choke if 'ba' has lower-alignment than 'size' */ + DPAA_ASSERT(!(ba & (size - 1))); + qm_ccsr_out(offset, upper_32_bits(ba)); + qm_ccsr_out(offset + REG_offset_BAR, lower_32_bits(ba)); + qm_ccsr_out(offset + REG_offset_AR, PFDR_AR_EN | (exp - 1)); +} + +static void qm_set_pfdr_threshold(u32 th, u8 k) +{ + qm_ccsr_out(REG_PFDR_FP_LWIT, th & 0xffffff); + qm_ccsr_out(REG_PFDR_CFG, k); +} + +static void qm_set_sfdr_threshold(u16 th) +{ + qm_ccsr_out(REG_SFDR_CFG, th & 0x3ff); +} + +static int qm_init_pfdr(struct device *dev, u32 pfdr_start, u32 num) +{ + u8 rslt = MCR_get_rslt(qm_ccsr_in(REG_MCR)); + + DPAA_ASSERT(pfdr_start && !(pfdr_start & 7) && !(num & 7) && num); + /* Make sure the command interface is 'idle' */ + if (!MCR_rslt_idle(rslt)) { + dev_crit(dev, "QMAN_MCR isn't idle"); + WARN_ON(1); + } + + /* Write the MCR command params then the verb */ + qm_ccsr_out(REG_MCP(0), pfdr_start); + /* + * TODO: remove this - it's a workaround for a model bug that is + * corrected in more recent versions. We use the workaround until + * everyone has upgraded. + */ + qm_ccsr_out(REG_MCP(1), pfdr_start + num - 16); + dma_wmb(); + qm_ccsr_out(REG_MCR, MCR_INIT_PFDR); + /* Poll for the result */ + do { + rslt = MCR_get_rslt(qm_ccsr_in(REG_MCR)); + } while (!MCR_rslt_idle(rslt)); + if (MCR_rslt_ok(rslt)) + return 0; + if (MCR_rslt_eaccess(rslt)) + return -EACCES; + if (MCR_rslt_inval(rslt)) + return -EINVAL; + dev_crit(dev, "Unexpected result from MCR_INIT_PFDR: %02x\n", rslt); + return -ENODEV; +} + +/* + * Ideally we would use the DMA API to turn rmem->base into a DMA address + * (especially if iommu translations ever get involved). Unfortunately, the + * DMA API currently does not allow mapping anything that is not backed with + * a struct page. + */ +static dma_addr_t fqd_a, pfdr_a; +static size_t fqd_sz, pfdr_sz; + +static int qman_fqd(struct reserved_mem *rmem) +{ + fqd_a = rmem->base; + fqd_sz = rmem->size; + + WARN_ON(!(fqd_a && fqd_sz)); + + return 0; +} +RESERVEDMEM_OF_DECLARE(qman_fqd, "fsl,qman-fqd", qman_fqd); + +static int qman_pfdr(struct reserved_mem *rmem) +{ + pfdr_a = rmem->base; + pfdr_sz = rmem->size; + + WARN_ON(!(pfdr_a && pfdr_sz)); + + return 0; +} +RESERVEDMEM_OF_DECLARE(qman_pfdr, "fsl,qman-pfdr", qman_pfdr); + +static unsigned int qm_get_fqid_maxcnt(void) +{ + return fqd_sz / 64; +} + +/* + * Flush this memory range from data cache so that QMAN originated + * transactions for this memory region could be marked non-coherent. + */ +static int zero_priv_mem(struct device *dev, struct device_node *node, + phys_addr_t addr, size_t sz) +{ + /* map as cacheable, non-guarded */ + void __iomem *tmpp = ioremap_prot(addr, sz, 0); + + memset_io(tmpp, 0, sz); + flush_dcache_range((unsigned long)tmpp, + (unsigned long)tmpp + sz); + iounmap(tmpp); + + return 0; +} + +static void log_edata_bits(struct device *dev, u32 bit_count) +{ + u32 i, j, mask = 0xffffffff; + + dev_warn(dev, "ErrInt, EDATA:\n"); + i = bit_count / 32; + if (bit_count % 32) { + i++; + mask = ~(mask << bit_count % 32); + } + j = 16 - i; + dev_warn(dev, " 0x%08x\n", qm_ccsr_in(REG_EDATA(j)) & mask); + j++; + for (; j < 16; j++) + dev_warn(dev, " 0x%08x\n", qm_ccsr_in(REG_EDATA(j))); +} + +static void log_additional_error_info(struct device *dev, u32 isr_val, + u32 ecsr_val) +{ + struct qm_ecir ecir_val; + struct qm_eadr eadr_val; + int memid; + + ecir_val.info = qm_ccsr_in(REG_ECIR); + /* Is portal info valid */ + if ((qman_ip_rev & 0xFF00) >= QMAN_REV30) { + struct qm_ecir2 ecir2_val; + + ecir2_val.info = qm_ccsr_in(REG_ECIR2); + if (ecsr_val & PORTAL_ECSR_ERR) { + dev_warn(dev, "ErrInt: %s id %d\n", + qm_ecir2_is_dcp(&ecir2_val) ? "DCP" : "SWP", + qm_ecir2_get_pnum(&ecir2_val)); + } + if (ecsr_val & (FQID_ECSR_ERR | QM_EIRQ_IECE)) + dev_warn(dev, "ErrInt: ecir.fqid 0x%x\n", + qm_ecir_get_fqid(&ecir_val)); + + if (ecsr_val & (QM_EIRQ_SBEI|QM_EIRQ_MBEI)) { + eadr_val.info = qm_ccsr_in(REG_EADR); + memid = qm_eadr_v3_get_memid(&eadr_val); + dev_warn(dev, "ErrInt: EADR Memory: %s, 0x%x\n", + error_mdata[memid].txt, + error_mdata[memid].addr_mask + & qm_eadr_v3_get_eadr(&eadr_val)); + log_edata_bits(dev, error_mdata[memid].bits); + } + } else { + if (ecsr_val & PORTAL_ECSR_ERR) { + dev_warn(dev, "ErrInt: %s id %d\n", + qm_ecir_is_dcp(&ecir_val) ? "DCP" : "SWP", + qm_ecir_get_pnum(&ecir_val)); + } + if (ecsr_val & FQID_ECSR_ERR) + dev_warn(dev, "ErrInt: ecir.fqid 0x%x\n", + qm_ecir_get_fqid(&ecir_val)); + + if (ecsr_val & (QM_EIRQ_SBEI|QM_EIRQ_MBEI)) { + eadr_val.info = qm_ccsr_in(REG_EADR); + memid = qm_eadr_get_memid(&eadr_val); + dev_warn(dev, "ErrInt: EADR Memory: %s, 0x%x\n", + error_mdata[memid].txt, + error_mdata[memid].addr_mask + & qm_eadr_get_eadr(&eadr_val)); + log_edata_bits(dev, error_mdata[memid].bits); + } + } +} + +static irqreturn_t qman_isr(int irq, void *ptr) +{ + u32 isr_val, ier_val, ecsr_val, isr_mask, i; + struct device *dev = ptr; + + ier_val = qm_ccsr_in(REG_ERR_IER); + isr_val = qm_ccsr_in(REG_ERR_ISR); + ecsr_val = qm_ccsr_in(REG_ECSR); + isr_mask = isr_val & ier_val; + + if (!isr_mask) + return IRQ_NONE; + + for (i = 0; i < ARRAY_SIZE(qman_hwerr_txts); i++) { + if (qman_hwerr_txts[i].mask & isr_mask) { + dev_err_ratelimited(dev, "ErrInt: %s\n", + qman_hwerr_txts[i].txt); + if (qman_hwerr_txts[i].mask & ecsr_val) { + log_additional_error_info(dev, isr_mask, + ecsr_val); + /* Re-arm error capture registers */ + qm_ccsr_out(REG_ECSR, ecsr_val); + } + if (qman_hwerr_txts[i].mask & QMAN_ERRS_TO_DISABLE) { + dev_dbg(dev, "Disabling error 0x%x\n", + qman_hwerr_txts[i].mask); + ier_val &= ~qman_hwerr_txts[i].mask; + qm_ccsr_out(REG_ERR_IER, ier_val); + } + } + } + qm_ccsr_out(REG_ERR_ISR, isr_val); + + return IRQ_HANDLED; +} + +static int qman_init_ccsr(struct device *dev) +{ + int i, err; + + /* FQD memory */ + qm_set_memory(qm_memory_fqd, fqd_a, fqd_sz); + /* PFDR memory */ + qm_set_memory(qm_memory_pfdr, pfdr_a, pfdr_sz); + err = qm_init_pfdr(dev, 8, pfdr_sz / 64 - 8); + if (err) + return err; + /* thresholds */ + qm_set_pfdr_threshold(512, 64); + qm_set_sfdr_threshold(128); + /* clear stale PEBI bit from interrupt status register */ + qm_ccsr_out(REG_ERR_ISR, QM_EIRQ_PEBI); + /* corenet initiator settings */ + qm_set_corenet_initiator(); + /* HID settings */ + qm_set_hid(); + /* Set scheduling weights to defaults */ + for (i = qm_wq_first; i <= qm_wq_last; i++) + qm_set_wq_scheduling(i, 0, 0, 0, 0, 0, 0, 0); + /* We are not prepared to accept ERNs for hardware enqueues */ + qm_set_dc(qm_dc_portal_fman0, 1, 0); + qm_set_dc(qm_dc_portal_fman1, 1, 0); + return 0; +} + +#define LIO_CFG_LIODN_MASK 0x0fff0000 +void qman_liodn_fixup(u16 channel) +{ + static int done; + static u32 liodn_offset; + u32 before, after; + int idx = channel - QM_CHANNEL_SWPORTAL0; + + if ((qman_ip_rev & 0xFF00) >= QMAN_REV30) + before = qm_ccsr_in(REG_REV3_QCSP_LIO_CFG(idx)); + else + before = qm_ccsr_in(REG_QCSP_LIO_CFG(idx)); + if (!done) { + liodn_offset = before & LIO_CFG_LIODN_MASK; + done = 1; + return; + } + after = (before & (~LIO_CFG_LIODN_MASK)) | liodn_offset; + if ((qman_ip_rev & 0xFF00) >= QMAN_REV30) + qm_ccsr_out(REG_REV3_QCSP_LIO_CFG(idx), after); + else + qm_ccsr_out(REG_QCSP_LIO_CFG(idx), after); +} + +#define IO_CFG_SDEST_MASK 0x00ff0000 +void qman_set_sdest(u16 channel, unsigned int cpu_idx) +{ + int idx = channel - QM_CHANNEL_SWPORTAL0; + u32 before, after; + + if ((qman_ip_rev & 0xFF00) >= QMAN_REV30) { + before = qm_ccsr_in(REG_REV3_QCSP_IO_CFG(idx)); + /* Each pair of vcpu share the same SRQ(SDEST) */ + cpu_idx /= 2; + after = (before & (~IO_CFG_SDEST_MASK)) | (cpu_idx << 16); + qm_ccsr_out(REG_REV3_QCSP_IO_CFG(idx), after); + } else { + before = qm_ccsr_in(REG_QCSP_IO_CFG(idx)); + after = (before & (~IO_CFG_SDEST_MASK)) | (cpu_idx << 16); + qm_ccsr_out(REG_QCSP_IO_CFG(idx), after); + } +} + +static int qman_resource_init(struct device *dev) +{ + int pool_chan_num, cgrid_num; + int ret, i; + + switch (qman_ip_rev >> 8) { + case 1: + pool_chan_num = 15; + cgrid_num = 256; + break; + case 2: + pool_chan_num = 3; + cgrid_num = 64; + break; + case 3: + pool_chan_num = 15; + cgrid_num = 256; + break; + default: + return -ENODEV; + } + + ret = gen_pool_add(qm_qpalloc, qm_channel_pool1 | DPAA_GENALLOC_OFF, + pool_chan_num, -1); + if (ret) { + dev_err(dev, "Failed to seed pool channels (%d)\n", ret); + return ret; + } + + ret = gen_pool_add(qm_cgralloc, DPAA_GENALLOC_OFF, cgrid_num, -1); + if (ret) { + dev_err(dev, "Failed to seed CGRID range (%d)\n", ret); + return ret; + } + + /* parse pool channels into the SDQCR mask */ + for (i = 0; i < cgrid_num; i++) + qm_pools_sdqcr |= QM_SDQCR_CHANNELS_POOL_CONV(i); + + ret = gen_pool_add(qm_fqalloc, QM_FQID_RANGE_START | DPAA_GENALLOC_OFF, + qm_get_fqid_maxcnt() - QM_FQID_RANGE_START, -1); + if (ret) { + dev_err(dev, "Failed to seed FQID range (%d)\n", ret); + return ret; + } + + return 0; +} + +static int fsl_qman_probe(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + struct device_node *node = dev->of_node; + struct resource *res; + int ret, err_irq; + u16 id; + u8 major, minor; + + res = platform_get_resource(pdev, IORESOURCE_MEM, 0); + if (!res) { + dev_err(dev, "Can't get %s property 'IORESOURCE_MEM'\n", + node->full_name); + return -ENXIO; + } + qm_ccsr_start = devm_ioremap(dev, res->start, resource_size(res)); + if (!qm_ccsr_start) + return -ENXIO; + + qm_get_version(&id, &major, &minor); + if (major == 1 && minor == 0) { + dev_err(dev, "Rev1.0 on P4080 rev1 is not supported!\n"); + return -ENODEV; + } else if (major == 1 && minor == 1) + qman_ip_rev = QMAN_REV11; + else if (major == 1 && minor == 2) + qman_ip_rev = QMAN_REV12; + else if (major == 2 && minor == 0) + qman_ip_rev = QMAN_REV20; + else if (major == 3 && minor == 0) + qman_ip_rev = QMAN_REV30; + else if (major == 3 && minor == 1) + qman_ip_rev = QMAN_REV31; + else { + dev_err(dev, "Unknown QMan version\n"); + return -ENODEV; + } + + if ((qman_ip_rev & 0xff00) >= QMAN_REV30) + qm_channel_pool1 = QMAN_CHANNEL_POOL1_REV3; + + ret = zero_priv_mem(dev, node, fqd_a, fqd_sz); + WARN_ON(ret); + if (ret) + return -ENODEV; + + ret = qman_init_ccsr(dev); + if (ret) { + dev_err(dev, "CCSR setup failed\n"); + return ret; + } + + err_irq = platform_get_irq(pdev, 0); + if (err_irq <= 0) { + dev_info(dev, "Can't get %s property 'interrupts'\n", + node->full_name); + return -ENODEV; + } + ret = devm_request_irq(dev, err_irq, qman_isr, IRQF_SHARED, "qman-err", + dev); + if (ret) { + dev_err(dev, "devm_request_irq() failed %d for '%s'\n", + ret, node->full_name); + return ret; + } + + /* + * Write-to-clear any stale bits, (eg. starvation being asserted prior + * to resource allocation during driver init). + */ + qm_ccsr_out(REG_ERR_ISR, 0xffffffff); + /* Enable Error Interrupts */ + qm_ccsr_out(REG_ERR_IER, 0xffffffff); + + qm_fqalloc = devm_gen_pool_create(dev, 0, -1, "qman-fqalloc"); + if (IS_ERR(qm_fqalloc)) { + ret = PTR_ERR(qm_fqalloc); + dev_err(dev, "qman-fqalloc pool init failed (%d)\n", ret); + return ret; + } + + qm_qpalloc = devm_gen_pool_create(dev, 0, -1, "qman-qpalloc"); + if (IS_ERR(qm_qpalloc)) { + ret = PTR_ERR(qm_qpalloc); + dev_err(dev, "qman-qpalloc pool init failed (%d)\n", ret); + return ret; + } + + qm_cgralloc = devm_gen_pool_create(dev, 0, -1, "qman-cgralloc"); + if (IS_ERR(qm_cgralloc)) { + ret = PTR_ERR(qm_cgralloc); + dev_err(dev, "qman-cgralloc pool init failed (%d)\n", ret); + return ret; + } + + ret = qman_resource_init(dev); + if (ret) + return ret; + + ret = qman_alloc_fq_table(qm_get_fqid_maxcnt()); + if (ret) + return ret; + + ret = qman_wq_alloc(); + if (ret) + return ret; + + return 0; +} + +static const struct of_device_id fsl_qman_ids[] = { + { + .compatible = "fsl,qman", + }, + {} +}; + +static struct platform_driver fsl_qman_driver = { + .driver = { + .name = KBUILD_MODNAME, + .of_match_table = fsl_qman_ids, + .suppress_bind_attrs = true, + }, + .probe = fsl_qman_probe, +}; + +builtin_platform_driver(fsl_qman_driver); diff --git a/drivers/soc/fsl/qbman/qman_portal.c b/drivers/soc/fsl/qbman/qman_portal.c new file mode 100644 index 000000000000..148614388fca --- /dev/null +++ b/drivers/soc/fsl/qbman/qman_portal.c @@ -0,0 +1,355 @@ +/* Copyright 2008 - 2016 Freescale Semiconductor, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Freescale Semiconductor nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * ALTERNATIVELY, this software may be distributed under the terms of the + * GNU General Public License ("GPL") as published by the Free Software + * Foundation, either version 2 of that License or (at your option) any + * later version. + * + * THIS SOFTWARE IS PROVIDED BY Freescale Semiconductor ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL Freescale Semiconductor BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "qman_priv.h" + +/* Enable portal interupts (as opposed to polling mode) */ +#define CONFIG_FSL_DPA_PIRQ_SLOW 1 +#define CONFIG_FSL_DPA_PIRQ_FAST 1 + +static struct cpumask portal_cpus; +/* protect qman global registers and global data shared among portals */ +static DEFINE_SPINLOCK(qman_lock); + +static void portal_set_cpu(struct qm_portal_config *pcfg, int cpu) +{ +#ifdef CONFIG_FSL_PAMU + struct device *dev = pcfg->dev; + int window_count = 1; + struct iommu_domain_geometry geom_attr; + struct pamu_stash_attribute stash_attr; + int ret; + + pcfg->iommu_domain = iommu_domain_alloc(&platform_bus_type); + if (!pcfg->iommu_domain) { + dev_err(dev, "%s(): iommu_domain_alloc() failed", __func__); + goto no_iommu; + } + geom_attr.aperture_start = 0; + geom_attr.aperture_end = + ((dma_addr_t)1 << min(8 * sizeof(dma_addr_t), (size_t)36)) - 1; + geom_attr.force_aperture = true; + ret = iommu_domain_set_attr(pcfg->iommu_domain, DOMAIN_ATTR_GEOMETRY, + &geom_attr); + if (ret < 0) { + dev_err(dev, "%s(): iommu_domain_set_attr() = %d", __func__, + ret); + goto out_domain_free; + } + ret = iommu_domain_set_attr(pcfg->iommu_domain, DOMAIN_ATTR_WINDOWS, + &window_count); + if (ret < 0) { + dev_err(dev, "%s(): iommu_domain_set_attr() = %d", __func__, + ret); + goto out_domain_free; + } + stash_attr.cpu = cpu; + stash_attr.cache = PAMU_ATTR_CACHE_L1; + ret = iommu_domain_set_attr(pcfg->iommu_domain, + DOMAIN_ATTR_FSL_PAMU_STASH, + &stash_attr); + if (ret < 0) { + dev_err(dev, "%s(): iommu_domain_set_attr() = %d", + __func__, ret); + goto out_domain_free; + } + ret = iommu_domain_window_enable(pcfg->iommu_domain, 0, 0, 1ULL << 36, + IOMMU_READ | IOMMU_WRITE); + if (ret < 0) { + dev_err(dev, "%s(): iommu_domain_window_enable() = %d", + __func__, ret); + goto out_domain_free; + } + ret = iommu_attach_device(pcfg->iommu_domain, dev); + if (ret < 0) { + dev_err(dev, "%s(): iommu_device_attach() = %d", __func__, + ret); + goto out_domain_free; + } + ret = iommu_domain_set_attr(pcfg->iommu_domain, + DOMAIN_ATTR_FSL_PAMU_ENABLE, + &window_count); + if (ret < 0) { + dev_err(dev, "%s(): iommu_domain_set_attr() = %d", __func__, + ret); + goto out_detach_device; + } + +no_iommu: +#endif + qman_set_sdest(pcfg->channel, cpu); + + return; + +#ifdef CONFIG_FSL_PAMU +out_detach_device: + iommu_detach_device(pcfg->iommu_domain, NULL); +out_domain_free: + iommu_domain_free(pcfg->iommu_domain); + pcfg->iommu_domain = NULL; +#endif +} + +static struct qman_portal *init_pcfg(struct qm_portal_config *pcfg) +{ + struct qman_portal *p; + u32 irq_sources = 0; + + /* We need the same LIODN offset for all portals */ + qman_liodn_fixup(pcfg->channel); + + pcfg->iommu_domain = NULL; + portal_set_cpu(pcfg, pcfg->cpu); + + p = qman_create_affine_portal(pcfg, NULL); + if (!p) { + dev_crit(pcfg->dev, "%s: Portal failure on cpu %d\n", + __func__, pcfg->cpu); + return NULL; + } + + /* Determine what should be interrupt-vs-poll driven */ +#ifdef CONFIG_FSL_DPA_PIRQ_SLOW + irq_sources |= QM_PIRQ_EQCI | QM_PIRQ_EQRI | QM_PIRQ_MRI | + QM_PIRQ_CSCI; +#endif +#ifdef CONFIG_FSL_DPA_PIRQ_FAST + irq_sources |= QM_PIRQ_DQRI; +#endif + qman_p_irqsource_add(p, irq_sources); + + spin_lock(&qman_lock); + if (cpumask_equal(&portal_cpus, cpu_possible_mask)) { + /* all assigned portals are initialized now */ + qman_init_cgr_all(); + } + spin_unlock(&qman_lock); + + dev_info(pcfg->dev, "Portal initialised, cpu %d\n", pcfg->cpu); + + return p; +} + +static void qman_portal_update_sdest(const struct qm_portal_config *pcfg, + unsigned int cpu) +{ +#ifdef CONFIG_FSL_PAMU /* TODO */ + struct pamu_stash_attribute stash_attr; + int ret; + + if (pcfg->iommu_domain) { + stash_attr.cpu = cpu; + stash_attr.cache = PAMU_ATTR_CACHE_L1; + ret = iommu_domain_set_attr(pcfg->iommu_domain, + DOMAIN_ATTR_FSL_PAMU_STASH, &stash_attr); + if (ret < 0) { + dev_err(pcfg->dev, + "Failed to update pamu stash setting\n"); + return; + } + } +#endif + qman_set_sdest(pcfg->channel, cpu); +} + +static void qman_offline_cpu(unsigned int cpu) +{ + struct qman_portal *p; + const struct qm_portal_config *pcfg; + + p = affine_portals[cpu]; + if (p) { + pcfg = qman_get_qm_portal_config(p); + if (pcfg) { + irq_set_affinity(pcfg->irq, cpumask_of(0)); + qman_portal_update_sdest(pcfg, 0); + } + } +} + +static void qman_online_cpu(unsigned int cpu) +{ + struct qman_portal *p; + const struct qm_portal_config *pcfg; + + p = affine_portals[cpu]; + if (p) { + pcfg = qman_get_qm_portal_config(p); + if (pcfg) { + irq_set_affinity(pcfg->irq, cpumask_of(cpu)); + qman_portal_update_sdest(pcfg, cpu); + } + } +} + +static int qman_hotplug_cpu_callback(struct notifier_block *nfb, + unsigned long action, void *hcpu) +{ + unsigned int cpu = (unsigned long)hcpu; + + switch (action) { + case CPU_ONLINE: + case CPU_ONLINE_FROZEN: + qman_online_cpu(cpu); + break; + case CPU_DOWN_PREPARE: + case CPU_DOWN_PREPARE_FROZEN: + qman_offline_cpu(cpu); + default: + break; + } + return NOTIFY_OK; +} + +static struct notifier_block qman_hotplug_cpu_notifier = { + .notifier_call = qman_hotplug_cpu_callback, +}; + +static int qman_portal_probe(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + struct device_node *node = dev->of_node; + struct qm_portal_config *pcfg; + struct resource *addr_phys[2]; + const u32 *channel; + void __iomem *va; + int irq, len, cpu; + + pcfg = devm_kmalloc(dev, sizeof(*pcfg), GFP_KERNEL); + if (!pcfg) + return -ENOMEM; + + pcfg->dev = dev; + + addr_phys[0] = platform_get_resource(pdev, IORESOURCE_MEM, + DPAA_PORTAL_CE); + if (!addr_phys[0]) { + dev_err(dev, "Can't get %s property 'reg::CE'\n", + node->full_name); + return -ENXIO; + } + + addr_phys[1] = platform_get_resource(pdev, IORESOURCE_MEM, + DPAA_PORTAL_CI); + if (!addr_phys[1]) { + dev_err(dev, "Can't get %s property 'reg::CI'\n", + node->full_name); + return -ENXIO; + } + + channel = of_get_property(node, "cell-index", &len); + if (!channel || (len != 4)) { + dev_err(dev, "Can't get %s property 'cell-index'\n", + node->full_name); + return -ENXIO; + } + pcfg->channel = *channel; + pcfg->cpu = -1; + irq = platform_get_irq(pdev, 0); + if (irq <= 0) { + dev_err(dev, "Can't get %s IRQ\n", node->full_name); + return -ENXIO; + } + pcfg->irq = irq; + + va = ioremap_prot(addr_phys[0]->start, resource_size(addr_phys[0]), 0); + if (!va) + goto err_ioremap1; + + pcfg->addr_virt[DPAA_PORTAL_CE] = va; + + va = ioremap_prot(addr_phys[1]->start, resource_size(addr_phys[1]), + _PAGE_GUARDED | _PAGE_NO_CACHE); + if (!va) + goto err_ioremap2; + + pcfg->addr_virt[DPAA_PORTAL_CI] = va; + + pcfg->pools = qm_get_pools_sdqcr(); + + spin_lock(&qman_lock); + cpu = cpumask_next_zero(-1, &portal_cpus); + if (cpu >= nr_cpu_ids) { + /* unassigned portal, skip init */ + spin_unlock(&qman_lock); + return 0; + } + + cpumask_set_cpu(cpu, &portal_cpus); + spin_unlock(&qman_lock); + pcfg->cpu = cpu; + + if (!init_pcfg(pcfg)) + goto err_ioremap2; + + /* clear irq affinity if assigned cpu is offline */ + if (!cpu_online(cpu)) + qman_offline_cpu(cpu); + + return 0; + +err_ioremap2: + iounmap(pcfg->addr_virt[DPAA_PORTAL_CE]); +err_ioremap1: + dev_err(dev, "ioremap failed\n"); + return -ENXIO; +} + +static const struct of_device_id qman_portal_ids[] = { + { + .compatible = "fsl,qman-portal", + }, + {} +}; +MODULE_DEVICE_TABLE(of, qman_portal_ids); + +static struct platform_driver qman_portal_driver = { + .driver = { + .name = KBUILD_MODNAME, + .of_match_table = qman_portal_ids, + }, + .probe = qman_portal_probe, +}; + +static int __init qman_portal_driver_register(struct platform_driver *drv) +{ + int ret; + + ret = platform_driver_register(drv); + if (ret < 0) + return ret; + + register_hotcpu_notifier(&qman_hotplug_cpu_notifier); + + return 0; +} + +module_driver(qman_portal_driver, + qman_portal_driver_register, platform_driver_unregister); diff --git a/drivers/soc/fsl/qbman/qman_priv.h b/drivers/soc/fsl/qbman/qman_priv.h new file mode 100644 index 000000000000..5cf821e623a9 --- /dev/null +++ b/drivers/soc/fsl/qbman/qman_priv.h @@ -0,0 +1,371 @@ +/* Copyright 2008 - 2016 Freescale Semiconductor, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Freescale Semiconductor nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * ALTERNATIVELY, this software may be distributed under the terms of the + * GNU General Public License ("GPL") as published by the Free Software + * Foundation, either version 2 of that License or (at your option) any + * later version. + * + * THIS SOFTWARE IS PROVIDED BY Freescale Semiconductor ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL Freescale Semiconductor BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include "dpaa_sys.h" + +#include +#include + +#if defined(CONFIG_FSL_PAMU) +#include +#endif + +struct qm_mcr_querywq { + u8 verb; + u8 result; + u16 channel_wq; /* ignores wq (3 lsbits): _res[0-2] */ + u8 __reserved[28]; + u32 wq_len[8]; +} __packed; + +static inline u16 qm_mcr_querywq_get_chan(const struct qm_mcr_querywq *wq) +{ + return wq->channel_wq >> 3; +} + +struct __qm_mcr_querycongestion { + u32 state[8]; +}; + +/* "Query Congestion Group State" */ +struct qm_mcr_querycongestion { + u8 verb; + u8 result; + u8 __reserved[30]; + /* Access this struct using qman_cgrs_get() */ + struct __qm_mcr_querycongestion state; +} __packed; + +/* "Query CGR" */ +struct qm_mcr_querycgr { + u8 verb; + u8 result; + u16 __reserved1; + struct __qm_mc_cgr cgr; /* CGR fields */ + u8 __reserved2[6]; + u8 i_bcnt_hi; /* high 8-bits of 40-bit "Instant" */ + u32 i_bcnt_lo; /* low 32-bits of 40-bit */ + u8 __reserved3[3]; + u8 a_bcnt_hi; /* high 8-bits of 40-bit "Average" */ + u32 a_bcnt_lo; /* low 32-bits of 40-bit */ + u32 cscn_targ_swp[4]; +} __packed; + +static inline u64 qm_mcr_querycgr_i_get64(const struct qm_mcr_querycgr *q) +{ + return ((u64)q->i_bcnt_hi << 32) | (u64)q->i_bcnt_lo; +} +static inline u64 qm_mcr_querycgr_a_get64(const struct qm_mcr_querycgr *q) +{ + return ((u64)q->a_bcnt_hi << 32) | (u64)q->a_bcnt_lo; +} + +/* "Query FQ Non-Programmable Fields" */ +struct qm_mcc_queryfq_np { + u8 _ncw_verb; + u8 __reserved1[3]; + u32 fqid; /* 24-bit */ + u8 __reserved2[56]; +} __packed; + +struct qm_mcr_queryfq_np { + u8 verb; + u8 result; + u8 __reserved1; + u8 state; /* QM_MCR_NP_STATE_*** */ + u32 fqd_link; /* 24-bit, _res2[24-31] */ + u16 odp_seq; /* 14-bit, _res3[14-15] */ + u16 orp_nesn; /* 14-bit, _res4[14-15] */ + u16 orp_ea_hseq; /* 15-bit, _res5[15] */ + u16 orp_ea_tseq; /* 15-bit, _res6[15] */ + u32 orp_ea_hptr; /* 24-bit, _res7[24-31] */ + u32 orp_ea_tptr; /* 24-bit, _res8[24-31] */ + u32 pfdr_hptr; /* 24-bit, _res9[24-31] */ + u32 pfdr_tptr; /* 24-bit, _res10[24-31] */ + u8 __reserved2[5]; + u8 is; /* 1-bit, _res12[1-7] */ + u16 ics_surp; + u32 byte_cnt; + u32 frm_cnt; /* 24-bit, _res13[24-31] */ + u32 __reserved3; + u16 ra1_sfdr; /* QM_MCR_NP_RA1_*** */ + u16 ra2_sfdr; /* QM_MCR_NP_RA2_*** */ + u16 __reserved4; + u16 od1_sfdr; /* QM_MCR_NP_OD1_*** */ + u16 od2_sfdr; /* QM_MCR_NP_OD2_*** */ + u16 od3_sfdr; /* QM_MCR_NP_OD3_*** */ +} __packed; + +#define QM_MCR_NP_STATE_FE 0x10 +#define QM_MCR_NP_STATE_R 0x08 +#define QM_MCR_NP_STATE_MASK 0x07 /* Reads FQD::STATE; */ +#define QM_MCR_NP_STATE_OOS 0x00 +#define QM_MCR_NP_STATE_RETIRED 0x01 +#define QM_MCR_NP_STATE_TEN_SCHED 0x02 +#define QM_MCR_NP_STATE_TRU_SCHED 0x03 +#define QM_MCR_NP_STATE_PARKED 0x04 +#define QM_MCR_NP_STATE_ACTIVE 0x05 +#define QM_MCR_NP_PTR_MASK 0x07ff /* for RA[12] & OD[123] */ +#define QM_MCR_NP_RA1_NRA(v) (((v) >> 14) & 0x3) /* FQD::NRA */ +#define QM_MCR_NP_RA2_IT(v) (((v) >> 14) & 0x1) /* FQD::IT */ +#define QM_MCR_NP_OD1_NOD(v) (((v) >> 14) & 0x3) /* FQD::NOD */ +#define QM_MCR_NP_OD3_NPC(v) (((v) >> 14) & 0x3) /* FQD::NPC */ + +enum qm_mcr_queryfq_np_masks { + qm_mcr_fqd_link_mask = BIT(24)-1, + qm_mcr_odp_seq_mask = BIT(14)-1, + qm_mcr_orp_nesn_mask = BIT(14)-1, + qm_mcr_orp_ea_hseq_mask = BIT(15)-1, + qm_mcr_orp_ea_tseq_mask = BIT(15)-1, + qm_mcr_orp_ea_hptr_mask = BIT(24)-1, + qm_mcr_orp_ea_tptr_mask = BIT(24)-1, + qm_mcr_pfdr_hptr_mask = BIT(24)-1, + qm_mcr_pfdr_tptr_mask = BIT(24)-1, + qm_mcr_is_mask = BIT(1)-1, + qm_mcr_frm_cnt_mask = BIT(24)-1, +}; +#define qm_mcr_np_get(np, field) \ + ((np)->field & (qm_mcr_##field##_mask)) + +/* Congestion Groups */ + +/* + * This wrapper represents a bit-array for the state of the 256 QMan congestion + * groups. Is also used as a *mask* for congestion groups, eg. so we ignore + * those that don't concern us. We harness the structure and accessor details + * already used in the management command to query congestion groups. + */ +#define CGR_BITS_PER_WORD 5 +#define CGR_WORD(x) ((x) >> CGR_BITS_PER_WORD) +#define CGR_BIT(x) (BIT(31) >> ((x) & 0x1f)) +#define CGR_NUM (sizeof(struct __qm_mcr_querycongestion) << 3) + +struct qman_cgrs { + struct __qm_mcr_querycongestion q; +}; + +static inline void qman_cgrs_init(struct qman_cgrs *c) +{ + memset(c, 0, sizeof(*c)); +} + +static inline void qman_cgrs_fill(struct qman_cgrs *c) +{ + memset(c, 0xff, sizeof(*c)); +} + +static inline int qman_cgrs_get(struct qman_cgrs *c, u8 cgr) +{ + return c->q.state[CGR_WORD(cgr)] & CGR_BIT(cgr); +} + +static inline void qman_cgrs_cp(struct qman_cgrs *dest, + const struct qman_cgrs *src) +{ + *dest = *src; +} + +static inline void qman_cgrs_and(struct qman_cgrs *dest, + const struct qman_cgrs *a, const struct qman_cgrs *b) +{ + int ret; + u32 *_d = dest->q.state; + const u32 *_a = a->q.state; + const u32 *_b = b->q.state; + + for (ret = 0; ret < 8; ret++) + *_d++ = *_a++ & *_b++; +} + +static inline void qman_cgrs_xor(struct qman_cgrs *dest, + const struct qman_cgrs *a, const struct qman_cgrs *b) +{ + int ret; + u32 *_d = dest->q.state; + const u32 *_a = a->q.state; + const u32 *_b = b->q.state; + + for (ret = 0; ret < 8; ret++) + *_d++ = *_a++ ^ *_b++; +} + +void qman_init_cgr_all(void); + +struct qm_portal_config { + /* + * Corenet portal addresses; + * [0]==cache-enabled, [1]==cache-inhibited. + */ + void __iomem *addr_virt[2]; + struct device *dev; + struct iommu_domain *iommu_domain; + /* Allow these to be joined in lists */ + struct list_head list; + /* User-visible portal configuration settings */ + /* portal is affined to this cpu */ + int cpu; + /* portal interrupt line */ + int irq; + /* + * the portal's dedicated channel id, used initialising + * frame queues to target this portal when scheduled + */ + u16 channel; + /* + * mask of pool channels this portal has dequeue access to + * (using QM_SDQCR_CHANNELS_POOL(n) for the bitmask) + */ + u32 pools; +}; + +/* Revision info (for errata and feature handling) */ +#define QMAN_REV11 0x0101 +#define QMAN_REV12 0x0102 +#define QMAN_REV20 0x0200 +#define QMAN_REV30 0x0300 +#define QMAN_REV31 0x0301 +extern u16 qman_ip_rev; /* 0 if uninitialised, otherwise QMAN_REVx */ + +#define QM_FQID_RANGE_START 1 /* FQID 0 reserved for internal use */ +extern struct gen_pool *qm_fqalloc; /* FQID allocator */ +extern struct gen_pool *qm_qpalloc; /* pool-channel allocator */ +extern struct gen_pool *qm_cgralloc; /* CGR ID allocator */ +u32 qm_get_pools_sdqcr(void); + +int qman_wq_alloc(void); +void qman_liodn_fixup(u16 channel); +void qman_set_sdest(u16 channel, unsigned int cpu_idx); + +struct qman_portal *qman_create_affine_portal( + const struct qm_portal_config *config, + const struct qman_cgrs *cgrs); +const struct qm_portal_config *qman_destroy_affine_portal(void); + +/* + * qman_query_fq - Queries FQD fields (via h/w query command) + * @fq: the frame queue object to be queried + * @fqd: storage for the queried FQD fields + */ +int qman_query_fq(struct qman_fq *fq, struct qm_fqd *fqd); + +/* + * For qman_volatile_dequeue(); Choose one PRECEDENCE. EXACT is optional. Use + * NUMFRAMES(n) (6-bit) or NUMFRAMES_TILLEMPTY to fill in the frame-count. Use + * FQID(n) to fill in the frame queue ID. + */ +#define QM_VDQCR_PRECEDENCE_VDQCR 0x0 +#define QM_VDQCR_PRECEDENCE_SDQCR 0x80000000 +#define QM_VDQCR_EXACT 0x40000000 +#define QM_VDQCR_NUMFRAMES_MASK 0x3f000000 +#define QM_VDQCR_NUMFRAMES_SET(n) (((n) & 0x3f) << 24) +#define QM_VDQCR_NUMFRAMES_GET(n) (((n) >> 24) & 0x3f) +#define QM_VDQCR_NUMFRAMES_TILLEMPTY QM_VDQCR_NUMFRAMES_SET(0) + +#define QMAN_VOLATILE_FLAG_WAIT 0x00000001 /* wait if VDQCR is in use */ +#define QMAN_VOLATILE_FLAG_WAIT_INT 0x00000002 /* if wait, interruptible? */ +#define QMAN_VOLATILE_FLAG_FINISH 0x00000004 /* wait till VDQCR completes */ + +/* + * qman_volatile_dequeue - Issue a volatile dequeue command + * @fq: the frame queue object to dequeue from + * @flags: a bit-mask of QMAN_VOLATILE_FLAG_*** options + * @vdqcr: bit mask of QM_VDQCR_*** options, as per qm_dqrr_vdqcr_set() + * + * Attempts to lock access to the portal's VDQCR volatile dequeue functionality. + * The function will block and sleep if QMAN_VOLATILE_FLAG_WAIT is specified and + * the VDQCR is already in use, otherwise returns non-zero for failure. If + * QMAN_VOLATILE_FLAG_FINISH is specified, the function will only return once + * the VDQCR command has finished executing (ie. once the callback for the last + * DQRR entry resulting from the VDQCR command has been called). If not using + * the FINISH flag, completion can be determined either by detecting the + * presence of the QM_DQRR_STAT_UNSCHEDULED and QM_DQRR_STAT_DQCR_EXPIRED bits + * in the "stat" parameter passed to the FQ's dequeue callback, or by waiting + * for the QMAN_FQ_STATE_VDQCR bit to disappear. + */ +int qman_volatile_dequeue(struct qman_fq *fq, u32 flags, u32 vdqcr); + +int qman_alloc_fq_table(u32 num_fqids); + +/* QMan s/w corenet portal, low-level i/face */ + +/* + * For qm_dqrr_sdqcr_set(); Choose one SOURCE. Choose one COUNT. Choose one + * dequeue TYPE. Choose TOKEN (8-bit). + * If SOURCE == CHANNELS, + * Choose CHANNELS_DEDICATED and/or CHANNELS_POOL(n). + * You can choose DEDICATED_PRECEDENCE if the portal channel should have + * priority. + * If SOURCE == SPECIFICWQ, + * Either select the work-queue ID with SPECIFICWQ_WQ(), or select the + * channel (SPECIFICWQ_DEDICATED or SPECIFICWQ_POOL()) and specify the + * work-queue priority (0-7) with SPECIFICWQ_WQ() - either way, you get the + * same value. + */ +#define QM_SDQCR_SOURCE_CHANNELS 0x0 +#define QM_SDQCR_SOURCE_SPECIFICWQ 0x40000000 +#define QM_SDQCR_COUNT_EXACT1 0x0 +#define QM_SDQCR_COUNT_UPTO3 0x20000000 +#define QM_SDQCR_DEDICATED_PRECEDENCE 0x10000000 +#define QM_SDQCR_TYPE_MASK 0x03000000 +#define QM_SDQCR_TYPE_NULL 0x0 +#define QM_SDQCR_TYPE_PRIO_QOS 0x01000000 +#define QM_SDQCR_TYPE_ACTIVE_QOS 0x02000000 +#define QM_SDQCR_TYPE_ACTIVE 0x03000000 +#define QM_SDQCR_TOKEN_MASK 0x00ff0000 +#define QM_SDQCR_TOKEN_SET(v) (((v) & 0xff) << 16) +#define QM_SDQCR_TOKEN_GET(v) (((v) >> 16) & 0xff) +#define QM_SDQCR_CHANNELS_DEDICATED 0x00008000 +#define QM_SDQCR_SPECIFICWQ_MASK 0x000000f7 +#define QM_SDQCR_SPECIFICWQ_DEDICATED 0x00000000 +#define QM_SDQCR_SPECIFICWQ_POOL(n) ((n) << 4) +#define QM_SDQCR_SPECIFICWQ_WQ(n) (n) + +/* For qm_dqrr_vdqcr_set(): use FQID(n) to fill in the frame queue ID */ +#define QM_VDQCR_FQID_MASK 0x00ffffff +#define QM_VDQCR_FQID(n) ((n) & QM_VDQCR_FQID_MASK) + +/* + * Used by all portal interrupt registers except 'inhibit' + * Channels with frame availability + */ +#define QM_PIRQ_DQAVAIL 0x0000ffff + +/* The DQAVAIL interrupt fields break down into these bits; */ +#define QM_DQAVAIL_PORTAL 0x8000 /* Portal channel */ +#define QM_DQAVAIL_POOL(n) (0x8000 >> (n)) /* Pool channel, n==[1..15] */ +#define QM_DQAVAIL_MASK 0xffff +/* This mask contains all the "irqsource" bits visible to API users */ +#define QM_PIRQ_VISIBLE (QM_PIRQ_SLOW | QM_PIRQ_DQRI) + +extern struct qman_portal *affine_portals[NR_CPUS]; +const struct qm_portal_config *qman_get_qm_portal_config( + struct qman_portal *portal); diff --git a/include/soc/fsl/qman.h b/include/soc/fsl/qman.h new file mode 100644 index 000000000000..37f3eb001a16 --- /dev/null +++ b/include/soc/fsl/qman.h @@ -0,0 +1,1074 @@ +/* Copyright 2008 - 2016 Freescale Semiconductor, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Freescale Semiconductor nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * ALTERNATIVELY, this software may be distributed under the terms of the + * GNU General Public License ("GPL") as published by the Free Software + * Foundation, either version 2 of that License or (at your option) any + * later version. + * + * THIS SOFTWARE IS PROVIDED BY Freescale Semiconductor ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL Freescale Semiconductor BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __FSL_QMAN_H +#define __FSL_QMAN_H + +#include + +/* Hardware constants */ +#define QM_CHANNEL_SWPORTAL0 0 +#define QMAN_CHANNEL_POOL1 0x21 +#define QMAN_CHANNEL_POOL1_REV3 0x401 +extern u16 qm_channel_pool1; + +/* Portal processing (interrupt) sources */ +#define QM_PIRQ_CSCI 0x00100000 /* Congestion State Change */ +#define QM_PIRQ_EQCI 0x00080000 /* Enqueue Command Committed */ +#define QM_PIRQ_EQRI 0x00040000 /* EQCR Ring (below threshold) */ +#define QM_PIRQ_DQRI 0x00020000 /* DQRR Ring (non-empty) */ +#define QM_PIRQ_MRI 0x00010000 /* MR Ring (non-empty) */ +/* + * This mask contains all the interrupt sources that need handling except DQRI, + * ie. that if present should trigger slow-path processing. + */ +#define QM_PIRQ_SLOW (QM_PIRQ_CSCI | QM_PIRQ_EQCI | QM_PIRQ_EQRI | \ + QM_PIRQ_MRI) + +/* For qman_static_dequeue_*** APIs */ +#define QM_SDQCR_CHANNELS_POOL_MASK 0x00007fff +/* for n in [1,15] */ +#define QM_SDQCR_CHANNELS_POOL(n) (0x00008000 >> (n)) +/* for conversion from n of qm_channel */ +static inline u32 QM_SDQCR_CHANNELS_POOL_CONV(u16 channel) +{ + return QM_SDQCR_CHANNELS_POOL(channel + 1 - qm_channel_pool1); +} + +/* --- QMan data structures (and associated constants) --- */ + +/* "Frame Descriptor (FD)" */ +struct qm_fd { + union { + struct { + u8 cfg8b_w1; + u8 bpid; /* Buffer Pool ID */ + u8 cfg8b_w3; + u8 addr_hi; /* high 8-bits of 40-bit address */ + __be32 addr_lo; /* low 32-bits of 40-bit address */ + } __packed; + __be64 data; + }; + __be32 cfg; /* format, offset, length / congestion */ + union { + __be32 cmd; + __be32 status; + }; +} __aligned(8); + +#define QM_FD_FORMAT_SG BIT(31) +#define QM_FD_FORMAT_LONG BIT(30) +#define QM_FD_FORMAT_COMPOUND BIT(29) +#define QM_FD_FORMAT_MASK GENMASK(31, 29) +#define QM_FD_OFF_SHIFT 20 +#define QM_FD_OFF_MASK GENMASK(28, 20) +#define QM_FD_LEN_MASK GENMASK(19, 0) +#define QM_FD_LEN_BIG_MASK GENMASK(28, 0) + +enum qm_fd_format { + /* + * 'contig' implies a contiguous buffer, whereas 'sg' implies a + * scatter-gather table. 'big' implies a 29-bit length with no offset + * field, otherwise length is 20-bit and offset is 9-bit. 'compound' + * implies a s/g-like table, where each entry itself represents a frame + * (contiguous or scatter-gather) and the 29-bit "length" is + * interpreted purely for congestion calculations, ie. a "congestion + * weight". + */ + qm_fd_contig = 0, + qm_fd_contig_big = QM_FD_FORMAT_LONG, + qm_fd_sg = QM_FD_FORMAT_SG, + qm_fd_sg_big = QM_FD_FORMAT_SG | QM_FD_FORMAT_LONG, + qm_fd_compound = QM_FD_FORMAT_COMPOUND +}; + +static inline dma_addr_t qm_fd_addr(const struct qm_fd *fd) +{ + return be64_to_cpu(fd->data) & 0xffffffffffLLU; +} + +static inline u64 qm_fd_addr_get64(const struct qm_fd *fd) +{ + return be64_to_cpu(fd->data) & 0xffffffffffLLU; +} + +static inline void qm_fd_addr_set64(struct qm_fd *fd, u64 addr) +{ + fd->addr_hi = upper_32_bits(addr); + fd->addr_lo = cpu_to_be32(lower_32_bits(addr)); +} + +/* + * The 'format' field indicates the interpretation of the remaining + * 29 bits of the 32-bit word. + * If 'format' is _contig or _sg, 20b length and 9b offset. + * If 'format' is _contig_big or _sg_big, 29b length. + * If 'format' is _compound, 29b "congestion weight". + */ +static inline enum qm_fd_format qm_fd_get_format(const struct qm_fd *fd) +{ + return be32_to_cpu(fd->cfg) & QM_FD_FORMAT_MASK; +} + +static inline int qm_fd_get_offset(const struct qm_fd *fd) +{ + return (be32_to_cpu(fd->cfg) & QM_FD_OFF_MASK) >> QM_FD_OFF_SHIFT; +} + +static inline int qm_fd_get_length(const struct qm_fd *fd) +{ + return be32_to_cpu(fd->cfg) & QM_FD_LEN_MASK; +} + +static inline int qm_fd_get_len_big(const struct qm_fd *fd) +{ + return be32_to_cpu(fd->cfg) & QM_FD_LEN_BIG_MASK; +} + +static inline void qm_fd_set_param(struct qm_fd *fd, enum qm_fd_format fmt, + int off, int len) +{ + fd->cfg = cpu_to_be32(fmt | (len & QM_FD_LEN_BIG_MASK) | + ((off << QM_FD_OFF_SHIFT) & QM_FD_OFF_MASK)); +} + +#define qm_fd_set_contig(fd, off, len) \ + qm_fd_set_param(fd, qm_fd_contig, off, len) +#define qm_fd_set_sg(fd, off, len) qm_fd_set_param(fd, qm_fd_sg, off, len) +#define qm_fd_set_contig_big(fd, len) \ + qm_fd_set_param(fd, qm_fd_contig_big, 0, len) +#define qm_fd_set_sg_big(fd, len) qm_fd_set_param(fd, qm_fd_sg_big, 0, len) + +static inline void qm_fd_clear_fd(struct qm_fd *fd) +{ + fd->data = 0; + fd->cfg = 0; + fd->cmd = 0; +} + +/* Scatter/Gather table entry */ +struct qm_sg_entry { + union { + struct { + u8 __reserved1[3]; + u8 addr_hi; /* high 8-bits of 40-bit address */ + __be32 addr_lo; /* low 32-bits of 40-bit address */ + }; + __be64 data; + }; + __be32 cfg; /* E bit, F bit, length */ + u8 __reserved2; + u8 bpid; + __be16 offset; /* 13-bit, _res[13-15]*/ +} __packed; + +#define QM_SG_LEN_MASK GENMASK(29, 0) +#define QM_SG_OFF_MASK GENMASK(12, 0) +#define QM_SG_FIN BIT(30) +#define QM_SG_EXT BIT(31) + +static inline dma_addr_t qm_sg_addr(const struct qm_sg_entry *sg) +{ + return be64_to_cpu(sg->data) & 0xffffffffffLLU; +} + +static inline u64 qm_sg_entry_get64(const struct qm_sg_entry *sg) +{ + return be64_to_cpu(sg->data) & 0xffffffffffLLU; +} + +static inline void qm_sg_entry_set64(struct qm_sg_entry *sg, u64 addr) +{ + sg->addr_hi = upper_32_bits(addr); + sg->addr_lo = cpu_to_be32(lower_32_bits(addr)); +} + +static inline bool qm_sg_entry_is_final(const struct qm_sg_entry *sg) +{ + return be32_to_cpu(sg->cfg) & QM_SG_FIN; +} + +static inline bool qm_sg_entry_is_ext(const struct qm_sg_entry *sg) +{ + return be32_to_cpu(sg->cfg) & QM_SG_EXT; +} + +static inline int qm_sg_entry_get_len(const struct qm_sg_entry *sg) +{ + return be32_to_cpu(sg->cfg) & QM_SG_LEN_MASK; +} + +static inline void qm_sg_entry_set_len(struct qm_sg_entry *sg, int len) +{ + sg->cfg = cpu_to_be32(len & QM_SG_LEN_MASK); +} + +static inline void qm_sg_entry_set_f(struct qm_sg_entry *sg, int len) +{ + sg->cfg = cpu_to_be32(QM_SG_FIN | (len & QM_SG_LEN_MASK)); +} + +static inline int qm_sg_entry_get_off(const struct qm_sg_entry *sg) +{ + return be32_to_cpu(sg->offset) & QM_SG_OFF_MASK; +} + +/* "Frame Dequeue Response" */ +struct qm_dqrr_entry { + u8 verb; + u8 stat; + u16 seqnum; /* 15-bit */ + u8 tok; + u8 __reserved2[3]; + u32 fqid; /* 24-bit */ + u32 contextB; + struct qm_fd fd; + u8 __reserved4[32]; +} __packed; +#define QM_DQRR_VERB_VBIT 0x80 +#define QM_DQRR_VERB_MASK 0x7f /* where the verb contains; */ +#define QM_DQRR_VERB_FRAME_DEQUEUE 0x60 /* "this format" */ +#define QM_DQRR_STAT_FQ_EMPTY 0x80 /* FQ empty */ +#define QM_DQRR_STAT_FQ_HELDACTIVE 0x40 /* FQ held active */ +#define QM_DQRR_STAT_FQ_FORCEELIGIBLE 0x20 /* FQ was force-eligible'd */ +#define QM_DQRR_STAT_FD_VALID 0x10 /* has a non-NULL FD */ +#define QM_DQRR_STAT_UNSCHEDULED 0x02 /* Unscheduled dequeue */ +#define QM_DQRR_STAT_DQCR_EXPIRED 0x01 /* VDQCR or PDQCR expired*/ + +/* "ERN Message Response" */ +/* "FQ State Change Notification" */ +union qm_mr_entry { + struct { + u8 verb; + u8 __reserved[63]; + }; + struct { + u8 verb; + u8 dca; + u16 seqnum; + u8 rc; /* Rej Code: 8-bit */ + u8 orp_hi; /* ORP: 24-bit */ + u16 orp_lo; + u32 fqid; /* 24-bit */ + u32 tag; + struct qm_fd fd; + u8 __reserved1[32]; + } __packed ern; + struct { + u8 verb; + u8 fqs; /* Frame Queue Status */ + u8 __reserved1[6]; + u32 fqid; /* 24-bit */ + u32 contextB; + u8 __reserved2[48]; + } __packed fq; /* FQRN/FQRNI/FQRL/FQPN */ +}; +#define QM_MR_VERB_VBIT 0x80 +/* + * ERNs originating from direct-connect portals ("dcern") use 0x20 as a verb + * which would be invalid as a s/w enqueue verb. A s/w ERN can be distinguished + * from the other MR types by noting if the 0x20 bit is unset. + */ +#define QM_MR_VERB_TYPE_MASK 0x27 +#define QM_MR_VERB_DC_ERN 0x20 +#define QM_MR_VERB_FQRN 0x21 +#define QM_MR_VERB_FQRNI 0x22 +#define QM_MR_VERB_FQRL 0x23 +#define QM_MR_VERB_FQPN 0x24 +#define QM_MR_RC_MASK 0xf0 /* contains one of; */ +#define QM_MR_RC_CGR_TAILDROP 0x00 +#define QM_MR_RC_WRED 0x10 +#define QM_MR_RC_ERROR 0x20 +#define QM_MR_RC_ORPWINDOW_EARLY 0x30 +#define QM_MR_RC_ORPWINDOW_LATE 0x40 +#define QM_MR_RC_FQ_TAILDROP 0x50 +#define QM_MR_RC_ORPWINDOW_RETIRED 0x60 +#define QM_MR_RC_ORP_ZERO 0x70 +#define QM_MR_FQS_ORLPRESENT 0x02 /* ORL fragments to come */ +#define QM_MR_FQS_NOTEMPTY 0x01 /* FQ has enqueued frames */ + +/* + * An identical structure of FQD fields is present in the "Init FQ" command and + * the "Query FQ" result, it's suctioned out into the "struct qm_fqd" type. + * Within that, the 'stashing' and 'taildrop' pieces are also factored out, the + * latter has two inlines to assist with converting to/from the mant+exp + * representation. + */ +struct qm_fqd_stashing { + /* See QM_STASHING_EXCL_<...> */ + u8 exclusive; + /* Numbers of cachelines */ + u8 cl; /* _res[6-7], as[4-5], ds[2-3], cs[0-1] */ +}; + +struct qm_fqd_oac { + /* "Overhead Accounting Control", see QM_OAC_<...> */ + u8 oac; /* oac[6-7], _res[0-5] */ + /* Two's-complement value (-128 to +127) */ + s8 oal; /* "Overhead Accounting Length" */ +}; + +struct qm_fqd { + /* _res[6-7], orprws[3-5], oa[2], olws[0-1] */ + u8 orpc; + u8 cgid; + __be16 fq_ctrl; /* See QM_FQCTRL_<...> */ + __be16 dest_wq; /* channel[3-15], wq[0-2] */ + __be16 ics_cred; /* 15-bit */ + /* + * For "Initialize Frame Queue" commands, the write-enable mask + * determines whether 'td' or 'oac_init' is observed. For query + * commands, this field is always 'td', and 'oac_query' (below) reflects + * the Overhead ACcounting values. + */ + union { + __be16 td; /* "Taildrop": _res[13-15], mant[5-12], exp[0-4] */ + struct qm_fqd_oac oac_init; + }; + __be32 context_b; + union { + /* Treat it as 64-bit opaque */ + __be64 opaque; + struct { + __be32 hi; + __be32 lo; + }; + /* Treat it as s/w portal stashing config */ + /* see "FQD Context_A field used for [...]" */ + struct { + struct qm_fqd_stashing stashing; + /* + * 48-bit address of FQ context to + * stash, must be cacheline-aligned + */ + __be16 context_hi; + __be32 context_lo; + } __packed; + } context_a; + struct qm_fqd_oac oac_query; +} __packed; + +#define QM_FQD_CHAN_OFF 3 +#define QM_FQD_WQ_MASK GENMASK(2, 0) +#define QM_FQD_TD_EXP_MASK GENMASK(4, 0) +#define QM_FQD_TD_MANT_OFF 5 +#define QM_FQD_TD_MANT_MASK GENMASK(12, 5) +#define QM_FQD_TD_MAX 0xe0000000 +#define QM_FQD_TD_MANT_MAX 0xff +#define QM_FQD_OAC_OFF 6 +#define QM_FQD_AS_OFF 4 +#define QM_FQD_DS_OFF 2 +#define QM_FQD_XS_MASK 0x3 + +/* 64-bit converters for context_hi/lo */ +static inline u64 qm_fqd_stashing_get64(const struct qm_fqd *fqd) +{ + return be64_to_cpu(fqd->context_a.opaque) & 0xffffffffffffULL; +} + +static inline dma_addr_t qm_fqd_stashing_addr(const struct qm_fqd *fqd) +{ + return be64_to_cpu(fqd->context_a.opaque) & 0xffffffffffffULL; +} + +static inline u64 qm_fqd_context_a_get64(const struct qm_fqd *fqd) +{ + return qm_fqd_stashing_get64(fqd); +} + +static inline void qm_fqd_stashing_set64(struct qm_fqd *fqd, u64 addr) +{ + fqd->context_a.context_hi = upper_32_bits(addr); + fqd->context_a.context_lo = lower_32_bits(addr); +} + +static inline void qm_fqd_context_a_set64(struct qm_fqd *fqd, u64 addr) +{ + fqd->context_a.hi = cpu_to_be16(upper_32_bits(addr)); + fqd->context_a.lo = cpu_to_be32(lower_32_bits(addr)); +} + +/* convert a threshold value into mant+exp representation */ +static inline int qm_fqd_set_taildrop(struct qm_fqd *fqd, u32 val, + int roundup) +{ + u32 e = 0; + int td, oddbit = 0; + + if (val > QM_FQD_TD_MAX) + return -ERANGE; + + while (val > QM_FQD_TD_MANT_MAX) { + oddbit = val & 1; + val >>= 1; + e++; + if (roundup && oddbit) + val++; + } + + td = (val << QM_FQD_TD_MANT_OFF) & QM_FQD_TD_MANT_MASK; + td |= (e & QM_FQD_TD_EXP_MASK); + fqd->td = cpu_to_be16(td); + return 0; +} +/* and the other direction */ +static inline int qm_fqd_get_taildrop(const struct qm_fqd *fqd) +{ + int td = be16_to_cpu(fqd->td); + + return ((td & QM_FQD_TD_MANT_MASK) >> QM_FQD_TD_MANT_OFF) + << (td & QM_FQD_TD_EXP_MASK); +} + +static inline void qm_fqd_set_stashing(struct qm_fqd *fqd, u8 as, u8 ds, u8 cs) +{ + struct qm_fqd_stashing *st = &fqd->context_a.stashing; + + st->cl = ((as & QM_FQD_XS_MASK) << QM_FQD_AS_OFF) | + ((ds & QM_FQD_XS_MASK) << QM_FQD_DS_OFF) | + (cs & QM_FQD_XS_MASK); +} + +static inline u8 qm_fqd_get_stashing(const struct qm_fqd *fqd) +{ + return fqd->context_a.stashing.cl; +} + +static inline void qm_fqd_set_oac(struct qm_fqd *fqd, u8 val) +{ + fqd->oac_init.oac = val << QM_FQD_OAC_OFF; +} + +static inline void qm_fqd_set_oal(struct qm_fqd *fqd, s8 val) +{ + fqd->oac_init.oal = val; +} + +static inline void qm_fqd_set_destwq(struct qm_fqd *fqd, int ch, int wq) +{ + fqd->dest_wq = cpu_to_be16((ch << QM_FQD_CHAN_OFF) | + (wq & QM_FQD_WQ_MASK)); +} + +static inline int qm_fqd_get_chan(const struct qm_fqd *fqd) +{ + return be16_to_cpu(fqd->dest_wq) >> QM_FQD_CHAN_OFF; +} + +static inline int qm_fqd_get_wq(const struct qm_fqd *fqd) +{ + return be16_to_cpu(fqd->dest_wq) & QM_FQD_WQ_MASK; +} + +/* See "Frame Queue Descriptor (FQD)" */ +/* Frame Queue Descriptor (FQD) field 'fq_ctrl' uses these constants */ +#define QM_FQCTRL_MASK 0x07ff /* 'fq_ctrl' flags; */ +#define QM_FQCTRL_CGE 0x0400 /* Congestion Group Enable */ +#define QM_FQCTRL_TDE 0x0200 /* Tail-Drop Enable */ +#define QM_FQCTRL_CTXASTASHING 0x0080 /* Context-A stashing */ +#define QM_FQCTRL_CPCSTASH 0x0040 /* CPC Stash Enable */ +#define QM_FQCTRL_FORCESFDR 0x0008 /* High-priority SFDRs */ +#define QM_FQCTRL_AVOIDBLOCK 0x0004 /* Don't block active */ +#define QM_FQCTRL_HOLDACTIVE 0x0002 /* Hold active in portal */ +#define QM_FQCTRL_PREFERINCACHE 0x0001 /* Aggressively cache FQD */ +#define QM_FQCTRL_LOCKINCACHE QM_FQCTRL_PREFERINCACHE /* older naming */ + +/* See "FQD Context_A field used for [...] */ +/* Frame Queue Descriptor (FQD) field 'CONTEXT_A' uses these constants */ +#define QM_STASHING_EXCL_ANNOTATION 0x04 +#define QM_STASHING_EXCL_DATA 0x02 +#define QM_STASHING_EXCL_CTX 0x01 + +/* See "Intra Class Scheduling" */ +/* FQD field 'OAC' (Overhead ACcounting) uses these constants */ +#define QM_OAC_ICS 0x2 /* Accounting for Intra-Class Scheduling */ +#define QM_OAC_CG 0x1 /* Accounting for Congestion Groups */ + +/* + * This struct represents the 32-bit "WR_PARM_[GYR]" parameters in CGR fields + * and associated commands/responses. The WRED parameters are calculated from + * these fields as follows; + * MaxTH = MA * (2 ^ Mn) + * Slope = SA / (2 ^ Sn) + * MaxP = 4 * (Pn + 1) + */ +struct qm_cgr_wr_parm { + /* MA[24-31], Mn[19-23], SA[12-18], Sn[6-11], Pn[0-5] */ + u32 word; +}; +/* + * This struct represents the 13-bit "CS_THRES" CGR field. In the corresponding + * management commands, this is padded to a 16-bit structure field, so that's + * how we represent it here. The congestion state threshold is calculated from + * these fields as follows; + * CS threshold = TA * (2 ^ Tn) + */ +struct qm_cgr_cs_thres { + /* _res[13-15], TA[5-12], Tn[0-4] */ + u16 word; +}; +/* + * This identical structure of CGR fields is present in the "Init/Modify CGR" + * commands and the "Query CGR" result. It's suctioned out here into its own + * struct. + */ +struct __qm_mc_cgr { + struct qm_cgr_wr_parm wr_parm_g; + struct qm_cgr_wr_parm wr_parm_y; + struct qm_cgr_wr_parm wr_parm_r; + u8 wr_en_g; /* boolean, use QM_CGR_EN */ + u8 wr_en_y; /* boolean, use QM_CGR_EN */ + u8 wr_en_r; /* boolean, use QM_CGR_EN */ + u8 cscn_en; /* boolean, use QM_CGR_EN */ + union { + struct { + u16 cscn_targ_upd_ctrl; /* use QM_CSCN_TARG_UDP_ */ + u16 cscn_targ_dcp_low; /* CSCN_TARG_DCP low-16bits */ + }; + u32 cscn_targ; /* use QM_CGR_TARG_* */ + }; + u8 cstd_en; /* boolean, use QM_CGR_EN */ + u8 cs; /* boolean, only used in query response */ + struct qm_cgr_cs_thres cs_thres; /* use qm_cgr_cs_thres_set64() */ + u8 mode; /* QMAN_CGR_MODE_FRAME not supported in rev1.0 */ +} __packed; +#define QM_CGR_EN 0x01 /* For wr_en_*, cscn_en, cstd_en */ +#define QM_CGR_TARG_UDP_CTRL_WRITE_BIT 0x8000 /* value written to portal bit*/ +#define QM_CGR_TARG_UDP_CTRL_DCP 0x4000 /* 0: SWP, 1: DCP */ +#define QM_CGR_TARG_PORTAL(n) (0x80000000 >> (n)) /* s/w portal, 0-9 */ +#define QM_CGR_TARG_FMAN0 0x00200000 /* direct-connect portal: fman0 */ +#define QM_CGR_TARG_FMAN1 0x00100000 /* : fman1 */ +/* Convert CGR thresholds to/from "cs_thres" format */ +static inline u64 qm_cgr_cs_thres_get64(const struct qm_cgr_cs_thres *th) +{ + return ((th->word >> 5) & 0xff) << (th->word & 0x1f); +} + +static inline int qm_cgr_cs_thres_set64(struct qm_cgr_cs_thres *th, u64 val, + int roundup) +{ + u32 e = 0; + int oddbit = 0; + + while (val > 0xff) { + oddbit = val & 1; + val >>= 1; + e++; + if (roundup && oddbit) + val++; + } + th->word = ((val & 0xff) << 5) | (e & 0x1f); + return 0; +} + +/* "Initialize FQ" */ +struct qm_mcc_initfq { + u8 __reserved1[2]; + u16 we_mask; /* Write Enable Mask */ + u32 fqid; /* 24-bit */ + u16 count; /* Initialises 'count+1' FQDs */ + struct qm_fqd fqd; /* the FQD fields go here */ + u8 __reserved2[30]; +} __packed; +/* "Initialize/Modify CGR" */ +struct qm_mcc_initcgr { + u8 __reserve1[2]; + u16 we_mask; /* Write Enable Mask */ + struct __qm_mc_cgr cgr; /* CGR fields */ + u8 __reserved2[2]; + u8 cgid; + u8 __reserved3[32]; +} __packed; + +/* INITFQ-specific flags */ +#define QM_INITFQ_WE_MASK 0x01ff /* 'Write Enable' flags; */ +#define QM_INITFQ_WE_OAC 0x0100 +#define QM_INITFQ_WE_ORPC 0x0080 +#define QM_INITFQ_WE_CGID 0x0040 +#define QM_INITFQ_WE_FQCTRL 0x0020 +#define QM_INITFQ_WE_DESTWQ 0x0010 +#define QM_INITFQ_WE_ICSCRED 0x0008 +#define QM_INITFQ_WE_TDTHRESH 0x0004 +#define QM_INITFQ_WE_CONTEXTB 0x0002 +#define QM_INITFQ_WE_CONTEXTA 0x0001 +/* INITCGR/MODIFYCGR-specific flags */ +#define QM_CGR_WE_MASK 0x07ff /* 'Write Enable Mask'; */ +#define QM_CGR_WE_WR_PARM_G 0x0400 +#define QM_CGR_WE_WR_PARM_Y 0x0200 +#define QM_CGR_WE_WR_PARM_R 0x0100 +#define QM_CGR_WE_WR_EN_G 0x0080 +#define QM_CGR_WE_WR_EN_Y 0x0040 +#define QM_CGR_WE_WR_EN_R 0x0020 +#define QM_CGR_WE_CSCN_EN 0x0010 +#define QM_CGR_WE_CSCN_TARG 0x0008 +#define QM_CGR_WE_CSTD_EN 0x0004 +#define QM_CGR_WE_CS_THRES 0x0002 +#define QM_CGR_WE_MODE 0x0001 + +#define QMAN_CGR_FLAG_USE_INIT 0x00000001 + + /* Portal and Frame Queues */ +/* Represents a managed portal */ +struct qman_portal; + +/* + * This object type represents QMan frame queue descriptors (FQD), it is + * cacheline-aligned, and initialised by qman_create_fq(). The structure is + * defined further down. + */ +struct qman_fq; + +/* + * This object type represents a QMan congestion group, it is defined further + * down. + */ +struct qman_cgr; + +/* + * This enum, and the callback type that returns it, are used when handling + * dequeued frames via DQRR. Note that for "null" callbacks registered with the + * portal object (for handling dequeues that do not demux because contextB is + * NULL), the return value *MUST* be qman_cb_dqrr_consume. + */ +enum qman_cb_dqrr_result { + /* DQRR entry can be consumed */ + qman_cb_dqrr_consume, + /* Like _consume, but requests parking - FQ must be held-active */ + qman_cb_dqrr_park, + /* Does not consume, for DCA mode only. */ + qman_cb_dqrr_defer, + /* + * Stop processing without consuming this ring entry. Exits the current + * qman_p_poll_dqrr() or interrupt-handling, as appropriate. If within + * an interrupt handler, the callback would typically call + * qman_irqsource_remove(QM_PIRQ_DQRI) before returning this value, + * otherwise the interrupt will reassert immediately. + */ + qman_cb_dqrr_stop, + /* Like qman_cb_dqrr_stop, but consumes the current entry. */ + qman_cb_dqrr_consume_stop +}; +typedef enum qman_cb_dqrr_result (*qman_cb_dqrr)(struct qman_portal *qm, + struct qman_fq *fq, + const struct qm_dqrr_entry *dqrr); + +/* + * This callback type is used when handling ERNs, FQRNs and FQRLs via MR. They + * are always consumed after the callback returns. + */ +typedef void (*qman_cb_mr)(struct qman_portal *qm, struct qman_fq *fq, + const union qm_mr_entry *msg); + +/* + * s/w-visible states. Ie. tentatively scheduled + truly scheduled + active + + * held-active + held-suspended are just "sched". Things like "retired" will not + * be assumed until it is complete (ie. QMAN_FQ_STATE_CHANGING is set until + * then, to indicate it's completing and to gate attempts to retry the retire + * command). Note, park commands do not set QMAN_FQ_STATE_CHANGING because it's + * technically impossible in the case of enqueue DCAs (which refer to DQRR ring + * index rather than the FQ that ring entry corresponds to), so repeated park + * commands are allowed (if you're silly enough to try) but won't change FQ + * state, and the resulting park notifications move FQs from "sched" to + * "parked". + */ +enum qman_fq_state { + qman_fq_state_oos, + qman_fq_state_parked, + qman_fq_state_sched, + qman_fq_state_retired +}; + +#define QMAN_FQ_STATE_CHANGING 0x80000000 /* 'state' is changing */ +#define QMAN_FQ_STATE_NE 0x40000000 /* retired FQ isn't empty */ +#define QMAN_FQ_STATE_ORL 0x20000000 /* retired FQ has ORL */ +#define QMAN_FQ_STATE_BLOCKOOS 0xe0000000 /* if any are set, no OOS */ +#define QMAN_FQ_STATE_CGR_EN 0x10000000 /* CGR enabled */ +#define QMAN_FQ_STATE_VDQCR 0x08000000 /* being volatile dequeued */ + +/* + * Frame queue objects (struct qman_fq) are stored within memory passed to + * qman_create_fq(), as this allows stashing of caller-provided demux callback + * pointers at no extra cost to stashing of (driver-internal) FQ state. If the + * caller wishes to add per-FQ state and have it benefit from dequeue-stashing, + * they should; + * + * (a) extend the qman_fq structure with their state; eg. + * + * // myfq is allocated and driver_fq callbacks filled in; + * struct my_fq { + * struct qman_fq base; + * int an_extra_field; + * [ ... add other fields to be associated with each FQ ...] + * } *myfq = some_my_fq_allocator(); + * struct qman_fq *fq = qman_create_fq(fqid, flags, &myfq->base); + * + * // in a dequeue callback, access extra fields from 'fq' via a cast; + * struct my_fq *myfq = (struct my_fq *)fq; + * do_something_with(myfq->an_extra_field); + * [...] + * + * (b) when and if configuring the FQ for context stashing, specify how ever + * many cachelines are required to stash 'struct my_fq', to accelerate not + * only the QMan driver but the callback as well. + */ + +struct qman_fq_cb { + qman_cb_dqrr dqrr; /* for dequeued frames */ + qman_cb_mr ern; /* for s/w ERNs */ + qman_cb_mr fqs; /* frame-queue state changes*/ +}; + +struct qman_fq { + /* Caller of qman_create_fq() provides these demux callbacks */ + struct qman_fq_cb cb; + /* + * These are internal to the driver, don't touch. In particular, they + * may change, be removed, or extended (so you shouldn't rely on + * sizeof(qman_fq) being a constant). + */ + u32 fqid, idx; + unsigned long flags; + enum qman_fq_state state; + int cgr_groupid; +}; + +/* + * This callback type is used when handling congestion group entry/exit. + * 'congested' is non-zero on congestion-entry, and zero on congestion-exit. + */ +typedef void (*qman_cb_cgr)(struct qman_portal *qm, + struct qman_cgr *cgr, int congested); + +struct qman_cgr { + /* Set these prior to qman_create_cgr() */ + u32 cgrid; /* 0..255, but u32 to allow specials like -1, 256, etc.*/ + qman_cb_cgr cb; + /* These are private to the driver */ + u16 chan; /* portal channel this object is created on */ + struct list_head node; +}; + +/* Flags to qman_create_fq() */ +#define QMAN_FQ_FLAG_NO_ENQUEUE 0x00000001 /* can't enqueue */ +#define QMAN_FQ_FLAG_NO_MODIFY 0x00000002 /* can only enqueue */ +#define QMAN_FQ_FLAG_TO_DCPORTAL 0x00000004 /* consumed by CAAM/PME/Fman */ +#define QMAN_FQ_FLAG_DYNAMIC_FQID 0x00000020 /* (de)allocate fqid */ + +/* Flags to qman_init_fq() */ +#define QMAN_INITFQ_FLAG_SCHED 0x00000001 /* schedule rather than park */ +#define QMAN_INITFQ_FLAG_LOCAL 0x00000004 /* set dest portal */ + + /* Portal Management */ +/** + * qman_p_irqsource_add - add processing sources to be interrupt-driven + * @bits: bitmask of QM_PIRQ_**I processing sources + * + * Adds processing sources that should be interrupt-driven (rather than + * processed via qman_poll_***() functions). + */ +void qman_p_irqsource_add(struct qman_portal *p, u32 bits); + +/** + * qman_p_irqsource_remove - remove processing sources from being int-driven + * @bits: bitmask of QM_PIRQ_**I processing sources + * + * Removes processing sources from being interrupt-driven, so that they will + * instead be processed via qman_poll_***() functions. + */ +void qman_p_irqsource_remove(struct qman_portal *p, u32 bits); + +/** + * qman_affine_cpus - return a mask of cpus that have affine portals + */ +const cpumask_t *qman_affine_cpus(void); + +/** + * qman_affine_channel - return the channel ID of an portal + * @cpu: the cpu whose affine portal is the subject of the query + * + * If @cpu is -1, the affine portal for the current CPU will be used. It is a + * bug to call this function for any value of @cpu (other than -1) that is not a + * member of the mask returned from qman_affine_cpus(). + */ +u16 qman_affine_channel(int cpu); + +/** + * qman_get_affine_portal - return the portal pointer affine to cpu + * @cpu: the cpu whose affine portal is the subject of the query + */ +struct qman_portal *qman_get_affine_portal(int cpu); + +/** + * qman_p_poll_dqrr - process DQRR (fast-path) entries + * @limit: the maximum number of DQRR entries to process + * + * Use of this function requires that DQRR processing not be interrupt-driven. + * The return value represents the number of DQRR entries processed. + */ +int qman_p_poll_dqrr(struct qman_portal *p, unsigned int limit); + +/** + * qman_p_static_dequeue_add - Add pool channels to the portal SDQCR + * @pools: bit-mask of pool channels, using QM_SDQCR_CHANNELS_POOL(n) + * + * Adds a set of pool channels to the portal's static dequeue command register + * (SDQCR). The requested pools are limited to those the portal has dequeue + * access to. + */ +void qman_p_static_dequeue_add(struct qman_portal *p, u32 pools); + + /* FQ management */ +/** + * qman_create_fq - Allocates a FQ + * @fqid: the index of the FQD to encapsulate, must be "Out of Service" + * @flags: bit-mask of QMAN_FQ_FLAG_*** options + * @fq: memory for storing the 'fq', with callbacks filled in + * + * Creates a frame queue object for the given @fqid, unless the + * QMAN_FQ_FLAG_DYNAMIC_FQID flag is set in @flags, in which case a FQID is + * dynamically allocated (or the function fails if none are available). Once + * created, the caller should not touch the memory at 'fq' except as extended to + * adjacent memory for user-defined fields (see the definition of "struct + * qman_fq" for more info). NO_MODIFY is only intended for enqueuing to + * pre-existing frame-queues that aren't to be otherwise interfered with, it + * prevents all other modifications to the frame queue. The TO_DCPORTAL flag + * causes the driver to honour any contextB modifications requested in the + * qm_init_fq() API, as this indicates the frame queue will be consumed by a + * direct-connect portal (PME, CAAM, or Fman). When frame queues are consumed by + * software portals, the contextB field is controlled by the driver and can't be + * modified by the caller. + */ +int qman_create_fq(u32 fqid, u32 flags, struct qman_fq *fq); + +/** + * qman_destroy_fq - Deallocates a FQ + * @fq: the frame queue object to release + * + * The memory for this frame queue object ('fq' provided in qman_create_fq()) is + * not deallocated but the caller regains ownership, to do with as desired. The + * FQ must be in the 'out-of-service' or in the 'parked' state. + */ +void qman_destroy_fq(struct qman_fq *fq); + +/** + * qman_fq_fqid - Queries the frame queue ID of a FQ object + * @fq: the frame queue object to query + */ +u32 qman_fq_fqid(struct qman_fq *fq); + +/** + * qman_init_fq - Initialises FQ fields, leaves the FQ "parked" or "scheduled" + * @fq: the frame queue object to modify, must be 'parked' or new. + * @flags: bit-mask of QMAN_INITFQ_FLAG_*** options + * @opts: the FQ-modification settings, as defined in the low-level API + * + * The @opts parameter comes from the low-level portal API. Select + * QMAN_INITFQ_FLAG_SCHED in @flags to cause the frame queue to be scheduled + * rather than parked. NB, @opts can be NULL. + * + * Note that some fields and options within @opts may be ignored or overwritten + * by the driver; + * 1. the 'count' and 'fqid' fields are always ignored (this operation only + * affects one frame queue: @fq). + * 2. the QM_INITFQ_WE_CONTEXTB option of the 'we_mask' field and the associated + * 'fqd' structure's 'context_b' field are sometimes overwritten; + * - if @fq was not created with QMAN_FQ_FLAG_TO_DCPORTAL, then context_b is + * initialised to a value used by the driver for demux. + * - if context_b is initialised for demux, so is context_a in case stashing + * is requested (see item 4). + * (So caller control of context_b is only possible for TO_DCPORTAL frame queue + * objects.) + * 3. if @flags contains QMAN_INITFQ_FLAG_LOCAL, the 'fqd' structure's + * 'dest::channel' field will be overwritten to match the portal used to issue + * the command. If the WE_DESTWQ write-enable bit had already been set by the + * caller, the channel workqueue will be left as-is, otherwise the write-enable + * bit is set and the workqueue is set to a default of 4. If the "LOCAL" flag + * isn't set, the destination channel/workqueue fields and the write-enable bit + * are left as-is. + * 4. if the driver overwrites context_a/b for demux, then if + * QM_INITFQ_WE_CONTEXTA is set, the driver will only overwrite + * context_a.address fields and will leave the stashing fields provided by the + * user alone, otherwise it will zero out the context_a.stashing fields. + */ +int qman_init_fq(struct qman_fq *fq, u32 flags, struct qm_mcc_initfq *opts); + +/** + * qman_schedule_fq - Schedules a FQ + * @fq: the frame queue object to schedule, must be 'parked' + * + * Schedules the frame queue, which must be Parked, which takes it to + * Tentatively-Scheduled or Truly-Scheduled depending on its fill-level. + */ +int qman_schedule_fq(struct qman_fq *fq); + +/** + * qman_retire_fq - Retires a FQ + * @fq: the frame queue object to retire + * @flags: FQ flags (QMAN_FQ_STATE*) if retirement completes immediately + * + * Retires the frame queue. This returns zero if it succeeds immediately, +1 if + * the retirement was started asynchronously, otherwise it returns negative for + * failure. When this function returns zero, @flags is set to indicate whether + * the retired FQ is empty and/or whether it has any ORL fragments (to show up + * as ERNs). Otherwise the corresponding flags will be known when a subsequent + * FQRN message shows up on the portal's message ring. + * + * NB, if the retirement is asynchronous (the FQ was in the Truly Scheduled or + * Active state), the completion will be via the message ring as a FQRN - but + * the corresponding callback may occur before this function returns!! Ie. the + * caller should be prepared to accept the callback as the function is called, + * not only once it has returned. + */ +int qman_retire_fq(struct qman_fq *fq, u32 *flags); + +/** + * qman_oos_fq - Puts a FQ "out of service" + * @fq: the frame queue object to be put out-of-service, must be 'retired' + * + * The frame queue must be retired and empty, and if any order restoration list + * was released as ERNs at the time of retirement, they must all be consumed. + */ +int qman_oos_fq(struct qman_fq *fq); + +/** + * qman_enqueue - Enqueue a frame to a frame queue + * @fq: the frame queue object to enqueue to + * @fd: a descriptor of the frame to be enqueued + * + * Fills an entry in the EQCR of portal @qm to enqueue the frame described by + * @fd. The descriptor details are copied from @fd to the EQCR entry, the 'pid' + * field is ignored. The return value is non-zero on error, such as ring full. + */ +int qman_enqueue(struct qman_fq *fq, const struct qm_fd *fd); + +/** + * qman_alloc_fqid_range - Allocate a contiguous range of FQIDs + * @result: is set by the API to the base FQID of the allocated range + * @count: the number of FQIDs required + * + * Returns 0 on success, or a negative error code. + */ +int qman_alloc_fqid_range(u32 *result, u32 count); +#define qman_alloc_fqid(result) qman_alloc_fqid_range(result, 1) + +/** + * qman_release_fqid - Release the specified frame queue ID + * @fqid: the FQID to be released back to the resource pool + * + * This function can also be used to seed the allocator with + * FQID ranges that it can subsequently allocate from. + * Returns 0 on success, or a negative error code. + */ +int qman_release_fqid(u32 fqid); + + /* Pool-channel management */ +/** + * qman_alloc_pool_range - Allocate a contiguous range of pool-channel IDs + * @result: is set by the API to the base pool-channel ID of the allocated range + * @count: the number of pool-channel IDs required + * + * Returns 0 on success, or a negative error code. + */ +int qman_alloc_pool_range(u32 *result, u32 count); +#define qman_alloc_pool(result) qman_alloc_pool_range(result, 1) + +/** + * qman_release_pool - Release the specified pool-channel ID + * @id: the pool-chan ID to be released back to the resource pool + * + * This function can also be used to seed the allocator with + * pool-channel ID ranges that it can subsequently allocate from. + * Returns 0 on success, or a negative error code. + */ +int qman_release_pool(u32 id); + + /* CGR management */ +/** + * qman_create_cgr - Register a congestion group object + * @cgr: the 'cgr' object, with fields filled in + * @flags: QMAN_CGR_FLAG_* values + * @opts: optional state of CGR settings + * + * Registers this object to receiving congestion entry/exit callbacks on the + * portal affine to the cpu portal on which this API is executed. If opts is + * NULL then only the callback (cgr->cb) function is registered. If @flags + * contains QMAN_CGR_FLAG_USE_INIT, then an init hw command (which will reset + * any unspecified parameters) will be used rather than a modify hw hardware + * (which only modifies the specified parameters). + */ +int qman_create_cgr(struct qman_cgr *cgr, u32 flags, + struct qm_mcc_initcgr *opts); + +/** + * qman_delete_cgr - Deregisters a congestion group object + * @cgr: the 'cgr' object to deregister + * + * "Unplugs" this CGR object from the portal affine to the cpu on which this API + * is executed. This must be excuted on the same affine portal on which it was + * created. + */ +int qman_delete_cgr(struct qman_cgr *cgr); + +/** + * qman_delete_cgr_safe - Deregisters a congestion group object from any CPU + * @cgr: the 'cgr' object to deregister + * + * This will select the proper CPU and run there qman_delete_cgr(). + */ +void qman_delete_cgr_safe(struct qman_cgr *cgr); + +/** + * qman_query_cgr_congested - Queries CGR's congestion status + * @cgr: the 'cgr' object to query + * @result: returns 'cgr's congestion status, 1 (true) if congested + */ +int qman_query_cgr_congested(struct qman_cgr *cgr, bool *result); + +/** + * qman_alloc_cgrid_range - Allocate a contiguous range of CGR IDs + * @result: is set by the API to the base CGR ID of the allocated range + * @count: the number of CGR IDs required + * + * Returns 0 on success, or a negative error code. + */ +int qman_alloc_cgrid_range(u32 *result, u32 count); +#define qman_alloc_cgrid(result) qman_alloc_cgrid_range(result, 1) + +/** + * qman_release_cgrid - Release the specified CGR ID + * @id: the CGR ID to be released back to the resource pool + * + * This function can also be used to seed the allocator with + * CGR ID ranges that it can subsequently allocate from. + * Returns 0 on success, or a negative error code. + */ +int qman_release_cgrid(u32 id); + +#endif /* __FSL_QMAN_H */ -- cgit v1.2.3 From 54f9c4d0778b3f9ab791b1b7eb1a5d2809f02f50 Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Tue, 20 Sep 2016 09:01:38 +0200 Subject: ipmi: add an Aspeed BT IPMI BMC driver MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch adds a simple device driver to expose the iBT interface on Aspeed SOCs (AST2400 and AST2500) as a character device. Such SOCs are commonly used as BMCs (BaseBoard Management Controllers) and this driver implements the BMC side of the BT interface. The BT (Block Transfer) interface is used to perform in-band IPMI communication between a host and its BMC. Entire messages are buffered before sending a notification to the other end, host or BMC, that there is data to be read. Usually, the host emits requests and the BMC responses but the specification provides a mean for the BMC to send SMS Attention (BMC-to-Host attention or System Management Software attention) messages. For this purpose, the driver introduces a specific ioctl on the device: 'BT_BMC_IOCTL_SMS_ATN' that can be used by the system running on the BMC to signal the host of such an event. The device name defaults to '/dev/ipmi-bt-host' Signed-off-by: Alistair Popple Signed-off-by: Jeremy Kerr Signed-off-by: Joel Stanley [clg: - checkpatch fixes - added a devicetree binding documentation - replace 'bt_host' by 'bt_bmc' to reflect that the driver is the BMC side of the IPMI BT interface - renamed the device to 'ipmi-bt-host' - introduced a temporary buffer to copy_{to,from}_user - used platform_get_irq() - moved the driver under drivers/char/ipmi/ but kept it as a misc device - changed the compatible cell to "aspeed,ast2400-bt-bmc" ] Signed-off-by: Cédric Le Goater Acked-by: Arnd Bergmann [clg: - checkpatch --strict fixes - removed the use of devm_iounmap, devm_kfree in cleanup paths - introduced an atomic-t to limit opens to 1 - introduced a mutex to protect write/read operations] Acked-by: Rob Herring Signed-off-by: Cédric Le Goater Signed-off-by: Corey Minyard --- Documentation/devicetree/bindings/ipmi.txt | 25 - .../bindings/ipmi/aspeed,ast2400-bt-bmc.txt | 23 + .../devicetree/bindings/ipmi/ipmi-smic.txt | 25 + drivers/Makefile | 2 +- drivers/char/ipmi/Kconfig | 7 + drivers/char/ipmi/Makefile | 1 + drivers/char/ipmi/bt-bmc.c | 510 +++++++++++++++++++++ include/uapi/linux/Kbuild | 1 + include/uapi/linux/bt-bmc.h | 18 + 9 files changed, 586 insertions(+), 26 deletions(-) delete mode 100644 Documentation/devicetree/bindings/ipmi.txt create mode 100644 Documentation/devicetree/bindings/ipmi/aspeed,ast2400-bt-bmc.txt create mode 100644 Documentation/devicetree/bindings/ipmi/ipmi-smic.txt create mode 100644 drivers/char/ipmi/bt-bmc.c create mode 100644 include/uapi/linux/bt-bmc.h (limited to 'include') diff --git a/Documentation/devicetree/bindings/ipmi.txt b/Documentation/devicetree/bindings/ipmi.txt deleted file mode 100644 index d5f1a877ed3e..000000000000 --- a/Documentation/devicetree/bindings/ipmi.txt +++ /dev/null @@ -1,25 +0,0 @@ -IPMI device - -Required properties: -- compatible: should be one of ipmi-kcs, ipmi-smic, or ipmi-bt -- device_type: should be ipmi -- reg: Address and length of the register set for the device - -Optional properties: -- interrupts: The interrupt for the device. Without this the interface - is polled. -- reg-size - The size of the register. Defaults to 1 -- reg-spacing - The number of bytes between register starts. Defaults to 1 -- reg-shift - The amount to shift the registers to the right to get the data - into bit zero. - -Example: - -smic@fff3a000 { - compatible = "ipmi-smic"; - device_type = "ipmi"; - reg = <0xfff3a000 0x1000>; - interrupts = <0 24 4>; - reg-size = <4>; - reg-spacing = <4>; -}; diff --git a/Documentation/devicetree/bindings/ipmi/aspeed,ast2400-bt-bmc.txt b/Documentation/devicetree/bindings/ipmi/aspeed,ast2400-bt-bmc.txt new file mode 100644 index 000000000000..fbbacd958240 --- /dev/null +++ b/Documentation/devicetree/bindings/ipmi/aspeed,ast2400-bt-bmc.txt @@ -0,0 +1,23 @@ +* Aspeed BT (Block Transfer) IPMI interface + +The Aspeed SOCs (AST2400 and AST2500) are commonly used as BMCs +(BaseBoard Management Controllers) and the BT interface can be used to +perform in-band IPMI communication with their host. + +Required properties: + +- compatible : should be "aspeed,ast2400-bt-bmc" +- reg: physical address and size of the registers + +Optional properties: + +- interrupts: interrupt generated by the BT interface. without an + interrupt, the driver will operate in poll mode. + +Example: + + ibt@1e789140 { + compatible = "aspeed,ast2400-bt-bmc"; + reg = <0x1e789140 0x18>; + interrupts = <8>; + }; diff --git a/Documentation/devicetree/bindings/ipmi/ipmi-smic.txt b/Documentation/devicetree/bindings/ipmi/ipmi-smic.txt new file mode 100644 index 000000000000..d5f1a877ed3e --- /dev/null +++ b/Documentation/devicetree/bindings/ipmi/ipmi-smic.txt @@ -0,0 +1,25 @@ +IPMI device + +Required properties: +- compatible: should be one of ipmi-kcs, ipmi-smic, or ipmi-bt +- device_type: should be ipmi +- reg: Address and length of the register set for the device + +Optional properties: +- interrupts: The interrupt for the device. Without this the interface + is polled. +- reg-size - The size of the register. Defaults to 1 +- reg-spacing - The number of bytes between register starts. Defaults to 1 +- reg-shift - The amount to shift the registers to the right to get the data + into bit zero. + +Example: + +smic@fff3a000 { + compatible = "ipmi-smic"; + device_type = "ipmi"; + reg = <0xfff3a000 0x1000>; + interrupts = <0 24 4>; + reg-size = <4>; + reg-spacing = <4>; +}; diff --git a/drivers/Makefile b/drivers/Makefile index 53abb4a5f736..5a9e7b6b7928 100644 --- a/drivers/Makefile +++ b/drivers/Makefile @@ -21,7 +21,7 @@ obj-y += video/ obj-y += idle/ # IPMI must come before ACPI in order to provide IPMI opregion support -obj-$(CONFIG_IPMI_HANDLER) += char/ipmi/ +obj-y += char/ipmi/ obj-$(CONFIG_ACPI) += acpi/ obj-$(CONFIG_SFI) += sfi/ diff --git a/drivers/char/ipmi/Kconfig b/drivers/char/ipmi/Kconfig index 5a9350b1069a..2c234e3e7513 100644 --- a/drivers/char/ipmi/Kconfig +++ b/drivers/char/ipmi/Kconfig @@ -76,3 +76,10 @@ config IPMI_POWEROFF the IPMI management controller is capable of this. endif # IPMI_HANDLER + +config ASPEED_BT_IPMI_BMC + tristate "BT IPMI bmc driver" + help + Provides a driver for the BT (Block Transfer) IPMI interface + found on Aspeed SOCs (AST2400 and AST2500). The driver + implements the BMC side of the BT interface. diff --git a/drivers/char/ipmi/Makefile b/drivers/char/ipmi/Makefile index f3ffde1f5f1f..0d98cd91def1 100644 --- a/drivers/char/ipmi/Makefile +++ b/drivers/char/ipmi/Makefile @@ -11,3 +11,4 @@ obj-$(CONFIG_IPMI_SSIF) += ipmi_ssif.o obj-$(CONFIG_IPMI_POWERNV) += ipmi_powernv.o obj-$(CONFIG_IPMI_WATCHDOG) += ipmi_watchdog.o obj-$(CONFIG_IPMI_POWEROFF) += ipmi_poweroff.o +obj-$(CONFIG_ASPEED_BT_IPMI_BMC) += bt-bmc.o diff --git a/drivers/char/ipmi/bt-bmc.c b/drivers/char/ipmi/bt-bmc.c new file mode 100644 index 000000000000..2e880bf0be26 --- /dev/null +++ b/drivers/char/ipmi/bt-bmc.c @@ -0,0 +1,510 @@ +/* + * Copyright (c) 2015-2016, IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * This is a BMC device used to communicate to the host + */ +#define DEVICE_NAME "ipmi-bt-host" + +#define BT_IO_BASE 0xe4 +#define BT_IRQ 10 + +#define BT_CR0 0x0 +#define BT_CR0_IO_BASE 16 +#define BT_CR0_IRQ 12 +#define BT_CR0_EN_CLR_SLV_RDP 0x8 +#define BT_CR0_EN_CLR_SLV_WRP 0x4 +#define BT_CR0_ENABLE_IBT 0x1 +#define BT_CR1 0x4 +#define BT_CR1_IRQ_H2B 0x01 +#define BT_CR1_IRQ_HBUSY 0x40 +#define BT_CR2 0x8 +#define BT_CR2_IRQ_H2B 0x01 +#define BT_CR2_IRQ_HBUSY 0x40 +#define BT_CR3 0xc +#define BT_CTRL 0x10 +#define BT_CTRL_B_BUSY 0x80 +#define BT_CTRL_H_BUSY 0x40 +#define BT_CTRL_OEM0 0x20 +#define BT_CTRL_SMS_ATN 0x10 +#define BT_CTRL_B2H_ATN 0x08 +#define BT_CTRL_H2B_ATN 0x04 +#define BT_CTRL_CLR_RD_PTR 0x02 +#define BT_CTRL_CLR_WR_PTR 0x01 +#define BT_BMC2HOST 0x14 +#define BT_INTMASK 0x18 +#define BT_INTMASK_B2H_IRQEN 0x01 +#define BT_INTMASK_B2H_IRQ 0x02 +#define BT_INTMASK_BMC_HWRST 0x80 + +#define BT_BMC_BUFFER_SIZE 256 + +struct bt_bmc { + struct device dev; + struct miscdevice miscdev; + void __iomem *base; + int irq; + wait_queue_head_t queue; + struct timer_list poll_timer; + struct mutex mutex; +}; + +static atomic_t open_count = ATOMIC_INIT(0); + +static u8 bt_inb(struct bt_bmc *bt_bmc, int reg) +{ + return ioread8(bt_bmc->base + reg); +} + +static void bt_outb(struct bt_bmc *bt_bmc, u8 data, int reg) +{ + iowrite8(data, bt_bmc->base + reg); +} + +static void clr_rd_ptr(struct bt_bmc *bt_bmc) +{ + bt_outb(bt_bmc, BT_CTRL_CLR_RD_PTR, BT_CTRL); +} + +static void clr_wr_ptr(struct bt_bmc *bt_bmc) +{ + bt_outb(bt_bmc, BT_CTRL_CLR_WR_PTR, BT_CTRL); +} + +static void clr_h2b_atn(struct bt_bmc *bt_bmc) +{ + bt_outb(bt_bmc, BT_CTRL_H2B_ATN, BT_CTRL); +} + +static void set_b_busy(struct bt_bmc *bt_bmc) +{ + if (!(bt_inb(bt_bmc, BT_CTRL) & BT_CTRL_B_BUSY)) + bt_outb(bt_bmc, BT_CTRL_B_BUSY, BT_CTRL); +} + +static void clr_b_busy(struct bt_bmc *bt_bmc) +{ + if (bt_inb(bt_bmc, BT_CTRL) & BT_CTRL_B_BUSY) + bt_outb(bt_bmc, BT_CTRL_B_BUSY, BT_CTRL); +} + +static void set_b2h_atn(struct bt_bmc *bt_bmc) +{ + bt_outb(bt_bmc, BT_CTRL_B2H_ATN, BT_CTRL); +} + +static u8 bt_read(struct bt_bmc *bt_bmc) +{ + return bt_inb(bt_bmc, BT_BMC2HOST); +} + +static ssize_t bt_readn(struct bt_bmc *bt_bmc, u8 *buf, size_t n) +{ + int i; + + for (i = 0; i < n; i++) + buf[i] = bt_read(bt_bmc); + return n; +} + +static void bt_write(struct bt_bmc *bt_bmc, u8 c) +{ + bt_outb(bt_bmc, c, BT_BMC2HOST); +} + +static ssize_t bt_writen(struct bt_bmc *bt_bmc, u8 *buf, size_t n) +{ + int i; + + for (i = 0; i < n; i++) + bt_write(bt_bmc, buf[i]); + return n; +} + +static void set_sms_atn(struct bt_bmc *bt_bmc) +{ + bt_outb(bt_bmc, BT_CTRL_SMS_ATN, BT_CTRL); +} + +static struct bt_bmc *file_bt_bmc(struct file *file) +{ + return container_of(file->private_data, struct bt_bmc, miscdev); +} + +static int bt_bmc_open(struct inode *inode, struct file *file) +{ + struct bt_bmc *bt_bmc = file_bt_bmc(file); + + if (atomic_inc_return(&open_count) == 1) { + clr_b_busy(bt_bmc); + return 0; + } + + atomic_dec(&open_count); + return -EBUSY; +} + +/* + * The BT (Block Transfer) interface means that entire messages are + * buffered by the host before a notification is sent to the BMC that + * there is data to be read. The first byte is the length and the + * message data follows. The read operation just tries to capture the + * whole before returning it to userspace. + * + * BT Message format : + * + * Byte 1 Byte 2 Byte 3 Byte 4 Byte 5:N + * Length NetFn/LUN Seq Cmd Data + * + */ +static ssize_t bt_bmc_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + struct bt_bmc *bt_bmc = file_bt_bmc(file); + u8 len; + int len_byte = 1; + u8 kbuffer[BT_BMC_BUFFER_SIZE]; + ssize_t ret = 0; + ssize_t nread; + + if (!access_ok(VERIFY_WRITE, buf, count)) + return -EFAULT; + + WARN_ON(*ppos); + + if (wait_event_interruptible(bt_bmc->queue, + bt_inb(bt_bmc, BT_CTRL) & BT_CTRL_H2B_ATN)) + return -ERESTARTSYS; + + mutex_lock(&bt_bmc->mutex); + + if (unlikely(!(bt_inb(bt_bmc, BT_CTRL) & BT_CTRL_H2B_ATN))) { + ret = -EIO; + goto out_unlock; + } + + set_b_busy(bt_bmc); + clr_h2b_atn(bt_bmc); + clr_rd_ptr(bt_bmc); + + /* + * The BT frames start with the message length, which does not + * include the length byte. + */ + kbuffer[0] = bt_read(bt_bmc); + len = kbuffer[0]; + + /* We pass the length back to userspace as well */ + if (len + 1 > count) + len = count - 1; + + while (len) { + nread = min_t(ssize_t, len, sizeof(kbuffer) - len_byte); + + bt_readn(bt_bmc, kbuffer + len_byte, nread); + + if (copy_to_user(buf, kbuffer, nread + len_byte)) { + ret = -EFAULT; + break; + } + len -= nread; + buf += nread + len_byte; + ret += nread + len_byte; + len_byte = 0; + } + + clr_b_busy(bt_bmc); + +out_unlock: + mutex_unlock(&bt_bmc->mutex); + return ret; +} + +/* + * BT Message response format : + * + * Byte 1 Byte 2 Byte 3 Byte 4 Byte 5 Byte 6:N + * Length NetFn/LUN Seq Cmd Code Data + */ +static ssize_t bt_bmc_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct bt_bmc *bt_bmc = file_bt_bmc(file); + u8 kbuffer[BT_BMC_BUFFER_SIZE]; + ssize_t ret = 0; + ssize_t nwritten; + + /* + * send a minimum response size + */ + if (count < 5) + return -EINVAL; + + if (!access_ok(VERIFY_READ, buf, count)) + return -EFAULT; + + WARN_ON(*ppos); + + /* + * There's no interrupt for clearing bmc busy so we have to + * poll + */ + if (wait_event_interruptible(bt_bmc->queue, + !(bt_inb(bt_bmc, BT_CTRL) & + (BT_CTRL_H_BUSY | BT_CTRL_B2H_ATN)))) + return -ERESTARTSYS; + + mutex_lock(&bt_bmc->mutex); + + if (unlikely(bt_inb(bt_bmc, BT_CTRL) & + (BT_CTRL_H_BUSY | BT_CTRL_B2H_ATN))) { + ret = -EIO; + goto out_unlock; + } + + clr_wr_ptr(bt_bmc); + + while (count) { + nwritten = min_t(ssize_t, count, sizeof(kbuffer)); + if (copy_from_user(&kbuffer, buf, nwritten)) { + ret = -EFAULT; + break; + } + + bt_writen(bt_bmc, kbuffer, nwritten); + + count -= nwritten; + buf += nwritten; + ret += nwritten; + } + + set_b2h_atn(bt_bmc); + +out_unlock: + mutex_unlock(&bt_bmc->mutex); + return ret; +} + +static long bt_bmc_ioctl(struct file *file, unsigned int cmd, + unsigned long param) +{ + struct bt_bmc *bt_bmc = file_bt_bmc(file); + + switch (cmd) { + case BT_BMC_IOCTL_SMS_ATN: + set_sms_atn(bt_bmc); + return 0; + } + return -EINVAL; +} + +static int bt_bmc_release(struct inode *inode, struct file *file) +{ + struct bt_bmc *bt_bmc = file_bt_bmc(file); + + atomic_dec(&open_count); + set_b_busy(bt_bmc); + return 0; +} + +static unsigned int bt_bmc_poll(struct file *file, poll_table *wait) +{ + struct bt_bmc *bt_bmc = file_bt_bmc(file); + unsigned int mask = 0; + u8 ctrl; + + poll_wait(file, &bt_bmc->queue, wait); + + ctrl = bt_inb(bt_bmc, BT_CTRL); + + if (ctrl & BT_CTRL_H2B_ATN) + mask |= POLLIN; + + if (!(ctrl & (BT_CTRL_H_BUSY | BT_CTRL_B2H_ATN))) + mask |= POLLOUT; + + return mask; +} + +static const struct file_operations bt_bmc_fops = { + .owner = THIS_MODULE, + .open = bt_bmc_open, + .read = bt_bmc_read, + .write = bt_bmc_write, + .release = bt_bmc_release, + .poll = bt_bmc_poll, + .unlocked_ioctl = bt_bmc_ioctl, +}; + +static void poll_timer(unsigned long data) +{ + struct bt_bmc *bt_bmc = (void *)data; + + bt_bmc->poll_timer.expires += msecs_to_jiffies(500); + wake_up(&bt_bmc->queue); + add_timer(&bt_bmc->poll_timer); +} + +static irqreturn_t bt_bmc_irq(int irq, void *arg) +{ + struct bt_bmc *bt_bmc = arg; + u32 reg; + + reg = ioread32(bt_bmc->base + BT_CR2); + reg &= BT_CR2_IRQ_H2B | BT_CR2_IRQ_HBUSY; + if (!reg) + return IRQ_NONE; + + /* ack pending IRQs */ + iowrite32(reg, bt_bmc->base + BT_CR2); + + wake_up(&bt_bmc->queue); + return IRQ_HANDLED; +} + +static int bt_bmc_config_irq(struct bt_bmc *bt_bmc, + struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + u32 reg; + int rc; + + bt_bmc->irq = platform_get_irq(pdev, 0); + if (!bt_bmc->irq) + return -ENODEV; + + rc = devm_request_irq(dev, bt_bmc->irq, bt_bmc_irq, IRQF_SHARED, + DEVICE_NAME, bt_bmc); + if (rc < 0) { + dev_warn(dev, "Unable to request IRQ %d\n", bt_bmc->irq); + bt_bmc->irq = 0; + return rc; + } + + /* + * Configure IRQs on the bmc clearing the H2B and HBUSY bits; + * H2B will be asserted when the bmc has data for us; HBUSY + * will be cleared (along with B2H) when we can write the next + * message to the BT buffer + */ + reg = ioread32(bt_bmc->base + BT_CR1); + reg |= BT_CR1_IRQ_H2B | BT_CR1_IRQ_HBUSY; + iowrite32(reg, bt_bmc->base + BT_CR1); + + return 0; +} + +static int bt_bmc_probe(struct platform_device *pdev) +{ + struct bt_bmc *bt_bmc; + struct device *dev; + struct resource *res; + int rc; + + if (!pdev || !pdev->dev.of_node) + return -ENODEV; + + dev = &pdev->dev; + dev_info(dev, "Found bt bmc device\n"); + + bt_bmc = devm_kzalloc(dev, sizeof(*bt_bmc), GFP_KERNEL); + if (!bt_bmc) + return -ENOMEM; + + dev_set_drvdata(&pdev->dev, bt_bmc); + + res = platform_get_resource(pdev, IORESOURCE_MEM, 0); + if (!res) { + dev_err(dev, "Unable to find resources\n"); + return -ENXIO; + } + + bt_bmc->base = devm_ioremap_resource(&pdev->dev, res); + if (!bt_bmc->base) + return -ENOMEM; + + mutex_init(&bt_bmc->mutex); + init_waitqueue_head(&bt_bmc->queue); + + bt_bmc->miscdev.minor = MISC_DYNAMIC_MINOR, + bt_bmc->miscdev.name = DEVICE_NAME, + bt_bmc->miscdev.fops = &bt_bmc_fops, + bt_bmc->miscdev.parent = dev; + rc = misc_register(&bt_bmc->miscdev); + if (rc) { + dev_err(dev, "Unable to register misc device\n"); + return rc; + } + + bt_bmc_config_irq(bt_bmc, pdev); + + if (bt_bmc->irq) { + dev_info(dev, "Using IRQ %d\n", bt_bmc->irq); + } else { + dev_info(dev, "No IRQ; using timer\n"); + setup_timer(&bt_bmc->poll_timer, poll_timer, + (unsigned long)bt_bmc); + bt_bmc->poll_timer.expires = jiffies + msecs_to_jiffies(10); + add_timer(&bt_bmc->poll_timer); + } + + iowrite32((BT_IO_BASE << BT_CR0_IO_BASE) | + (BT_IRQ << BT_CR0_IRQ) | + BT_CR0_EN_CLR_SLV_RDP | + BT_CR0_EN_CLR_SLV_WRP | + BT_CR0_ENABLE_IBT, + bt_bmc->base + BT_CR0); + + clr_b_busy(bt_bmc); + + return 0; +} + +static int bt_bmc_remove(struct platform_device *pdev) +{ + struct bt_bmc *bt_bmc = dev_get_drvdata(&pdev->dev); + + misc_deregister(&bt_bmc->miscdev); + if (!bt_bmc->irq) + del_timer_sync(&bt_bmc->poll_timer); + return 0; +} + +static const struct of_device_id bt_bmc_match[] = { + { .compatible = "aspeed,ast2400-bt-bmc" }, + { }, +}; + +static struct platform_driver bt_bmc_driver = { + .driver = { + .name = DEVICE_NAME, + .of_match_table = bt_bmc_match, + }, + .probe = bt_bmc_probe, + .remove = bt_bmc_remove, +}; + +module_platform_driver(bt_bmc_driver); + +MODULE_DEVICE_TABLE(of, bt_bmc_match); +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Alistair Popple "); +MODULE_DESCRIPTION("Linux device interface to the BT interface"); diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild index 185f8ea2702f..17b12942c67d 100644 --- a/include/uapi/linux/Kbuild +++ b/include/uapi/linux/Kbuild @@ -74,6 +74,7 @@ header-y += bpf_common.h header-y += bpf.h header-y += bpqether.h header-y += bsg.h +header-y += bt-bmc.h header-y += btrfs.h header-y += can.h header-y += capability.h diff --git a/include/uapi/linux/bt-bmc.h b/include/uapi/linux/bt-bmc.h new file mode 100644 index 000000000000..d9ec766a63d0 --- /dev/null +++ b/include/uapi/linux/bt-bmc.h @@ -0,0 +1,18 @@ +/* + * Copyright (c) 2015-2016, IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#ifndef _UAPI_LINUX_BT_BMC_H +#define _UAPI_LINUX_BT_BMC_H + +#include + +#define __BT_BMC_IOCTL_MAGIC 0xb1 +#define BT_BMC_IOCTL_SMS_ATN _IO(__BT_BMC_IOCTL_MAGIC, 0x00) + +#endif /* _UAPI_LINUX_BT_BMC_H */ -- cgit v1.2.3 From 6675df311db87aa2107a04ef97e19420953cbace Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Thu, 22 Sep 2016 17:24:22 -0700 Subject: Btrfs: catch invalid free space trees MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There are two separate issues that can lead to corrupted free space trees. 1. The free space tree bitmaps had an endianness issue on big-endian systems which is fixed by an earlier patch in this series. 2. btrfs-progs before v4.7.3 modified filesystems without updating the free space tree. To catch both of these issues at once, we need to force the free space tree to be rebuilt. To do so, add a FREE_SPACE_TREE_VALID compat_ro bit. If the bit isn't set, we know that it was either produced by a broken big-endian kernel or may have been corrupted by btrfs-progs. This also provides us with a way to add rudimentary read-write support for the free space tree to btrfs-progs: it can just clear this bit and have the kernel rebuild the free space tree. Cc: stable@vger.kernel.org # 4.5+ Tested-by: Holger Hoffstätte Tested-by: Chandan Rajendra Signed-off-by: Omar Sandoval Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 3 ++- fs/btrfs/disk-io.c | 9 +++++++++ fs/btrfs/free-space-tree.c | 2 ++ include/uapi/linux/btrfs.h | 12 +++++++++++- 4 files changed, 24 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 33fe03551105..791e47ce9d27 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -251,7 +251,8 @@ struct btrfs_super_block { #define BTRFS_FEATURE_COMPAT_SAFE_CLEAR 0ULL #define BTRFS_FEATURE_COMPAT_RO_SUPP \ - (BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE) + (BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE | \ + BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE_VALID) #define BTRFS_FEATURE_COMPAT_RO_SAFE_SET 0ULL #define BTRFS_FEATURE_COMPAT_RO_SAFE_CLEAR 0ULL diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index c0bfc6ce5f06..3dede6d53bad 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2566,6 +2566,7 @@ int open_ctree(struct super_block *sb, int num_backups_tried = 0; int backup_index = 0; int max_active; + int clear_free_space_tree = 0; tree_root = fs_info->tree_root = btrfs_alloc_root(fs_info, GFP_KERNEL); chunk_root = fs_info->chunk_root = btrfs_alloc_root(fs_info, GFP_KERNEL); @@ -3131,6 +3132,14 @@ retry_root_backup: if (btrfs_test_opt(fs_info, CLEAR_CACHE) && btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) { + clear_free_space_tree = 1; + } else if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) && + !btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID)) { + btrfs_warn(fs_info, "free space tree is invalid"); + clear_free_space_tree = 1; + } + + if (clear_free_space_tree) { btrfs_info(fs_info, "clearing free space tree"); ret = btrfs_clear_free_space_tree(fs_info); if (ret) { diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index 8fd85bfbe2da..ea605ffd0e03 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -1182,6 +1182,7 @@ int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info) } btrfs_set_fs_compat_ro(fs_info, FREE_SPACE_TREE); + btrfs_set_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID); fs_info->creating_free_space_tree = 0; ret = btrfs_commit_transaction(trans, tree_root); @@ -1250,6 +1251,7 @@ int btrfs_clear_free_space_tree(struct btrfs_fs_info *fs_info) return PTR_ERR(trans); btrfs_clear_fs_compat_ro(fs_info, FREE_SPACE_TREE); + btrfs_clear_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID); fs_info->free_space_root = NULL; ret = clear_free_space_tree(trans, free_space_root); diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h index ac5eacd3055b..db4c253f8011 100644 --- a/include/uapi/linux/btrfs.h +++ b/include/uapi/linux/btrfs.h @@ -239,7 +239,17 @@ struct btrfs_ioctl_fs_info_args { * Used by: * struct btrfs_ioctl_feature_flags */ -#define BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE (1ULL << 0) +#define BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE (1ULL << 0) +/* + * Older kernels (< 4.9) on big-endian systems produced broken free space tree + * bitmaps, and btrfs-progs also used to corrupt the free space tree (versions + * < 4.7.3). If this bit is clear, then the free space tree cannot be trusted. + * btrfs-progs can also intentionally clear this bit to ask the kernel to + * rebuild the free space tree, however this might not work on older kernels + * that do not know about this bit. If not sure, clear the cache manually on + * first mount when booting older kernel versions. + */ +#define BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE_VALID (1ULL << 1) #define BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF (1ULL << 0) #define BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL (1ULL << 1) -- cgit v1.2.3 From 38addce8b600ca335dc86fa3d48c890f1c6fa1f4 Mon Sep 17 00:00:00 2001 From: Emese Revfy Date: Mon, 20 Jun 2016 20:41:19 +0200 Subject: gcc-plugins: Add latent_entropy plugin This adds a new gcc plugin named "latent_entropy". It is designed to extract as much possible uncertainty from a running system at boot time as possible, hoping to capitalize on any possible variation in CPU operation (due to runtime data differences, hardware differences, SMP ordering, thermal timing variation, cache behavior, etc). At the very least, this plugin is a much more comprehensive example for how to manipulate kernel code using the gcc plugin internals. The need for very-early boot entropy tends to be very architecture or system design specific, so this plugin is more suited for those sorts of special cases. The existing kernel RNG already attempts to extract entropy from reliable runtime variation, but this plugin takes the idea to a logical extreme by permuting a global variable based on any variation in code execution (e.g. a different value (and permutation function) is used to permute the global based on loop count, case statement, if/then/else branching, etc). To do this, the plugin starts by inserting a local variable in every marked function. The plugin then adds logic so that the value of this variable is modified by randomly chosen operations (add, xor and rol) and random values (gcc generates separate static values for each location at compile time and also injects the stack pointer at runtime). The resulting value depends on the control flow path (e.g., loops and branches taken). Before the function returns, the plugin mixes this local variable into the latent_entropy global variable. The value of this global variable is added to the kernel entropy pool in do_one_initcall() and _do_fork(), though it does not credit any bytes of entropy to the pool; the contents of the global are just used to mix the pool. Additionally, the plugin can pre-initialize arrays with build-time random contents, so that two different kernel builds running on identical hardware will not have the same starting values. Signed-off-by: Emese Revfy [kees: expanded commit message and code comments] Signed-off-by: Kees Cook --- arch/Kconfig | 18 + arch/powerpc/kernel/Makefile | 5 + include/linux/random.h | 11 + init/main.c | 1 + kernel/fork.c | 1 + mm/page_alloc.c | 5 + scripts/Makefile.gcc-plugins | 9 +- scripts/gcc-plugins/latent_entropy_plugin.c | 640 ++++++++++++++++++++++++++++ 8 files changed, 689 insertions(+), 1 deletion(-) create mode 100644 scripts/gcc-plugins/latent_entropy_plugin.c (limited to 'include') diff --git a/arch/Kconfig b/arch/Kconfig index fd6e9712af81..ae80cd24e8f0 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -383,6 +383,24 @@ config GCC_PLUGIN_SANCOV gcc-4.5 on). It is based on the commit "Add fuzzing coverage support" by Dmitry Vyukov . +config GCC_PLUGIN_LATENT_ENTROPY + bool "Generate some entropy during boot and runtime" + depends on GCC_PLUGINS + help + By saying Y here the kernel will instrument some kernel code to + extract some entropy from both original and artificially created + program state. This will help especially embedded systems where + there is little 'natural' source of entropy normally. The cost + is some slowdown of the boot process (about 0.5%) and fork and + irq processing. + + Note that entropy extracted this way is not cryptographically + secure! + + This plugin was ported from grsecurity/PaX. More information at: + * https://grsecurity.net/ + * https://pax.grsecurity.net/ + config HAVE_CC_STACKPROTECTOR bool help diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index fe4c075bcf50..62df36c3f138 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -14,6 +14,11 @@ CFLAGS_prom_init.o += -fPIC CFLAGS_btext.o += -fPIC endif +CFLAGS_cputable.o += $(DISABLE_LATENT_ENTROPY_PLUGIN) +CFLAGS_init.o += $(DISABLE_LATENT_ENTROPY_PLUGIN) +CFLAGS_btext.o += $(DISABLE_LATENT_ENTROPY_PLUGIN) +CFLAGS_prom.o += $(DISABLE_LATENT_ENTROPY_PLUGIN) + ifdef CONFIG_FUNCTION_TRACER # Do not trace early boot code CFLAGS_REMOVE_cputable.o = -mno-sched-epilog $(CC_FLAGS_FTRACE) diff --git a/include/linux/random.h b/include/linux/random.h index 3d6e9815cd85..a59c74cdb1eb 100644 --- a/include/linux/random.h +++ b/include/linux/random.h @@ -18,6 +18,17 @@ struct random_ready_callback { }; extern void add_device_randomness(const void *, unsigned int); + +#if defined(CONFIG_GCC_PLUGIN_LATENT_ENTROPY) && !defined(__CHECKER__) +static inline void add_latent_entropy(void) +{ + add_device_randomness((const void *)&latent_entropy, + sizeof(latent_entropy)); +} +#else +static inline void add_latent_entropy(void) {} +#endif + extern void add_input_randomness(unsigned int type, unsigned int code, unsigned int value); extern void add_interrupt_randomness(int irq, int irq_flags); diff --git a/init/main.c b/init/main.c index a8a58e2794a5..2858be732f6d 100644 --- a/init/main.c +++ b/init/main.c @@ -789,6 +789,7 @@ int __init_or_module do_one_initcall(initcall_t fn) } WARN(msgbuf[0], "initcall %pF returned with %s\n", fn, msgbuf); + add_latent_entropy(); return ret; } diff --git a/kernel/fork.c b/kernel/fork.c index beb31725f7e2..001b18473a07 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1780,6 +1780,7 @@ long _do_fork(unsigned long clone_flags, p = copy_process(clone_flags, stack_start, stack_size, child_tidptr, NULL, trace, tls, NUMA_NO_NODE); + add_latent_entropy(); /* * Do this prior waking up the new thread - the thread pointer * might get invalid after that point, if the thread exits quickly. diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a2214c64ed3c..248851d1fc86 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -91,6 +91,11 @@ EXPORT_PER_CPU_SYMBOL(_numa_mem_); int _node_numa_mem_[MAX_NUMNODES]; #endif +#ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY +volatile u64 latent_entropy; +EXPORT_SYMBOL(latent_entropy); +#endif + /* * Array of node states. */ diff --git a/scripts/Makefile.gcc-plugins b/scripts/Makefile.gcc-plugins index 61f0e6db909b..060d2cb373db 100644 --- a/scripts/Makefile.gcc-plugins +++ b/scripts/Makefile.gcc-plugins @@ -6,6 +6,12 @@ ifdef CONFIG_GCC_PLUGINS gcc-plugin-$(CONFIG_GCC_PLUGIN_CYC_COMPLEXITY) += cyc_complexity_plugin.so + gcc-plugin-$(CONFIG_GCC_PLUGIN_LATENT_ENTROPY) += latent_entropy_plugin.so + gcc-plugin-cflags-$(CONFIG_GCC_PLUGIN_LATENT_ENTROPY) += -DLATENT_ENTROPY_PLUGIN + ifdef CONFIG_PAX_LATENT_ENTROPY + DISABLE_LATENT_ENTROPY_PLUGIN += -fplugin-arg-latent_entropy_plugin-disable + endif + ifdef CONFIG_GCC_PLUGIN_SANCOV ifeq ($(CFLAGS_KCOV),) # It is needed because of the gcc-plugin.sh and gcc version checks. @@ -21,7 +27,8 @@ ifdef CONFIG_GCC_PLUGINS GCC_PLUGINS_CFLAGS := $(strip $(addprefix -fplugin=$(objtree)/scripts/gcc-plugins/, $(gcc-plugin-y)) $(gcc-plugin-cflags-y)) - export PLUGINCC GCC_PLUGINS_CFLAGS GCC_PLUGIN GCC_PLUGIN_SUBDIR SANCOV_PLUGIN + export PLUGINCC GCC_PLUGINS_CFLAGS GCC_PLUGIN GCC_PLUGIN_SUBDIR + export SANCOV_PLUGIN DISABLE_LATENT_ENTROPY_PLUGIN ifneq ($(PLUGINCC),) # SANCOV_PLUGIN can be only in CFLAGS_KCOV because avoid duplication. diff --git a/scripts/gcc-plugins/latent_entropy_plugin.c b/scripts/gcc-plugins/latent_entropy_plugin.c new file mode 100644 index 000000000000..ff1939b804ae --- /dev/null +++ b/scripts/gcc-plugins/latent_entropy_plugin.c @@ -0,0 +1,640 @@ +/* + * Copyright 2012-2016 by the PaX Team + * Copyright 2016 by Emese Revfy + * Licensed under the GPL v2 + * + * Note: the choice of the license means that the compilation process is + * NOT 'eligible' as defined by gcc's library exception to the GPL v3, + * but for the kernel it doesn't matter since it doesn't link against + * any of the gcc libraries + * + * This gcc plugin helps generate a little bit of entropy from program state, + * used throughout the uptime of the kernel. Here is an instrumentation example: + * + * before: + * void __latent_entropy test(int argc, char *argv[]) + * { + * if (argc <= 1) + * printf("%s: no command arguments :(\n", *argv); + * else + * printf("%s: %d command arguments!\n", *argv, args - 1); + * } + * + * after: + * void __latent_entropy test(int argc, char *argv[]) + * { + * // latent_entropy_execute() 1. + * unsigned long local_entropy; + * // init_local_entropy() 1. + * void *local_entropy_frameaddr; + * // init_local_entropy() 3. + * unsigned long tmp_latent_entropy; + * + * // init_local_entropy() 2. + * local_entropy_frameaddr = __builtin_frame_address(0); + * local_entropy = (unsigned long) local_entropy_frameaddr; + * + * // init_local_entropy() 4. + * tmp_latent_entropy = latent_entropy; + * // init_local_entropy() 5. + * local_entropy ^= tmp_latent_entropy; + * + * // latent_entropy_execute() 3. + * if (argc <= 1) { + * // perturb_local_entropy() + * local_entropy += 4623067384293424948; + * printf("%s: no command arguments :(\n", *argv); + * // perturb_local_entropy() + * } else { + * local_entropy ^= 3896280633962944730; + * printf("%s: %d command arguments!\n", *argv, args - 1); + * } + * + * // latent_entropy_execute() 4. + * tmp_latent_entropy = rol(tmp_latent_entropy, local_entropy); + * latent_entropy = tmp_latent_entropy; + * } + * + * TODO: + * - add ipa pass to identify not explicitly marked candidate functions + * - mix in more program state (function arguments/return values, + * loop variables, etc) + * - more instrumentation control via attribute parameters + * + * BUGS: + * - none known + * + * Options: + * -fplugin-arg-latent_entropy_plugin-disable + * + * Attribute: __attribute__((latent_entropy)) + * The latent_entropy gcc attribute can be only on functions and variables. + * If it is on a function then the plugin will instrument it. If the attribute + * is on a variable then the plugin will initialize it with a random value. + * The variable must be an integer, an integer array type or a structure + * with integer fields. + */ + +#include "gcc-common.h" + +int plugin_is_GPL_compatible; + +static GTY(()) tree latent_entropy_decl; + +static struct plugin_info latent_entropy_plugin_info = { + .version = "201606141920vanilla", + .help = "disable\tturn off latent entropy instrumentation\n", +}; + +static unsigned HOST_WIDE_INT seed; +/* + * get_random_seed() (this is a GCC function) generates the seed. + * This is a simple random generator without any cryptographic security because + * the entropy doesn't come from here. + */ +static unsigned HOST_WIDE_INT get_random_const(void) +{ + unsigned int i; + unsigned HOST_WIDE_INT ret = 0; + + for (i = 0; i < 8 * sizeof(ret); i++) { + ret = (ret << 1) | (seed & 1); + seed >>= 1; + if (ret & 1) + seed ^= 0xD800000000000000ULL; + } + + return ret; +} + +static tree tree_get_random_const(tree type) +{ + unsigned long long mask; + + mask = 1ULL << (TREE_INT_CST_LOW(TYPE_SIZE(type)) - 1); + mask = 2 * (mask - 1) + 1; + + if (TYPE_UNSIGNED(type)) + return build_int_cstu(type, mask & get_random_const()); + return build_int_cst(type, mask & get_random_const()); +} + +static tree handle_latent_entropy_attribute(tree *node, tree name, + tree args __unused, + int flags __unused, + bool *no_add_attrs) +{ + tree type; +#if BUILDING_GCC_VERSION <= 4007 + VEC(constructor_elt, gc) *vals; +#else + vec *vals; +#endif + + switch (TREE_CODE(*node)) { + default: + *no_add_attrs = true; + error("%qE attribute only applies to functions and variables", + name); + break; + + case VAR_DECL: + if (DECL_INITIAL(*node)) { + *no_add_attrs = true; + error("variable %qD with %qE attribute must not be initialized", + *node, name); + break; + } + + if (!TREE_STATIC(*node)) { + *no_add_attrs = true; + error("variable %qD with %qE attribute must not be local", + *node, name); + break; + } + + type = TREE_TYPE(*node); + switch (TREE_CODE(type)) { + default: + *no_add_attrs = true; + error("variable %qD with %qE attribute must be an integer or a fixed length integer array type or a fixed sized structure with integer fields", + *node, name); + break; + + case RECORD_TYPE: { + tree fld, lst = TYPE_FIELDS(type); + unsigned int nelt = 0; + + for (fld = lst; fld; nelt++, fld = TREE_CHAIN(fld)) { + tree fieldtype; + + fieldtype = TREE_TYPE(fld); + if (TREE_CODE(fieldtype) == INTEGER_TYPE) + continue; + + *no_add_attrs = true; + error("structure variable %qD with %qE attribute has a non-integer field %qE", + *node, name, fld); + break; + } + + if (fld) + break; + +#if BUILDING_GCC_VERSION <= 4007 + vals = VEC_alloc(constructor_elt, gc, nelt); +#else + vec_alloc(vals, nelt); +#endif + + for (fld = lst; fld; fld = TREE_CHAIN(fld)) { + tree random_const, fld_t = TREE_TYPE(fld); + + random_const = tree_get_random_const(fld_t); + CONSTRUCTOR_APPEND_ELT(vals, fld, random_const); + } + + /* Initialize the fields with random constants */ + DECL_INITIAL(*node) = build_constructor(type, vals); + break; + } + + /* Initialize the variable with a random constant */ + case INTEGER_TYPE: + DECL_INITIAL(*node) = tree_get_random_const(type); + break; + + case ARRAY_TYPE: { + tree elt_type, array_size, elt_size; + unsigned int i, nelt; + HOST_WIDE_INT array_size_int, elt_size_int; + + elt_type = TREE_TYPE(type); + elt_size = TYPE_SIZE_UNIT(TREE_TYPE(type)); + array_size = TYPE_SIZE_UNIT(type); + + if (TREE_CODE(elt_type) != INTEGER_TYPE || !array_size + || TREE_CODE(array_size) != INTEGER_CST) { + *no_add_attrs = true; + error("array variable %qD with %qE attribute must be a fixed length integer array type", + *node, name); + break; + } + + array_size_int = TREE_INT_CST_LOW(array_size); + elt_size_int = TREE_INT_CST_LOW(elt_size); + nelt = array_size_int / elt_size_int; + +#if BUILDING_GCC_VERSION <= 4007 + vals = VEC_alloc(constructor_elt, gc, nelt); +#else + vec_alloc(vals, nelt); +#endif + + for (i = 0; i < nelt; i++) { + tree cst = size_int(i); + tree rand_cst = tree_get_random_const(elt_type); + + CONSTRUCTOR_APPEND_ELT(vals, cst, rand_cst); + } + + /* + * Initialize the elements of the array with random + * constants + */ + DECL_INITIAL(*node) = build_constructor(type, vals); + break; + } + } + break; + + case FUNCTION_DECL: + break; + } + + return NULL_TREE; +} + +static struct attribute_spec latent_entropy_attr = { + .name = "latent_entropy", + .min_length = 0, + .max_length = 0, + .decl_required = true, + .type_required = false, + .function_type_required = false, + .handler = handle_latent_entropy_attribute, +#if BUILDING_GCC_VERSION >= 4007 + .affects_type_identity = false +#endif +}; + +static void register_attributes(void *event_data __unused, void *data __unused) +{ + register_attribute(&latent_entropy_attr); +} + +static bool latent_entropy_gate(void) +{ + tree list; + + /* don't bother with noreturn functions for now */ + if (TREE_THIS_VOLATILE(current_function_decl)) + return false; + + /* gcc-4.5 doesn't discover some trivial noreturn functions */ + if (EDGE_COUNT(EXIT_BLOCK_PTR_FOR_FN(cfun)->preds) == 0) + return false; + + list = DECL_ATTRIBUTES(current_function_decl); + return lookup_attribute("latent_entropy", list) != NULL_TREE; +} + +static tree create_var(tree type, const char *name) +{ + tree var; + + var = create_tmp_var(type, name); + add_referenced_var(var); + mark_sym_for_renaming(var); + return var; +} + +/* + * Set up the next operation and its constant operand to use in the latent + * entropy PRNG. When RHS is specified, the request is for perturbing the + * local latent entropy variable, otherwise it is for perturbing the global + * latent entropy variable where the two operands are already given by the + * local and global latent entropy variables themselves. + * + * The operation is one of add/xor/rol when instrumenting the local entropy + * variable and one of add/xor when perturbing the global entropy variable. + * Rotation is not used for the latter case because it would transmit less + * entropy to the global variable than the other two operations. + */ +static enum tree_code get_op(tree *rhs) +{ + static enum tree_code op; + unsigned HOST_WIDE_INT random_const; + + random_const = get_random_const(); + + switch (op) { + case BIT_XOR_EXPR: + op = PLUS_EXPR; + break; + + case PLUS_EXPR: + if (rhs) { + op = LROTATE_EXPR; + /* + * This code limits the value of random_const to + * the size of a wide int for the rotation + */ + random_const &= HOST_BITS_PER_WIDE_INT - 1; + break; + } + + case LROTATE_EXPR: + default: + op = BIT_XOR_EXPR; + break; + } + if (rhs) + *rhs = build_int_cstu(unsigned_intDI_type_node, random_const); + return op; +} + +static gimple create_assign(enum tree_code code, tree lhs, tree op1, + tree op2) +{ + return gimple_build_assign_with_ops(code, lhs, op1, op2); +} + +static void perturb_local_entropy(basic_block bb, tree local_entropy) +{ + gimple_stmt_iterator gsi; + gimple assign; + tree rhs; + enum tree_code op; + + op = get_op(&rhs); + assign = create_assign(op, local_entropy, local_entropy, rhs); + gsi = gsi_after_labels(bb); + gsi_insert_before(&gsi, assign, GSI_NEW_STMT); + update_stmt(assign); +} + +static void __perturb_latent_entropy(gimple_stmt_iterator *gsi, + tree local_entropy) +{ + gimple assign; + tree temp; + enum tree_code op; + + /* 1. create temporary copy of latent_entropy */ + temp = create_var(unsigned_intDI_type_node, "tmp_latent_entropy"); + + /* 2. read... */ + add_referenced_var(latent_entropy_decl); + mark_sym_for_renaming(latent_entropy_decl); + assign = gimple_build_assign(temp, latent_entropy_decl); + gsi_insert_before(gsi, assign, GSI_NEW_STMT); + update_stmt(assign); + + /* 3. ...modify... */ + op = get_op(NULL); + assign = create_assign(op, temp, temp, local_entropy); + gsi_insert_after(gsi, assign, GSI_NEW_STMT); + update_stmt(assign); + + /* 4. ...write latent_entropy */ + assign = gimple_build_assign(latent_entropy_decl, temp); + gsi_insert_after(gsi, assign, GSI_NEW_STMT); + update_stmt(assign); +} + +static bool handle_tail_calls(basic_block bb, tree local_entropy) +{ + gimple_stmt_iterator gsi; + + for (gsi = gsi_start_bb(bb); !gsi_end_p(gsi); gsi_next(&gsi)) { + gcall *call; + gimple stmt = gsi_stmt(gsi); + + if (!is_gimple_call(stmt)) + continue; + + call = as_a_gcall(stmt); + if (!gimple_call_tail_p(call)) + continue; + + __perturb_latent_entropy(&gsi, local_entropy); + return true; + } + + return false; +} + +static void perturb_latent_entropy(tree local_entropy) +{ + edge_iterator ei; + edge e, last_bb_e; + basic_block last_bb; + + gcc_assert(single_pred_p(EXIT_BLOCK_PTR_FOR_FN(cfun))); + last_bb_e = single_pred_edge(EXIT_BLOCK_PTR_FOR_FN(cfun)); + + FOR_EACH_EDGE(e, ei, last_bb_e->src->preds) { + if (ENTRY_BLOCK_PTR_FOR_FN(cfun) == e->src) + continue; + if (EXIT_BLOCK_PTR_FOR_FN(cfun) == e->src) + continue; + + handle_tail_calls(e->src, local_entropy); + } + + last_bb = single_pred(EXIT_BLOCK_PTR_FOR_FN(cfun)); + if (!handle_tail_calls(last_bb, local_entropy)) { + gimple_stmt_iterator gsi = gsi_last_bb(last_bb); + + __perturb_latent_entropy(&gsi, local_entropy); + } +} + +static void init_local_entropy(basic_block bb, tree local_entropy) +{ + gimple assign, call; + tree frame_addr, rand_const, tmp, fndecl, udi_frame_addr; + enum tree_code op; + unsigned HOST_WIDE_INT rand_cst; + gimple_stmt_iterator gsi = gsi_after_labels(bb); + + /* 1. create local_entropy_frameaddr */ + frame_addr = create_var(ptr_type_node, "local_entropy_frameaddr"); + + /* 2. local_entropy_frameaddr = __builtin_frame_address() */ + fndecl = builtin_decl_implicit(BUILT_IN_FRAME_ADDRESS); + call = gimple_build_call(fndecl, 1, integer_zero_node); + gimple_call_set_lhs(call, frame_addr); + gsi_insert_before(&gsi, call, GSI_NEW_STMT); + update_stmt(call); + + udi_frame_addr = fold_convert(unsigned_intDI_type_node, frame_addr); + assign = gimple_build_assign(local_entropy, udi_frame_addr); + gsi_insert_after(&gsi, assign, GSI_NEW_STMT); + update_stmt(assign); + + /* 3. create temporary copy of latent_entropy */ + tmp = create_var(unsigned_intDI_type_node, "tmp_latent_entropy"); + + /* 4. read the global entropy variable into local entropy */ + add_referenced_var(latent_entropy_decl); + mark_sym_for_renaming(latent_entropy_decl); + assign = gimple_build_assign(tmp, latent_entropy_decl); + gsi_insert_after(&gsi, assign, GSI_NEW_STMT); + update_stmt(assign); + + /* 5. mix local_entropy_frameaddr into local entropy */ + assign = create_assign(BIT_XOR_EXPR, local_entropy, local_entropy, tmp); + gsi_insert_after(&gsi, assign, GSI_NEW_STMT); + update_stmt(assign); + + rand_cst = get_random_const(); + rand_const = build_int_cstu(unsigned_intDI_type_node, rand_cst); + op = get_op(NULL); + assign = create_assign(op, local_entropy, local_entropy, rand_const); + gsi_insert_after(&gsi, assign, GSI_NEW_STMT); + update_stmt(assign); +} + +static bool create_latent_entropy_decl(void) +{ + varpool_node_ptr node; + + if (latent_entropy_decl != NULL_TREE) + return true; + + FOR_EACH_VARIABLE(node) { + tree name, var = NODE_DECL(node); + + if (DECL_NAME_LENGTH(var) < sizeof("latent_entropy") - 1) + continue; + + name = DECL_NAME(var); + if (strcmp(IDENTIFIER_POINTER(name), "latent_entropy")) + continue; + + latent_entropy_decl = var; + break; + } + + return latent_entropy_decl != NULL_TREE; +} + +static unsigned int latent_entropy_execute(void) +{ + basic_block bb; + tree local_entropy; + + if (!create_latent_entropy_decl()) + return 0; + + /* prepare for step 2 below */ + gcc_assert(single_succ_p(ENTRY_BLOCK_PTR_FOR_FN(cfun))); + bb = single_succ(ENTRY_BLOCK_PTR_FOR_FN(cfun)); + if (!single_pred_p(bb)) { + split_edge(single_succ_edge(ENTRY_BLOCK_PTR_FOR_FN(cfun))); + gcc_assert(single_succ_p(ENTRY_BLOCK_PTR_FOR_FN(cfun))); + bb = single_succ(ENTRY_BLOCK_PTR_FOR_FN(cfun)); + } + + /* 1. create the local entropy variable */ + local_entropy = create_var(unsigned_intDI_type_node, "local_entropy"); + + /* 2. initialize the local entropy variable */ + init_local_entropy(bb, local_entropy); + + bb = bb->next_bb; + + /* + * 3. instrument each BB with an operation on the + * local entropy variable + */ + while (bb != EXIT_BLOCK_PTR_FOR_FN(cfun)) { + perturb_local_entropy(bb, local_entropy); + bb = bb->next_bb; + }; + + /* 4. mix local entropy into the global entropy variable */ + perturb_latent_entropy(local_entropy); + return 0; +} + +static void latent_entropy_start_unit(void *gcc_data __unused, + void *user_data __unused) +{ + tree type, id; + int quals; + + seed = get_random_seed(false); + + if (in_lto_p) + return; + + /* extern volatile u64 latent_entropy */ + gcc_assert(TYPE_PRECISION(long_long_unsigned_type_node) == 64); + quals = TYPE_QUALS(long_long_unsigned_type_node) | TYPE_QUAL_VOLATILE; + type = build_qualified_type(long_long_unsigned_type_node, quals); + id = get_identifier("latent_entropy"); + latent_entropy_decl = build_decl(UNKNOWN_LOCATION, VAR_DECL, id, type); + + TREE_STATIC(latent_entropy_decl) = 1; + TREE_PUBLIC(latent_entropy_decl) = 1; + TREE_USED(latent_entropy_decl) = 1; + DECL_PRESERVE_P(latent_entropy_decl) = 1; + TREE_THIS_VOLATILE(latent_entropy_decl) = 1; + DECL_EXTERNAL(latent_entropy_decl) = 1; + DECL_ARTIFICIAL(latent_entropy_decl) = 1; + lang_hooks.decls.pushdecl(latent_entropy_decl); +} + +#define PASS_NAME latent_entropy +#define PROPERTIES_REQUIRED PROP_gimple_leh | PROP_cfg +#define TODO_FLAGS_FINISH TODO_verify_ssa | TODO_verify_stmts | TODO_dump_func \ + | TODO_update_ssa +#include "gcc-generate-gimple-pass.h" + +int plugin_init(struct plugin_name_args *plugin_info, + struct plugin_gcc_version *version) +{ + bool enabled = true; + const char * const plugin_name = plugin_info->base_name; + const int argc = plugin_info->argc; + const struct plugin_argument * const argv = plugin_info->argv; + int i; + + struct register_pass_info latent_entropy_pass_info; + + latent_entropy_pass_info.pass = make_latent_entropy_pass(); + latent_entropy_pass_info.reference_pass_name = "optimized"; + latent_entropy_pass_info.ref_pass_instance_number = 1; + latent_entropy_pass_info.pos_op = PASS_POS_INSERT_BEFORE; + static const struct ggc_root_tab gt_ggc_r_gt_latent_entropy[] = { + { + .base = &latent_entropy_decl, + .nelt = 1, + .stride = sizeof(latent_entropy_decl), + .cb = >_ggc_mx_tree_node, + .pchw = >_pch_nx_tree_node + }, + LAST_GGC_ROOT_TAB + }; + + if (!plugin_default_version_check(version, &gcc_version)) { + error(G_("incompatible gcc/plugin versions")); + return 1; + } + + for (i = 0; i < argc; ++i) { + if (!(strcmp(argv[i].key, "disable"))) { + enabled = false; + continue; + } + error(G_("unkown option '-fplugin-arg-%s-%s'"), plugin_name, argv[i].key); + } + + register_callback(plugin_name, PLUGIN_INFO, NULL, + &latent_entropy_plugin_info); + if (enabled) { + register_callback(plugin_name, PLUGIN_START_UNIT, + &latent_entropy_start_unit, NULL); + register_callback(plugin_name, PLUGIN_REGISTER_GGC_ROOTS, + NULL, (void *)>_ggc_r_gt_latent_entropy); + register_callback(plugin_name, PLUGIN_PASS_MANAGER_SETUP, NULL, + &latent_entropy_pass_info); + } + register_callback(plugin_name, PLUGIN_ATTRIBUTES, register_attributes, + NULL); + + return 0; +} -- cgit v1.2.3 From 0766f788eb727e2e330d55d30545db65bcf2623f Mon Sep 17 00:00:00 2001 From: Emese Revfy Date: Mon, 20 Jun 2016 20:42:34 +0200 Subject: latent_entropy: Mark functions with __latent_entropy The __latent_entropy gcc attribute can be used only on functions and variables. If it is on a function then the plugin will instrument it for gathering control-flow entropy. If the attribute is on a variable then the plugin will initialize it with random contents. The variable must be an integer, an integer array type or a structure with integer fields. These specific functions have been selected because they are init functions (to help gather boot-time entropy), are called at unpredictable times, or they have variable loops, each of which provide some level of latent entropy. Signed-off-by: Emese Revfy [kees: expanded commit message] Signed-off-by: Kees Cook --- block/blk-softirq.c | 2 +- drivers/char/random.c | 4 ++-- fs/namespace.c | 1 + include/linux/compiler-gcc.h | 7 +++++++ include/linux/compiler.h | 4 ++++ include/linux/fdtable.h | 2 +- include/linux/genhd.h | 2 +- include/linux/init.h | 5 +++-- include/linux/random.h | 4 ++-- kernel/fork.c | 6 ++++-- kernel/rcu/tiny.c | 2 +- kernel/rcu/tree.c | 2 +- kernel/sched/fair.c | 2 +- kernel/softirq.c | 4 ++-- kernel/time/timer.c | 2 +- lib/irq_poll.c | 2 +- lib/random32.c | 2 +- mm/page_alloc.c | 2 +- net/core/dev.c | 4 ++-- 19 files changed, 37 insertions(+), 22 deletions(-) (limited to 'include') diff --git a/block/blk-softirq.c b/block/blk-softirq.c index 53b1737e978d..489eab825a8a 100644 --- a/block/blk-softirq.c +++ b/block/blk-softirq.c @@ -18,7 +18,7 @@ static DEFINE_PER_CPU(struct list_head, blk_cpu_done); * Softirq action handler - move entries to local list and loop over them * while passing them to the queue registered handler. */ -static void blk_done_softirq(struct softirq_action *h) +static __latent_entropy void blk_done_softirq(struct softirq_action *h) { struct list_head *cpu_list, local_list; diff --git a/drivers/char/random.c b/drivers/char/random.c index 3efb3bf0ab83..7274ae89ddb3 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -479,8 +479,8 @@ static ssize_t _extract_entropy(struct entropy_store *r, void *buf, static void crng_reseed(struct crng_state *crng, struct entropy_store *r); static void push_to_pool(struct work_struct *work); -static __u32 input_pool_data[INPUT_POOL_WORDS]; -static __u32 blocking_pool_data[OUTPUT_POOL_WORDS]; +static __u32 input_pool_data[INPUT_POOL_WORDS] __latent_entropy; +static __u32 blocking_pool_data[OUTPUT_POOL_WORDS] __latent_entropy; static struct entropy_store input_pool = { .poolinfo = &poolinfo_table[0], diff --git a/fs/namespace.c b/fs/namespace.c index 7bb2cda3bfef..4a9568b81138 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2759,6 +2759,7 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns) return new_ns; } +__latent_entropy struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, struct user_namespace *user_ns, struct fs_struct *new_fs) { diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index 573c5a18908f..432f5c97e18f 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -188,6 +188,13 @@ #endif /* GCC_VERSION >= 40300 */ #if GCC_VERSION >= 40500 + +#ifndef __CHECKER__ +#ifdef LATENT_ENTROPY_PLUGIN +#define __latent_entropy __attribute__((latent_entropy)) +#endif +#endif + /* * Mark a position in code as unreachable. This can be used to * suppress control flow warnings after asm blocks that transfer diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 668569844d37..ceaddaf76ff1 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -406,6 +406,10 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s # define __attribute_const__ /* unimplemented */ #endif +#ifndef __latent_entropy +# define __latent_entropy +#endif + /* * Tell gcc if a function is cold. The compiler will assume any path * directly leading to the call is unlikely. diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h index 5295535b60c6..9852c7e33466 100644 --- a/include/linux/fdtable.h +++ b/include/linux/fdtable.h @@ -105,7 +105,7 @@ struct files_struct *get_files_struct(struct task_struct *); void put_files_struct(struct files_struct *fs); void reset_files_struct(struct files_struct *); int unshare_files(struct files_struct **); -struct files_struct *dup_fd(struct files_struct *, int *); +struct files_struct *dup_fd(struct files_struct *, int *) __latent_entropy; void do_close_on_exec(struct files_struct *); int iterate_fd(struct files_struct *, unsigned, int (*)(const void *, struct file *, unsigned), diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 1dbf52f9c24b..e0341af6950e 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -437,7 +437,7 @@ extern void disk_flush_events(struct gendisk *disk, unsigned int mask); extern unsigned int disk_clear_events(struct gendisk *disk, unsigned int mask); /* drivers/char/random.c */ -extern void add_disk_randomness(struct gendisk *disk); +extern void add_disk_randomness(struct gendisk *disk) __latent_entropy; extern void rand_initialize_disk(struct gendisk *disk); static inline sector_t get_start_sect(struct block_device *bdev) diff --git a/include/linux/init.h b/include/linux/init.h index 6935d02474aa..1e5c131d5c9a 100644 --- a/include/linux/init.h +++ b/include/linux/init.h @@ -39,7 +39,7 @@ /* These are for everybody (although not all archs will actually discard it in modules) */ -#define __init __section(.init.text) __cold notrace +#define __init __section(.init.text) __cold notrace __latent_entropy #define __initdata __section(.init.data) #define __initconst __constsection(.init.rodata) #define __exitdata __section(.exit.data) @@ -86,7 +86,8 @@ #define __exit __section(.exit.text) __exitused __cold notrace /* Used for MEMORY_HOTPLUG */ -#define __meminit __section(.meminit.text) __cold notrace +#define __meminit __section(.meminit.text) __cold notrace \ + __latent_entropy #define __meminitdata __section(.meminit.data) #define __meminitconst __constsection(.meminit.rodata) #define __memexit __section(.memexit.text) __exitused __cold notrace diff --git a/include/linux/random.h b/include/linux/random.h index a59c74cdb1eb..d80a4388a4fd 100644 --- a/include/linux/random.h +++ b/include/linux/random.h @@ -30,8 +30,8 @@ static inline void add_latent_entropy(void) {} #endif extern void add_input_randomness(unsigned int type, unsigned int code, - unsigned int value); -extern void add_interrupt_randomness(int irq, int irq_flags); + unsigned int value) __latent_entropy; +extern void add_interrupt_randomness(int irq, int irq_flags) __latent_entropy; extern void get_random_bytes(void *buf, int nbytes); extern int add_random_ready_callback(struct random_ready_callback *rdy); diff --git a/kernel/fork.c b/kernel/fork.c index 001b18473a07..05393881ef39 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -404,7 +404,8 @@ free_tsk: } #ifdef CONFIG_MMU -static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) +static __latent_entropy int dup_mmap(struct mm_struct *mm, + struct mm_struct *oldmm) { struct vm_area_struct *mpnt, *tmp, *prev, **pprev; struct rb_node **rb_link, *rb_parent; @@ -1296,7 +1297,8 @@ init_task_pid(struct task_struct *task, enum pid_type type, struct pid *pid) * parts of the process environment (as per the clone * flags). The actual kick-off is left to the caller. */ -static struct task_struct *copy_process(unsigned long clone_flags, +static __latent_entropy struct task_struct *copy_process( + unsigned long clone_flags, unsigned long stack_start, unsigned long stack_size, int __user *child_tidptr, diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index 944b1b491ed8..1898559e6b60 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -170,7 +170,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) false)); } -static void rcu_process_callbacks(struct softirq_action *unused) +static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused) { __rcu_process_callbacks(&rcu_sched_ctrlblk); __rcu_process_callbacks(&rcu_bh_ctrlblk); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 5d80925e7fc8..e5164deb51e1 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3013,7 +3013,7 @@ __rcu_process_callbacks(struct rcu_state *rsp) /* * Do RCU core processing for the current CPU. */ -static void rcu_process_callbacks(struct softirq_action *unused) +static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused) { struct rcu_state *rsp; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 039de34f1521..004996df2f10 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8283,7 +8283,7 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { } * run_rebalance_domains is triggered when needed from the scheduler tick. * Also triggered for nohz idle balancing (with nohz_balancing_kick set). */ -static void run_rebalance_domains(struct softirq_action *h) +static __latent_entropy void run_rebalance_domains(struct softirq_action *h) { struct rq *this_rq = this_rq(); enum cpu_idle_type idle = this_rq->idle_balance ? diff --git a/kernel/softirq.c b/kernel/softirq.c index 17caf4b63342..34033fd09c8c 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -482,7 +482,7 @@ void __tasklet_hi_schedule_first(struct tasklet_struct *t) } EXPORT_SYMBOL(__tasklet_hi_schedule_first); -static void tasklet_action(struct softirq_action *a) +static __latent_entropy void tasklet_action(struct softirq_action *a) { struct tasklet_struct *list; @@ -518,7 +518,7 @@ static void tasklet_action(struct softirq_action *a) } } -static void tasklet_hi_action(struct softirq_action *a) +static __latent_entropy void tasklet_hi_action(struct softirq_action *a) { struct tasklet_struct *list; diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 32bf6f75a8fe..2d47980a1bc4 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1633,7 +1633,7 @@ static inline void __run_timers(struct timer_base *base) /* * This function runs timers and the timer-tq in bottom half context. */ -static void run_timer_softirq(struct softirq_action *h) +static __latent_entropy void run_timer_softirq(struct softirq_action *h) { struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]); diff --git a/lib/irq_poll.c b/lib/irq_poll.c index 836f7db4e548..63be7495dbb2 100644 --- a/lib/irq_poll.c +++ b/lib/irq_poll.c @@ -74,7 +74,7 @@ void irq_poll_complete(struct irq_poll *iop) } EXPORT_SYMBOL(irq_poll_complete); -static void irq_poll_softirq(struct softirq_action *h) +static void __latent_entropy irq_poll_softirq(struct softirq_action *h) { struct list_head *list = this_cpu_ptr(&blk_cpu_iopoll); int rearm = 0, budget = irq_poll_budget; diff --git a/lib/random32.c b/lib/random32.c index 69ed593aab07..a30923573676 100644 --- a/lib/random32.c +++ b/lib/random32.c @@ -47,7 +47,7 @@ static inline void prandom_state_selftest(void) } #endif -static DEFINE_PER_CPU(struct rnd_state, net_rand_state); +static DEFINE_PER_CPU(struct rnd_state, net_rand_state) __latent_entropy; /** * prandom_u32_state - seeded pseudo-random number generator. diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 248851d1fc86..901121af4e54 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -92,7 +92,7 @@ int _node_numa_mem_[MAX_NUMNODES]; #endif #ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY -volatile u64 latent_entropy; +volatile u64 latent_entropy __latent_entropy; EXPORT_SYMBOL(latent_entropy); #endif diff --git a/net/core/dev.c b/net/core/dev.c index ea6312057a71..ee076c2791f9 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3855,7 +3855,7 @@ int netif_rx_ni(struct sk_buff *skb) } EXPORT_SYMBOL(netif_rx_ni); -static void net_tx_action(struct softirq_action *h) +static __latent_entropy void net_tx_action(struct softirq_action *h) { struct softnet_data *sd = this_cpu_ptr(&softnet_data); @@ -5187,7 +5187,7 @@ out_unlock: return work; } -static void net_rx_action(struct softirq_action *h) +static __latent_entropy void net_rx_action(struct softirq_action *h) { struct softnet_data *sd = this_cpu_ptr(&softnet_data); unsigned long time_limit = jiffies + 2; -- cgit v1.2.3 From b60e4ea4a400bde8a4811f94b84a9bb65f81b677 Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Thu, 29 Sep 2016 16:39:41 +0300 Subject: ACPI / property: Allow holes in reference properties DT allows holes or empty phandles for references. This is used for example in SPI subsystem where some chip selects are native and others are regular GPIOs. In ACPI _DSD we currently do not support this but instead the preceding reference consumes all following integer arguments. For example we would like to support something like the below ASL fragment for SPI: Package () { "cs-gpios", Package () { ^GPIO, 19, 0, 0, // GPIO CS0 0, // Native CS ^GPIO, 20, 0, 0, // GPIO CS1 } } The zero in the middle means "no entry" or NULL reference. To support this we change acpi_data_get_property_reference() to take firmware node and num_args as argument and rename it to __acpi_node_get_property_reference(). The function returns -ENOENT if the given index resolves to "no entry" reference and -ENODATA when there are no more entries in the property. We then add static inline wrapper acpi_node_get_property_reference() that passes MAX_ACPI_REFERENCE_ARGS as num_args to support the existing behaviour which some drivers have been relying on. Signed-off-by: Mika Westerberg Reviewed-by: Andy Shevchenko Signed-off-by: Rafael J. Wysocki --- drivers/acpi/property.c | 117 ++++++++++++++++++++++++++++-------------------- include/linux/acpi.h | 22 +++++++-- 2 files changed, 87 insertions(+), 52 deletions(-) (limited to 'include') diff --git a/drivers/acpi/property.c b/drivers/acpi/property.c index f2fd3fee588a..03f5ec11ab31 100644 --- a/drivers/acpi/property.c +++ b/drivers/acpi/property.c @@ -468,10 +468,11 @@ static int acpi_data_get_property_array(struct acpi_device_data *data, } /** - * acpi_data_get_property_reference - returns handle to the referenced object - * @data: ACPI device data object containing the property + * __acpi_node_get_property_reference - returns handle to the referenced object + * @fwnode: Firmware node to get the property from * @propname: Name of the property * @index: Index of the reference to return + * @num_args: Maximum number of arguments after each reference * @args: Location to store the returned reference with optional arguments * * Find property with @name, verifify that it is a package containing at least @@ -482,17 +483,40 @@ static int acpi_data_get_property_array(struct acpi_device_data *data, * If there's more than one reference in the property value package, @index is * used to select the one to return. * + * It is possible to leave holes in the property value set like in the + * example below: + * + * Package () { + * "cs-gpios", + * Package () { + * ^GPIO, 19, 0, 0, + * ^GPIO, 20, 0, 0, + * 0, + * ^GPIO, 21, 0, 0, + * } + * } + * + * Calling this function with index %2 return %-ENOENT and with index %3 + * returns the last entry. If the property does not contain any more values + * %-ENODATA is returned. The NULL entry must be single integer and + * preferably contain value %0. + * * Return: %0 on success, negative error code on failure. */ -static int acpi_data_get_property_reference(struct acpi_device_data *data, - const char *propname, size_t index, - struct acpi_reference_args *args) +int __acpi_node_get_property_reference(struct fwnode_handle *fwnode, + const char *propname, size_t index, size_t num_args, + struct acpi_reference_args *args) { const union acpi_object *element, *end; const union acpi_object *obj; + struct acpi_device_data *data; struct acpi_device *device; int ret, idx = 0; + data = acpi_device_data_of_node(fwnode); + if (!data) + return -EINVAL; + ret = acpi_data_get_property(data, propname, ACPI_TYPE_ANY, &obj); if (ret) return ret; @@ -532,59 +556,54 @@ static int acpi_data_get_property_reference(struct acpi_device_data *data, while (element < end) { u32 nargs, i; - if (element->type != ACPI_TYPE_LOCAL_REFERENCE) - return -EPROTO; - - ret = acpi_bus_get_device(element->reference.handle, &device); - if (ret) - return -ENODEV; - - element++; - nargs = 0; - - /* assume following integer elements are all args */ - for (i = 0; element + i < end; i++) { - int type = element[i].type; + if (element->type == ACPI_TYPE_LOCAL_REFERENCE) { + ret = acpi_bus_get_device(element->reference.handle, + &device); + if (ret) + return -ENODEV; + + nargs = 0; + element++; + + /* assume following integer elements are all args */ + for (i = 0; element + i < end && i < num_args; i++) { + int type = element[i].type; + + if (type == ACPI_TYPE_INTEGER) + nargs++; + else if (type == ACPI_TYPE_LOCAL_REFERENCE) + break; + else + return -EPROTO; + } - if (type == ACPI_TYPE_INTEGER) - nargs++; - else if (type == ACPI_TYPE_LOCAL_REFERENCE) - break; - else + if (nargs > MAX_ACPI_REFERENCE_ARGS) return -EPROTO; - } - if (idx++ == index) { - args->adev = device; - args->nargs = nargs; - for (i = 0; i < nargs; i++) - args->args[i] = element[i].integer.value; + if (idx == index) { + args->adev = device; + args->nargs = nargs; + for (i = 0; i < nargs; i++) + args->args[i] = element[i].integer.value; - return 0; + return 0; + } + + element += nargs; + } else if (element->type == ACPI_TYPE_INTEGER) { + if (idx == index) + return -ENOENT; + element++; + } else { + return -EPROTO; } - element += nargs; + idx++; } - return -EPROTO; -} - -/** - * acpi_node_get_property_reference - get a handle to the referenced object. - * @fwnode: Firmware node to get the property from. - * @propname: Name of the property. - * @index: Index of the reference to return. - * @args: Location to store the returned reference with optional arguments. - */ -int acpi_node_get_property_reference(struct fwnode_handle *fwnode, - const char *name, size_t index, - struct acpi_reference_args *args) -{ - struct acpi_device_data *data = acpi_device_data_of_node(fwnode); - - return data ? acpi_data_get_property_reference(data, name, index, args) : -EINVAL; + return -ENODATA; } -EXPORT_SYMBOL_GPL(acpi_node_get_property_reference); +EXPORT_SYMBOL_GPL(__acpi_node_get_property_reference); static int acpi_data_prop_read_single(struct acpi_device_data *data, const char *propname, diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 4d8452c2384b..632ec16a855e 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -927,9 +927,17 @@ struct acpi_reference_args { #ifdef CONFIG_ACPI int acpi_dev_get_property(struct acpi_device *adev, const char *name, acpi_object_type type, const union acpi_object **obj); -int acpi_node_get_property_reference(struct fwnode_handle *fwnode, - const char *name, size_t index, - struct acpi_reference_args *args); +int __acpi_node_get_property_reference(struct fwnode_handle *fwnode, + const char *name, size_t index, size_t num_args, + struct acpi_reference_args *args); + +static inline int acpi_node_get_property_reference(struct fwnode_handle *fwnode, + const char *name, size_t index, + struct acpi_reference_args *args) +{ + return __acpi_node_get_property_reference(fwnode, name, index, + MAX_ACPI_REFERENCE_ARGS, args); +} int acpi_node_prop_get(struct fwnode_handle *fwnode, const char *propname, void **valptr); @@ -1005,6 +1013,14 @@ static inline int acpi_dev_get_property(struct acpi_device *adev, return -ENXIO; } +static inline int +__acpi_node_get_property_reference(struct fwnode_handle *fwnode, + const char *name, size_t index, size_t num_args, + struct acpi_reference_args *args) +{ + return -ENXIO; +} + static inline int acpi_node_get_property_reference(struct fwnode_handle *fwnode, const char *name, size_t index, struct acpi_reference_args *args) -- cgit v1.2.3 From 819bf593767c0966f320c51c6ed3f3835062abc8 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 11 Oct 2016 14:56:53 +0200 Subject: docs-rst: sphinxify 802.11 documentation This is just a very basic conversion, I've split up the original multi-book template, and also split up the multi-part mac80211 part in the original book; neither of those were handled by the automatic pandoc conversion. Fix errors that showed up, resulting in a much nicer rendering, at least for the interface combinations documentation. Signed-off-by: Johannes Berg Signed-off-by: Jonathan Corbet --- Documentation/80211/cfg80211.rst | 345 ++++++++++++++++++ Documentation/80211/conf.py | 5 + Documentation/80211/index.rst | 17 + Documentation/80211/introduction.rst | 17 + Documentation/80211/mac80211-advanced.rst | 295 +++++++++++++++ Documentation/80211/mac80211.rst | 216 +++++++++++ Documentation/DocBook/80211.tmpl | 584 ------------------------------ Documentation/DocBook/Makefile | 2 +- Documentation/index.rst | 1 + include/net/cfg80211.h | 77 ++-- 10 files changed, 939 insertions(+), 620 deletions(-) create mode 100644 Documentation/80211/cfg80211.rst create mode 100644 Documentation/80211/conf.py create mode 100644 Documentation/80211/index.rst create mode 100644 Documentation/80211/introduction.rst create mode 100644 Documentation/80211/mac80211-advanced.rst create mode 100644 Documentation/80211/mac80211.rst delete mode 100644 Documentation/DocBook/80211.tmpl (limited to 'include') diff --git a/Documentation/80211/cfg80211.rst b/Documentation/80211/cfg80211.rst new file mode 100644 index 000000000000..b1e149ea6fee --- /dev/null +++ b/Documentation/80211/cfg80211.rst @@ -0,0 +1,345 @@ +================== +cfg80211 subsystem +================== + +Device registration +=================== + +.. kernel-doc:: include/net/cfg80211.h + :doc: Device registration + +.. kernel-doc:: include/net/cfg80211.h + :functions: ieee80211_channel_flags + +.. kernel-doc:: include/net/cfg80211.h + :functions: ieee80211_channel + +.. kernel-doc:: include/net/cfg80211.h + :functions: ieee80211_rate_flags + +.. kernel-doc:: include/net/cfg80211.h + :functions: ieee80211_rate + +.. kernel-doc:: include/net/cfg80211.h + :functions: ieee80211_sta_ht_cap + +.. kernel-doc:: include/net/cfg80211.h + :functions: ieee80211_supported_band + +.. kernel-doc:: include/net/cfg80211.h + :functions: cfg80211_signal_type + +.. kernel-doc:: include/net/cfg80211.h + :functions: wiphy_params_flags + +.. kernel-doc:: include/net/cfg80211.h + :functions: wiphy_flags + +.. kernel-doc:: include/net/cfg80211.h + :functions: wiphy + +.. kernel-doc:: include/net/cfg80211.h + :functions: wireless_dev + +.. kernel-doc:: include/net/cfg80211.h + :functions: wiphy_new + +.. kernel-doc:: include/net/cfg80211.h + :functions: wiphy_register + +.. kernel-doc:: include/net/cfg80211.h + :functions: wiphy_unregister + +.. kernel-doc:: include/net/cfg80211.h + :functions: wiphy_free + +.. kernel-doc:: include/net/cfg80211.h + :functions: wiphy_name + +.. kernel-doc:: include/net/cfg80211.h + :functions: wiphy_dev + +.. kernel-doc:: include/net/cfg80211.h + :functions: wiphy_priv + +.. kernel-doc:: include/net/cfg80211.h + :functions: priv_to_wiphy + +.. kernel-doc:: include/net/cfg80211.h + :functions: set_wiphy_dev + +.. kernel-doc:: include/net/cfg80211.h + :functions: wdev_priv + +.. kernel-doc:: include/net/cfg80211.h + :functions: ieee80211_iface_limit + +.. kernel-doc:: include/net/cfg80211.h + :functions: ieee80211_iface_combination + +.. kernel-doc:: include/net/cfg80211.h + :functions: cfg80211_check_combinations + +Actions and configuration +========================= + +.. kernel-doc:: include/net/cfg80211.h + :doc: Actions and configuration + +.. kernel-doc:: include/net/cfg80211.h + :functions: cfg80211_ops + +.. kernel-doc:: include/net/cfg80211.h + :functions: vif_params + +.. kernel-doc:: include/net/cfg80211.h + :functions: key_params + +.. kernel-doc:: include/net/cfg80211.h + :functions: survey_info_flags + +.. kernel-doc:: include/net/cfg80211.h + :functions: survey_info + +.. kernel-doc:: include/net/cfg80211.h + :functions: cfg80211_beacon_data + +.. kernel-doc:: include/net/cfg80211.h + :functions: cfg80211_ap_settings + +.. kernel-doc:: include/net/cfg80211.h + :functions: station_parameters + +.. kernel-doc:: include/net/cfg80211.h + :functions: rate_info_flags + +.. kernel-doc:: include/net/cfg80211.h + :functions: rate_info + +.. kernel-doc:: include/net/cfg80211.h + :functions: station_info + +.. kernel-doc:: include/net/cfg80211.h + :functions: monitor_flags + +.. kernel-doc:: include/net/cfg80211.h + :functions: mpath_info_flags + +.. kernel-doc:: include/net/cfg80211.h + :functions: mpath_info + +.. kernel-doc:: include/net/cfg80211.h + :functions: bss_parameters + +.. kernel-doc:: include/net/cfg80211.h + :functions: ieee80211_txq_params + +.. kernel-doc:: include/net/cfg80211.h + :functions: cfg80211_crypto_settings + +.. kernel-doc:: include/net/cfg80211.h + :functions: cfg80211_auth_request + +.. kernel-doc:: include/net/cfg80211.h + :functions: cfg80211_assoc_request + +.. kernel-doc:: include/net/cfg80211.h + :functions: cfg80211_deauth_request + +.. kernel-doc:: include/net/cfg80211.h + :functions: cfg80211_disassoc_request + +.. kernel-doc:: include/net/cfg80211.h + :functions: cfg80211_ibss_params + +.. kernel-doc:: include/net/cfg80211.h + :functions: cfg80211_connect_params + +.. kernel-doc:: include/net/cfg80211.h + :functions: cfg80211_pmksa + +.. kernel-doc:: include/net/cfg80211.h + :functions: cfg80211_rx_mlme_mgmt + +.. kernel-doc:: include/net/cfg80211.h + :functions: cfg80211_auth_timeout + +.. kernel-doc:: include/net/cfg80211.h + :functions: cfg80211_rx_assoc_resp + +.. kernel-doc:: include/net/cfg80211.h + :functions: cfg80211_assoc_timeout + +.. kernel-doc:: include/net/cfg80211.h + :functions: cfg80211_tx_mlme_mgmt + +.. kernel-doc:: include/net/cfg80211.h + :functions: cfg80211_ibss_joined + +.. kernel-doc:: include/net/cfg80211.h + :functions: cfg80211_connect_result + +.. kernel-doc:: include/net/cfg80211.h + :functions: cfg80211_connect_bss + +.. kernel-doc:: include/net/cfg80211.h + :functions: cfg80211_connect_timeout + +.. kernel-doc:: include/net/cfg80211.h + :functions: cfg80211_roamed + +.. kernel-doc:: include/net/cfg80211.h + :functions: cfg80211_disconnected + +.. kernel-doc:: include/net/cfg80211.h + :functions: cfg80211_ready_on_channel + +.. kernel-doc:: include/net/cfg80211.h + :functions: cfg80211_remain_on_channel_expired + +.. kernel-doc:: include/net/cfg80211.h + :functions: cfg80211_new_sta + +.. kernel-doc:: include/net/cfg80211.h + :functions: cfg80211_rx_mgmt + +.. kernel-doc:: include/net/cfg80211.h + :functions: cfg80211_mgmt_tx_status + +.. kernel-doc:: include/net/cfg80211.h + :functions: cfg80211_cqm_rssi_notify + +.. kernel-doc:: include/net/cfg80211.h + :functions: cfg80211_cqm_pktloss_notify + +.. kernel-doc:: include/net/cfg80211.h + :functions: cfg80211_michael_mic_failure + +Scanning and BSS list handling +============================== + +.. kernel-doc:: include/net/cfg80211.h + :doc: Scanning and BSS list handling + +.. kernel-doc:: include/net/cfg80211.h + :functions: cfg80211_ssid + +.. kernel-doc:: include/net/cfg80211.h + :functions: cfg80211_scan_request + +.. kernel-doc:: include/net/cfg80211.h + :functions: cfg80211_scan_done + +.. kernel-doc:: include/net/cfg80211.h + :functions: cfg80211_bss + +.. kernel-doc:: include/net/cfg80211.h + :functions: cfg80211_inform_bss + +.. kernel-doc:: include/net/cfg80211.h + :functions: cfg80211_inform_bss_frame_data + +.. kernel-doc:: include/net/cfg80211.h + :functions: cfg80211_inform_bss_data + +.. kernel-doc:: include/net/cfg80211.h + :functions: cfg80211_unlink_bss + +.. kernel-doc:: include/net/cfg80211.h + :functions: cfg80211_find_ie + +.. kernel-doc:: include/net/cfg80211.h + :functions: ieee80211_bss_get_ie + +Utility functions +================= + +.. kernel-doc:: include/net/cfg80211.h + :doc: Utility functions + +.. kernel-doc:: include/net/cfg80211.h + :functions: ieee80211_channel_to_frequency + +.. kernel-doc:: include/net/cfg80211.h + :functions: ieee80211_frequency_to_channel + +.. kernel-doc:: include/net/cfg80211.h + :functions: ieee80211_get_channel + +.. kernel-doc:: include/net/cfg80211.h + :functions: ieee80211_get_response_rate + +.. kernel-doc:: include/net/cfg80211.h + :functions: ieee80211_hdrlen + +.. kernel-doc:: include/net/cfg80211.h + :functions: ieee80211_get_hdrlen_from_skb + +.. kernel-doc:: include/net/cfg80211.h + :functions: ieee80211_radiotap_iterator + +Data path helpers +================= + +.. kernel-doc:: include/net/cfg80211.h + :doc: Data path helpers + +.. kernel-doc:: include/net/cfg80211.h + :functions: ieee80211_data_to_8023 + +.. kernel-doc:: include/net/cfg80211.h + :functions: ieee80211_data_from_8023 + +.. kernel-doc:: include/net/cfg80211.h + :functions: ieee80211_amsdu_to_8023s + +.. kernel-doc:: include/net/cfg80211.h + :functions: cfg80211_classify8021d + +Regulatory enforcement infrastructure +===================================== + +.. kernel-doc:: include/net/cfg80211.h + :doc: Regulatory enforcement infrastructure + +.. kernel-doc:: include/net/cfg80211.h + :functions: regulatory_hint + +.. kernel-doc:: include/net/cfg80211.h + :functions: wiphy_apply_custom_regulatory + +.. kernel-doc:: include/net/cfg80211.h + :functions: freq_reg_info + +RFkill integration +================== + +.. kernel-doc:: include/net/cfg80211.h + :doc: RFkill integration + +.. kernel-doc:: include/net/cfg80211.h + :functions: wiphy_rfkill_set_hw_state + +.. kernel-doc:: include/net/cfg80211.h + :functions: wiphy_rfkill_start_polling + +.. kernel-doc:: include/net/cfg80211.h + :functions: wiphy_rfkill_stop_polling + +Test mode +========= + +.. kernel-doc:: include/net/cfg80211.h + :doc: Test mode + +.. kernel-doc:: include/net/cfg80211.h + :functions: cfg80211_testmode_alloc_reply_skb + +.. kernel-doc:: include/net/cfg80211.h + :functions: cfg80211_testmode_reply + +.. kernel-doc:: include/net/cfg80211.h + :functions: cfg80211_testmode_alloc_event_skb + +.. kernel-doc:: include/net/cfg80211.h + :functions: cfg80211_testmode_event diff --git a/Documentation/80211/conf.py b/Documentation/80211/conf.py new file mode 100644 index 000000000000..20c7c275ef4a --- /dev/null +++ b/Documentation/80211/conf.py @@ -0,0 +1,5 @@ +# -*- coding: utf-8; mode: python -*- + +project = "Linux 802.11 Driver Developer's Guide" + +tags.add("subproject") diff --git a/Documentation/80211/index.rst b/Documentation/80211/index.rst new file mode 100644 index 000000000000..90bba476f442 --- /dev/null +++ b/Documentation/80211/index.rst @@ -0,0 +1,17 @@ +===================================== +Linux 802.11 Driver Developer's Guide +===================================== + +.. toctree:: + + introduction + cfg80211 + mac80211 + mac80211-advanced + +.. only:: subproject + + Indices + ======= + + * :ref:`genindex` diff --git a/Documentation/80211/introduction.rst b/Documentation/80211/introduction.rst new file mode 100644 index 000000000000..4938fa87691c --- /dev/null +++ b/Documentation/80211/introduction.rst @@ -0,0 +1,17 @@ +============ +Introduction +============ + +Explaining wireless 802.11 networking in the Linux kernel + +Copyright 2007-2009 Johannes Berg + +These books attempt to give a description of the various subsystems +that play a role in 802.11 wireless networking in Linux. Since these +books are for kernel developers they attempts to document the +structures and functions used in the kernel as well as giving a +higher-level overview. + +The reader is expected to be familiar with the 802.11 standard as +published by the IEEE in 802.11-2007 (or possibly later versions). +References to this standard will be given as "802.11-2007 8.1.5". diff --git a/Documentation/80211/mac80211-advanced.rst b/Documentation/80211/mac80211-advanced.rst new file mode 100644 index 000000000000..70a89b2163c2 --- /dev/null +++ b/Documentation/80211/mac80211-advanced.rst @@ -0,0 +1,295 @@ +============================= +mac80211 subsystem (advanced) +============================= + +Information contained within this part of the book is of interest only +for advanced interaction of mac80211 with drivers to exploit more +hardware capabilities and improve performance. + +LED support +=========== + +Mac80211 supports various ways of blinking LEDs. Wherever possible, +device LEDs should be exposed as LED class devices and hooked up to the +appropriate trigger, which will then be triggered appropriately by +mac80211. + +.. kernel-doc:: include/net/mac80211.h + :functions: ieee80211_get_tx_led_name + +.. kernel-doc:: include/net/mac80211.h + :functions: ieee80211_get_rx_led_name + +.. kernel-doc:: include/net/mac80211.h + :functions: ieee80211_get_assoc_led_name + +.. kernel-doc:: include/net/mac80211.h + :functions: ieee80211_get_radio_led_name + +.. kernel-doc:: include/net/mac80211.h + :functions: ieee80211_tpt_blink + +.. kernel-doc:: include/net/mac80211.h + :functions: ieee80211_tpt_led_trigger_flags + +.. kernel-doc:: include/net/mac80211.h + :functions: ieee80211_create_tpt_led_trigger + +Hardware crypto acceleration +============================ + +.. kernel-doc:: include/net/mac80211.h + :doc: Hardware crypto acceleration + +.. kernel-doc:: include/net/mac80211.h + :functions: set_key_cmd + +.. kernel-doc:: include/net/mac80211.h + :functions: ieee80211_key_conf + +.. kernel-doc:: include/net/mac80211.h + :functions: ieee80211_key_flags + +.. kernel-doc:: include/net/mac80211.h + :functions: ieee80211_get_tkip_p1k + +.. kernel-doc:: include/net/mac80211.h + :functions: ieee80211_get_tkip_p1k_iv + +.. kernel-doc:: include/net/mac80211.h + :functions: ieee80211_get_tkip_p2k + +Powersave support +================= + +.. kernel-doc:: include/net/mac80211.h + :doc: Powersave support + +Beacon filter support +===================== + +.. kernel-doc:: include/net/mac80211.h + :doc: Beacon filter support + +.. kernel-doc:: include/net/mac80211.h + :functions: ieee80211_beacon_loss + +Multiple queues and QoS support +=============================== + +TBD + +.. kernel-doc:: include/net/mac80211.h + :functions: ieee80211_tx_queue_params + +Access point mode support +========================= + +TBD + +Some parts of the if_conf should be discussed here instead + +Insert notes about VLAN interfaces with hw crypto here or in the hw +crypto chapter. + +support for powersaving clients +------------------------------- + +.. kernel-doc:: include/net/mac80211.h + :doc: AP support for powersaving clients + +.. kernel-doc:: include/net/mac80211.h + :functions: ieee80211_get_buffered_bc + +.. kernel-doc:: include/net/mac80211.h + :functions: ieee80211_beacon_get + +.. kernel-doc:: include/net/mac80211.h + :functions: ieee80211_sta_eosp + +.. kernel-doc:: include/net/mac80211.h + :functions: ieee80211_frame_release_type + +.. kernel-doc:: include/net/mac80211.h + :functions: ieee80211_sta_ps_transition + +.. kernel-doc:: include/net/mac80211.h + :functions: ieee80211_sta_ps_transition_ni + +.. kernel-doc:: include/net/mac80211.h + :functions: ieee80211_sta_set_buffered + +.. kernel-doc:: include/net/mac80211.h + :functions: ieee80211_sta_block_awake + +Supporting multiple virtual interfaces +====================================== + +TBD + +Note: WDS with identical MAC address should almost always be OK + +Insert notes about having multiple virtual interfaces with different MAC +addresses here, note which configurations are supported by mac80211, add +notes about supporting hw crypto with it. + +.. kernel-doc:: include/net/mac80211.h + :functions: ieee80211_iterate_active_interfaces + +.. kernel-doc:: include/net/mac80211.h + :functions: ieee80211_iterate_active_interfaces_atomic + +Station handling +================ + +TODO + +.. kernel-doc:: include/net/mac80211.h + :functions: ieee80211_sta + +.. kernel-doc:: include/net/mac80211.h + :functions: sta_notify_cmd + +.. kernel-doc:: include/net/mac80211.h + :functions: ieee80211_find_sta + +.. kernel-doc:: include/net/mac80211.h + :functions: ieee80211_find_sta_by_ifaddr + +Hardware scan offload +===================== + +TBD + +.. kernel-doc:: include/net/mac80211.h + :functions: ieee80211_scan_completed + +Aggregation +=========== + +TX A-MPDU aggregation +--------------------- + +.. kernel-doc:: net/mac80211/agg-tx.c + :doc: TX A-MPDU aggregation + +.. WARNING: DOCPROC directive not supported: !Cnet/mac80211/agg-tx.c + +RX A-MPDU aggregation +--------------------- + +.. kernel-doc:: net/mac80211/agg-rx.c + :doc: RX A-MPDU aggregation + +.. WARNING: DOCPROC directive not supported: !Cnet/mac80211/agg-rx.c + +.. kernel-doc:: include/net/mac80211.h + :functions: ieee80211_ampdu_mlme_action + +Spatial Multiplexing Powersave (SMPS) +===================================== + +.. kernel-doc:: include/net/mac80211.h + :doc: Spatial multiplexing power save + +.. kernel-doc:: include/net/mac80211.h + :functions: ieee80211_request_smps + +.. kernel-doc:: include/net/mac80211.h + :functions: ieee80211_smps_mode + +TBD + +This part of the book describes the rate control algorithm interface and +how it relates to mac80211 and drivers. + +Rate Control API +================ + +TBD + +.. kernel-doc:: include/net/mac80211.h + :functions: ieee80211_start_tx_ba_session + +.. kernel-doc:: include/net/mac80211.h + :functions: ieee80211_start_tx_ba_cb_irqsafe + +.. kernel-doc:: include/net/mac80211.h + :functions: ieee80211_stop_tx_ba_session + +.. kernel-doc:: include/net/mac80211.h + :functions: ieee80211_stop_tx_ba_cb_irqsafe + +.. kernel-doc:: include/net/mac80211.h + :functions: ieee80211_rate_control_changed + +.. kernel-doc:: include/net/mac80211.h + :functions: ieee80211_tx_rate_control + +.. kernel-doc:: include/net/mac80211.h + :functions: rate_control_send_low + +TBD + +This part of the book describes mac80211 internals. + +Key handling +============ + +Key handling basics +------------------- + +.. kernel-doc:: net/mac80211/key.c + :doc: Key handling basics + +MORE TBD +-------- + +TBD + +Receive processing +================== + +TBD + +Transmit processing +=================== + +TBD + +Station info handling +===================== + +Programming information +----------------------- + +.. kernel-doc:: net/mac80211/sta_info.h + :functions: sta_info + +.. kernel-doc:: net/mac80211/sta_info.h + :functions: ieee80211_sta_info_flags + +STA information lifetime rules +------------------------------ + +.. kernel-doc:: net/mac80211/sta_info.c + :doc: STA information lifetime rules + +Aggregation +=========== + +.. kernel-doc:: net/mac80211/sta_info.h + :functions: sta_ampdu_mlme + +.. kernel-doc:: net/mac80211/sta_info.h + :functions: tid_ampdu_tx + +.. kernel-doc:: net/mac80211/sta_info.h + :functions: tid_ampdu_rx + +Synchronisation +=============== + +TBD + +Locking, lots of RCU diff --git a/Documentation/80211/mac80211.rst b/Documentation/80211/mac80211.rst new file mode 100644 index 000000000000..85a8335e80b6 --- /dev/null +++ b/Documentation/80211/mac80211.rst @@ -0,0 +1,216 @@ +=========================== +mac80211 subsystem (basics) +=========================== + +You should read and understand the information contained within this +part of the book while implementing a mac80211 driver. In some chapters, +advanced usage is noted, those may be skipped if this isn't needed. + +This part of the book only covers station and monitor mode +functionality, additional information required to implement the other +modes is covered in the second part of the book. + +Basic hardware handling +======================= + +TBD + +This chapter shall contain information on getting a hw struct allocated +and registered with mac80211. + +Since it is required to allocate rates/modes before registering a hw +struct, this chapter shall also contain information on setting up the +rate/mode structs. + +Additionally, some discussion about the callbacks and the general +programming model should be in here, including the definition of +ieee80211_ops which will be referred to a lot. + +Finally, a discussion of hardware capabilities should be done with +references to other parts of the book. + +.. kernel-doc:: include/net/mac80211.h + :functions: ieee80211_hw + +.. kernel-doc:: include/net/mac80211.h + :functions: ieee80211_hw_flags + +.. kernel-doc:: include/net/mac80211.h + :functions: SET_IEEE80211_DEV + +.. kernel-doc:: include/net/mac80211.h + :functions: SET_IEEE80211_PERM_ADDR + +.. kernel-doc:: include/net/mac80211.h + :functions: ieee80211_ops + +.. kernel-doc:: include/net/mac80211.h + :functions: ieee80211_alloc_hw + +.. kernel-doc:: include/net/mac80211.h + :functions: ieee80211_register_hw + +.. kernel-doc:: include/net/mac80211.h + :functions: ieee80211_unregister_hw + +.. kernel-doc:: include/net/mac80211.h + :functions: ieee80211_free_hw + +PHY configuration +================= + +TBD + +This chapter should describe PHY handling including start/stop callbacks +and the various structures used. + +.. kernel-doc:: include/net/mac80211.h + :functions: ieee80211_conf + +.. kernel-doc:: include/net/mac80211.h + :functions: ieee80211_conf_flags + +Virtual interfaces +================== + +TBD + +This chapter should describe virtual interface basics that are relevant +to the driver (VLANs, MGMT etc are not.) It should explain the use of +the add_iface/remove_iface callbacks as well as the interface +configuration callbacks. + +Things related to AP mode should be discussed there. + +Things related to supporting multiple interfaces should be in the +appropriate chapter, a BIG FAT note should be here about this though and +the recommendation to allow only a single interface in STA mode at +first! + +.. kernel-doc:: include/net/mac80211.h + :functions: ieee80211_vif + +Receive and transmit processing +=============================== + +what should be here +------------------- + +TBD + +This should describe the receive and transmit paths in mac80211/the +drivers as well as transmit status handling. + +Frame format +------------ + +.. kernel-doc:: include/net/mac80211.h + :doc: Frame format + +Packet alignment +---------------- + +.. kernel-doc:: net/mac80211/rx.c + :doc: Packet alignment + +Calling into mac80211 from interrupts +------------------------------------- + +.. kernel-doc:: include/net/mac80211.h + :doc: Calling mac80211 from interrupts + +functions/definitions +--------------------- + +.. kernel-doc:: include/net/mac80211.h + :functions: ieee80211_rx_status + +.. kernel-doc:: include/net/mac80211.h + :functions: mac80211_rx_flags + +.. kernel-doc:: include/net/mac80211.h + :functions: mac80211_tx_info_flags + +.. kernel-doc:: include/net/mac80211.h + :functions: mac80211_tx_control_flags + +.. kernel-doc:: include/net/mac80211.h + :functions: mac80211_rate_control_flags + +.. kernel-doc:: include/net/mac80211.h + :functions: ieee80211_tx_rate + +.. kernel-doc:: include/net/mac80211.h + :functions: ieee80211_tx_info + +.. kernel-doc:: include/net/mac80211.h + :functions: ieee80211_tx_info_clear_status + +.. kernel-doc:: include/net/mac80211.h + :functions: ieee80211_rx + +.. kernel-doc:: include/net/mac80211.h + :functions: ieee80211_rx_ni + +.. kernel-doc:: include/net/mac80211.h + :functions: ieee80211_rx_irqsafe + +.. kernel-doc:: include/net/mac80211.h + :functions: ieee80211_tx_status + +.. kernel-doc:: include/net/mac80211.h + :functions: ieee80211_tx_status_ni + +.. kernel-doc:: include/net/mac80211.h + :functions: ieee80211_tx_status_irqsafe + +.. kernel-doc:: include/net/mac80211.h + :functions: ieee80211_rts_get + +.. kernel-doc:: include/net/mac80211.h + :functions: ieee80211_rts_duration + +.. kernel-doc:: include/net/mac80211.h + :functions: ieee80211_ctstoself_get + +.. kernel-doc:: include/net/mac80211.h + :functions: ieee80211_ctstoself_duration + +.. kernel-doc:: include/net/mac80211.h + :functions: ieee80211_generic_frame_duration + +.. kernel-doc:: include/net/mac80211.h + :functions: ieee80211_wake_queue + +.. kernel-doc:: include/net/mac80211.h + :functions: ieee80211_stop_queue + +.. kernel-doc:: include/net/mac80211.h + :functions: ieee80211_wake_queues + +.. kernel-doc:: include/net/mac80211.h + :functions: ieee80211_stop_queues + +.. kernel-doc:: include/net/mac80211.h + :functions: ieee80211_queue_stopped + +Frame filtering +=============== + +.. kernel-doc:: include/net/mac80211.h + :doc: Frame filtering + +.. kernel-doc:: include/net/mac80211.h + :functions: ieee80211_filter_flags + +The mac80211 workqueue +====================== + +.. kernel-doc:: include/net/mac80211.h + :doc: mac80211 workqueue + +.. kernel-doc:: include/net/mac80211.h + :functions: ieee80211_queue_work + +.. kernel-doc:: include/net/mac80211.h + :functions: ieee80211_queue_delayed_work diff --git a/Documentation/DocBook/80211.tmpl b/Documentation/DocBook/80211.tmpl deleted file mode 100644 index 800fe7a9024c..000000000000 --- a/Documentation/DocBook/80211.tmpl +++ /dev/null @@ -1,584 +0,0 @@ - - - - - The 802.11 subsystems – for kernel developers - - Explaining wireless 802.11 networking in the Linux kernel - - - - 2007-2009 - Johannes Berg - - - - - Johannes - Berg - -
johannes@sipsolutions.net
-
-
-
- - - - This documentation is free software; you can redistribute - it and/or modify it under the terms of the GNU General Public - License version 2 as published by the Free Software Foundation. - - - This documentation is distributed in the hope that it will be - useful, but WITHOUT ANY WARRANTY; without even the implied - warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - See the GNU General Public License for more details. - - - You should have received a copy of the GNU General Public - License along with this documentation; if not, write to the Free - Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, - MA 02111-1307 USA - - - For more details see the file COPYING in the source - distribution of Linux. - - - - - - These books attempt to give a description of the - various subsystems that play a role in 802.11 wireless - networking in Linux. Since these books are for kernel - developers they attempts to document the structures - and functions used in the kernel as well as giving a - higher-level overview. - - - The reader is expected to be familiar with the 802.11 - standard as published by the IEEE in 802.11-2007 (or - possibly later versions). References to this standard - will be given as "802.11-2007 8.1.5". - - -
- - - The cfg80211 subsystem - - -!Pinclude/net/cfg80211.h Introduction - - - - Device registration -!Pinclude/net/cfg80211.h Device registration -!Finclude/net/cfg80211.h ieee80211_channel_flags -!Finclude/net/cfg80211.h ieee80211_channel -!Finclude/net/cfg80211.h ieee80211_rate_flags -!Finclude/net/cfg80211.h ieee80211_rate -!Finclude/net/cfg80211.h ieee80211_sta_ht_cap -!Finclude/net/cfg80211.h ieee80211_supported_band -!Finclude/net/cfg80211.h cfg80211_signal_type -!Finclude/net/cfg80211.h wiphy_params_flags -!Finclude/net/cfg80211.h wiphy_flags -!Finclude/net/cfg80211.h wiphy -!Finclude/net/cfg80211.h wireless_dev -!Finclude/net/cfg80211.h wiphy_new -!Finclude/net/cfg80211.h wiphy_register -!Finclude/net/cfg80211.h wiphy_unregister -!Finclude/net/cfg80211.h wiphy_free - -!Finclude/net/cfg80211.h wiphy_name -!Finclude/net/cfg80211.h wiphy_dev -!Finclude/net/cfg80211.h wiphy_priv -!Finclude/net/cfg80211.h priv_to_wiphy -!Finclude/net/cfg80211.h set_wiphy_dev -!Finclude/net/cfg80211.h wdev_priv -!Finclude/net/cfg80211.h ieee80211_iface_limit -!Finclude/net/cfg80211.h ieee80211_iface_combination -!Finclude/net/cfg80211.h cfg80211_check_combinations - - - Actions and configuration -!Pinclude/net/cfg80211.h Actions and configuration -!Finclude/net/cfg80211.h cfg80211_ops -!Finclude/net/cfg80211.h vif_params -!Finclude/net/cfg80211.h key_params -!Finclude/net/cfg80211.h survey_info_flags -!Finclude/net/cfg80211.h survey_info -!Finclude/net/cfg80211.h cfg80211_beacon_data -!Finclude/net/cfg80211.h cfg80211_ap_settings -!Finclude/net/cfg80211.h station_parameters -!Finclude/net/cfg80211.h rate_info_flags -!Finclude/net/cfg80211.h rate_info -!Finclude/net/cfg80211.h station_info -!Finclude/net/cfg80211.h monitor_flags -!Finclude/net/cfg80211.h mpath_info_flags -!Finclude/net/cfg80211.h mpath_info -!Finclude/net/cfg80211.h bss_parameters -!Finclude/net/cfg80211.h ieee80211_txq_params -!Finclude/net/cfg80211.h cfg80211_crypto_settings -!Finclude/net/cfg80211.h cfg80211_auth_request -!Finclude/net/cfg80211.h cfg80211_assoc_request -!Finclude/net/cfg80211.h cfg80211_deauth_request -!Finclude/net/cfg80211.h cfg80211_disassoc_request -!Finclude/net/cfg80211.h cfg80211_ibss_params -!Finclude/net/cfg80211.h cfg80211_connect_params -!Finclude/net/cfg80211.h cfg80211_pmksa -!Finclude/net/cfg80211.h cfg80211_rx_mlme_mgmt -!Finclude/net/cfg80211.h cfg80211_auth_timeout -!Finclude/net/cfg80211.h cfg80211_rx_assoc_resp -!Finclude/net/cfg80211.h cfg80211_assoc_timeout -!Finclude/net/cfg80211.h cfg80211_tx_mlme_mgmt -!Finclude/net/cfg80211.h cfg80211_ibss_joined -!Finclude/net/cfg80211.h cfg80211_connect_result -!Finclude/net/cfg80211.h cfg80211_connect_bss -!Finclude/net/cfg80211.h cfg80211_connect_timeout -!Finclude/net/cfg80211.h cfg80211_roamed -!Finclude/net/cfg80211.h cfg80211_disconnected -!Finclude/net/cfg80211.h cfg80211_ready_on_channel -!Finclude/net/cfg80211.h cfg80211_remain_on_channel_expired -!Finclude/net/cfg80211.h cfg80211_new_sta -!Finclude/net/cfg80211.h cfg80211_rx_mgmt -!Finclude/net/cfg80211.h cfg80211_mgmt_tx_status -!Finclude/net/cfg80211.h cfg80211_cqm_rssi_notify -!Finclude/net/cfg80211.h cfg80211_cqm_pktloss_notify -!Finclude/net/cfg80211.h cfg80211_michael_mic_failure - - - Scanning and BSS list handling -!Pinclude/net/cfg80211.h Scanning and BSS list handling -!Finclude/net/cfg80211.h cfg80211_ssid -!Finclude/net/cfg80211.h cfg80211_scan_request -!Finclude/net/cfg80211.h cfg80211_scan_done -!Finclude/net/cfg80211.h cfg80211_bss -!Finclude/net/cfg80211.h cfg80211_inform_bss -!Finclude/net/cfg80211.h cfg80211_inform_bss_frame_data -!Finclude/net/cfg80211.h cfg80211_inform_bss_data -!Finclude/net/cfg80211.h cfg80211_unlink_bss -!Finclude/net/cfg80211.h cfg80211_find_ie -!Finclude/net/cfg80211.h ieee80211_bss_get_ie - - - Utility functions -!Pinclude/net/cfg80211.h Utility functions -!Finclude/net/cfg80211.h ieee80211_channel_to_frequency -!Finclude/net/cfg80211.h ieee80211_frequency_to_channel -!Finclude/net/cfg80211.h ieee80211_get_channel -!Finclude/net/cfg80211.h ieee80211_get_response_rate -!Finclude/net/cfg80211.h ieee80211_hdrlen -!Finclude/net/cfg80211.h ieee80211_get_hdrlen_from_skb -!Finclude/net/cfg80211.h ieee80211_radiotap_iterator - - - Data path helpers -!Pinclude/net/cfg80211.h Data path helpers -!Finclude/net/cfg80211.h ieee80211_data_to_8023 -!Finclude/net/cfg80211.h ieee80211_data_from_8023 -!Finclude/net/cfg80211.h ieee80211_amsdu_to_8023s -!Finclude/net/cfg80211.h cfg80211_classify8021d - - - Regulatory enforcement infrastructure -!Pinclude/net/cfg80211.h Regulatory enforcement infrastructure -!Finclude/net/cfg80211.h regulatory_hint -!Finclude/net/cfg80211.h wiphy_apply_custom_regulatory -!Finclude/net/cfg80211.h freq_reg_info - - - RFkill integration -!Pinclude/net/cfg80211.h RFkill integration -!Finclude/net/cfg80211.h wiphy_rfkill_set_hw_state -!Finclude/net/cfg80211.h wiphy_rfkill_start_polling -!Finclude/net/cfg80211.h wiphy_rfkill_stop_polling - - - Test mode -!Pinclude/net/cfg80211.h Test mode -!Finclude/net/cfg80211.h cfg80211_testmode_alloc_reply_skb -!Finclude/net/cfg80211.h cfg80211_testmode_reply -!Finclude/net/cfg80211.h cfg80211_testmode_alloc_event_skb -!Finclude/net/cfg80211.h cfg80211_testmode_event - - - - - The mac80211 subsystem - -!Pinclude/net/mac80211.h Introduction -!Pinclude/net/mac80211.h Warning - - - - - - - - - The basic mac80211 driver interface - - - You should read and understand the information contained - within this part of the book while implementing a driver. - In some chapters, advanced usage is noted, that may be - skipped at first. - - - This part of the book only covers station and monitor mode - functionality, additional information required to implement - the other modes is covered in the second part of the book. - - - - - Basic hardware handling - TBD - - This chapter shall contain information on getting a hw - struct allocated and registered with mac80211. - - - Since it is required to allocate rates/modes before registering - a hw struct, this chapter shall also contain information on setting - up the rate/mode structs. - - - Additionally, some discussion about the callbacks and - the general programming model should be in here, including - the definition of ieee80211_ops which will be referred to - a lot. - - - Finally, a discussion of hardware capabilities should be done - with references to other parts of the book. - - -!Finclude/net/mac80211.h ieee80211_hw -!Finclude/net/mac80211.h ieee80211_hw_flags -!Finclude/net/mac80211.h SET_IEEE80211_DEV -!Finclude/net/mac80211.h SET_IEEE80211_PERM_ADDR -!Finclude/net/mac80211.h ieee80211_ops -!Finclude/net/mac80211.h ieee80211_alloc_hw -!Finclude/net/mac80211.h ieee80211_register_hw -!Finclude/net/mac80211.h ieee80211_unregister_hw -!Finclude/net/mac80211.h ieee80211_free_hw - - - - PHY configuration - TBD - - This chapter should describe PHY handling including - start/stop callbacks and the various structures used. - -!Finclude/net/mac80211.h ieee80211_conf -!Finclude/net/mac80211.h ieee80211_conf_flags - - - - Virtual interfaces - TBD - - This chapter should describe virtual interface basics - that are relevant to the driver (VLANs, MGMT etc are not.) - It should explain the use of the add_iface/remove_iface - callbacks as well as the interface configuration callbacks. - - Things related to AP mode should be discussed there. - - Things related to supporting multiple interfaces should be - in the appropriate chapter, a BIG FAT note should be here about - this though and the recommendation to allow only a single - interface in STA mode at first! - -!Finclude/net/mac80211.h ieee80211_vif - - - - Receive and transmit processing - - what should be here - TBD - - This should describe the receive and transmit - paths in mac80211/the drivers as well as - transmit status handling. - - - - Frame format -!Pinclude/net/mac80211.h Frame format - - - Packet alignment -!Pnet/mac80211/rx.c Packet alignment - - - Calling into mac80211 from interrupts -!Pinclude/net/mac80211.h Calling mac80211 from interrupts - - - functions/definitions -!Finclude/net/mac80211.h ieee80211_rx_status -!Finclude/net/mac80211.h mac80211_rx_flags -!Finclude/net/mac80211.h mac80211_tx_info_flags -!Finclude/net/mac80211.h mac80211_tx_control_flags -!Finclude/net/mac80211.h mac80211_rate_control_flags -!Finclude/net/mac80211.h ieee80211_tx_rate -!Finclude/net/mac80211.h ieee80211_tx_info -!Finclude/net/mac80211.h ieee80211_tx_info_clear_status -!Finclude/net/mac80211.h ieee80211_rx -!Finclude/net/mac80211.h ieee80211_rx_ni -!Finclude/net/mac80211.h ieee80211_rx_irqsafe -!Finclude/net/mac80211.h ieee80211_tx_status -!Finclude/net/mac80211.h ieee80211_tx_status_ni -!Finclude/net/mac80211.h ieee80211_tx_status_irqsafe -!Finclude/net/mac80211.h ieee80211_rts_get -!Finclude/net/mac80211.h ieee80211_rts_duration -!Finclude/net/mac80211.h ieee80211_ctstoself_get -!Finclude/net/mac80211.h ieee80211_ctstoself_duration -!Finclude/net/mac80211.h ieee80211_generic_frame_duration -!Finclude/net/mac80211.h ieee80211_wake_queue -!Finclude/net/mac80211.h ieee80211_stop_queue -!Finclude/net/mac80211.h ieee80211_wake_queues -!Finclude/net/mac80211.h ieee80211_stop_queues -!Finclude/net/mac80211.h ieee80211_queue_stopped - - - - - Frame filtering -!Pinclude/net/mac80211.h Frame filtering -!Finclude/net/mac80211.h ieee80211_filter_flags - - - - The mac80211 workqueue -!Pinclude/net/mac80211.h mac80211 workqueue -!Finclude/net/mac80211.h ieee80211_queue_work -!Finclude/net/mac80211.h ieee80211_queue_delayed_work - - - - - Advanced driver interface - - - Information contained within this part of the book is - of interest only for advanced interaction of mac80211 - with drivers to exploit more hardware capabilities and - improve performance. - - - - - LED support - - Mac80211 supports various ways of blinking LEDs. Wherever possible, - device LEDs should be exposed as LED class devices and hooked up to - the appropriate trigger, which will then be triggered appropriately - by mac80211. - -!Finclude/net/mac80211.h ieee80211_get_tx_led_name -!Finclude/net/mac80211.h ieee80211_get_rx_led_name -!Finclude/net/mac80211.h ieee80211_get_assoc_led_name -!Finclude/net/mac80211.h ieee80211_get_radio_led_name -!Finclude/net/mac80211.h ieee80211_tpt_blink -!Finclude/net/mac80211.h ieee80211_tpt_led_trigger_flags -!Finclude/net/mac80211.h ieee80211_create_tpt_led_trigger - - - - Hardware crypto acceleration -!Pinclude/net/mac80211.h Hardware crypto acceleration - -!Finclude/net/mac80211.h set_key_cmd -!Finclude/net/mac80211.h ieee80211_key_conf -!Finclude/net/mac80211.h ieee80211_key_flags -!Finclude/net/mac80211.h ieee80211_get_tkip_p1k -!Finclude/net/mac80211.h ieee80211_get_tkip_p1k_iv -!Finclude/net/mac80211.h ieee80211_get_tkip_p2k - - - - Powersave support -!Pinclude/net/mac80211.h Powersave support - - - - Beacon filter support -!Pinclude/net/mac80211.h Beacon filter support -!Finclude/net/mac80211.h ieee80211_beacon_loss - - - - Multiple queues and QoS support - TBD -!Finclude/net/mac80211.h ieee80211_tx_queue_params - - - - Access point mode support - TBD - Some parts of the if_conf should be discussed here instead - - Insert notes about VLAN interfaces with hw crypto here or - in the hw crypto chapter. - -
- support for powersaving clients -!Pinclude/net/mac80211.h AP support for powersaving clients -!Finclude/net/mac80211.h ieee80211_get_buffered_bc -!Finclude/net/mac80211.h ieee80211_beacon_get -!Finclude/net/mac80211.h ieee80211_sta_eosp -!Finclude/net/mac80211.h ieee80211_frame_release_type -!Finclude/net/mac80211.h ieee80211_sta_ps_transition -!Finclude/net/mac80211.h ieee80211_sta_ps_transition_ni -!Finclude/net/mac80211.h ieee80211_sta_set_buffered -!Finclude/net/mac80211.h ieee80211_sta_block_awake -
-
- - - Supporting multiple virtual interfaces - TBD - - Note: WDS with identical MAC address should almost always be OK - - - Insert notes about having multiple virtual interfaces with - different MAC addresses here, note which configurations are - supported by mac80211, add notes about supporting hw crypto - with it. - -!Finclude/net/mac80211.h ieee80211_iterate_active_interfaces -!Finclude/net/mac80211.h ieee80211_iterate_active_interfaces_atomic - - - - Station handling - TODO -!Finclude/net/mac80211.h ieee80211_sta -!Finclude/net/mac80211.h sta_notify_cmd -!Finclude/net/mac80211.h ieee80211_find_sta -!Finclude/net/mac80211.h ieee80211_find_sta_by_ifaddr - - - - Hardware scan offload - TBD -!Finclude/net/mac80211.h ieee80211_scan_completed - - - - Aggregation - - TX A-MPDU aggregation -!Pnet/mac80211/agg-tx.c TX A-MPDU aggregation -!Cnet/mac80211/agg-tx.c - - - RX A-MPDU aggregation -!Pnet/mac80211/agg-rx.c RX A-MPDU aggregation -!Cnet/mac80211/agg-rx.c -!Finclude/net/mac80211.h ieee80211_ampdu_mlme_action - - - - - Spatial Multiplexing Powersave (SMPS) -!Pinclude/net/mac80211.h Spatial multiplexing power save -!Finclude/net/mac80211.h ieee80211_request_smps -!Finclude/net/mac80211.h ieee80211_smps_mode - -
- - - Rate control interface - - TBD - - This part of the book describes the rate control algorithm - interface and how it relates to mac80211 and drivers. - - - - Rate Control API - TBD -!Finclude/net/mac80211.h ieee80211_start_tx_ba_session -!Finclude/net/mac80211.h ieee80211_start_tx_ba_cb_irqsafe -!Finclude/net/mac80211.h ieee80211_stop_tx_ba_session -!Finclude/net/mac80211.h ieee80211_stop_tx_ba_cb_irqsafe -!Finclude/net/mac80211.h ieee80211_rate_control_changed -!Finclude/net/mac80211.h ieee80211_tx_rate_control -!Finclude/net/mac80211.h rate_control_send_low - - - - - Internals - - TBD - - This part of the book describes mac80211 internals. - - - - - Key handling - - Key handling basics -!Pnet/mac80211/key.c Key handling basics - - - MORE TBD - TBD - - - - - Receive processing - TBD - - - - Transmit processing - TBD - - - - Station info handling - - Programming information -!Fnet/mac80211/sta_info.h sta_info -!Fnet/mac80211/sta_info.h ieee80211_sta_info_flags - - - STA information lifetime rules -!Pnet/mac80211/sta_info.c STA information lifetime rules - - - - - Aggregation -!Fnet/mac80211/sta_info.h sta_ampdu_mlme -!Fnet/mac80211/sta_info.h tid_ampdu_tx -!Fnet/mac80211/sta_info.h tid_ampdu_rx - - - - Synchronisation - TBD - Locking, lots of RCU - - -
-
diff --git a/Documentation/DocBook/Makefile b/Documentation/DocBook/Makefile index 736f5916daea..fdf8232d0eeb 100644 --- a/Documentation/DocBook/Makefile +++ b/Documentation/DocBook/Makefile @@ -12,7 +12,7 @@ DOCBOOKS := z8530book.xml \ kernel-api.xml filesystems.xml lsm.xml usb.xml kgdb.xml \ gadget.xml libata.xml mtdnand.xml librs.xml rapidio.xml \ genericirq.xml s390-drivers.xml uio-howto.xml scsi.xml \ - 80211.xml debugobjects.xml sh.xml regulator.xml \ + debugobjects.xml sh.xml regulator.xml \ alsa-driver-api.xml writing-an-alsa-driver.xml \ tracepoint.xml w1.xml \ writing_musb_glue_layer.xml crypto-API.xml iio.xml diff --git a/Documentation/index.rst b/Documentation/index.rst index d9ccb94fca95..c53d089455a4 100644 --- a/Documentation/index.rst +++ b/Documentation/index.rst @@ -17,6 +17,7 @@ Contents: driver-api/index media/index gpu/index + 80211/index Indices and tables ================== diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 9c23f4d33e06..02b6690d7162 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -775,9 +775,9 @@ enum station_parameters_apply_mask { * (or NULL for no change) * @supported_rates_len: number of supported rates * @sta_flags_mask: station flags that changed - * (bitmask of BIT(NL80211_STA_FLAG_...)) + * (bitmask of BIT(%NL80211_STA_FLAG_...)) * @sta_flags_set: station flags values - * (bitmask of BIT(NL80211_STA_FLAG_...)) + * (bitmask of BIT(%NL80211_STA_FLAG_...)) * @listen_interval: listen interval or -1 for no change * @aid: AID or zero for no change * @peer_aid: mesh peer AID or zero for no change @@ -2947,47 +2947,54 @@ struct ieee80211_iface_limit { * * 1. Allow #STA <= 1, #AP <= 1, matching BI, channels = 1, 2 total: * - * struct ieee80211_iface_limit limits1[] = { - * { .max = 1, .types = BIT(NL80211_IFTYPE_STATION), }, - * { .max = 1, .types = BIT(NL80211_IFTYPE_AP}, }, - * }; - * struct ieee80211_iface_combination combination1 = { - * .limits = limits1, - * .n_limits = ARRAY_SIZE(limits1), - * .max_interfaces = 2, - * .beacon_int_infra_match = true, - * }; + * .. code-block:: c + * + * struct ieee80211_iface_limit limits1[] = { + * { .max = 1, .types = BIT(NL80211_IFTYPE_STATION), }, + * { .max = 1, .types = BIT(NL80211_IFTYPE_AP}, }, + * }; + * struct ieee80211_iface_combination combination1 = { + * .limits = limits1, + * .n_limits = ARRAY_SIZE(limits1), + * .max_interfaces = 2, + * .beacon_int_infra_match = true, + * }; * * * 2. Allow #{AP, P2P-GO} <= 8, channels = 1, 8 total: * - * struct ieee80211_iface_limit limits2[] = { - * { .max = 8, .types = BIT(NL80211_IFTYPE_AP) | - * BIT(NL80211_IFTYPE_P2P_GO), }, - * }; - * struct ieee80211_iface_combination combination2 = { - * .limits = limits2, - * .n_limits = ARRAY_SIZE(limits2), - * .max_interfaces = 8, - * .num_different_channels = 1, - * }; + * .. code-block:: c + * + * struct ieee80211_iface_limit limits2[] = { + * { .max = 8, .types = BIT(NL80211_IFTYPE_AP) | + * BIT(NL80211_IFTYPE_P2P_GO), }, + * }; + * struct ieee80211_iface_combination combination2 = { + * .limits = limits2, + * .n_limits = ARRAY_SIZE(limits2), + * .max_interfaces = 8, + * .num_different_channels = 1, + * }; * * * 3. Allow #STA <= 1, #{P2P-client,P2P-GO} <= 3 on two channels, 4 total. * - * This allows for an infrastructure connection and three P2P connections. - * - * struct ieee80211_iface_limit limits3[] = { - * { .max = 1, .types = BIT(NL80211_IFTYPE_STATION), }, - * { .max = 3, .types = BIT(NL80211_IFTYPE_P2P_GO) | - * BIT(NL80211_IFTYPE_P2P_CLIENT), }, - * }; - * struct ieee80211_iface_combination combination3 = { - * .limits = limits3, - * .n_limits = ARRAY_SIZE(limits3), - * .max_interfaces = 4, - * .num_different_channels = 2, - * }; + * This allows for an infrastructure connection and three P2P connections. + * + * .. code-block:: c + * + * struct ieee80211_iface_limit limits3[] = { + * { .max = 1, .types = BIT(NL80211_IFTYPE_STATION), }, + * { .max = 3, .types = BIT(NL80211_IFTYPE_P2P_GO) | + * BIT(NL80211_IFTYPE_P2P_CLIENT), }, + * }; + * struct ieee80211_iface_combination combination3 = { + * .limits = limits3, + * .n_limits = ARRAY_SIZE(limits3), + * .max_interfaces = 4, + * .num_different_channels = 2, + * }; + * */ struct ieee80211_iface_combination { const struct ieee80211_iface_limit *limits; -- cgit v1.2.3 From 7f6990c830f3e8d703f13d7963acf51936c52ad2 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 5 Oct 2016 15:29:49 +0200 Subject: cfg80211: let ieee80211_amsdu_to_8023s() take only header-less SKB There's only a single case where has_80211_header is passed as true, which is in mac80211. Given that there's only simple code that needs to be done before calling it, export that function from cfg80211 instead and let mac80211 call it itself. Signed-off-by: Johannes Berg --- .../net/wireless/marvell/mwifiex/11n_rxreorder.c | 2 +- include/net/cfg80211.h | 31 +++++++++++++++------- net/mac80211/rx.c | 8 +++++- net/wireless/util.c | 24 ++++------------- 4 files changed, 35 insertions(+), 30 deletions(-) (limited to 'include') diff --git a/drivers/net/wireless/marvell/mwifiex/11n_rxreorder.c b/drivers/net/wireless/marvell/mwifiex/11n_rxreorder.c index 94480123efa3..e9f462e3730b 100644 --- a/drivers/net/wireless/marvell/mwifiex/11n_rxreorder.c +++ b/drivers/net/wireless/marvell/mwifiex/11n_rxreorder.c @@ -45,7 +45,7 @@ static int mwifiex_11n_dispatch_amsdu_pkt(struct mwifiex_private *priv, skb_trim(skb, le16_to_cpu(local_rx_pd->rx_pkt_length)); ieee80211_amsdu_to_8023s(skb, &list, priv->curr_addr, - priv->wdev.iftype, 0, false); + priv->wdev.iftype, 0); while (!skb_queue_empty(&list)) { struct rx_packet_hdr *rx_hdr; diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index fe78f02a242e..f372eadd1963 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -4039,6 +4039,18 @@ unsigned int ieee80211_get_mesh_hdrlen(struct ieee80211s_hdr *meshhdr); * that do not do the 802.11/802.3 conversion on the device. */ +/** + * ieee80211_data_to_8023_exthdr - convert an 802.11 data frame to 802.3 + * @skb: the 802.11 data frame + * @ehdr: pointer to a &struct ethhdr that will get the header, instead + * of it being pushed into the SKB + * @addr: the device MAC address + * @iftype: the virtual interface type + * Return: 0 on success. Non-zero on error. + */ +int ieee80211_data_to_8023_exthdr(struct sk_buff *skb, struct ethhdr *ehdr, + const u8 *addr, enum nl80211_iftype iftype); + /** * ieee80211_data_to_8023 - convert an 802.11 data frame to 802.3 * @skb: the 802.11 data frame @@ -4046,8 +4058,11 @@ unsigned int ieee80211_get_mesh_hdrlen(struct ieee80211s_hdr *meshhdr); * @iftype: the virtual interface type * Return: 0 on success. Non-zero on error. */ -int ieee80211_data_to_8023(struct sk_buff *skb, const u8 *addr, - enum nl80211_iftype iftype); +static inline int ieee80211_data_to_8023(struct sk_buff *skb, const u8 *addr, + enum nl80211_iftype iftype) +{ + return ieee80211_data_to_8023_exthdr(skb, NULL, addr, iftype); +} /** * ieee80211_data_from_8023 - convert an 802.3 frame to 802.11 @@ -4065,22 +4080,20 @@ int ieee80211_data_from_8023(struct sk_buff *skb, const u8 *addr, /** * ieee80211_amsdu_to_8023s - decode an IEEE 802.11n A-MSDU frame * - * Decode an IEEE 802.11n A-MSDU frame and convert it to a list of - * 802.3 frames. The @list will be empty if the decode fails. The - * @skb is consumed after the function returns. + * Decode an IEEE 802.11 A-MSDU and convert it to a list of 802.3 frames. + * The @list will be empty if the decode fails. The @skb must be fully + * header-less before being passed in here; it is freed in this function. * - * @skb: The input IEEE 802.11n A-MSDU frame. + * @skb: The input A-MSDU frame without any headers. * @list: The output list of 802.3 frames. It must be allocated and * initialized by by the caller. * @addr: The device MAC address. * @iftype: The device interface type. * @extra_headroom: The hardware extra headroom for SKBs in the @list. - * @has_80211_header: Set it true if SKB is with IEEE 802.11 header. */ void ieee80211_amsdu_to_8023s(struct sk_buff *skb, struct sk_buff_head *list, const u8 *addr, enum nl80211_iftype iftype, - const unsigned int extra_headroom, - bool has_80211_header); + const unsigned int extra_headroom); /** * cfg80211_classify8021d - determine the 802.1p/1d tag for a data frame diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 3ac2f1cba317..de2d75fa5c51 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -2298,6 +2298,7 @@ ieee80211_rx_h_amsdu(struct ieee80211_rx_data *rx) __le16 fc = hdr->frame_control; struct sk_buff_head frame_list; struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(rx->skb); + struct ethhdr ethhdr; if (unlikely(!ieee80211_is_data(fc))) return RX_CONTINUE; @@ -2329,9 +2330,14 @@ ieee80211_rx_h_amsdu(struct ieee80211_rx_data *rx) skb->dev = dev; __skb_queue_head_init(&frame_list); + if (ieee80211_data_to_8023_exthdr(skb, ðhdr, + rx->sdata->vif.addr, + rx->sdata->vif.type)) + return RX_DROP_UNUSABLE; + ieee80211_amsdu_to_8023s(skb, &frame_list, dev->dev_addr, rx->sdata->vif.type, - rx->local->hw.extra_tx_headroom, true); + rx->local->hw.extra_tx_headroom); while (!skb_queue_empty(&frame_list)) { rx->skb = __skb_dequeue(&frame_list); diff --git a/net/wireless/util.c b/net/wireless/util.c index 8edce22d1b93..e36ede840b88 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -420,8 +420,8 @@ unsigned int ieee80211_get_mesh_hdrlen(struct ieee80211s_hdr *meshhdr) } EXPORT_SYMBOL(ieee80211_get_mesh_hdrlen); -static int __ieee80211_data_to_8023(struct sk_buff *skb, struct ethhdr *ehdr, - const u8 *addr, enum nl80211_iftype iftype) +int ieee80211_data_to_8023_exthdr(struct sk_buff *skb, struct ethhdr *ehdr, + const u8 *addr, enum nl80211_iftype iftype) { struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; struct { @@ -525,13 +525,7 @@ static int __ieee80211_data_to_8023(struct sk_buff *skb, struct ethhdr *ehdr, return 0; } - -int ieee80211_data_to_8023(struct sk_buff *skb, const u8 *addr, - enum nl80211_iftype iftype) -{ - return __ieee80211_data_to_8023(skb, NULL, addr, iftype); -} -EXPORT_SYMBOL(ieee80211_data_to_8023); +EXPORT_SYMBOL(ieee80211_data_to_8023_exthdr); int ieee80211_data_from_8023(struct sk_buff *skb, const u8 *addr, enum nl80211_iftype iftype, @@ -745,25 +739,18 @@ __ieee80211_amsdu_copy(struct sk_buff *skb, unsigned int hlen, void ieee80211_amsdu_to_8023s(struct sk_buff *skb, struct sk_buff_head *list, const u8 *addr, enum nl80211_iftype iftype, - const unsigned int extra_headroom, - bool has_80211_header) + const unsigned int extra_headroom) { unsigned int hlen = ALIGN(extra_headroom, 4); struct sk_buff *frame = NULL; u16 ethertype; u8 *payload; - int offset = 0, remaining, err; + int offset = 0, remaining; struct ethhdr eth; bool reuse_frag = skb->head_frag && !skb_has_frag_list(skb); bool reuse_skb = false; bool last = false; - if (has_80211_header) { - err = __ieee80211_data_to_8023(skb, ð, addr, iftype); - if (err) - goto out; - } - while (!last) { unsigned int subframe_len; int len; @@ -819,7 +806,6 @@ void ieee80211_amsdu_to_8023s(struct sk_buff *skb, struct sk_buff_head *list, purge: __skb_queue_purge(list); - out: dev_kfree_skb(skb); } EXPORT_SYMBOL(ieee80211_amsdu_to_8023s); -- cgit v1.2.3 From 8b935ee2ea17db720d70f6420f77f594c0c93f75 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 5 Oct 2016 16:17:01 +0200 Subject: cfg80211: add ability to check DA/SA in A-MSDU decapsulation We should not accept arbitrary DA/SA inside A-MSDUs, it could be used to circumvent protections, like allowing a station to send frames and make them seem to come from somewhere else. Add the necessary infrastructure in cfg80211 to allow such checks, in further patches we'll start using them. Signed-off-by: Johannes Berg --- drivers/net/wireless/marvell/mwifiex/11n_rxreorder.c | 2 +- include/net/cfg80211.h | 5 ++++- net/mac80211/rx.c | 3 ++- net/wireless/util.c | 14 ++++++++++++-- 4 files changed, 19 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/drivers/net/wireless/marvell/mwifiex/11n_rxreorder.c b/drivers/net/wireless/marvell/mwifiex/11n_rxreorder.c index e9f462e3730b..274dd5a1574a 100644 --- a/drivers/net/wireless/marvell/mwifiex/11n_rxreorder.c +++ b/drivers/net/wireless/marvell/mwifiex/11n_rxreorder.c @@ -45,7 +45,7 @@ static int mwifiex_11n_dispatch_amsdu_pkt(struct mwifiex_private *priv, skb_trim(skb, le16_to_cpu(local_rx_pd->rx_pkt_length)); ieee80211_amsdu_to_8023s(skb, &list, priv->curr_addr, - priv->wdev.iftype, 0); + priv->wdev.iftype, 0, NULL, NULL); while (!skb_queue_empty(&list)) { struct rx_packet_hdr *rx_hdr; diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index f372eadd1963..7df600c463eb 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -4090,10 +4090,13 @@ int ieee80211_data_from_8023(struct sk_buff *skb, const u8 *addr, * @addr: The device MAC address. * @iftype: The device interface type. * @extra_headroom: The hardware extra headroom for SKBs in the @list. + * @check_da: DA to check in the inner ethernet header, or NULL + * @check_sa: SA to check in the inner ethernet header, or NULL */ void ieee80211_amsdu_to_8023s(struct sk_buff *skb, struct sk_buff_head *list, const u8 *addr, enum nl80211_iftype iftype, - const unsigned int extra_headroom); + const unsigned int extra_headroom, + const u8 *check_da, const u8 *check_sa); /** * cfg80211_classify8021d - determine the 802.1p/1d tag for a data frame diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index de2d75fa5c51..cf53fe1a0aa2 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -2337,7 +2337,8 @@ ieee80211_rx_h_amsdu(struct ieee80211_rx_data *rx) ieee80211_amsdu_to_8023s(skb, &frame_list, dev->dev_addr, rx->sdata->vif.type, - rx->local->hw.extra_tx_headroom); + rx->local->hw.extra_tx_headroom, + NULL, NULL); while (!skb_queue_empty(&frame_list)) { rx->skb = __skb_dequeue(&frame_list); diff --git a/net/wireless/util.c b/net/wireless/util.c index e36ede840b88..5ea12afc7706 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -739,7 +739,8 @@ __ieee80211_amsdu_copy(struct sk_buff *skb, unsigned int hlen, void ieee80211_amsdu_to_8023s(struct sk_buff *skb, struct sk_buff_head *list, const u8 *addr, enum nl80211_iftype iftype, - const unsigned int extra_headroom) + const unsigned int extra_headroom, + const u8 *check_da, const u8 *check_sa) { unsigned int hlen = ALIGN(extra_headroom, 4); struct sk_buff *frame = NULL; @@ -767,8 +768,17 @@ void ieee80211_amsdu_to_8023s(struct sk_buff *skb, struct sk_buff_head *list, goto purge; offset += sizeof(struct ethhdr); - /* reuse skb for the last subframe */ last = remaining <= subframe_len + padding; + + /* FIXME: should we really accept multicast DA? */ + if ((check_da && !is_multicast_ether_addr(eth.h_dest) && + !ether_addr_equal(check_da, eth.h_dest)) || + (check_sa && !ether_addr_equal(check_sa, eth.h_source))) { + offset += len + padding; + continue; + } + + /* reuse skb for the last subframe */ if (!skb_is_nonlinear(skb) && !reuse_frag && last) { skb_pull(skb, offset); frame = skb; -- cgit v1.2.3 From 899bb6642f2a2f2cd3f77abd6c5a14550e3b37e6 Mon Sep 17 00:00:00 2001 From: Aaro Koskinen Date: Wed, 12 Oct 2016 08:45:05 +0530 Subject: cpufreq: skip invalid entries when searching the frequency Skip invalid entries when searching the frequency. This fixes cpufreq at least on loongson2 MIPS board. Fixes: da0c6dc00c69 (cpufreq: Handle sorted frequency tables more efficiently) Signed-off-by: Aaro Koskinen Signed-off-by: Viresh Kumar Cc: 4.8+ # 4.8+ Signed-off-by: Rafael J. Wysocki --- include/linux/cpufreq.h | 104 ++++++++++++++++++++++++------------------------ 1 file changed, 52 insertions(+), 52 deletions(-) (limited to 'include') diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 631ba33bbe9f..5fa55fc56e18 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -639,19 +639,19 @@ static inline int cpufreq_table_find_index_al(struct cpufreq_policy *policy, unsigned int target_freq) { struct cpufreq_frequency_table *table = policy->freq_table; + struct cpufreq_frequency_table *pos, *best = table - 1; unsigned int freq; - int i, best = -1; - for (i = 0; table[i].frequency != CPUFREQ_TABLE_END; i++) { - freq = table[i].frequency; + cpufreq_for_each_valid_entry(pos, table) { + freq = pos->frequency; if (freq >= target_freq) - return i; + return pos - table; - best = i; + best = pos; } - return best; + return best - table; } /* Find lowest freq at or above target in a table in descending order */ @@ -659,28 +659,28 @@ static inline int cpufreq_table_find_index_dl(struct cpufreq_policy *policy, unsigned int target_freq) { struct cpufreq_frequency_table *table = policy->freq_table; + struct cpufreq_frequency_table *pos, *best = table - 1; unsigned int freq; - int i, best = -1; - for (i = 0; table[i].frequency != CPUFREQ_TABLE_END; i++) { - freq = table[i].frequency; + cpufreq_for_each_valid_entry(pos, table) { + freq = pos->frequency; if (freq == target_freq) - return i; + return pos - table; if (freq > target_freq) { - best = i; + best = pos; continue; } /* No freq found above target_freq */ - if (best == -1) - return i; + if (best == table - 1) + return pos - table; - return best; + return best - pos; } - return best; + return best - pos; } /* Works only on sorted freq-tables */ @@ -700,28 +700,28 @@ static inline int cpufreq_table_find_index_ah(struct cpufreq_policy *policy, unsigned int target_freq) { struct cpufreq_frequency_table *table = policy->freq_table; + struct cpufreq_frequency_table *pos, *best = table - 1; unsigned int freq; - int i, best = -1; - for (i = 0; table[i].frequency != CPUFREQ_TABLE_END; i++) { - freq = table[i].frequency; + cpufreq_for_each_valid_entry(pos, table) { + freq = pos->frequency; if (freq == target_freq) - return i; + return pos - table; if (freq < target_freq) { - best = i; + best = pos; continue; } /* No freq found below target_freq */ - if (best == -1) - return i; + if (best == table - 1) + return pos - table; - return best; + return best - table; } - return best; + return best - table; } /* Find highest freq at or below target in a table in descending order */ @@ -729,19 +729,19 @@ static inline int cpufreq_table_find_index_dh(struct cpufreq_policy *policy, unsigned int target_freq) { struct cpufreq_frequency_table *table = policy->freq_table; + struct cpufreq_frequency_table *pos, *best = table - 1; unsigned int freq; - int i, best = -1; - for (i = 0; table[i].frequency != CPUFREQ_TABLE_END; i++) { - freq = table[i].frequency; + cpufreq_for_each_valid_entry(pos, table) { + freq = pos->frequency; if (freq <= target_freq) - return i; + return pos - table; - best = i; + best = pos; } - return best; + return best - table; } /* Works only on sorted freq-tables */ @@ -761,32 +761,32 @@ static inline int cpufreq_table_find_index_ac(struct cpufreq_policy *policy, unsigned int target_freq) { struct cpufreq_frequency_table *table = policy->freq_table; + struct cpufreq_frequency_table *pos, *best = table - 1; unsigned int freq; - int i, best = -1; - for (i = 0; table[i].frequency != CPUFREQ_TABLE_END; i++) { - freq = table[i].frequency; + cpufreq_for_each_valid_entry(pos, table) { + freq = pos->frequency; if (freq == target_freq) - return i; + return pos - table; if (freq < target_freq) { - best = i; + best = pos; continue; } /* No freq found below target_freq */ - if (best == -1) - return i; + if (best == table - 1) + return pos - table; /* Choose the closest freq */ - if (target_freq - table[best].frequency > freq - target_freq) - return i; + if (target_freq - best->frequency > freq - target_freq) + return pos - table; - return best; + return best - table; } - return best; + return best - table; } /* Find closest freq to target in a table in descending order */ @@ -794,32 +794,32 @@ static inline int cpufreq_table_find_index_dc(struct cpufreq_policy *policy, unsigned int target_freq) { struct cpufreq_frequency_table *table = policy->freq_table; + struct cpufreq_frequency_table *pos, *best = table - 1; unsigned int freq; - int i, best = -1; - for (i = 0; table[i].frequency != CPUFREQ_TABLE_END; i++) { - freq = table[i].frequency; + cpufreq_for_each_valid_entry(pos, table) { + freq = pos->frequency; if (freq == target_freq) - return i; + return pos - table; if (freq > target_freq) { - best = i; + best = pos; continue; } /* No freq found above target_freq */ - if (best == -1) - return i; + if (best == table - 1) + return pos - table; /* Choose the closest freq */ - if (table[best].frequency - target_freq > target_freq - freq) - return i; + if (best->frequency - target_freq > target_freq - freq) + return pos - table; - return best; + return best - table; } - return best; + return best - table; } /* Works only on sorted freq-tables */ -- cgit v1.2.3 From d60874cd58fcb21372f2df698c20f8cf2f78fdcb Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 4 Oct 2016 14:40:45 +0200 Subject: vfs: add vfs_get_link() helper This helper is for filesystems that want to read the symlink and are better off with the get_link() interface (returning a char *) rather than the readlink() interface (copy into a userspace buffer). Also call the LSM hook for readlink (not get_link) since this is for symlink reading not following. Signed-off-by: Miklos Szeredi --- fs/namei.c | 25 +++++++++++++++++++++++++ include/linux/fs.h | 1 + 2 files changed, 26 insertions(+) (limited to 'include') diff --git a/fs/namei.c b/fs/namei.c index adb04146df09..8a2c2959da08 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -4677,6 +4677,31 @@ int generic_readlink(struct dentry *dentry, char __user *buffer, int buflen) } EXPORT_SYMBOL(generic_readlink); +/** + * vfs_get_link - get symlink body + * @dentry: dentry on which to get symbolic link + * @done: caller needs to free returned data with this + * + * Calls security hook and i_op->get_link() on the supplied inode. + * + * It does not touch atime. That's up to the caller if necessary. + * + * Does not work on "special" symlinks like /proc/$$/fd/N + */ +const char *vfs_get_link(struct dentry *dentry, struct delayed_call *done) +{ + const char *res = ERR_PTR(-EINVAL); + struct inode *inode = d_inode(dentry); + + if (d_is_symlink(dentry)) { + res = ERR_PTR(security_inode_readlink(dentry)); + if (!res) + res = inode->i_op->get_link(dentry, inode, done); + } + return res; +} +EXPORT_SYMBOL(vfs_get_link); + /* get the link contents into pagecache */ const char *page_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *callback) diff --git a/include/linux/fs.h b/include/linux/fs.h index 901e25d495cc..bc8ac5108368 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2919,6 +2919,7 @@ extern int vfs_stat(const char __user *, struct kstat *); extern int vfs_lstat(const char __user *, struct kstat *); extern int vfs_fstat(unsigned int, struct kstat *); extern int vfs_fstatat(int , const char __user *, struct kstat *, int); +extern const char *vfs_get_link(struct dentry *, struct delayed_call *); extern int __generic_block_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, -- cgit v1.2.3 From 9d6280da39c04832d6abf5f4ecc8175c9aad91c0 Mon Sep 17 00:00:00 2001 From: Jiri Bohac Date: Thu, 13 Oct 2016 18:50:02 +0200 Subject: IPv6: Drop the temporary address regen_timer The randomized interface identifier (rndid) was periodically updated from the regen_timer timer. Simplify the code by updating the rndid only when needed by ipv6_try_regen_rndid(). This makes the follow-up DESYNC_FACTOR fix much simpler. Also it fixes a reference counting error in this error path, where an in6_dev_put was missing: err = addrconf_sysctl_register(ndev); if (err) { ipv6_mc_destroy_dev(ndev); - del_timer(&ndev->regen_timer); snmp6_unregister_dev(ndev); goto err_release; Signed-off-by: Jiri Bohac Signed-off-by: David S. Miller --- include/net/if_inet6.h | 1 - net/ipv6/addrconf.c | 61 ++++++++------------------------------------------ 2 files changed, 9 insertions(+), 53 deletions(-) (limited to 'include') diff --git a/include/net/if_inet6.h b/include/net/if_inet6.h index 515352c6280a..ae707352f463 100644 --- a/include/net/if_inet6.h +++ b/include/net/if_inet6.h @@ -191,7 +191,6 @@ struct inet6_dev { int dead; u8 rndid[8]; - struct timer_list regen_timer; struct list_head tempaddr_list; struct in6_addr token; diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 9faafe58516a..58c3d73403a5 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -147,9 +147,8 @@ static inline void addrconf_sysctl_unregister(struct inet6_dev *idev) } #endif -static void __ipv6_regen_rndid(struct inet6_dev *idev); -static void __ipv6_try_regen_rndid(struct inet6_dev *idev, struct in6_addr *tmpaddr); -static void ipv6_regen_rndid(unsigned long data); +static void ipv6_regen_rndid(struct inet6_dev *idev); +static void ipv6_try_regen_rndid(struct inet6_dev *idev, struct in6_addr *tmpaddr); static int ipv6_generate_eui64(u8 *eui, struct net_device *dev); static int ipv6_count_addresses(struct inet6_dev *idev); @@ -409,9 +408,7 @@ static struct inet6_dev *ipv6_add_dev(struct net_device *dev) goto err_release; } - /* One reference from device. We must do this before - * we invoke __ipv6_regen_rndid(). - */ + /* One reference from device. */ in6_dev_hold(ndev); if (dev->flags & (IFF_NOARP | IFF_LOOPBACK)) @@ -425,17 +422,14 @@ static struct inet6_dev *ipv6_add_dev(struct net_device *dev) #endif INIT_LIST_HEAD(&ndev->tempaddr_list); - setup_timer(&ndev->regen_timer, ipv6_regen_rndid, (unsigned long)ndev); if ((dev->flags&IFF_LOOPBACK) || dev->type == ARPHRD_TUNNEL || dev->type == ARPHRD_TUNNEL6 || dev->type == ARPHRD_SIT || dev->type == ARPHRD_NONE) { ndev->cnf.use_tempaddr = -1; - } else { - in6_dev_hold(ndev); - ipv6_regen_rndid((unsigned long) ndev); - } + } else + ipv6_regen_rndid(ndev); ndev->token = in6addr_any; @@ -447,7 +441,6 @@ static struct inet6_dev *ipv6_add_dev(struct net_device *dev) err = addrconf_sysctl_register(ndev); if (err) { ipv6_mc_destroy_dev(ndev); - del_timer(&ndev->regen_timer); snmp6_unregister_dev(ndev); goto err_release; } @@ -1222,7 +1215,7 @@ retry: } in6_ifa_hold(ifp); memcpy(addr.s6_addr, ifp->addr.s6_addr, 8); - __ipv6_try_regen_rndid(idev, tmpaddr); + ipv6_try_regen_rndid(idev, tmpaddr); memcpy(&addr.s6_addr[8], idev->rndid, 8); age = (now - ifp->tstamp) / HZ; tmp_valid_lft = min_t(__u32, @@ -2150,7 +2143,7 @@ static int ipv6_inherit_eui64(u8 *eui, struct inet6_dev *idev) } /* (re)generation of randomized interface identifier (RFC 3041 3.2, 3.5) */ -static void __ipv6_regen_rndid(struct inet6_dev *idev) +static void ipv6_regen_rndid(struct inet6_dev *idev) { regen: get_random_bytes(idev->rndid, sizeof(idev->rndid)); @@ -2179,43 +2172,10 @@ regen: } } -static void ipv6_regen_rndid(unsigned long data) -{ - struct inet6_dev *idev = (struct inet6_dev *) data; - unsigned long expires; - - rcu_read_lock_bh(); - write_lock_bh(&idev->lock); - - if (idev->dead) - goto out; - - __ipv6_regen_rndid(idev); - - expires = jiffies + - idev->cnf.temp_prefered_lft * HZ - - idev->cnf.regen_max_retry * idev->cnf.dad_transmits * - NEIGH_VAR(idev->nd_parms, RETRANS_TIME) - - idev->cnf.max_desync_factor * HZ; - if (time_before(expires, jiffies)) { - pr_warn("%s: too short regeneration interval; timer disabled for %s\n", - __func__, idev->dev->name); - goto out; - } - - if (!mod_timer(&idev->regen_timer, expires)) - in6_dev_hold(idev); - -out: - write_unlock_bh(&idev->lock); - rcu_read_unlock_bh(); - in6_dev_put(idev); -} - -static void __ipv6_try_regen_rndid(struct inet6_dev *idev, struct in6_addr *tmpaddr) +static void ipv6_try_regen_rndid(struct inet6_dev *idev, struct in6_addr *tmpaddr) { if (tmpaddr && memcmp(idev->rndid, &tmpaddr->s6_addr[8], 8) == 0) - __ipv6_regen_rndid(idev); + ipv6_regen_rndid(idev); } /* @@ -3594,9 +3554,6 @@ restart: if (!how) idev->if_flags &= ~(IF_RS_SENT|IF_RA_RCVD|IF_READY); - if (how && del_timer(&idev->regen_timer)) - in6_dev_put(idev); - /* Step 3: clear tempaddr list */ while (!list_empty(&idev->tempaddr_list)) { ifa = list_first_entry(&idev->tempaddr_list, -- cgit v1.2.3 From 76506a986dc31394fd1f2741db037d29c7e57843 Mon Sep 17 00:00:00 2001 From: Jiri Bohac Date: Thu, 13 Oct 2016 18:52:15 +0200 Subject: IPv6: fix DESYNC_FACTOR The IPv6 temporary address generation uses a variable called DESYNC_FACTOR to prevent hosts updating the addresses at the same time. Quoting RFC 4941: ... The value DESYNC_FACTOR is a random value (different for each client) that ensures that clients don't synchronize with each other and generate new addresses at exactly the same time ... DESYNC_FACTOR is defined as: DESYNC_FACTOR -- A random value within the range 0 - MAX_DESYNC_FACTOR. It is computed once at system start (rather than each time it is used) and must never be greater than (TEMP_VALID_LIFETIME - REGEN_ADVANCE). First, I believe the RFC has a typo in it and meant to say: "and must never be greater than (TEMP_PREFERRED_LIFETIME - REGEN_ADVANCE)" The reason is that at various places in the RFC, DESYNC_FACTOR is used in a calculation like (TEMP_PREFERRED_LIFETIME - DESYNC_FACTOR) or (TEMP_PREFERRED_LIFETIME - REGEN_ADVANCE - DESYNC_FACTOR). It needs to be smaller than (TEMP_PREFERRED_LIFETIME - REGEN_ADVANCE) for the result of these calculations to be larger than zero. It's never used in a calculation together with TEMP_VALID_LIFETIME. I already submitted an errata to the rfc-editor: https://www.rfc-editor.org/errata_search.php?rfc=4941 The Linux implementation of DESYNC_FACTOR is very wrong: max_desync_factor is used in places DESYNC_FACTOR should be used. max_desync_factor is initialized to the RFC-recommended value for MAX_DESYNC_FACTOR (600) but the whole point is to get a _random_ value. And nothing ensures that the value used is not greater than (TEMP_PREFERRED_LIFETIME - REGEN_ADVANCE), which leads to underflows. The effect can easily be observed when setting the temp_prefered_lft sysctl e.g. to 60. The preferred lifetime of the temporary addresses will be bogus. TEMP_PREFERRED_LIFETIME and REGEN_ADVANCE are not constants and can be influenced by these three sysctls: regen_max_retry, dad_transmits and temp_prefered_lft. Thus, the upper bound for desync_factor needs to be re-calculated each time a new address is generated and if desync_factor is larger than the new upper bound, a new random value needs to be re-generated. And since we already have max_desync_factor configurable per interface, we also need to calculate and store desync_factor per interface. Signed-off-by: Jiri Bohac Signed-off-by: David S. Miller --- include/net/if_inet6.h | 1 + net/ipv6/addrconf.c | 39 +++++++++++++++++++++++++++++++-------- 2 files changed, 32 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/net/if_inet6.h b/include/net/if_inet6.h index ae707352f463..b0576cb2ab25 100644 --- a/include/net/if_inet6.h +++ b/include/net/if_inet6.h @@ -190,6 +190,7 @@ struct inet6_dev { __u32 if_flags; int dead; + u32 desync_factor; u8 rndid[8]; struct list_head tempaddr_list; diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 58c3d73403a5..cc7c26d05f45 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -422,6 +422,7 @@ static struct inet6_dev *ipv6_add_dev(struct net_device *dev) #endif INIT_LIST_HEAD(&ndev->tempaddr_list); + ndev->desync_factor = U32_MAX; if ((dev->flags&IFF_LOOPBACK) || dev->type == ARPHRD_TUNNEL || dev->type == ARPHRD_TUNNEL6 || @@ -1183,6 +1184,7 @@ static int ipv6_create_tempaddr(struct inet6_ifaddr *ifp, struct inet6_ifaddr *i int ret = 0; u32 addr_flags; unsigned long now = jiffies; + long max_desync_factor; write_lock_bh(&idev->lock); if (ift) { @@ -1218,20 +1220,41 @@ retry: ipv6_try_regen_rndid(idev, tmpaddr); memcpy(&addr.s6_addr[8], idev->rndid, 8); age = (now - ifp->tstamp) / HZ; + + regen_advance = idev->cnf.regen_max_retry * + idev->cnf.dad_transmits * + NEIGH_VAR(idev->nd_parms, RETRANS_TIME) / HZ; + + /* recalculate max_desync_factor each time and update + * idev->desync_factor if it's larger + */ + max_desync_factor = min_t(__u32, + idev->cnf.max_desync_factor, + idev->cnf.temp_prefered_lft - regen_advance); + + if (unlikely(idev->desync_factor > max_desync_factor)) { + if (max_desync_factor > 0) { + get_random_bytes(&idev->desync_factor, + sizeof(idev->desync_factor)); + idev->desync_factor %= max_desync_factor; + } else { + idev->desync_factor = 0; + } + } + tmp_valid_lft = min_t(__u32, ifp->valid_lft, idev->cnf.temp_valid_lft + age); - tmp_prefered_lft = min_t(__u32, - ifp->prefered_lft, - idev->cnf.temp_prefered_lft + age - - idev->cnf.max_desync_factor); + tmp_prefered_lft = idev->cnf.temp_prefered_lft + age - + idev->desync_factor; + /* guard against underflow in case of concurrent updates to cnf */ + if (unlikely(tmp_prefered_lft < 0)) + tmp_prefered_lft = 0; + tmp_prefered_lft = min_t(__u32, ifp->prefered_lft, tmp_prefered_lft); tmp_plen = ifp->prefix_len; tmp_tstamp = ifp->tstamp; spin_unlock_bh(&ifp->lock); - regen_advance = idev->cnf.regen_max_retry * - idev->cnf.dad_transmits * - NEIGH_VAR(idev->nd_parms, RETRANS_TIME) / HZ; write_unlock_bh(&idev->lock); /* A temporary address is created only if this calculated Preferred @@ -2316,7 +2339,7 @@ static void manage_tempaddrs(struct inet6_dev *idev, max_valid = 0; max_prefered = idev->cnf.temp_prefered_lft - - idev->cnf.max_desync_factor - age; + idev->desync_factor - age; if (max_prefered < 0) max_prefered = 0; -- cgit v1.2.3 From 0189efb8f4f830b9ac7a7c56c0c6e260859e950d Mon Sep 17 00:00:00 2001 From: Yuval Mintz Date: Thu, 13 Oct 2016 22:57:02 +0300 Subject: qed*: Fix Kconfig dependencies with INFINIBAND_QEDR The qedr driver would require a tristate Kconfig option [to allow it to compile as a module], and toward that end we've added the INFINIBAND_QEDR option. But as we've made the compilation of the qed/qede infrastructure required for RoCE dependent on the option we'd be facing linking difficulties in case that QED=y or QEDE=y, and INFINIBAND_QEDR=m. To resolve this, we seperate between the INFINIBAND_QEDR option and the infrastructure support in qed/qede by introducing a new QED_RDMA option which would be selected by INFINIBAND_QEDR but would be a boolean instead of a tristate; Following that, the qed/qede is fixed based on this new option so that all config combinations would be supported. Fixes: cee9fbd8e2e9 ("qede: add qedr framework") Reported-by: Arnd Bergmann Signed-off-by: Yuval Mintz Signed-off-by: David S. Miller --- drivers/net/ethernet/qlogic/Kconfig | 4 ++ drivers/net/ethernet/qlogic/qed/Makefile | 2 +- drivers/net/ethernet/qlogic/qed/qed_cxt.c | 7 +-- drivers/net/ethernet/qlogic/qed/qed_dev.c | 14 ++--- drivers/net/ethernet/qlogic/qed/qed_ll2.c | 1 + drivers/net/ethernet/qlogic/qed/qed_ll2.h | 20 ------- drivers/net/ethernet/qlogic/qed/qed_main.c | 28 +++++---- drivers/net/ethernet/qlogic/qed/qed_roce.c | 91 +++++++++++++++--------------- drivers/net/ethernet/qlogic/qed/qed_roce.h | 75 +++++++++++++++--------- drivers/net/ethernet/qlogic/qed/qed_spq.c | 4 -- drivers/net/ethernet/qlogic/qede/Makefile | 2 +- include/linux/qed/qede_roce.h | 2 +- 12 files changed, 120 insertions(+), 130 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/qlogic/Kconfig b/drivers/net/ethernet/qlogic/Kconfig index 0df1391f9663..77567727528a 100644 --- a/drivers/net/ethernet/qlogic/Kconfig +++ b/drivers/net/ethernet/qlogic/Kconfig @@ -107,10 +107,14 @@ config QEDE ---help--- This enables the support for ... +config QED_RDMA + bool + config INFINIBAND_QEDR tristate "QLogic qede RoCE sources [debug]" depends on QEDE && 64BIT select QED_LL2 + select QED_RDMA default n ---help--- This provides a temporary node that allows the compilation diff --git a/drivers/net/ethernet/qlogic/qed/Makefile b/drivers/net/ethernet/qlogic/qed/Makefile index cda0af7fbc20..967acf322c09 100644 --- a/drivers/net/ethernet/qlogic/qed/Makefile +++ b/drivers/net/ethernet/qlogic/qed/Makefile @@ -5,4 +5,4 @@ qed-y := qed_cxt.o qed_dev.o qed_hw.o qed_init_fw_funcs.o qed_init_ops.o \ qed_selftest.o qed_dcbx.o qed_debug.o qed-$(CONFIG_QED_SRIOV) += qed_sriov.o qed_vf.o qed-$(CONFIG_QED_LL2) += qed_ll2.o -qed-$(CONFIG_INFINIBAND_QEDR) += qed_roce.o +qed-$(CONFIG_QED_RDMA) += qed_roce.o diff --git a/drivers/net/ethernet/qlogic/qed/qed_cxt.c b/drivers/net/ethernet/qlogic/qed/qed_cxt.c index 82370a1a59ad..277db7831f7b 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_cxt.c +++ b/drivers/net/ethernet/qlogic/qed/qed_cxt.c @@ -47,13 +47,8 @@ #define TM_ALIGN BIT(TM_SHIFT) #define TM_ELEM_SIZE 4 -/* ILT constants */ -#if IS_ENABLED(CONFIG_INFINIBAND_QEDR) /* For RoCE we configure to 64K to cover for RoCE max tasks 256K purpose. */ -#define ILT_DEFAULT_HW_P_SIZE 4 -#else -#define ILT_DEFAULT_HW_P_SIZE 3 -#endif +#define ILT_DEFAULT_HW_P_SIZE (IS_ENABLED(CONFIG_QED_RDMA) ? 4 : 3) #define ILT_PAGE_IN_BYTES(hw_p_size) (1U << ((hw_p_size) + 12)) #define ILT_CFG_REG(cli, reg) PSWRQ2_REG_ ## cli ## _ ## reg ## _RT_OFFSET diff --git a/drivers/net/ethernet/qlogic/qed/qed_dev.c b/drivers/net/ethernet/qlogic/qed/qed_dev.c index 754f6a908858..21adf5208320 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_dev.c +++ b/drivers/net/ethernet/qlogic/qed/qed_dev.c @@ -1422,19 +1422,19 @@ static void qed_hw_set_feat(struct qed_hwfn *p_hwfn) u32 *feat_num = p_hwfn->hw_info.feat_num; int num_features = 1; -#if IS_ENABLED(CONFIG_INFINIBAND_QEDR) - /* Roce CNQ each requires: 1 status block + 1 CNQ. We divide the - * status blocks equally between L2 / RoCE but with consideration as - * to how many l2 queues / cnqs we have - */ - if (p_hwfn->hw_info.personality == QED_PCI_ETH_ROCE) { + if (IS_ENABLED(CONFIG_QED_RDMA) && + p_hwfn->hw_info.personality == QED_PCI_ETH_ROCE) { + /* Roce CNQ each requires: 1 status block + 1 CNQ. We divide + * the status blocks equally between L2 / RoCE but with + * consideration as to how many l2 queues / cnqs we have. + */ num_features++; feat_num[QED_RDMA_CNQ] = min_t(u32, RESC_NUM(p_hwfn, QED_SB) / num_features, RESC_NUM(p_hwfn, QED_RDMA_CNQ_RAM)); } -#endif + feat_num[QED_PF_L2_QUE] = min_t(u32, RESC_NUM(p_hwfn, QED_SB) / num_features, RESC_NUM(p_hwfn, QED_L2_QUEUE)); diff --git a/drivers/net/ethernet/qlogic/qed/qed_ll2.c b/drivers/net/ethernet/qlogic/qed/qed_ll2.c index 02a8be2faed7..7856e5241b52 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_ll2.c +++ b/drivers/net/ethernet/qlogic/qed/qed_ll2.c @@ -38,6 +38,7 @@ #include "qed_mcp.h" #include "qed_reg_addr.h" #include "qed_sp.h" +#include "qed_roce.h" #define QED_LL2_RX_REGISTERED(ll2) ((ll2)->rx_queue.b_cb_registred) #define QED_LL2_TX_REGISTERED(ll2) ((ll2)->tx_queue.b_cb_registred) diff --git a/drivers/net/ethernet/qlogic/qed/qed_ll2.h b/drivers/net/ethernet/qlogic/qed/qed_ll2.h index 80a5dc2d652d..4e3d62a16cab 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_ll2.h +++ b/drivers/net/ethernet/qlogic/qed/qed_ll2.h @@ -293,24 +293,4 @@ void qed_ll2_setup(struct qed_hwfn *p_hwfn, */ void qed_ll2_free(struct qed_hwfn *p_hwfn, struct qed_ll2_info *p_ll2_connections); -void qed_ll2b_complete_rx_gsi_packet(struct qed_hwfn *p_hwfn, - u8 connection_handle, - void *cookie, - dma_addr_t rx_buf_addr, - u16 data_length, - u8 data_length_error, - u16 parse_flags, - u16 vlan, - u32 src_mac_addr_hi, - u16 src_mac_addr_lo, bool b_last_packet); -void qed_ll2b_complete_tx_gsi_packet(struct qed_hwfn *p_hwfn, - u8 connection_handle, - void *cookie, - dma_addr_t first_frag_addr, - bool b_last_fragment, bool b_last_packet); -void qed_ll2b_release_tx_gsi_packet(struct qed_hwfn *p_hwfn, - u8 connection_handle, - void *cookie, - dma_addr_t first_frag_addr, - bool b_last_fragment, bool b_last_packet); #endif diff --git a/drivers/net/ethernet/qlogic/qed/qed_main.c b/drivers/net/ethernet/qlogic/qed/qed_main.c index 4ee3151e80c2..6eb2401b9e22 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_main.c +++ b/drivers/net/ethernet/qlogic/qed/qed_main.c @@ -33,10 +33,8 @@ #include "qed_hw.h" #include "qed_selftest.h" -#if IS_ENABLED(CONFIG_INFINIBAND_QEDR) #define QED_ROCE_QPS (8192) #define QED_ROCE_DPIS (8) -#endif static char version[] = "QLogic FastLinQ 4xxxx Core Module qed " DRV_MODULE_VERSION "\n"; @@ -682,9 +680,7 @@ static int qed_slowpath_setup_int(struct qed_dev *cdev, enum qed_int_mode int_mode) { struct qed_sb_cnt_info sb_cnt_info; -#if IS_ENABLED(CONFIG_INFINIBAND_QEDR) - int num_l2_queues; -#endif + int num_l2_queues = 0; int rc; int i; @@ -715,8 +711,9 @@ static int qed_slowpath_setup_int(struct qed_dev *cdev, cdev->int_params.fp_msix_cnt = cdev->int_params.out.num_vectors - cdev->num_hwfns; -#if IS_ENABLED(CONFIG_INFINIBAND_QEDR) - num_l2_queues = 0; + if (!IS_ENABLED(CONFIG_QED_RDMA)) + return 0; + for_each_hwfn(cdev, i) num_l2_queues += FEAT_NUM(&cdev->hwfns[i], QED_PF_L2_QUE); @@ -738,7 +735,6 @@ static int qed_slowpath_setup_int(struct qed_dev *cdev, DP_VERBOSE(cdev, QED_MSG_RDMA, "roce_msix_cnt=%d roce_msix_base=%d\n", cdev->int_params.rdma_msix_cnt, cdev->int_params.rdma_msix_base); -#endif return 0; } @@ -843,18 +839,20 @@ static void qed_update_pf_params(struct qed_dev *cdev, { int i; -#if IS_ENABLED(CONFIG_INFINIBAND_QEDR) - params->rdma_pf_params.num_qps = QED_ROCE_QPS; - params->rdma_pf_params.min_dpis = QED_ROCE_DPIS; - /* divide by 3 the MRs to avoid MF ILT overflow */ - params->rdma_pf_params.num_mrs = RDMA_MAX_TIDS; - params->rdma_pf_params.gl_pi = QED_ROCE_PROTOCOL_INDEX; -#endif for (i = 0; i < cdev->num_hwfns; i++) { struct qed_hwfn *p_hwfn = &cdev->hwfns[i]; p_hwfn->pf_params = *params; } + + if (!IS_ENABLED(CONFIG_QED_RDMA)) + return; + + params->rdma_pf_params.num_qps = QED_ROCE_QPS; + params->rdma_pf_params.min_dpis = QED_ROCE_DPIS; + /* divide by 3 the MRs to avoid MF ILT overflow */ + params->rdma_pf_params.num_mrs = RDMA_MAX_TIDS; + params->rdma_pf_params.gl_pi = QED_ROCE_PROTOCOL_INDEX; } static int qed_slowpath_start(struct qed_dev *cdev, diff --git a/drivers/net/ethernet/qlogic/qed/qed_roce.c b/drivers/net/ethernet/qlogic/qed/qed_roce.c index c73638ceb1ad..187df38542f7 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_roce.c +++ b/drivers/net/ethernet/qlogic/qed/qed_roce.c @@ -129,17 +129,12 @@ static void qed_bmap_release_id(struct qed_hwfn *p_hwfn, } } -u32 qed_rdma_get_sb_id(void *p_hwfn, u32 rel_sb_id) +static u32 qed_rdma_get_sb_id(void *p_hwfn, u32 rel_sb_id) { /* First sb id for RoCE is after all the l2 sb */ return FEAT_NUM((struct qed_hwfn *)p_hwfn, QED_PF_L2_QUE) + rel_sb_id; } -u32 qed_rdma_query_cau_timer_res(void *rdma_cxt) -{ - return QED_CAU_DEF_RX_TIMER_RES; -} - static int qed_rdma_alloc(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt, struct qed_rdma_start_in_params *params) @@ -275,7 +270,7 @@ free_rdma_info: return rc; } -void qed_rdma_resc_free(struct qed_hwfn *p_hwfn) +static void qed_rdma_resc_free(struct qed_hwfn *p_hwfn) { struct qed_rdma_info *p_rdma_info = p_hwfn->p_rdma_info; @@ -527,6 +522,26 @@ static int qed_rdma_start_fw(struct qed_hwfn *p_hwfn, return qed_spq_post(p_hwfn, p_ent, NULL); } +static int qed_rdma_alloc_tid(void *rdma_cxt, u32 *itid) +{ + struct qed_hwfn *p_hwfn = (struct qed_hwfn *)rdma_cxt; + int rc; + + DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "Allocate TID\n"); + + spin_lock_bh(&p_hwfn->p_rdma_info->lock); + rc = qed_rdma_bmap_alloc_id(p_hwfn, + &p_hwfn->p_rdma_info->tid_map, itid); + spin_unlock_bh(&p_hwfn->p_rdma_info->lock); + if (rc) + goto out; + + rc = qed_cxt_dynamic_ilt_alloc(p_hwfn, QED_ELEM_TASK, *itid); +out: + DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "Allocate TID - done, rc = %d\n", rc); + return rc; +} + static int qed_rdma_reserve_lkey(struct qed_hwfn *p_hwfn) { struct qed_rdma_device *dev = p_hwfn->p_rdma_info->dev; @@ -573,7 +588,7 @@ static int qed_rdma_setup(struct qed_hwfn *p_hwfn, return qed_rdma_start_fw(p_hwfn, params, p_ptt); } -int qed_rdma_stop(void *rdma_cxt) +static int qed_rdma_stop(void *rdma_cxt) { struct qed_hwfn *p_hwfn = (struct qed_hwfn *)rdma_cxt; struct rdma_close_func_ramrod_data *p_ramrod; @@ -629,8 +644,8 @@ out: return rc; } -int qed_rdma_add_user(void *rdma_cxt, - struct qed_rdma_add_user_out_params *out_params) +static int qed_rdma_add_user(void *rdma_cxt, + struct qed_rdma_add_user_out_params *out_params) { struct qed_hwfn *p_hwfn = (struct qed_hwfn *)rdma_cxt; u32 dpi_start_offset; @@ -664,7 +679,7 @@ int qed_rdma_add_user(void *rdma_cxt, return rc; } -struct qed_rdma_port *qed_rdma_query_port(void *rdma_cxt) +static struct qed_rdma_port *qed_rdma_query_port(void *rdma_cxt) { struct qed_hwfn *p_hwfn = (struct qed_hwfn *)rdma_cxt; struct qed_rdma_port *p_port = p_hwfn->p_rdma_info->port; @@ -680,7 +695,7 @@ struct qed_rdma_port *qed_rdma_query_port(void *rdma_cxt) return p_port; } -struct qed_rdma_device *qed_rdma_query_device(void *rdma_cxt) +static struct qed_rdma_device *qed_rdma_query_device(void *rdma_cxt) { struct qed_hwfn *p_hwfn = (struct qed_hwfn *)rdma_cxt; @@ -690,7 +705,7 @@ struct qed_rdma_device *qed_rdma_query_device(void *rdma_cxt) return p_hwfn->p_rdma_info->dev; } -void qed_rdma_free_tid(void *rdma_cxt, u32 itid) +static void qed_rdma_free_tid(void *rdma_cxt, u32 itid) { struct qed_hwfn *p_hwfn = (struct qed_hwfn *)rdma_cxt; @@ -701,27 +716,7 @@ void qed_rdma_free_tid(void *rdma_cxt, u32 itid) spin_unlock_bh(&p_hwfn->p_rdma_info->lock); } -int qed_rdma_alloc_tid(void *rdma_cxt, u32 *itid) -{ - struct qed_hwfn *p_hwfn = (struct qed_hwfn *)rdma_cxt; - int rc; - - DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "Allocate TID\n"); - - spin_lock_bh(&p_hwfn->p_rdma_info->lock); - rc = qed_rdma_bmap_alloc_id(p_hwfn, - &p_hwfn->p_rdma_info->tid_map, itid); - spin_unlock_bh(&p_hwfn->p_rdma_info->lock); - if (rc) - goto out; - - rc = qed_cxt_dynamic_ilt_alloc(p_hwfn, QED_ELEM_TASK, *itid); -out: - DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "Allocate TID - done, rc = %d\n", rc); - return rc; -} - -void qed_rdma_cnq_prod_update(void *rdma_cxt, u8 qz_offset, u16 prod) +static void qed_rdma_cnq_prod_update(void *rdma_cxt, u8 qz_offset, u16 prod) { struct qed_hwfn *p_hwfn; u16 qz_num; @@ -816,7 +811,7 @@ static int qed_rdma_get_int(struct qed_dev *cdev, struct qed_int_info *info) return 0; } -int qed_rdma_alloc_pd(void *rdma_cxt, u16 *pd) +static int qed_rdma_alloc_pd(void *rdma_cxt, u16 *pd) { struct qed_hwfn *p_hwfn = (struct qed_hwfn *)rdma_cxt; u32 returned_id; @@ -1985,9 +1980,9 @@ int qed_roce_destroy_qp(struct qed_hwfn *p_hwfn, struct qed_rdma_qp *qp) return 0; } -int qed_rdma_query_qp(void *rdma_cxt, - struct qed_rdma_qp *qp, - struct qed_rdma_query_qp_out_params *out_params) +static int qed_rdma_query_qp(void *rdma_cxt, + struct qed_rdma_qp *qp, + struct qed_rdma_query_qp_out_params *out_params) { struct qed_hwfn *p_hwfn = (struct qed_hwfn *)rdma_cxt; int rc; @@ -2022,7 +2017,7 @@ int qed_rdma_query_qp(void *rdma_cxt, return rc; } -int qed_rdma_destroy_qp(void *rdma_cxt, struct qed_rdma_qp *qp) +static int qed_rdma_destroy_qp(void *rdma_cxt, struct qed_rdma_qp *qp) { struct qed_hwfn *p_hwfn = (struct qed_hwfn *)rdma_cxt; int rc = 0; @@ -2215,9 +2210,9 @@ static int qed_roce_modify_qp(struct qed_hwfn *p_hwfn, return rc; } -int qed_rdma_modify_qp(void *rdma_cxt, - struct qed_rdma_qp *qp, - struct qed_rdma_modify_qp_in_params *params) +static int qed_rdma_modify_qp(void *rdma_cxt, + struct qed_rdma_qp *qp, + struct qed_rdma_modify_qp_in_params *params) { struct qed_hwfn *p_hwfn = (struct qed_hwfn *)rdma_cxt; enum qed_roce_qp_state prev_state; @@ -2312,8 +2307,9 @@ int qed_rdma_modify_qp(void *rdma_cxt, return rc; } -int qed_rdma_register_tid(void *rdma_cxt, - struct qed_rdma_register_tid_in_params *params) +static int +qed_rdma_register_tid(void *rdma_cxt, + struct qed_rdma_register_tid_in_params *params) { struct qed_hwfn *p_hwfn = (struct qed_hwfn *)rdma_cxt; struct rdma_register_tid_ramrod_data *p_ramrod; @@ -2450,7 +2446,7 @@ int qed_rdma_register_tid(void *rdma_cxt, return rc; } -int qed_rdma_deregister_tid(void *rdma_cxt, u32 itid) +static int qed_rdma_deregister_tid(void *rdma_cxt, u32 itid) { struct qed_hwfn *p_hwfn = (struct qed_hwfn *)rdma_cxt; struct rdma_deregister_tid_ramrod_data *p_ramrod; @@ -2561,7 +2557,8 @@ void qed_rdma_dpm_bar(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt) qed_rdma_dpm_conf(p_hwfn, p_ptt); } -int qed_rdma_start(void *rdma_cxt, struct qed_rdma_start_in_params *params) +static int qed_rdma_start(void *rdma_cxt, + struct qed_rdma_start_in_params *params) { struct qed_hwfn *p_hwfn = (struct qed_hwfn *)rdma_cxt; struct qed_ptt *p_ptt; @@ -2601,7 +2598,7 @@ static int qed_rdma_init(struct qed_dev *cdev, return qed_rdma_start(QED_LEADING_HWFN(cdev), params); } -void qed_rdma_remove_user(void *rdma_cxt, u16 dpi) +static void qed_rdma_remove_user(void *rdma_cxt, u16 dpi) { struct qed_hwfn *p_hwfn = (struct qed_hwfn *)rdma_cxt; diff --git a/drivers/net/ethernet/qlogic/qed/qed_roce.h b/drivers/net/ethernet/qlogic/qed/qed_roce.h index 2f091e8a0f40..691413176734 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_roce.h +++ b/drivers/net/ethernet/qlogic/qed/qed_roce.h @@ -181,36 +181,55 @@ struct qed_rdma_qp { dma_addr_t shared_queue_phys_addr; }; -int -qed_rdma_add_user(void *rdma_cxt, - struct qed_rdma_add_user_out_params *out_params); -int qed_rdma_alloc_pd(void *rdma_cxt, u16 *pd); -int qed_rdma_alloc_tid(void *rdma_cxt, u32 *tid); -int qed_rdma_deregister_tid(void *rdma_cxt, u32 tid); -void qed_rdma_free_tid(void *rdma_cxt, u32 tid); -struct qed_rdma_device *qed_rdma_query_device(void *rdma_cxt); -struct qed_rdma_port *qed_rdma_query_port(void *rdma_cxt); -int -qed_rdma_register_tid(void *rdma_cxt, - struct qed_rdma_register_tid_in_params *params); -void qed_rdma_remove_user(void *rdma_cxt, u16 dpi); -int qed_rdma_start(void *p_hwfn, struct qed_rdma_start_in_params *params); -int qed_rdma_stop(void *rdma_cxt); -u32 qed_rdma_get_sb_id(void *p_hwfn, u32 rel_sb_id); -u32 qed_rdma_query_cau_timer_res(void *p_hwfn); -void qed_rdma_cnq_prod_update(void *rdma_cxt, u8 cnq_index, u16 prod); -void qed_rdma_resc_free(struct qed_hwfn *p_hwfn); +#if IS_ENABLED(CONFIG_QED_RDMA) +void qed_rdma_dpm_bar(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt); void qed_async_roce_event(struct qed_hwfn *p_hwfn, struct event_ring_entry *p_eqe); -int qed_rdma_destroy_qp(void *rdma_cxt, struct qed_rdma_qp *qp); -int qed_rdma_modify_qp(void *rdma_cxt, struct qed_rdma_qp *qp, - struct qed_rdma_modify_qp_in_params *params); -int qed_rdma_query_qp(void *rdma_cxt, struct qed_rdma_qp *qp, - struct qed_rdma_query_qp_out_params *out_params); - -#if IS_ENABLED(CONFIG_INFINIBAND_QEDR) -void qed_rdma_dpm_bar(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt); +void qed_ll2b_complete_tx_gsi_packet(struct qed_hwfn *p_hwfn, + u8 connection_handle, + void *cookie, + dma_addr_t first_frag_addr, + bool b_last_fragment, bool b_last_packet); +void qed_ll2b_release_tx_gsi_packet(struct qed_hwfn *p_hwfn, + u8 connection_handle, + void *cookie, + dma_addr_t first_frag_addr, + bool b_last_fragment, bool b_last_packet); +void qed_ll2b_complete_rx_gsi_packet(struct qed_hwfn *p_hwfn, + u8 connection_handle, + void *cookie, + dma_addr_t rx_buf_addr, + u16 data_length, + u8 data_length_error, + u16 parse_flags, + u16 vlan, + u32 src_mac_addr_hi, + u16 src_mac_addr_lo, bool b_last_packet); #else -void qed_rdma_dpm_bar(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt) {} +static inline void qed_rdma_dpm_bar(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt) {} +static inline void qed_async_roce_event(struct qed_hwfn *p_hwfn, struct event_ring_entry *p_eqe) {} +static inline void qed_ll2b_complete_tx_gsi_packet(struct qed_hwfn *p_hwfn, + u8 connection_handle, + void *cookie, + dma_addr_t first_frag_addr, + bool b_last_fragment, + bool b_last_packet) {} +static inline void qed_ll2b_release_tx_gsi_packet(struct qed_hwfn *p_hwfn, + u8 connection_handle, + void *cookie, + dma_addr_t first_frag_addr, + bool b_last_fragment, + bool b_last_packet) {} +static inline void qed_ll2b_complete_rx_gsi_packet(struct qed_hwfn *p_hwfn, + u8 connection_handle, + void *cookie, + dma_addr_t rx_buf_addr, + u16 data_length, + u8 data_length_error, + u16 parse_flags, + u16 vlan, + u32 src_mac_addr_hi, + u16 src_mac_addr_lo, + bool b_last_packet) {} #endif #endif diff --git a/drivers/net/ethernet/qlogic/qed/qed_spq.c b/drivers/net/ethernet/qlogic/qed/qed_spq.c index caff41544898..9fbaf9429fd0 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_spq.c +++ b/drivers/net/ethernet/qlogic/qed/qed_spq.c @@ -28,9 +28,7 @@ #include "qed_reg_addr.h" #include "qed_sp.h" #include "qed_sriov.h" -#if IS_ENABLED(CONFIG_INFINIBAND_QEDR) #include "qed_roce.h" -#endif /*************************************************************************** * Structures & Definitions @@ -240,11 +238,9 @@ qed_async_event_completion(struct qed_hwfn *p_hwfn, struct event_ring_entry *p_eqe) { switch (p_eqe->protocol_id) { -#if IS_ENABLED(CONFIG_INFINIBAND_QEDR) case PROTOCOLID_ROCE: qed_async_roce_event(p_hwfn, p_eqe); return 0; -#endif case PROTOCOLID_COMMON: return qed_sriov_eqe_event(p_hwfn, p_eqe->opcode, diff --git a/drivers/net/ethernet/qlogic/qede/Makefile b/drivers/net/ethernet/qlogic/qede/Makefile index 28dc58919c85..048a230c3ce0 100644 --- a/drivers/net/ethernet/qlogic/qede/Makefile +++ b/drivers/net/ethernet/qlogic/qede/Makefile @@ -2,4 +2,4 @@ obj-$(CONFIG_QEDE) := qede.o qede-y := qede_main.o qede_ethtool.o qede-$(CONFIG_DCB) += qede_dcbnl.o -qede-$(CONFIG_INFINIBAND_QEDR) += qede_roce.o +qede-$(CONFIG_QED_RDMA) += qede_roce.o diff --git a/include/linux/qed/qede_roce.h b/include/linux/qed/qede_roce.h index 99fbe6d55acb..f48d64b0e2fb 100644 --- a/include/linux/qed/qede_roce.h +++ b/include/linux/qed/qede_roce.h @@ -68,7 +68,7 @@ void qede_roce_unregister_driver(struct qedr_driver *drv); bool qede_roce_supported(struct qede_dev *dev); -#if IS_ENABLED(CONFIG_INFINIBAND_QEDR) +#if IS_ENABLED(CONFIG_QED_RDMA) int qede_roce_dev_add(struct qede_dev *dev); void qede_roce_dev_event_open(struct qede_dev *dev); void qede_roce_dev_event_close(struct qede_dev *dev); -- cgit v1.2.3 From 496063426dece3f47e21f9f3387205d6ca03bd2a Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Fri, 14 Oct 2016 14:26:24 -0400 Subject: pkeys: Remove easily triggered WARN This easy-to-trigger warning shows up instantly when running Trinity on a kernel with CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS disabled. At most this should have been a printk, but the -EINVAL alone should be more than adequate indicator that something isn't available. Signed-off-by: Dave Jones Signed-off-by: Linus Torvalds --- include/linux/pkeys.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/pkeys.h b/include/linux/pkeys.h index e4c08c1ff0c5..a1bacf1150b2 100644 --- a/include/linux/pkeys.h +++ b/include/linux/pkeys.h @@ -25,7 +25,6 @@ static inline int mm_pkey_alloc(struct mm_struct *mm) static inline int mm_pkey_free(struct mm_struct *mm, int pkey) { - WARN_ONCE(1, "free of protection key when disabled"); return -EINVAL; } -- cgit v1.2.3 From 2e0cbc4dd077aea4f1693583fd68eaed4d60464b Mon Sep 17 00:00:00 2001 From: Ram Amrani Date: Mon, 10 Oct 2016 13:15:30 +0300 Subject: qedr: Add RoCE driver framework Adds a skeletal implementation of the qed* RoCE driver - basically the ability to communicate with the qede driver and receive notifications from it regarding various init/exit events. Signed-off-by: Rajesh Borundia Signed-off-by: Ram Amrani Signed-off-by: Doug Ledford --- drivers/infiniband/Kconfig | 2 + drivers/infiniband/hw/Makefile | 1 + drivers/infiniband/hw/qedr/Kconfig | 7 + drivers/infiniband/hw/qedr/Makefile | 3 + drivers/infiniband/hw/qedr/main.c | 254 ++++++++++++++++++++++++++++++++++++ drivers/infiniband/hw/qedr/qedr.h | 61 +++++++++ drivers/net/ethernet/qlogic/Kconfig | 11 -- include/uapi/linux/pci_regs.h | 3 + 8 files changed, 331 insertions(+), 11 deletions(-) create mode 100644 drivers/infiniband/hw/qedr/Kconfig create mode 100644 drivers/infiniband/hw/qedr/Makefile create mode 100644 drivers/infiniband/hw/qedr/main.c create mode 100644 drivers/infiniband/hw/qedr/qedr.h (limited to 'include') diff --git a/drivers/infiniband/Kconfig b/drivers/infiniband/Kconfig index 19a418a1b631..fb3fb89640e5 100644 --- a/drivers/infiniband/Kconfig +++ b/drivers/infiniband/Kconfig @@ -89,4 +89,6 @@ source "drivers/infiniband/sw/rxe/Kconfig" source "drivers/infiniband/hw/hfi1/Kconfig" +source "drivers/infiniband/hw/qedr/Kconfig" + endif # INFINIBAND diff --git a/drivers/infiniband/hw/Makefile b/drivers/infiniband/hw/Makefile index 21fe401ff178..e7a5ed9f6f3f 100644 --- a/drivers/infiniband/hw/Makefile +++ b/drivers/infiniband/hw/Makefile @@ -10,3 +10,4 @@ obj-$(CONFIG_INFINIBAND_OCRDMA) += ocrdma/ obj-$(CONFIG_INFINIBAND_USNIC) += usnic/ obj-$(CONFIG_INFINIBAND_HFI1) += hfi1/ obj-$(CONFIG_INFINIBAND_HNS) += hns/ +obj-$(CONFIG_INFINIBAND_QEDR) += qedr/ diff --git a/drivers/infiniband/hw/qedr/Kconfig b/drivers/infiniband/hw/qedr/Kconfig new file mode 100644 index 000000000000..7c06d85568d4 --- /dev/null +++ b/drivers/infiniband/hw/qedr/Kconfig @@ -0,0 +1,7 @@ +config INFINIBAND_QEDR + tristate "QLogic RoCE driver" + depends on 64BIT && QEDE + select QED_LL2 + ---help--- + This driver provides low-level InfiniBand over Ethernet + support for QLogic QED host channel adapters (HCAs). diff --git a/drivers/infiniband/hw/qedr/Makefile b/drivers/infiniband/hw/qedr/Makefile new file mode 100644 index 000000000000..3a5b7a288281 --- /dev/null +++ b/drivers/infiniband/hw/qedr/Makefile @@ -0,0 +1,3 @@ +obj-$(CONFIG_INFINIBAND_QEDR) := qedr.o + +qedr-y := main.o diff --git a/drivers/infiniband/hw/qedr/main.c b/drivers/infiniband/hw/qedr/main.c new file mode 100644 index 000000000000..7387d029a35b --- /dev/null +++ b/drivers/infiniband/hw/qedr/main.c @@ -0,0 +1,254 @@ +/* QLogic qedr NIC Driver + * Copyright (c) 2015-2016 QLogic Corporation + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and /or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include +#include +#include +#include +#include +#include +#include +#include "qedr.h" + +MODULE_DESCRIPTION("QLogic 40G/100G ROCE Driver"); +MODULE_AUTHOR("QLogic Corporation"); +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_VERSION(QEDR_MODULE_VERSION); + +void qedr_ib_dispatch_event(struct qedr_dev *dev, u8 port_num, + enum ib_event_type type) +{ + struct ib_event ibev; + + ibev.device = &dev->ibdev; + ibev.element.port_num = port_num; + ibev.event = type; + + ib_dispatch_event(&ibev); +} + +static enum rdma_link_layer qedr_link_layer(struct ib_device *device, + u8 port_num) +{ + return IB_LINK_LAYER_ETHERNET; +} + +static int qedr_register_device(struct qedr_dev *dev) +{ + strlcpy(dev->ibdev.name, "qedr%d", IB_DEVICE_NAME_MAX); + + memcpy(dev->ibdev.node_desc, QEDR_NODE_DESC, sizeof(QEDR_NODE_DESC)); + dev->ibdev.owner = THIS_MODULE; + + dev->ibdev.get_link_layer = qedr_link_layer; + + return 0; +} + +/* QEDR sysfs interface */ +static ssize_t show_rev(struct device *device, struct device_attribute *attr, + char *buf) +{ + struct qedr_dev *dev = dev_get_drvdata(device); + + return scnprintf(buf, PAGE_SIZE, "0x%x\n", dev->pdev->vendor); +} + +static ssize_t show_hca_type(struct device *device, + struct device_attribute *attr, char *buf) +{ + return scnprintf(buf, PAGE_SIZE, "%s\n", "HCA_TYPE_TO_SET"); +} + +static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL); +static DEVICE_ATTR(hca_type, S_IRUGO, show_hca_type, NULL); + +static struct device_attribute *qedr_attributes[] = { + &dev_attr_hw_rev, + &dev_attr_hca_type +}; + +static void qedr_remove_sysfiles(struct qedr_dev *dev) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(qedr_attributes); i++) + device_remove_file(&dev->ibdev.dev, qedr_attributes[i]); +} + +static void qedr_pci_set_atomic(struct qedr_dev *dev, struct pci_dev *pdev) +{ + struct pci_dev *bridge; + u32 val; + + dev->atomic_cap = IB_ATOMIC_NONE; + + bridge = pdev->bus->self; + if (!bridge) + return; + + /* Check whether we are connected directly or via a switch */ + while (bridge && bridge->bus->parent) { + DP_DEBUG(dev, QEDR_MSG_INIT, + "Device is not connected directly to root. bridge->bus->number=%d primary=%d\n", + bridge->bus->number, bridge->bus->primary); + /* Need to check Atomic Op Routing Supported all the way to + * root complex. + */ + pcie_capability_read_dword(bridge, PCI_EXP_DEVCAP2, &val); + if (!(val & PCI_EXP_DEVCAP2_ATOMIC_ROUTE)) { + pcie_capability_clear_word(pdev, + PCI_EXP_DEVCTL2, + PCI_EXP_DEVCTL2_ATOMIC_REQ); + return; + } + bridge = bridge->bus->parent->self; + } + bridge = pdev->bus->self; + + /* according to bridge capability */ + pcie_capability_read_dword(bridge, PCI_EXP_DEVCAP2, &val); + if (val & PCI_EXP_DEVCAP2_ATOMIC_COMP64) { + pcie_capability_set_word(pdev, PCI_EXP_DEVCTL2, + PCI_EXP_DEVCTL2_ATOMIC_REQ); + dev->atomic_cap = IB_ATOMIC_GLOB; + } else { + pcie_capability_clear_word(pdev, PCI_EXP_DEVCTL2, + PCI_EXP_DEVCTL2_ATOMIC_REQ); + } +} + +static struct qedr_dev *qedr_add(struct qed_dev *cdev, struct pci_dev *pdev, + struct net_device *ndev) +{ + struct qedr_dev *dev; + int rc = 0, i; + + dev = (struct qedr_dev *)ib_alloc_device(sizeof(*dev)); + if (!dev) { + pr_err("Unable to allocate ib device\n"); + return NULL; + } + + DP_DEBUG(dev, QEDR_MSG_INIT, "qedr add device called\n"); + + dev->pdev = pdev; + dev->ndev = ndev; + dev->cdev = cdev; + + qedr_pci_set_atomic(dev, pdev); + + rc = qedr_register_device(dev); + if (rc) { + DP_ERR(dev, "Unable to allocate register device\n"); + goto init_err; + } + + for (i = 0; i < ARRAY_SIZE(qedr_attributes); i++) + if (device_create_file(&dev->ibdev.dev, qedr_attributes[i])) + goto init_err; + + DP_DEBUG(dev, QEDR_MSG_INIT, "qedr driver loaded successfully\n"); + return dev; + +init_err: + ib_dealloc_device(&dev->ibdev); + DP_ERR(dev, "qedr driver load failed rc=%d\n", rc); + + return NULL; +} + +static void qedr_remove(struct qedr_dev *dev) +{ + /* First unregister with stack to stop all the active traffic + * of the registered clients. + */ + qedr_remove_sysfiles(dev); + + ib_dealloc_device(&dev->ibdev); +} + +static int qedr_close(struct qedr_dev *dev) +{ + qedr_ib_dispatch_event(dev, 1, IB_EVENT_PORT_ERR); + + return 0; +} + +static void qedr_shutdown(struct qedr_dev *dev) +{ + qedr_close(dev); + qedr_remove(dev); +} + +/* event handling via NIC driver ensures that all the NIC specific + * initialization done before RoCE driver notifies + * event to stack. + */ +static void qedr_notify(struct qedr_dev *dev, enum qede_roce_event event) +{ + switch (event) { + case QEDE_UP: + qedr_ib_dispatch_event(dev, 1, IB_EVENT_PORT_ACTIVE); + break; + case QEDE_DOWN: + qedr_close(dev); + break; + case QEDE_CLOSE: + qedr_shutdown(dev); + break; + case QEDE_CHANGE_ADDR: + qedr_ib_dispatch_event(dev, 1, IB_EVENT_GID_CHANGE); + break; + default: + pr_err("Event not supported\n"); + } +} + +static struct qedr_driver qedr_drv = { + .name = "qedr_driver", + .add = qedr_add, + .remove = qedr_remove, + .notify = qedr_notify, +}; + +static int __init qedr_init_module(void) +{ + return qede_roce_register_driver(&qedr_drv); +} + +static void __exit qedr_exit_module(void) +{ + qede_roce_unregister_driver(&qedr_drv); +} + +module_init(qedr_init_module); +module_exit(qedr_exit_module); diff --git a/drivers/infiniband/hw/qedr/qedr.h b/drivers/infiniband/hw/qedr/qedr.h new file mode 100644 index 000000000000..801417080f11 --- /dev/null +++ b/drivers/infiniband/hw/qedr/qedr.h @@ -0,0 +1,61 @@ +/* QLogic qedr NIC Driver + * Copyright (c) 2015-2016 QLogic Corporation + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and /or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __QEDR_H__ +#define __QEDR_H__ + +#include +#include +#include +#include + +#define QEDR_MODULE_VERSION "8.10.10.0" +#define QEDR_NODE_DESC "QLogic 579xx RoCE HCA" +#define DP_NAME(dev) ((dev)->ibdev.name) + +#define DP_DEBUG(dev, module, fmt, ...) \ + pr_debug("(%s) " module ": " fmt, \ + DP_NAME(dev) ? DP_NAME(dev) : "", ## __VA_ARGS__) + +#define QEDR_MSG_INIT "INIT" + +struct qedr_dev { + struct ib_device ibdev; + struct qed_dev *cdev; + struct pci_dev *pdev; + struct net_device *ndev; + + enum ib_atomic_cap atomic_cap; + + u32 dp_module; + u8 dp_level; +}; +#endif diff --git a/drivers/net/ethernet/qlogic/Kconfig b/drivers/net/ethernet/qlogic/Kconfig index 0df1391f9663..1e8339a67f6e 100644 --- a/drivers/net/ethernet/qlogic/Kconfig +++ b/drivers/net/ethernet/qlogic/Kconfig @@ -107,15 +107,4 @@ config QEDE ---help--- This enables the support for ... -config INFINIBAND_QEDR - tristate "QLogic qede RoCE sources [debug]" - depends on QEDE && 64BIT - select QED_LL2 - default n - ---help--- - This provides a temporary node that allows the compilation - and logical testing of the InfiniBand over Ethernet support - for QLogic QED. This would be replaced by the 'real' option - once the QEDR driver is added [+relocated]. - endif # NET_VENDOR_QLOGIC diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h index d812172d1d7b..e5a2e68b2236 100644 --- a/include/uapi/linux/pci_regs.h +++ b/include/uapi/linux/pci_regs.h @@ -612,6 +612,8 @@ */ #define PCI_EXP_DEVCAP2 36 /* Device Capabilities 2 */ #define PCI_EXP_DEVCAP2_ARI 0x00000020 /* Alternative Routing-ID */ +#define PCI_EXP_DEVCAP2_ATOMIC_ROUTE 0x00000040 /* Atomic Op routing */ +#define PCI_EXP_DEVCAP2_ATOMIC_COMP64 0x00000100 /* Atomic 64-bit compare */ #define PCI_EXP_DEVCAP2_LTR 0x00000800 /* Latency tolerance reporting */ #define PCI_EXP_DEVCAP2_OBFF_MASK 0x000c0000 /* OBFF support mechanism */ #define PCI_EXP_DEVCAP2_OBFF_MSG 0x00040000 /* New message signaling */ @@ -619,6 +621,7 @@ #define PCI_EXP_DEVCTL2 40 /* Device Control 2 */ #define PCI_EXP_DEVCTL2_COMP_TIMEOUT 0x000f /* Completion Timeout Value */ #define PCI_EXP_DEVCTL2_ARI 0x0020 /* Alternative Routing-ID */ +#define PCI_EXP_DEVCTL2_ATOMIC_REQ 0x0040 /* Set Atomic requests */ #define PCI_EXP_DEVCTL2_IDO_REQ_EN 0x0100 /* Allow IDO for requests */ #define PCI_EXP_DEVCTL2_IDO_CMP_EN 0x0200 /* Allow IDO for completions */ #define PCI_EXP_DEVCTL2_LTR_EN 0x0400 /* Enable LTR mechanism */ -- cgit v1.2.3 From ac1b36e55a5137e2f146e60be36d0cc81069feb6 Mon Sep 17 00:00:00 2001 From: Ram Amrani Date: Mon, 10 Oct 2016 13:15:32 +0300 Subject: qedr: Add support for user context verbs Add support for ucontext, query port, add and del gid verbs. Signed-off-by: Rajesh Borundia Signed-off-by: Ram Amrani Signed-off-by: Doug Ledford --- drivers/infiniband/hw/qedr/Makefile | 2 +- drivers/infiniband/hw/qedr/main.c | 26 ++ drivers/infiniband/hw/qedr/qedr.h | 34 +++ drivers/infiniband/hw/qedr/verbs.c | 456 ++++++++++++++++++++++++++++++++++++ drivers/infiniband/hw/qedr/verbs.h | 52 ++++ include/uapi/rdma/qedr-abi.h | 53 +++++ 6 files changed, 622 insertions(+), 1 deletion(-) create mode 100644 drivers/infiniband/hw/qedr/verbs.c create mode 100644 drivers/infiniband/hw/qedr/verbs.h create mode 100644 include/uapi/rdma/qedr-abi.h (limited to 'include') diff --git a/drivers/infiniband/hw/qedr/Makefile b/drivers/infiniband/hw/qedr/Makefile index 3a5b7a288281..b10f2b17ada5 100644 --- a/drivers/infiniband/hw/qedr/Makefile +++ b/drivers/infiniband/hw/qedr/Makefile @@ -1,3 +1,3 @@ obj-$(CONFIG_INFINIBAND_QEDR) := qedr.o -qedr-y := main.o +qedr-y := main.o verbs.o diff --git a/drivers/infiniband/hw/qedr/main.c b/drivers/infiniband/hw/qedr/main.c index 67805941d138..55c4f565c214 100644 --- a/drivers/infiniband/hw/qedr/main.c +++ b/drivers/infiniband/hw/qedr/main.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #include #include @@ -39,6 +40,8 @@ #include #include #include "qedr.h" +#include "verbs.h" +#include MODULE_DESCRIPTION("QLogic 40G/100G ROCE Driver"); MODULE_AUTHOR("QLogic Corporation"); @@ -80,6 +83,29 @@ static int qedr_register_device(struct qedr_dev *dev) memcpy(dev->ibdev.node_desc, QEDR_NODE_DESC, sizeof(QEDR_NODE_DESC)); dev->ibdev.owner = THIS_MODULE; + dev->ibdev.uverbs_abi_ver = QEDR_ABI_VERSION; + + dev->ibdev.uverbs_cmd_mask = QEDR_UVERBS(GET_CONTEXT) | + QEDR_UVERBS(QUERY_DEVICE) | + QEDR_UVERBS(QUERY_PORT); + + dev->ibdev.phys_port_cnt = 1; + dev->ibdev.num_comp_vectors = dev->num_cnq; + dev->ibdev.node_type = RDMA_NODE_IB_CA; + + dev->ibdev.query_device = qedr_query_device; + dev->ibdev.query_port = qedr_query_port; + dev->ibdev.modify_port = qedr_modify_port; + + dev->ibdev.query_gid = qedr_query_gid; + dev->ibdev.add_gid = qedr_add_gid; + dev->ibdev.del_gid = qedr_del_gid; + + dev->ibdev.alloc_ucontext = qedr_alloc_ucontext; + dev->ibdev.dealloc_ucontext = qedr_dealloc_ucontext; + dev->ibdev.mmap = qedr_mmap; + + dev->ibdev.dma_device = &dev->pdev->dev; dev->ibdev.get_link_layer = qedr_link_layer; dev->ibdev.get_dev_fw_str = qedr_get_dev_fw_str; diff --git a/drivers/infiniband/hw/qedr/qedr.h b/drivers/infiniband/hw/qedr/qedr.h index 1adbebb887eb..2091c0d6e67d 100644 --- a/drivers/infiniband/hw/qedr/qedr.h +++ b/drivers/infiniband/hw/qedr/qedr.h @@ -49,6 +49,7 @@ DP_NAME(dev) ? DP_NAME(dev) : "", ## __VA_ARGS__) #define QEDR_MSG_INIT "INIT" +#define QEDR_MSG_MISC "MISC" struct qedr_dev; @@ -176,6 +177,39 @@ struct qedr_dev { #define QEDR_UVERBS(CMD_NAME) (1ull << IB_USER_VERBS_CMD_##CMD_NAME) +#define QEDR_ROCE_PKEY_MAX 1 +#define QEDR_ROCE_PKEY_TABLE_LEN 1 +#define QEDR_ROCE_PKEY_DEFAULT 0xffff + +struct qedr_ucontext { + struct ib_ucontext ibucontext; + struct qedr_dev *dev; + struct qedr_pd *pd; + u64 dpi_addr; + u64 dpi_phys_addr; + u32 dpi_size; + u16 dpi; + + struct list_head mm_head; + + /* Lock to protect mm list */ + struct mutex mm_list_lock; +}; + +struct qedr_mm { + struct { + u64 phy_addr; + unsigned long len; + } key; + struct list_head entry; +}; + +static inline +struct qedr_ucontext *get_qedr_ucontext(struct ib_ucontext *ibucontext) +{ + return container_of(ibucontext, struct qedr_ucontext, ibucontext); +} + static inline struct qedr_dev *get_qedr_dev(struct ib_device *ibdev) { return container_of(ibdev, struct qedr_dev, ibdev); diff --git a/drivers/infiniband/hw/qedr/verbs.c b/drivers/infiniband/hw/qedr/verbs.c new file mode 100644 index 000000000000..b9dcade1cb9f --- /dev/null +++ b/drivers/infiniband/hw/qedr/verbs.c @@ -0,0 +1,456 @@ +/* QLogic qedr NIC Driver + * Copyright (c) 2015-2016 QLogic Corporation + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and /or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "qedr_hsi.h" +#include +#include "qedr.h" +#include "verbs.h" +#include + +int qedr_query_gid(struct ib_device *ibdev, u8 port, int index, + union ib_gid *sgid) +{ + struct qedr_dev *dev = get_qedr_dev(ibdev); + int rc = 0; + + if (!rdma_cap_roce_gid_table(ibdev, port)) + return -ENODEV; + + rc = ib_get_cached_gid(ibdev, port, index, sgid, NULL); + if (rc == -EAGAIN) { + memcpy(sgid, &zgid, sizeof(*sgid)); + return 0; + } + + DP_DEBUG(dev, QEDR_MSG_INIT, "query gid: index=%d %llx:%llx\n", index, + sgid->global.interface_id, sgid->global.subnet_prefix); + + return rc; +} + +int qedr_add_gid(struct ib_device *device, u8 port_num, + unsigned int index, const union ib_gid *gid, + const struct ib_gid_attr *attr, void **context) +{ + if (!rdma_cap_roce_gid_table(device, port_num)) + return -EINVAL; + + if (port_num > QEDR_MAX_PORT) + return -EINVAL; + + if (!context) + return -EINVAL; + + return 0; +} + +int qedr_del_gid(struct ib_device *device, u8 port_num, + unsigned int index, void **context) +{ + if (!rdma_cap_roce_gid_table(device, port_num)) + return -EINVAL; + + if (port_num > QEDR_MAX_PORT) + return -EINVAL; + + if (!context) + return -EINVAL; + + return 0; +} + +int qedr_query_device(struct ib_device *ibdev, + struct ib_device_attr *attr, struct ib_udata *udata) +{ + struct qedr_dev *dev = get_qedr_dev(ibdev); + struct qedr_device_attr *qattr = &dev->attr; + + if (!dev->rdma_ctx) { + DP_ERR(dev, + "qedr_query_device called with invalid params rdma_ctx=%p\n", + dev->rdma_ctx); + return -EINVAL; + } + + memset(attr, 0, sizeof(*attr)); + + attr->fw_ver = qattr->fw_ver; + attr->sys_image_guid = qattr->sys_image_guid; + attr->max_mr_size = qattr->max_mr_size; + attr->page_size_cap = qattr->page_size_caps; + attr->vendor_id = qattr->vendor_id; + attr->vendor_part_id = qattr->vendor_part_id; + attr->hw_ver = qattr->hw_ver; + attr->max_qp = qattr->max_qp; + attr->max_qp_wr = max_t(u32, qattr->max_sqe, qattr->max_rqe); + attr->device_cap_flags = IB_DEVICE_CURR_QP_STATE_MOD | + IB_DEVICE_RC_RNR_NAK_GEN | + IB_DEVICE_LOCAL_DMA_LKEY | IB_DEVICE_MEM_MGT_EXTENSIONS; + + attr->max_sge = qattr->max_sge; + attr->max_sge_rd = qattr->max_sge; + attr->max_cq = qattr->max_cq; + attr->max_cqe = qattr->max_cqe; + attr->max_mr = qattr->max_mr; + attr->max_mw = qattr->max_mw; + attr->max_pd = qattr->max_pd; + attr->atomic_cap = dev->atomic_cap; + attr->max_fmr = qattr->max_fmr; + attr->max_map_per_fmr = 16; + attr->max_qp_init_rd_atom = + 1 << (fls(qattr->max_qp_req_rd_atomic_resc) - 1); + attr->max_qp_rd_atom = + min(1 << (fls(qattr->max_qp_resp_rd_atomic_resc) - 1), + attr->max_qp_init_rd_atom); + + attr->max_srq = qattr->max_srq; + attr->max_srq_sge = qattr->max_srq_sge; + attr->max_srq_wr = qattr->max_srq_wr; + + attr->local_ca_ack_delay = qattr->dev_ack_delay; + attr->max_fast_reg_page_list_len = qattr->max_mr / 8; + attr->max_pkeys = QEDR_ROCE_PKEY_MAX; + attr->max_ah = qattr->max_ah; + + return 0; +} + +#define QEDR_SPEED_SDR (1) +#define QEDR_SPEED_DDR (2) +#define QEDR_SPEED_QDR (4) +#define QEDR_SPEED_FDR10 (8) +#define QEDR_SPEED_FDR (16) +#define QEDR_SPEED_EDR (32) + +static inline void get_link_speed_and_width(int speed, u8 *ib_speed, + u8 *ib_width) +{ + switch (speed) { + case 1000: + *ib_speed = QEDR_SPEED_SDR; + *ib_width = IB_WIDTH_1X; + break; + case 10000: + *ib_speed = QEDR_SPEED_QDR; + *ib_width = IB_WIDTH_1X; + break; + + case 20000: + *ib_speed = QEDR_SPEED_DDR; + *ib_width = IB_WIDTH_4X; + break; + + case 25000: + *ib_speed = QEDR_SPEED_EDR; + *ib_width = IB_WIDTH_1X; + break; + + case 40000: + *ib_speed = QEDR_SPEED_QDR; + *ib_width = IB_WIDTH_4X; + break; + + case 50000: + *ib_speed = QEDR_SPEED_QDR; + *ib_width = IB_WIDTH_4X; + break; + + case 100000: + *ib_speed = QEDR_SPEED_EDR; + *ib_width = IB_WIDTH_4X; + break; + + default: + /* Unsupported */ + *ib_speed = QEDR_SPEED_SDR; + *ib_width = IB_WIDTH_1X; + } +} + +int qedr_query_port(struct ib_device *ibdev, u8 port, struct ib_port_attr *attr) +{ + struct qedr_dev *dev; + struct qed_rdma_port *rdma_port; + + dev = get_qedr_dev(ibdev); + if (port > 1) { + DP_ERR(dev, "invalid_port=0x%x\n", port); + return -EINVAL; + } + + if (!dev->rdma_ctx) { + DP_ERR(dev, "rdma_ctx is NULL\n"); + return -EINVAL; + } + + rdma_port = dev->ops->rdma_query_port(dev->rdma_ctx); + memset(attr, 0, sizeof(*attr)); + + if (rdma_port->port_state == QED_RDMA_PORT_UP) { + attr->state = IB_PORT_ACTIVE; + attr->phys_state = 5; + } else { + attr->state = IB_PORT_DOWN; + attr->phys_state = 3; + } + attr->max_mtu = IB_MTU_4096; + attr->active_mtu = iboe_get_mtu(dev->ndev->mtu); + attr->lid = 0; + attr->lmc = 0; + attr->sm_lid = 0; + attr->sm_sl = 0; + attr->port_cap_flags = IB_PORT_IP_BASED_GIDS; + attr->gid_tbl_len = QEDR_MAX_SGID; + attr->pkey_tbl_len = QEDR_ROCE_PKEY_TABLE_LEN; + attr->bad_pkey_cntr = rdma_port->pkey_bad_counter; + attr->qkey_viol_cntr = 0; + get_link_speed_and_width(rdma_port->link_speed, + &attr->active_speed, &attr->active_width); + attr->max_msg_sz = rdma_port->max_msg_size; + attr->max_vl_num = 4; + + return 0; +} + +int qedr_modify_port(struct ib_device *ibdev, u8 port, int mask, + struct ib_port_modify *props) +{ + struct qedr_dev *dev; + + dev = get_qedr_dev(ibdev); + if (port > 1) { + DP_ERR(dev, "invalid_port=0x%x\n", port); + return -EINVAL; + } + + return 0; +} + +static int qedr_add_mmap(struct qedr_ucontext *uctx, u64 phy_addr, + unsigned long len) +{ + struct qedr_mm *mm; + + mm = kzalloc(sizeof(*mm), GFP_KERNEL); + if (!mm) + return -ENOMEM; + + mm->key.phy_addr = phy_addr; + /* This function might be called with a length which is not a multiple + * of PAGE_SIZE, while the mapping is PAGE_SIZE grained and the kernel + * forces this granularity by increasing the requested size if needed. + * When qedr_mmap is called, it will search the list with the updated + * length as a key. To prevent search failures, the length is rounded up + * in advance to PAGE_SIZE. + */ + mm->key.len = roundup(len, PAGE_SIZE); + INIT_LIST_HEAD(&mm->entry); + + mutex_lock(&uctx->mm_list_lock); + list_add(&mm->entry, &uctx->mm_head); + mutex_unlock(&uctx->mm_list_lock); + + DP_DEBUG(uctx->dev, QEDR_MSG_MISC, + "added (addr=0x%llx,len=0x%lx) for ctx=%p\n", + (unsigned long long)mm->key.phy_addr, + (unsigned long)mm->key.len, uctx); + + return 0; +} + +static bool qedr_search_mmap(struct qedr_ucontext *uctx, u64 phy_addr, + unsigned long len) +{ + bool found = false; + struct qedr_mm *mm; + + mutex_lock(&uctx->mm_list_lock); + list_for_each_entry(mm, &uctx->mm_head, entry) { + if (len != mm->key.len || phy_addr != mm->key.phy_addr) + continue; + + found = true; + break; + } + mutex_unlock(&uctx->mm_list_lock); + DP_DEBUG(uctx->dev, QEDR_MSG_MISC, + "searched for (addr=0x%llx,len=0x%lx) for ctx=%p, result=%d\n", + mm->key.phy_addr, mm->key.len, uctx, found); + + return found; +} + +struct ib_ucontext *qedr_alloc_ucontext(struct ib_device *ibdev, + struct ib_udata *udata) +{ + int rc; + struct qedr_ucontext *ctx; + struct qedr_alloc_ucontext_resp uresp; + struct qedr_dev *dev = get_qedr_dev(ibdev); + struct qed_rdma_add_user_out_params oparams; + + if (!udata) + return ERR_PTR(-EFAULT); + + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) + return ERR_PTR(-ENOMEM); + + rc = dev->ops->rdma_add_user(dev->rdma_ctx, &oparams); + if (rc) { + DP_ERR(dev, + "failed to allocate a DPI for a new RoCE application, rc=%d. To overcome this consider to increase the number of DPIs, increase the doorbell BAR size or just close unnecessary RoCE applications. In order to increase the number of DPIs consult the qedr readme\n", + rc); + goto err; + } + + ctx->dpi = oparams.dpi; + ctx->dpi_addr = oparams.dpi_addr; + ctx->dpi_phys_addr = oparams.dpi_phys_addr; + ctx->dpi_size = oparams.dpi_size; + INIT_LIST_HEAD(&ctx->mm_head); + mutex_init(&ctx->mm_list_lock); + + memset(&uresp, 0, sizeof(uresp)); + + uresp.db_pa = ctx->dpi_phys_addr; + uresp.db_size = ctx->dpi_size; + uresp.max_send_wr = dev->attr.max_sqe; + uresp.max_recv_wr = dev->attr.max_rqe; + uresp.max_srq_wr = dev->attr.max_srq_wr; + uresp.sges_per_send_wr = QEDR_MAX_SQE_ELEMENTS_PER_SQE; + uresp.sges_per_recv_wr = QEDR_MAX_RQE_ELEMENTS_PER_RQE; + uresp.sges_per_srq_wr = dev->attr.max_srq_sge; + uresp.max_cqes = QEDR_MAX_CQES; + + rc = ib_copy_to_udata(udata, &uresp, sizeof(uresp)); + if (rc) + goto err; + + ctx->dev = dev; + + rc = qedr_add_mmap(ctx, ctx->dpi_phys_addr, ctx->dpi_size); + if (rc) + goto err; + + DP_DEBUG(dev, QEDR_MSG_INIT, "Allocating user context %p\n", + &ctx->ibucontext); + return &ctx->ibucontext; + +err: + kfree(ctx); + return ERR_PTR(rc); +} + +int qedr_dealloc_ucontext(struct ib_ucontext *ibctx) +{ + struct qedr_ucontext *uctx = get_qedr_ucontext(ibctx); + struct qedr_mm *mm, *tmp; + int status = 0; + + DP_DEBUG(uctx->dev, QEDR_MSG_INIT, "Deallocating user context %p\n", + uctx); + uctx->dev->ops->rdma_remove_user(uctx->dev->rdma_ctx, uctx->dpi); + + list_for_each_entry_safe(mm, tmp, &uctx->mm_head, entry) { + DP_DEBUG(uctx->dev, QEDR_MSG_MISC, + "deleted (addr=0x%llx,len=0x%lx) for ctx=%p\n", + mm->key.phy_addr, mm->key.len, uctx); + list_del(&mm->entry); + kfree(mm); + } + + kfree(uctx); + return status; +} + +int qedr_mmap(struct ib_ucontext *context, struct vm_area_struct *vma) +{ + struct qedr_ucontext *ucontext = get_qedr_ucontext(context); + struct qedr_dev *dev = get_qedr_dev(context->device); + unsigned long vm_page = vma->vm_pgoff << PAGE_SHIFT; + u64 unmapped_db = dev->db_phys_addr; + unsigned long len = (vma->vm_end - vma->vm_start); + int rc = 0; + bool found; + + DP_DEBUG(dev, QEDR_MSG_INIT, + "qedr_mmap called vm_page=0x%lx vm_pgoff=0x%lx unmapped_db=0x%llx db_size=%x, len=%lx\n", + vm_page, vma->vm_pgoff, unmapped_db, dev->db_size, len); + if (vma->vm_start & (PAGE_SIZE - 1)) { + DP_ERR(dev, "Vma_start not page aligned = %ld\n", + vma->vm_start); + return -EINVAL; + } + + found = qedr_search_mmap(ucontext, vm_page, len); + if (!found) { + DP_ERR(dev, "Vma_pgoff not found in mapped array = %ld\n", + vma->vm_pgoff); + return -EINVAL; + } + + DP_DEBUG(dev, QEDR_MSG_INIT, "Mapping doorbell bar\n"); + + if ((vm_page >= unmapped_db) && (vm_page <= (unmapped_db + + dev->db_size))) { + DP_DEBUG(dev, QEDR_MSG_INIT, "Mapping doorbell bar\n"); + if (vma->vm_flags & VM_READ) { + DP_ERR(dev, "Trying to map doorbell bar for read\n"); + return -EPERM; + } + + vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot); + + rc = io_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff, + PAGE_SIZE, vma->vm_page_prot); + } else { + DP_DEBUG(dev, QEDR_MSG_INIT, "Mapping chains\n"); + rc = remap_pfn_range(vma, vma->vm_start, + vma->vm_pgoff, len, vma->vm_page_prot); + } + DP_DEBUG(dev, QEDR_MSG_INIT, "qedr_mmap return code: %d\n", rc); + return rc; +} diff --git a/drivers/infiniband/hw/qedr/verbs.h b/drivers/infiniband/hw/qedr/verbs.h new file mode 100644 index 000000000000..9472044dd587 --- /dev/null +++ b/drivers/infiniband/hw/qedr/verbs.h @@ -0,0 +1,52 @@ +/* QLogic qedr NIC Driver + * Copyright (c) 2015-2016 QLogic Corporation + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and /or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __QEDR_VERBS_H__ +#define __QEDR_VERBS_H__ + +int qedr_query_device(struct ib_device *ibdev, + struct ib_device_attr *attr, struct ib_udata *udata); +int qedr_query_port(struct ib_device *, u8 port, struct ib_port_attr *props); +int qedr_modify_port(struct ib_device *, u8 port, int mask, + struct ib_port_modify *props); + +int qedr_query_gid(struct ib_device *, u8 port, int index, union ib_gid *gid); + +struct ib_ucontext *qedr_alloc_ucontext(struct ib_device *, struct ib_udata *); +int qedr_dealloc_ucontext(struct ib_ucontext *); + +int qedr_mmap(struct ib_ucontext *, struct vm_area_struct *vma); +int qedr_del_gid(struct ib_device *device, u8 port_num, + unsigned int index, void **context); +int qedr_add_gid(struct ib_device *device, u8 port_num, + unsigned int index, const union ib_gid *gid, + const struct ib_gid_attr *attr, void **context); +#endif diff --git a/include/uapi/rdma/qedr-abi.h b/include/uapi/rdma/qedr-abi.h new file mode 100644 index 000000000000..f7c7fff7a877 --- /dev/null +++ b/include/uapi/rdma/qedr-abi.h @@ -0,0 +1,53 @@ +/* QLogic qedr NIC Driver + * Copyright (c) 2015-2016 QLogic Corporation + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and /or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __QEDR_USER_H__ +#define __QEDR_USER_H__ + +#include + +#define QEDR_ABI_VERSION (8) + +/* user kernel communication data structures. */ + +struct qedr_alloc_ucontext_resp { + __u64 db_pa; + __u32 db_size; + + __u32 max_send_wr; + __u32 max_recv_wr; + __u32 max_srq_wr; + __u32 sges_per_send_wr; + __u32 sges_per_recv_wr; + __u32 sges_per_srq_wr; + __u32 max_cqes; +}; +#endif /* __QEDR_USER_H__ */ -- cgit v1.2.3 From a7efd7773e31b60f695816c27393fc717a9df127 Mon Sep 17 00:00:00 2001 From: Ram Amrani Date: Mon, 10 Oct 2016 13:15:33 +0300 Subject: qedr: Add support for PD,PKEY and CQ verbs Add support for protection domain and completion queue verbs. Signed-off-by: Rajesh Borundia Signed-off-by: Ram Amrani Signed-off-by: Doug Ledford --- drivers/infiniband/hw/qedr/main.c | 49 ++- drivers/infiniband/hw/qedr/qedr.h | 78 +++++ drivers/infiniband/hw/qedr/qedr_hsi_rdma.h | 79 +++++ drivers/infiniband/hw/qedr/verbs.c | 539 +++++++++++++++++++++++++++++ drivers/infiniband/hw/qedr/verbs.h | 14 + include/uapi/rdma/qedr-abi.h | 19 + 6 files changed, 777 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/drivers/infiniband/hw/qedr/main.c b/drivers/infiniband/hw/qedr/main.c index 55c4f565c214..35928abb6b63 100644 --- a/drivers/infiniband/hw/qedr/main.c +++ b/drivers/infiniband/hw/qedr/main.c @@ -87,7 +87,14 @@ static int qedr_register_device(struct qedr_dev *dev) dev->ibdev.uverbs_cmd_mask = QEDR_UVERBS(GET_CONTEXT) | QEDR_UVERBS(QUERY_DEVICE) | - QEDR_UVERBS(QUERY_PORT); + QEDR_UVERBS(QUERY_PORT) | + QEDR_UVERBS(ALLOC_PD) | + QEDR_UVERBS(DEALLOC_PD) | + QEDR_UVERBS(CREATE_COMP_CHANNEL) | + QEDR_UVERBS(CREATE_CQ) | + QEDR_UVERBS(RESIZE_CQ) | + QEDR_UVERBS(DESTROY_CQ) | + QEDR_UVERBS(REQ_NOTIFY_CQ); dev->ibdev.phys_port_cnt = 1; dev->ibdev.num_comp_vectors = dev->num_cnq; @@ -105,6 +112,16 @@ static int qedr_register_device(struct qedr_dev *dev) dev->ibdev.dealloc_ucontext = qedr_dealloc_ucontext; dev->ibdev.mmap = qedr_mmap; + dev->ibdev.alloc_pd = qedr_alloc_pd; + dev->ibdev.dealloc_pd = qedr_dealloc_pd; + + dev->ibdev.create_cq = qedr_create_cq; + dev->ibdev.destroy_cq = qedr_destroy_cq; + dev->ibdev.resize_cq = qedr_resize_cq; + dev->ibdev.req_notify_cq = qedr_arm_cq; + + dev->ibdev.query_pkey = qedr_query_pkey; + dev->ibdev.dma_device = &dev->pdev->dev; dev->ibdev.get_link_layer = qedr_link_layer; @@ -322,6 +339,8 @@ static irqreturn_t qedr_irq_handler(int irq, void *handle) { u16 hw_comp_cons, sw_comp_cons; struct qedr_cnq *cnq = handle; + struct regpair *cq_handle; + struct qedr_cq *cq; qed_sb_ack(cnq->sb, IGU_INT_DISABLE, 0); @@ -334,8 +353,36 @@ static irqreturn_t qedr_irq_handler(int irq, void *handle) rmb(); while (sw_comp_cons != hw_comp_cons) { + cq_handle = (struct regpair *)qed_chain_consume(&cnq->pbl); + cq = (struct qedr_cq *)(uintptr_t)HILO_U64(cq_handle->hi, + cq_handle->lo); + + if (cq == NULL) { + DP_ERR(cnq->dev, + "Received NULL CQ cq_handle->hi=%d cq_handle->lo=%d sw_comp_cons=%d hw_comp_cons=%d\n", + cq_handle->hi, cq_handle->lo, sw_comp_cons, + hw_comp_cons); + + break; + } + + if (cq->sig != QEDR_CQ_MAGIC_NUMBER) { + DP_ERR(cnq->dev, + "Problem with cq signature, cq_handle->hi=%d ch_handle->lo=%d cq=%p\n", + cq_handle->hi, cq_handle->lo, cq); + break; + } + + cq->arm_flags = 0; + + if (cq->ibcq.comp_handler) + (*cq->ibcq.comp_handler) + (&cq->ibcq, cq->ibcq.cq_context); + sw_comp_cons = qed_chain_get_cons_idx(&cnq->pbl); + cnq->n_comp++; + } qed_ops->rdma_cnq_prod_update(cnq->dev->rdma_ctx, cnq->index, diff --git a/drivers/infiniband/hw/qedr/qedr.h b/drivers/infiniband/hw/qedr/qedr.h index 2091c0d6e67d..9e2846a599ce 100644 --- a/drivers/infiniband/hw/qedr/qedr.h +++ b/drivers/infiniband/hw/qedr/qedr.h @@ -50,6 +50,10 @@ #define QEDR_MSG_INIT "INIT" #define QEDR_MSG_MISC "MISC" +#define QEDR_MSG_CQ " CQ" +#define QEDR_MSG_MR " MR" + +#define QEDR_CQ_MAGIC_NUMBER (0x11223344) struct qedr_dev; @@ -181,6 +185,12 @@ struct qedr_dev { #define QEDR_ROCE_PKEY_TABLE_LEN 1 #define QEDR_ROCE_PKEY_DEFAULT 0xffff +struct qedr_pbl { + struct list_head list_entry; + void *va; + dma_addr_t pa; +}; + struct qedr_ucontext { struct ib_ucontext ibucontext; struct qedr_dev *dev; @@ -196,6 +206,64 @@ struct qedr_ucontext { struct mutex mm_list_lock; }; +union db_prod64 { + struct rdma_pwm_val32_data data; + u64 raw; +}; + +enum qedr_cq_type { + QEDR_CQ_TYPE_GSI, + QEDR_CQ_TYPE_KERNEL, + QEDR_CQ_TYPE_USER, +}; + +struct qedr_pbl_info { + u32 num_pbls; + u32 num_pbes; + u32 pbl_size; + u32 pbe_size; + bool two_layered; +}; + +struct qedr_userq { + struct ib_umem *umem; + struct qedr_pbl_info pbl_info; + struct qedr_pbl *pbl_tbl; + u64 buf_addr; + size_t buf_len; +}; + +struct qedr_cq { + struct ib_cq ibcq; + + enum qedr_cq_type cq_type; + u32 sig; + + u16 icid; + + /* Lock to protect multiplem CQ's */ + spinlock_t cq_lock; + u8 arm_flags; + struct qed_chain pbl; + + void __iomem *db_addr; + union db_prod64 db; + + u8 pbl_toggle; + union rdma_cqe *latest_cqe; + union rdma_cqe *toggle_cqe; + + u32 cq_cons; + + struct qedr_userq q; +}; + +struct qedr_pd { + struct ib_pd ibpd; + u32 pd_id; + struct qedr_ucontext *uctx; +}; + struct qedr_mm { struct { u64 phy_addr; @@ -215,4 +283,14 @@ static inline struct qedr_dev *get_qedr_dev(struct ib_device *ibdev) return container_of(ibdev, struct qedr_dev, ibdev); } +static inline struct qedr_pd *get_qedr_pd(struct ib_pd *ibpd) +{ + return container_of(ibpd, struct qedr_pd, ibpd); +} + +static inline struct qedr_cq *get_qedr_cq(struct ib_cq *ibcq) +{ + return container_of(ibcq, struct qedr_cq, ibcq); +} + #endif diff --git a/drivers/infiniband/hw/qedr/qedr_hsi_rdma.h b/drivers/infiniband/hw/qedr/qedr_hsi_rdma.h index 3e508fbd5e78..84f6520107cc 100644 --- a/drivers/infiniband/hw/qedr/qedr_hsi_rdma.h +++ b/drivers/infiniband/hw/qedr/qedr_hsi_rdma.h @@ -47,6 +47,19 @@ struct rdma_cqe_responder { __le32 imm_data_hi; __le16 rq_cons; u8 flags; +#define RDMA_CQE_RESPONDER_TOGGLE_BIT_MASK 0x1 +#define RDMA_CQE_RESPONDER_TOGGLE_BIT_SHIFT 0 +#define RDMA_CQE_RESPONDER_TYPE_MASK 0x3 +#define RDMA_CQE_RESPONDER_TYPE_SHIFT 1 +#define RDMA_CQE_RESPONDER_INV_FLG_MASK 0x1 +#define RDMA_CQE_RESPONDER_INV_FLG_SHIFT 3 +#define RDMA_CQE_RESPONDER_IMM_FLG_MASK 0x1 +#define RDMA_CQE_RESPONDER_IMM_FLG_SHIFT 4 +#define RDMA_CQE_RESPONDER_RDMA_FLG_MASK 0x1 +#define RDMA_CQE_RESPONDER_RDMA_FLG_SHIFT 5 +#define RDMA_CQE_RESPONDER_RESERVED2_MASK 0x3 +#define RDMA_CQE_RESPONDER_RESERVED2_SHIFT 6 + u8 status; }; struct rdma_cqe_requester { @@ -58,6 +71,12 @@ struct rdma_cqe_requester { __le32 reserved3; __le16 reserved4; u8 flags; +#define RDMA_CQE_REQUESTER_TOGGLE_BIT_MASK 0x1 +#define RDMA_CQE_REQUESTER_TOGGLE_BIT_SHIFT 0 +#define RDMA_CQE_REQUESTER_TYPE_MASK 0x3 +#define RDMA_CQE_REQUESTER_TYPE_SHIFT 1 +#define RDMA_CQE_REQUESTER_RESERVED5_MASK 0x1F +#define RDMA_CQE_REQUESTER_RESERVED5_SHIFT 3 u8 status; }; @@ -66,6 +85,12 @@ struct rdma_cqe_common { struct regpair qp_handle; __le16 reserved1[7]; u8 flags; +#define RDMA_CQE_COMMON_TOGGLE_BIT_MASK 0x1 +#define RDMA_CQE_COMMON_TOGGLE_BIT_SHIFT 0 +#define RDMA_CQE_COMMON_TYPE_MASK 0x3 +#define RDMA_CQE_COMMON_TYPE_SHIFT 1 +#define RDMA_CQE_COMMON_RESERVED2_MASK 0x1F +#define RDMA_CQE_COMMON_RESERVED2_SHIFT 3 u8 status; }; @@ -76,6 +101,45 @@ union rdma_cqe { struct rdma_cqe_common cmn; }; +/* * CQE requester status enumeration */ +enum rdma_cqe_requester_status_enum { + RDMA_CQE_REQ_STS_OK, + RDMA_CQE_REQ_STS_BAD_RESPONSE_ERR, + RDMA_CQE_REQ_STS_LOCAL_LENGTH_ERR, + RDMA_CQE_REQ_STS_LOCAL_QP_OPERATION_ERR, + RDMA_CQE_REQ_STS_LOCAL_PROTECTION_ERR, + RDMA_CQE_REQ_STS_MEMORY_MGT_OPERATION_ERR, + RDMA_CQE_REQ_STS_REMOTE_INVALID_REQUEST_ERR, + RDMA_CQE_REQ_STS_REMOTE_ACCESS_ERR, + RDMA_CQE_REQ_STS_REMOTE_OPERATION_ERR, + RDMA_CQE_REQ_STS_RNR_NAK_RETRY_CNT_ERR, + RDMA_CQE_REQ_STS_TRANSPORT_RETRY_CNT_ERR, + RDMA_CQE_REQ_STS_WORK_REQUEST_FLUSHED_ERR, + MAX_RDMA_CQE_REQUESTER_STATUS_ENUM +}; + +/* CQE responder status enumeration */ +enum rdma_cqe_responder_status_enum { + RDMA_CQE_RESP_STS_OK, + RDMA_CQE_RESP_STS_LOCAL_ACCESS_ERR, + RDMA_CQE_RESP_STS_LOCAL_LENGTH_ERR, + RDMA_CQE_RESP_STS_LOCAL_QP_OPERATION_ERR, + RDMA_CQE_RESP_STS_LOCAL_PROTECTION_ERR, + RDMA_CQE_RESP_STS_MEMORY_MGT_OPERATION_ERR, + RDMA_CQE_RESP_STS_REMOTE_INVALID_REQUEST_ERR, + RDMA_CQE_RESP_STS_WORK_REQUEST_FLUSHED_ERR, + MAX_RDMA_CQE_RESPONDER_STATUS_ENUM +}; + +/* CQE type enumeration */ +enum rdma_cqe_type { + RDMA_CQE_TYPE_REQUESTER, + RDMA_CQE_TYPE_RESPONDER_RQ, + RDMA_CQE_TYPE_RESPONDER_SRQ, + RDMA_CQE_TYPE_INVALID, + MAX_RDMA_CQE_TYPE +}; + struct rdma_sq_sge { __le32 length; struct regpair addr; @@ -93,4 +157,19 @@ struct rdma_srq_sge { __le32 length; __le32 l_key; }; + +/* Rdma doorbell data for CQ */ +struct rdma_pwm_val32_data { + __le16 icid; + u8 agg_flags; + u8 params; +#define RDMA_PWM_VAL32_DATA_AGG_CMD_MASK 0x3 +#define RDMA_PWM_VAL32_DATA_AGG_CMD_SHIFT 0 +#define RDMA_PWM_VAL32_DATA_BYPASS_EN_MASK 0x1 +#define RDMA_PWM_VAL32_DATA_BYPASS_EN_SHIFT 2 +#define RDMA_PWM_VAL32_DATA_RESERVED_MASK 0x1F +#define RDMA_PWM_VAL32_DATA_RESERVED_SHIFT 3 + __le32 value; +}; + #endif /* __QED_HSI_RDMA__ */ diff --git a/drivers/infiniband/hw/qedr/verbs.c b/drivers/infiniband/hw/qedr/verbs.c index b9dcade1cb9f..b525c6cf1df0 100644 --- a/drivers/infiniband/hw/qedr/verbs.c +++ b/drivers/infiniband/hw/qedr/verbs.c @@ -49,6 +49,17 @@ #include "verbs.h" #include +#define DB_ADDR_SHIFT(addr) ((addr) << DB_PWM_ADDR_OFFSET_SHIFT) + +int qedr_query_pkey(struct ib_device *ibdev, u8 port, u16 index, u16 *pkey) +{ + if (index > QEDR_ROCE_PKEY_TABLE_LEN) + return -EINVAL; + + *pkey = QEDR_ROCE_PKEY_DEFAULT; + return 0; +} + int qedr_query_gid(struct ib_device *ibdev, u8 port, int index, union ib_gid *sgid) { @@ -454,3 +465,531 @@ int qedr_mmap(struct ib_ucontext *context, struct vm_area_struct *vma) DP_DEBUG(dev, QEDR_MSG_INIT, "qedr_mmap return code: %d\n", rc); return rc; } + +struct ib_pd *qedr_alloc_pd(struct ib_device *ibdev, + struct ib_ucontext *context, struct ib_udata *udata) +{ + struct qedr_dev *dev = get_qedr_dev(ibdev); + struct qedr_ucontext *uctx = NULL; + struct qedr_alloc_pd_uresp uresp; + struct qedr_pd *pd; + u16 pd_id; + int rc; + + DP_DEBUG(dev, QEDR_MSG_INIT, "Function called from: %s\n", + (udata && context) ? "User Lib" : "Kernel"); + + if (!dev->rdma_ctx) { + DP_ERR(dev, "invlaid RDMA context\n"); + return ERR_PTR(-EINVAL); + } + + pd = kzalloc(sizeof(*pd), GFP_KERNEL); + if (!pd) + return ERR_PTR(-ENOMEM); + + dev->ops->rdma_alloc_pd(dev->rdma_ctx, &pd_id); + + uresp.pd_id = pd_id; + pd->pd_id = pd_id; + + if (udata && context) { + rc = ib_copy_to_udata(udata, &uresp, sizeof(uresp)); + if (rc) + DP_ERR(dev, "copy error pd_id=0x%x.\n", pd_id); + uctx = get_qedr_ucontext(context); + uctx->pd = pd; + pd->uctx = uctx; + } + + return &pd->ibpd; +} + +int qedr_dealloc_pd(struct ib_pd *ibpd) +{ + struct qedr_dev *dev = get_qedr_dev(ibpd->device); + struct qedr_pd *pd = get_qedr_pd(ibpd); + + if (!pd) + pr_err("Invalid PD received in dealloc_pd\n"); + + DP_DEBUG(dev, QEDR_MSG_INIT, "Deallocating PD %d\n", pd->pd_id); + dev->ops->rdma_dealloc_pd(dev->rdma_ctx, pd->pd_id); + + kfree(pd); + + return 0; +} + +static void qedr_free_pbl(struct qedr_dev *dev, + struct qedr_pbl_info *pbl_info, struct qedr_pbl *pbl) +{ + struct pci_dev *pdev = dev->pdev; + int i; + + for (i = 0; i < pbl_info->num_pbls; i++) { + if (!pbl[i].va) + continue; + dma_free_coherent(&pdev->dev, pbl_info->pbl_size, + pbl[i].va, pbl[i].pa); + } + + kfree(pbl); +} + +#define MIN_FW_PBL_PAGE_SIZE (4 * 1024) +#define MAX_FW_PBL_PAGE_SIZE (64 * 1024) + +#define NUM_PBES_ON_PAGE(_page_size) (_page_size / sizeof(u64)) +#define MAX_PBES_ON_PAGE NUM_PBES_ON_PAGE(MAX_FW_PBL_PAGE_SIZE) +#define MAX_PBES_TWO_LAYER (MAX_PBES_ON_PAGE * MAX_PBES_ON_PAGE) + +static struct qedr_pbl *qedr_alloc_pbl_tbl(struct qedr_dev *dev, + struct qedr_pbl_info *pbl_info, + gfp_t flags) +{ + struct pci_dev *pdev = dev->pdev; + struct qedr_pbl *pbl_table; + dma_addr_t *pbl_main_tbl; + dma_addr_t pa; + void *va; + int i; + + pbl_table = kcalloc(pbl_info->num_pbls, sizeof(*pbl_table), flags); + if (!pbl_table) + return ERR_PTR(-ENOMEM); + + for (i = 0; i < pbl_info->num_pbls; i++) { + va = dma_alloc_coherent(&pdev->dev, pbl_info->pbl_size, + &pa, flags); + if (!va) + goto err; + + memset(va, 0, pbl_info->pbl_size); + pbl_table[i].va = va; + pbl_table[i].pa = pa; + } + + /* Two-Layer PBLs, if we have more than one pbl we need to initialize + * the first one with physical pointers to all of the rest + */ + pbl_main_tbl = (dma_addr_t *)pbl_table[0].va; + for (i = 0; i < pbl_info->num_pbls - 1; i++) + pbl_main_tbl[i] = pbl_table[i + 1].pa; + + return pbl_table; + +err: + for (i--; i >= 0; i--) + dma_free_coherent(&pdev->dev, pbl_info->pbl_size, + pbl_table[i].va, pbl_table[i].pa); + + qedr_free_pbl(dev, pbl_info, pbl_table); + + return ERR_PTR(-ENOMEM); +} + +static int qedr_prepare_pbl_tbl(struct qedr_dev *dev, + struct qedr_pbl_info *pbl_info, + u32 num_pbes, int two_layer_capable) +{ + u32 pbl_capacity; + u32 pbl_size; + u32 num_pbls; + + if ((num_pbes > MAX_PBES_ON_PAGE) && two_layer_capable) { + if (num_pbes > MAX_PBES_TWO_LAYER) { + DP_ERR(dev, "prepare pbl table: too many pages %d\n", + num_pbes); + return -EINVAL; + } + + /* calculate required pbl page size */ + pbl_size = MIN_FW_PBL_PAGE_SIZE; + pbl_capacity = NUM_PBES_ON_PAGE(pbl_size) * + NUM_PBES_ON_PAGE(pbl_size); + + while (pbl_capacity < num_pbes) { + pbl_size *= 2; + pbl_capacity = pbl_size / sizeof(u64); + pbl_capacity = pbl_capacity * pbl_capacity; + } + + num_pbls = DIV_ROUND_UP(num_pbes, NUM_PBES_ON_PAGE(pbl_size)); + num_pbls++; /* One for the layer0 ( points to the pbls) */ + pbl_info->two_layered = true; + } else { + /* One layered PBL */ + num_pbls = 1; + pbl_size = max_t(u32, MIN_FW_PBL_PAGE_SIZE, + roundup_pow_of_two((num_pbes * sizeof(u64)))); + pbl_info->two_layered = false; + } + + pbl_info->num_pbls = num_pbls; + pbl_info->pbl_size = pbl_size; + pbl_info->num_pbes = num_pbes; + + DP_DEBUG(dev, QEDR_MSG_MR, + "prepare pbl table: num_pbes=%d, num_pbls=%d, pbl_size=%d\n", + pbl_info->num_pbes, pbl_info->num_pbls, pbl_info->pbl_size); + + return 0; +} + +static void qedr_populate_pbls(struct qedr_dev *dev, struct ib_umem *umem, + struct qedr_pbl *pbl, + struct qedr_pbl_info *pbl_info) +{ + int shift, pg_cnt, pages, pbe_cnt, total_num_pbes = 0; + struct qedr_pbl *pbl_tbl; + struct scatterlist *sg; + struct regpair *pbe; + int entry; + u32 addr; + + if (!pbl_info->num_pbes) + return; + + /* If we have a two layered pbl, the first pbl points to the rest + * of the pbls and the first entry lays on the second pbl in the table + */ + if (pbl_info->two_layered) + pbl_tbl = &pbl[1]; + else + pbl_tbl = pbl; + + pbe = (struct regpair *)pbl_tbl->va; + if (!pbe) { + DP_ERR(dev, "cannot populate PBL due to a NULL PBE\n"); + return; + } + + pbe_cnt = 0; + + shift = ilog2(umem->page_size); + + for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) { + pages = sg_dma_len(sg) >> shift; + for (pg_cnt = 0; pg_cnt < pages; pg_cnt++) { + /* store the page address in pbe */ + pbe->lo = cpu_to_le32(sg_dma_address(sg) + + umem->page_size * pg_cnt); + addr = upper_32_bits(sg_dma_address(sg) + + umem->page_size * pg_cnt); + pbe->hi = cpu_to_le32(addr); + pbe_cnt++; + total_num_pbes++; + pbe++; + + if (total_num_pbes == pbl_info->num_pbes) + return; + + /* If the given pbl is full storing the pbes, + * move to next pbl. + */ + if (pbe_cnt == (pbl_info->pbl_size / sizeof(u64))) { + pbl_tbl++; + pbe = (struct regpair *)pbl_tbl->va; + pbe_cnt = 0; + } + } + } +} + +static int qedr_copy_cq_uresp(struct qedr_dev *dev, + struct qedr_cq *cq, struct ib_udata *udata) +{ + struct qedr_create_cq_uresp uresp; + int rc; + + memset(&uresp, 0, sizeof(uresp)); + + uresp.db_offset = DB_ADDR_SHIFT(DQ_PWM_OFFSET_UCM_RDMA_CQ_CONS_32BIT); + uresp.icid = cq->icid; + + rc = ib_copy_to_udata(udata, &uresp, sizeof(uresp)); + if (rc) + DP_ERR(dev, "copy error cqid=0x%x.\n", cq->icid); + + return rc; +} + +static void consume_cqe(struct qedr_cq *cq) +{ + if (cq->latest_cqe == cq->toggle_cqe) + cq->pbl_toggle ^= RDMA_CQE_REQUESTER_TOGGLE_BIT_MASK; + + cq->latest_cqe = qed_chain_consume(&cq->pbl); +} + +static inline int qedr_align_cq_entries(int entries) +{ + u64 size, aligned_size; + + /* We allocate an extra entry that we don't report to the FW. */ + size = (entries + 1) * QEDR_CQE_SIZE; + aligned_size = ALIGN(size, PAGE_SIZE); + + return aligned_size / QEDR_CQE_SIZE; +} + +static inline int qedr_init_user_queue(struct ib_ucontext *ib_ctx, + struct qedr_dev *dev, + struct qedr_userq *q, + u64 buf_addr, size_t buf_len, + int access, int dmasync) +{ + int page_cnt; + int rc; + + q->buf_addr = buf_addr; + q->buf_len = buf_len; + q->umem = ib_umem_get(ib_ctx, q->buf_addr, q->buf_len, access, dmasync); + if (IS_ERR(q->umem)) { + DP_ERR(dev, "create user queue: failed ib_umem_get, got %ld\n", + PTR_ERR(q->umem)); + return PTR_ERR(q->umem); + } + + page_cnt = ib_umem_page_count(q->umem); + rc = qedr_prepare_pbl_tbl(dev, &q->pbl_info, page_cnt, 0); + if (rc) + goto err0; + + q->pbl_tbl = qedr_alloc_pbl_tbl(dev, &q->pbl_info, GFP_KERNEL); + if (IS_ERR_OR_NULL(q->pbl_tbl)) + goto err0; + + qedr_populate_pbls(dev, q->umem, q->pbl_tbl, &q->pbl_info); + + return 0; + +err0: + ib_umem_release(q->umem); + + return rc; +} + +static inline void qedr_init_cq_params(struct qedr_cq *cq, + struct qedr_ucontext *ctx, + struct qedr_dev *dev, int vector, + int chain_entries, int page_cnt, + u64 pbl_ptr, + struct qed_rdma_create_cq_in_params + *params) +{ + memset(params, 0, sizeof(*params)); + params->cq_handle_hi = upper_32_bits((uintptr_t)cq); + params->cq_handle_lo = lower_32_bits((uintptr_t)cq); + params->cnq_id = vector; + params->cq_size = chain_entries - 1; + params->dpi = (ctx) ? ctx->dpi : dev->dpi; + params->pbl_num_pages = page_cnt; + params->pbl_ptr = pbl_ptr; + params->pbl_two_level = 0; +} + +static void doorbell_cq(struct qedr_cq *cq, u32 cons, u8 flags) +{ + /* Flush data before signalling doorbell */ + wmb(); + cq->db.data.agg_flags = flags; + cq->db.data.value = cpu_to_le32(cons); + writeq(cq->db.raw, cq->db_addr); + + /* Make sure write would stick */ + mmiowb(); +} + +int qedr_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags) +{ + struct qedr_cq *cq = get_qedr_cq(ibcq); + unsigned long sflags; + + if (cq->cq_type == QEDR_CQ_TYPE_GSI) + return 0; + + spin_lock_irqsave(&cq->cq_lock, sflags); + + cq->arm_flags = 0; + + if (flags & IB_CQ_SOLICITED) + cq->arm_flags |= DQ_UCM_ROCE_CQ_ARM_SE_CF_CMD; + + if (flags & IB_CQ_NEXT_COMP) + cq->arm_flags |= DQ_UCM_ROCE_CQ_ARM_CF_CMD; + + doorbell_cq(cq, cq->cq_cons - 1, cq->arm_flags); + + spin_unlock_irqrestore(&cq->cq_lock, sflags); + + return 0; +} + +struct ib_cq *qedr_create_cq(struct ib_device *ibdev, + const struct ib_cq_init_attr *attr, + struct ib_ucontext *ib_ctx, struct ib_udata *udata) +{ + struct qedr_ucontext *ctx = get_qedr_ucontext(ib_ctx); + struct qed_rdma_destroy_cq_out_params destroy_oparams; + struct qed_rdma_destroy_cq_in_params destroy_iparams; + struct qedr_dev *dev = get_qedr_dev(ibdev); + struct qed_rdma_create_cq_in_params params; + struct qedr_create_cq_ureq ureq; + int vector = attr->comp_vector; + int entries = attr->cqe; + struct qedr_cq *cq; + int chain_entries; + int page_cnt; + u64 pbl_ptr; + u16 icid; + int rc; + + DP_DEBUG(dev, QEDR_MSG_INIT, + "create_cq: called from %s. entries=%d, vector=%d\n", + udata ? "User Lib" : "Kernel", entries, vector); + + if (entries > QEDR_MAX_CQES) { + DP_ERR(dev, + "create cq: the number of entries %d is too high. Must be equal or below %d.\n", + entries, QEDR_MAX_CQES); + return ERR_PTR(-EINVAL); + } + + chain_entries = qedr_align_cq_entries(entries); + chain_entries = min_t(int, chain_entries, QEDR_MAX_CQES); + + cq = kzalloc(sizeof(*cq), GFP_KERNEL); + if (!cq) + return ERR_PTR(-ENOMEM); + + if (udata) { + memset(&ureq, 0, sizeof(ureq)); + if (ib_copy_from_udata(&ureq, udata, sizeof(ureq))) { + DP_ERR(dev, + "create cq: problem copying data from user space\n"); + goto err0; + } + + if (!ureq.len) { + DP_ERR(dev, + "create cq: cannot create a cq with 0 entries\n"); + goto err0; + } + + cq->cq_type = QEDR_CQ_TYPE_USER; + + rc = qedr_init_user_queue(ib_ctx, dev, &cq->q, ureq.addr, + ureq.len, IB_ACCESS_LOCAL_WRITE, 1); + if (rc) + goto err0; + + pbl_ptr = cq->q.pbl_tbl->pa; + page_cnt = cq->q.pbl_info.num_pbes; + } else { + cq->cq_type = QEDR_CQ_TYPE_KERNEL; + + rc = dev->ops->common->chain_alloc(dev->cdev, + QED_CHAIN_USE_TO_CONSUME, + QED_CHAIN_MODE_PBL, + QED_CHAIN_CNT_TYPE_U32, + chain_entries, + sizeof(union rdma_cqe), + &cq->pbl); + if (rc) + goto err1; + + page_cnt = qed_chain_get_page_cnt(&cq->pbl); + pbl_ptr = qed_chain_get_pbl_phys(&cq->pbl); + } + + qedr_init_cq_params(cq, ctx, dev, vector, chain_entries, page_cnt, + pbl_ptr, ¶ms); + + rc = dev->ops->rdma_create_cq(dev->rdma_ctx, ¶ms, &icid); + if (rc) + goto err2; + + cq->icid = icid; + cq->sig = QEDR_CQ_MAGIC_NUMBER; + spin_lock_init(&cq->cq_lock); + + if (ib_ctx) { + rc = qedr_copy_cq_uresp(dev, cq, udata); + if (rc) + goto err3; + } else { + /* Generate doorbell address. */ + cq->db_addr = dev->db_addr + + DB_ADDR_SHIFT(DQ_PWM_OFFSET_UCM_RDMA_CQ_CONS_32BIT); + cq->db.data.icid = cq->icid; + cq->db.data.params = DB_AGG_CMD_SET << + RDMA_PWM_VAL32_DATA_AGG_CMD_SHIFT; + + /* point to the very last element, passing it we will toggle */ + cq->toggle_cqe = qed_chain_get_last_elem(&cq->pbl); + cq->pbl_toggle = RDMA_CQE_REQUESTER_TOGGLE_BIT_MASK; + cq->latest_cqe = NULL; + consume_cqe(cq); + cq->cq_cons = qed_chain_get_cons_idx_u32(&cq->pbl); + } + + DP_DEBUG(dev, QEDR_MSG_CQ, + "create cq: icid=0x%0x, addr=%p, size(entries)=0x%0x\n", + cq->icid, cq, params.cq_size); + + return &cq->ibcq; + +err3: + destroy_iparams.icid = cq->icid; + dev->ops->rdma_destroy_cq(dev->rdma_ctx, &destroy_iparams, + &destroy_oparams); +err2: + if (udata) + qedr_free_pbl(dev, &cq->q.pbl_info, cq->q.pbl_tbl); + else + dev->ops->common->chain_free(dev->cdev, &cq->pbl); +err1: + if (udata) + ib_umem_release(cq->q.umem); +err0: + kfree(cq); + return ERR_PTR(-EINVAL); +} + +int qedr_resize_cq(struct ib_cq *ibcq, int new_cnt, struct ib_udata *udata) +{ + struct qedr_dev *dev = get_qedr_dev(ibcq->device); + struct qedr_cq *cq = get_qedr_cq(ibcq); + + DP_ERR(dev, "cq %p RESIZE NOT SUPPORTED\n", cq); + + return 0; +} + +int qedr_destroy_cq(struct ib_cq *ibcq) +{ + struct qedr_dev *dev = get_qedr_dev(ibcq->device); + struct qed_rdma_destroy_cq_out_params oparams; + struct qed_rdma_destroy_cq_in_params iparams; + struct qedr_cq *cq = get_qedr_cq(ibcq); + + DP_DEBUG(dev, QEDR_MSG_CQ, "destroy cq: cq_id %d", cq->icid); + + /* GSIs CQs are handled by driver, so they don't exist in the FW */ + if (cq->cq_type != QEDR_CQ_TYPE_GSI) { + iparams.icid = cq->icid; + dev->ops->rdma_destroy_cq(dev->rdma_ctx, &iparams, &oparams); + dev->ops->common->chain_free(dev->cdev, &cq->pbl); + } + + if (ibcq->uobject && ibcq->uobject->context) { + qedr_free_pbl(dev, &cq->q.pbl_info, cq->q.pbl_tbl); + ib_umem_release(cq->q.umem); + } + + kfree(cq); + + return 0; +} diff --git a/drivers/infiniband/hw/qedr/verbs.h b/drivers/infiniband/hw/qedr/verbs.h index 9472044dd587..36c8a692f740 100644 --- a/drivers/infiniband/hw/qedr/verbs.h +++ b/drivers/infiniband/hw/qedr/verbs.h @@ -40,6 +40,8 @@ int qedr_modify_port(struct ib_device *, u8 port, int mask, int qedr_query_gid(struct ib_device *, u8 port, int index, union ib_gid *gid); +int qedr_query_pkey(struct ib_device *, u8 port, u16 index, u16 *pkey); + struct ib_ucontext *qedr_alloc_ucontext(struct ib_device *, struct ib_udata *); int qedr_dealloc_ucontext(struct ib_ucontext *); @@ -49,4 +51,16 @@ int qedr_del_gid(struct ib_device *device, u8 port_num, int qedr_add_gid(struct ib_device *device, u8 port_num, unsigned int index, const union ib_gid *gid, const struct ib_gid_attr *attr, void **context); +struct ib_pd *qedr_alloc_pd(struct ib_device *, + struct ib_ucontext *, struct ib_udata *); +int qedr_dealloc_pd(struct ib_pd *pd); + +struct ib_cq *qedr_create_cq(struct ib_device *ibdev, + const struct ib_cq_init_attr *attr, + struct ib_ucontext *ib_ctx, + struct ib_udata *udata); +int qedr_resize_cq(struct ib_cq *, int cqe, struct ib_udata *); +int qedr_destroy_cq(struct ib_cq *); +int qedr_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags); + #endif diff --git a/include/uapi/rdma/qedr-abi.h b/include/uapi/rdma/qedr-abi.h index f7c7fff7a877..b0fc5f2125e0 100644 --- a/include/uapi/rdma/qedr-abi.h +++ b/include/uapi/rdma/qedr-abi.h @@ -50,4 +50,23 @@ struct qedr_alloc_ucontext_resp { __u32 sges_per_srq_wr; __u32 max_cqes; }; + +struct qedr_alloc_pd_ureq { + __u64 rsvd1; +}; + +struct qedr_alloc_pd_uresp { + __u32 pd_id; +}; + +struct qedr_create_cq_ureq { + __u64 addr; + __u64 len; +}; + +struct qedr_create_cq_uresp { + __u32 db_offset; + __u16 icid; +}; + #endif /* __QEDR_USER_H__ */ -- cgit v1.2.3 From cecbcddf6461a11ce229e80bb3981415220c9763 Mon Sep 17 00:00:00 2001 From: Ram Amrani Date: Mon, 10 Oct 2016 13:15:34 +0300 Subject: qedr: Add support for QP verbs Add support for Queue Pair verbs which adds, deletes, modifies and queries Queue Pairs. Signed-off-by: Rajesh Borundia Signed-off-by: Ram Amrani Signed-off-by: Doug Ledford --- drivers/infiniband/hw/qedr/main.c | 15 +- drivers/infiniband/hw/qedr/qedr.h | 125 ++++ drivers/infiniband/hw/qedr/qedr_cm.h | 40 + drivers/infiniband/hw/qedr/qedr_hsi_rdma.h | 11 + drivers/infiniband/hw/qedr/verbs.c | 1089 ++++++++++++++++++++++++++++ drivers/infiniband/hw/qedr/verbs.h | 7 + include/uapi/rdma/qedr-abi.h | 34 + 7 files changed, 1320 insertions(+), 1 deletion(-) create mode 100644 drivers/infiniband/hw/qedr/qedr_cm.h (limited to 'include') diff --git a/drivers/infiniband/hw/qedr/main.c b/drivers/infiniband/hw/qedr/main.c index 35928abb6b63..13ba47b7b99f 100644 --- a/drivers/infiniband/hw/qedr/main.c +++ b/drivers/infiniband/hw/qedr/main.c @@ -48,6 +48,8 @@ MODULE_AUTHOR("QLogic Corporation"); MODULE_LICENSE("Dual BSD/GPL"); MODULE_VERSION(QEDR_MODULE_VERSION); +#define QEDR_WQ_MULTIPLIER_DFT (3) + void qedr_ib_dispatch_event(struct qedr_dev *dev, u8 port_num, enum ib_event_type type) { @@ -94,7 +96,11 @@ static int qedr_register_device(struct qedr_dev *dev) QEDR_UVERBS(CREATE_CQ) | QEDR_UVERBS(RESIZE_CQ) | QEDR_UVERBS(DESTROY_CQ) | - QEDR_UVERBS(REQ_NOTIFY_CQ); + QEDR_UVERBS(REQ_NOTIFY_CQ) | + QEDR_UVERBS(CREATE_QP) | + QEDR_UVERBS(MODIFY_QP) | + QEDR_UVERBS(QUERY_QP) | + QEDR_UVERBS(DESTROY_QP); dev->ibdev.phys_port_cnt = 1; dev->ibdev.num_comp_vectors = dev->num_cnq; @@ -120,6 +126,11 @@ static int qedr_register_device(struct qedr_dev *dev) dev->ibdev.resize_cq = qedr_resize_cq; dev->ibdev.req_notify_cq = qedr_arm_cq; + dev->ibdev.create_qp = qedr_create_qp; + dev->ibdev.modify_qp = qedr_modify_qp; + dev->ibdev.query_qp = qedr_query_qp; + dev->ibdev.destroy_qp = qedr_destroy_qp; + dev->ibdev.query_pkey = qedr_query_pkey; dev->ibdev.dma_device = &dev->pdev->dev; @@ -630,6 +641,8 @@ static struct qedr_dev *qedr_add(struct qed_dev *cdev, struct pci_dev *pdev, goto init_err; } + dev->wq_multiplier = QEDR_WQ_MULTIPLIER_DFT; + qedr_pci_set_atomic(dev, pdev); rc = qedr_alloc_resources(dev); diff --git a/drivers/infiniband/hw/qedr/qedr.h b/drivers/infiniband/hw/qedr/qedr.h index 9e2846a599ce..e9fe941c48ad 100644 --- a/drivers/infiniband/hw/qedr/qedr.h +++ b/drivers/infiniband/hw/qedr/qedr.h @@ -52,6 +52,9 @@ #define QEDR_MSG_MISC "MISC" #define QEDR_MSG_CQ " CQ" #define QEDR_MSG_MR " MR" +#define QEDR_MSG_RQ " RQ" +#define QEDR_MSG_SQ " SQ" +#define QEDR_MSG_QP " QP" #define QEDR_CQ_MAGIC_NUMBER (0x11223344) @@ -143,6 +146,8 @@ struct qedr_dev { u32 dp_module; u8 dp_level; u8 num_hwfns; + uint wq_multiplier; + }; #define QEDR_MAX_SQ_PBL (0x8000) @@ -272,6 +277,122 @@ struct qedr_mm { struct list_head entry; }; +union db_prod32 { + struct rdma_pwm_val16_data data; + u32 raw; +}; + +struct qedr_qp_hwq_info { + /* WQE Elements */ + struct qed_chain pbl; + u64 p_phys_addr_tbl; + u32 max_sges; + + /* WQE */ + u16 prod; + u16 cons; + u16 wqe_cons; + u16 max_wr; + + /* DB */ + void __iomem *db; + union db_prod32 db_data; +}; + +#define QEDR_INC_SW_IDX(p_info, index) \ + do { \ + p_info->index = (p_info->index + 1) & \ + qed_chain_get_capacity(p_info->pbl) \ + } while (0) + +enum qedr_qp_err_bitmap { + QEDR_QP_ERR_SQ_FULL = 1, + QEDR_QP_ERR_RQ_FULL = 2, + QEDR_QP_ERR_BAD_SR = 4, + QEDR_QP_ERR_BAD_RR = 8, + QEDR_QP_ERR_SQ_PBL_FULL = 16, + QEDR_QP_ERR_RQ_PBL_FULL = 32, +}; + +struct qedr_qp { + struct ib_qp ibqp; /* must be first */ + struct qedr_dev *dev; + + struct qedr_qp_hwq_info sq; + struct qedr_qp_hwq_info rq; + + u32 max_inline_data; + + /* Lock for QP's */ + spinlock_t q_lock; + struct qedr_cq *sq_cq; + struct qedr_cq *rq_cq; + struct qedr_srq *srq; + enum qed_roce_qp_state state; + u32 id; + struct qedr_pd *pd; + enum ib_qp_type qp_type; + struct qed_rdma_qp *qed_qp; + u32 qp_id; + u16 icid; + u16 mtu; + int sgid_idx; + u32 rq_psn; + u32 sq_psn; + u32 qkey; + u32 dest_qp_num; + + /* Relevant to qps created from kernel space only (ULPs) */ + u8 prev_wqe_size; + u16 wqe_cons; + u32 err_bitmap; + bool signaled; + + /* SQ shadow */ + struct { + u64 wr_id; + enum ib_wc_opcode opcode; + u32 bytes_len; + u8 wqe_size; + bool signaled; + dma_addr_t icrc_mapping; + u32 *icrc; + struct qedr_mr *mr; + } *wqe_wr_id; + + /* RQ shadow */ + struct { + u64 wr_id; + struct ib_sge sg_list[RDMA_MAX_SGE_PER_RQ_WQE]; + u8 wqe_size; + + u16 vlan_id; + int rc; + } *rqe_wr_id; + + /* Relevant to qps created from user space only (applications) */ + struct qedr_userq usq; + struct qedr_userq urq; +}; + +static inline int qedr_get_dmac(struct qedr_dev *dev, + struct ib_ah_attr *ah_attr, u8 *mac_addr) +{ + union ib_gid zero_sgid = { { 0 } }; + struct in6_addr in6; + + if (!memcmp(&ah_attr->grh.dgid, &zero_sgid, sizeof(union ib_gid))) { + DP_ERR(dev, "Local port GID not supported\n"); + eth_zero_addr(mac_addr); + return -EINVAL; + } + + memcpy(&in6, ah_attr->grh.dgid.raw, sizeof(in6)); + ether_addr_copy(mac_addr, ah_attr->dmac); + + return 0; +} + static inline struct qedr_ucontext *get_qedr_ucontext(struct ib_ucontext *ibucontext) { @@ -293,4 +414,8 @@ static inline struct qedr_cq *get_qedr_cq(struct ib_cq *ibcq) return container_of(ibcq, struct qedr_cq, ibcq); } +static inline struct qedr_qp *get_qedr_qp(struct ib_qp *ibqp) +{ + return container_of(ibqp, struct qedr_qp, ibqp); +} #endif diff --git a/drivers/infiniband/hw/qedr/qedr_cm.h b/drivers/infiniband/hw/qedr/qedr_cm.h new file mode 100644 index 000000000000..b8a8b76d77b8 --- /dev/null +++ b/drivers/infiniband/hw/qedr/qedr_cm.h @@ -0,0 +1,40 @@ +/* QLogic qedr NIC Driver + * Copyright (c) 2015-2016 QLogic Corporation + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and /or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef LINUX_QEDR_CM_H_ +#define LINUX_QEDR_CM_H_ + +static inline u32 qedr_get_ipv4_from_gid(u8 *gid) +{ + return *(u32 *)(void *)&gid[12]; +} + +#endif diff --git a/drivers/infiniband/hw/qedr/qedr_hsi_rdma.h b/drivers/infiniband/hw/qedr/qedr_hsi_rdma.h index 84f6520107cc..47705598eec6 100644 --- a/drivers/infiniband/hw/qedr/qedr_hsi_rdma.h +++ b/drivers/infiniband/hw/qedr/qedr_hsi_rdma.h @@ -158,6 +158,17 @@ struct rdma_srq_sge { __le32 l_key; }; +/* Rdma doorbell data for SQ and RQ */ +struct rdma_pwm_val16_data { + __le16 icid; + __le16 value; +}; + +union rdma_pwm_val16_data_union { + struct rdma_pwm_val16_data as_struct; + __le32 as_dword; +}; + /* Rdma doorbell data for CQ */ struct rdma_pwm_val32_data { __le16 icid; diff --git a/drivers/infiniband/hw/qedr/verbs.c b/drivers/infiniband/hw/qedr/verbs.c index b525c6cf1df0..a0d1c5fffb63 100644 --- a/drivers/infiniband/hw/qedr/verbs.c +++ b/drivers/infiniband/hw/qedr/verbs.c @@ -48,6 +48,7 @@ #include "qedr.h" #include "verbs.h" #include +#include "qedr_cm.h" #define DB_ADDR_SHIFT(addr) ((addr) << DB_PWM_ADDR_OFFSET_SHIFT) @@ -993,3 +994,1091 @@ int qedr_destroy_cq(struct ib_cq *ibcq) return 0; } + +static inline int get_gid_info_from_table(struct ib_qp *ibqp, + struct ib_qp_attr *attr, + int attr_mask, + struct qed_rdma_modify_qp_in_params + *qp_params) +{ + enum rdma_network_type nw_type; + struct ib_gid_attr gid_attr; + union ib_gid gid; + u32 ipv4_addr; + int rc = 0; + int i; + + rc = ib_get_cached_gid(ibqp->device, attr->ah_attr.port_num, + attr->ah_attr.grh.sgid_index, &gid, &gid_attr); + if (rc) + return rc; + + if (!memcmp(&gid, &zgid, sizeof(gid))) + return -ENOENT; + + if (gid_attr.ndev) { + qp_params->vlan_id = rdma_vlan_dev_vlan_id(gid_attr.ndev); + + dev_put(gid_attr.ndev); + nw_type = ib_gid_to_network_type(gid_attr.gid_type, &gid); + switch (nw_type) { + case RDMA_NETWORK_IPV6: + memcpy(&qp_params->sgid.bytes[0], &gid.raw[0], + sizeof(qp_params->sgid)); + memcpy(&qp_params->dgid.bytes[0], + &attr->ah_attr.grh.dgid, + sizeof(qp_params->dgid)); + qp_params->roce_mode = ROCE_V2_IPV6; + SET_FIELD(qp_params->modify_flags, + QED_ROCE_MODIFY_QP_VALID_ROCE_MODE, 1); + break; + case RDMA_NETWORK_IB: + memcpy(&qp_params->sgid.bytes[0], &gid.raw[0], + sizeof(qp_params->sgid)); + memcpy(&qp_params->dgid.bytes[0], + &attr->ah_attr.grh.dgid, + sizeof(qp_params->dgid)); + qp_params->roce_mode = ROCE_V1; + break; + case RDMA_NETWORK_IPV4: + memset(&qp_params->sgid, 0, sizeof(qp_params->sgid)); + memset(&qp_params->dgid, 0, sizeof(qp_params->dgid)); + ipv4_addr = qedr_get_ipv4_from_gid(gid.raw); + qp_params->sgid.ipv4_addr = ipv4_addr; + ipv4_addr = + qedr_get_ipv4_from_gid(attr->ah_attr.grh.dgid.raw); + qp_params->dgid.ipv4_addr = ipv4_addr; + SET_FIELD(qp_params->modify_flags, + QED_ROCE_MODIFY_QP_VALID_ROCE_MODE, 1); + qp_params->roce_mode = ROCE_V2_IPV4; + break; + } + } + + for (i = 0; i < 4; i++) { + qp_params->sgid.dwords[i] = ntohl(qp_params->sgid.dwords[i]); + qp_params->dgid.dwords[i] = ntohl(qp_params->dgid.dwords[i]); + } + + if (qp_params->vlan_id >= VLAN_CFI_MASK) + qp_params->vlan_id = 0; + + return 0; +} + +static void qedr_cleanup_user_sq(struct qedr_dev *dev, struct qedr_qp *qp) +{ + qedr_free_pbl(dev, &qp->usq.pbl_info, qp->usq.pbl_tbl); + ib_umem_release(qp->usq.umem); +} + +static void qedr_cleanup_user_rq(struct qedr_dev *dev, struct qedr_qp *qp) +{ + qedr_free_pbl(dev, &qp->urq.pbl_info, qp->urq.pbl_tbl); + ib_umem_release(qp->urq.umem); +} + +static void qedr_cleanup_kernel_sq(struct qedr_dev *dev, struct qedr_qp *qp) +{ + dev->ops->common->chain_free(dev->cdev, &qp->sq.pbl); + kfree(qp->wqe_wr_id); +} + +static void qedr_cleanup_kernel_rq(struct qedr_dev *dev, struct qedr_qp *qp) +{ + dev->ops->common->chain_free(dev->cdev, &qp->rq.pbl); + kfree(qp->rqe_wr_id); +} + +static int qedr_check_qp_attrs(struct ib_pd *ibpd, struct qedr_dev *dev, + struct ib_qp_init_attr *attrs) +{ + struct qedr_device_attr *qattr = &dev->attr; + + /* QP0... attrs->qp_type == IB_QPT_GSI */ + if (attrs->qp_type != IB_QPT_RC && attrs->qp_type != IB_QPT_GSI) { + DP_DEBUG(dev, QEDR_MSG_QP, + "create qp: unsupported qp type=0x%x requested\n", + attrs->qp_type); + return -EINVAL; + } + + if (attrs->cap.max_send_wr > qattr->max_sqe) { + DP_ERR(dev, + "create qp: cannot create a SQ with %d elements (max_send_wr=0x%x)\n", + attrs->cap.max_send_wr, qattr->max_sqe); + return -EINVAL; + } + + if (attrs->cap.max_inline_data > qattr->max_inline) { + DP_ERR(dev, + "create qp: unsupported inline data size=0x%x requested (max_inline=0x%x)\n", + attrs->cap.max_inline_data, qattr->max_inline); + return -EINVAL; + } + + if (attrs->cap.max_send_sge > qattr->max_sge) { + DP_ERR(dev, + "create qp: unsupported send_sge=0x%x requested (max_send_sge=0x%x)\n", + attrs->cap.max_send_sge, qattr->max_sge); + return -EINVAL; + } + + if (attrs->cap.max_recv_sge > qattr->max_sge) { + DP_ERR(dev, + "create qp: unsupported recv_sge=0x%x requested (max_recv_sge=0x%x)\n", + attrs->cap.max_recv_sge, qattr->max_sge); + return -EINVAL; + } + + /* Unprivileged user space cannot create special QP */ + if (ibpd->uobject && attrs->qp_type == IB_QPT_GSI) { + DP_ERR(dev, + "create qp: userspace can't create special QPs of type=0x%x\n", + attrs->qp_type); + return -EINVAL; + } + + return 0; +} + +static void qedr_copy_rq_uresp(struct qedr_create_qp_uresp *uresp, + struct qedr_qp *qp) +{ + uresp->rq_db_offset = DB_ADDR_SHIFT(DQ_PWM_OFFSET_TCM_ROCE_RQ_PROD); + uresp->rq_icid = qp->icid; +} + +static void qedr_copy_sq_uresp(struct qedr_create_qp_uresp *uresp, + struct qedr_qp *qp) +{ + uresp->sq_db_offset = DB_ADDR_SHIFT(DQ_PWM_OFFSET_XCM_RDMA_SQ_PROD); + uresp->sq_icid = qp->icid + 1; +} + +static int qedr_copy_qp_uresp(struct qedr_dev *dev, + struct qedr_qp *qp, struct ib_udata *udata) +{ + struct qedr_create_qp_uresp uresp; + int rc; + + memset(&uresp, 0, sizeof(uresp)); + qedr_copy_sq_uresp(&uresp, qp); + qedr_copy_rq_uresp(&uresp, qp); + + uresp.atomic_supported = dev->atomic_cap != IB_ATOMIC_NONE; + uresp.qp_id = qp->qp_id; + + rc = ib_copy_to_udata(udata, &uresp, sizeof(uresp)); + if (rc) + DP_ERR(dev, + "create qp: failed a copy to user space with qp icid=0x%x.\n", + qp->icid); + + return rc; +} + +static void qedr_set_qp_init_params(struct qedr_dev *dev, + struct qedr_qp *qp, + struct qedr_pd *pd, + struct ib_qp_init_attr *attrs) +{ + qp->pd = pd; + + spin_lock_init(&qp->q_lock); + + qp->qp_type = attrs->qp_type; + qp->max_inline_data = attrs->cap.max_inline_data; + qp->sq.max_sges = attrs->cap.max_send_sge; + qp->state = QED_ROCE_QP_STATE_RESET; + qp->signaled = (attrs->sq_sig_type == IB_SIGNAL_ALL_WR) ? true : false; + qp->sq_cq = get_qedr_cq(attrs->send_cq); + qp->rq_cq = get_qedr_cq(attrs->recv_cq); + qp->dev = dev; + + DP_DEBUG(dev, QEDR_MSG_QP, + "QP params:\tpd = %d, qp_type = %d, max_inline_data = %d, state = %d, signaled = %d, use_srq=%d\n", + pd->pd_id, qp->qp_type, qp->max_inline_data, + qp->state, qp->signaled, (attrs->srq) ? 1 : 0); + DP_DEBUG(dev, QEDR_MSG_QP, + "SQ params:\tsq_max_sges = %d, sq_cq_id = %d\n", + qp->sq.max_sges, qp->sq_cq->icid); + qp->rq.max_sges = attrs->cap.max_recv_sge; + DP_DEBUG(dev, QEDR_MSG_QP, + "RQ params:\trq_max_sges = %d, rq_cq_id = %d\n", + qp->rq.max_sges, qp->rq_cq->icid); +} + +static inline void +qedr_init_qp_user_params(struct qed_rdma_create_qp_in_params *params, + struct qedr_create_qp_ureq *ureq) +{ + /* QP handle to be written in CQE */ + params->qp_handle_lo = ureq->qp_handle_lo; + params->qp_handle_hi = ureq->qp_handle_hi; +} + +static inline void +qedr_init_qp_kernel_doorbell_sq(struct qedr_dev *dev, struct qedr_qp *qp) +{ + qp->sq.db = dev->db_addr + + DB_ADDR_SHIFT(DQ_PWM_OFFSET_XCM_RDMA_SQ_PROD); + qp->sq.db_data.data.icid = qp->icid + 1; +} + +static inline void +qedr_init_qp_kernel_doorbell_rq(struct qedr_dev *dev, struct qedr_qp *qp) +{ + qp->rq.db = dev->db_addr + + DB_ADDR_SHIFT(DQ_PWM_OFFSET_TCM_ROCE_RQ_PROD); + qp->rq.db_data.data.icid = qp->icid; +} + +static inline int +qedr_init_qp_kernel_params_rq(struct qedr_dev *dev, + struct qedr_qp *qp, struct ib_qp_init_attr *attrs) +{ + /* Allocate driver internal RQ array */ + qp->rqe_wr_id = kcalloc(qp->rq.max_wr, sizeof(*qp->rqe_wr_id), + GFP_KERNEL); + if (!qp->rqe_wr_id) + return -ENOMEM; + + DP_DEBUG(dev, QEDR_MSG_QP, "RQ max_wr set to %d.\n", qp->rq.max_wr); + + return 0; +} + +static inline int +qedr_init_qp_kernel_params_sq(struct qedr_dev *dev, + struct qedr_qp *qp, + struct ib_qp_init_attr *attrs, + struct qed_rdma_create_qp_in_params *params) +{ + u32 temp_max_wr; + + /* Allocate driver internal SQ array */ + temp_max_wr = attrs->cap.max_send_wr * dev->wq_multiplier; + temp_max_wr = min_t(u32, temp_max_wr, dev->attr.max_sqe); + + /* temp_max_wr < attr->max_sqe < u16 so the casting is safe */ + qp->sq.max_wr = (u16)temp_max_wr; + qp->wqe_wr_id = kcalloc(qp->sq.max_wr, sizeof(*qp->wqe_wr_id), + GFP_KERNEL); + if (!qp->wqe_wr_id) + return -ENOMEM; + + DP_DEBUG(dev, QEDR_MSG_QP, "SQ max_wr set to %d.\n", qp->sq.max_wr); + + /* QP handle to be written in CQE */ + params->qp_handle_lo = lower_32_bits((uintptr_t)qp); + params->qp_handle_hi = upper_32_bits((uintptr_t)qp); + + return 0; +} + +static inline int qedr_init_qp_kernel_sq(struct qedr_dev *dev, + struct qedr_qp *qp, + struct ib_qp_init_attr *attrs) +{ + u32 n_sq_elems, n_sq_entries; + int rc; + + /* A single work request may take up to QEDR_MAX_SQ_WQE_SIZE elements in + * the ring. The ring should allow at least a single WR, even if the + * user requested none, due to allocation issues. + */ + n_sq_entries = attrs->cap.max_send_wr; + n_sq_entries = min_t(u32, n_sq_entries, dev->attr.max_sqe); + n_sq_entries = max_t(u32, n_sq_entries, 1); + n_sq_elems = n_sq_entries * QEDR_MAX_SQE_ELEMENTS_PER_SQE; + rc = dev->ops->common->chain_alloc(dev->cdev, + QED_CHAIN_USE_TO_PRODUCE, + QED_CHAIN_MODE_PBL, + QED_CHAIN_CNT_TYPE_U32, + n_sq_elems, + QEDR_SQE_ELEMENT_SIZE, + &qp->sq.pbl); + if (rc) { + DP_ERR(dev, "failed to allocate QP %p SQ\n", qp); + return rc; + } + + DP_DEBUG(dev, QEDR_MSG_SQ, + "SQ Pbl base addr = %llx max_send_wr=%d max_wr=%d capacity=%d, rc=%d\n", + qed_chain_get_pbl_phys(&qp->sq.pbl), attrs->cap.max_send_wr, + n_sq_entries, qed_chain_get_capacity(&qp->sq.pbl), rc); + return 0; +} + +static inline int qedr_init_qp_kernel_rq(struct qedr_dev *dev, + struct qedr_qp *qp, + struct ib_qp_init_attr *attrs) +{ + u32 n_rq_elems, n_rq_entries; + int rc; + + /* A single work request may take up to QEDR_MAX_RQ_WQE_SIZE elements in + * the ring. There ring should allow at least a single WR, even if the + * user requested none, due to allocation issues. + */ + n_rq_entries = max_t(u32, attrs->cap.max_recv_wr, 1); + n_rq_elems = n_rq_entries * QEDR_MAX_RQE_ELEMENTS_PER_RQE; + rc = dev->ops->common->chain_alloc(dev->cdev, + QED_CHAIN_USE_TO_CONSUME_PRODUCE, + QED_CHAIN_MODE_PBL, + QED_CHAIN_CNT_TYPE_U32, + n_rq_elems, + QEDR_RQE_ELEMENT_SIZE, + &qp->rq.pbl); + + if (rc) { + DP_ERR(dev, "failed to allocate memory for QP %p RQ\n", qp); + return -ENOMEM; + } + + DP_DEBUG(dev, QEDR_MSG_RQ, + "RQ Pbl base addr = %llx max_recv_wr=%d max_wr=%d capacity=%d, rc=%d\n", + qed_chain_get_pbl_phys(&qp->rq.pbl), attrs->cap.max_recv_wr, + n_rq_entries, qed_chain_get_capacity(&qp->rq.pbl), rc); + + /* n_rq_entries < u16 so the casting is safe */ + qp->rq.max_wr = (u16)n_rq_entries; + + return 0; +} + +static inline void +qedr_init_qp_in_params_sq(struct qedr_dev *dev, + struct qedr_pd *pd, + struct qedr_qp *qp, + struct ib_qp_init_attr *attrs, + struct ib_udata *udata, + struct qed_rdma_create_qp_in_params *params) +{ + /* QP handle to be written in an async event */ + params->qp_handle_async_lo = lower_32_bits((uintptr_t)qp); + params->qp_handle_async_hi = upper_32_bits((uintptr_t)qp); + + params->signal_all = (attrs->sq_sig_type == IB_SIGNAL_ALL_WR); + params->fmr_and_reserved_lkey = !udata; + params->pd = pd->pd_id; + params->dpi = pd->uctx ? pd->uctx->dpi : dev->dpi; + params->sq_cq_id = get_qedr_cq(attrs->send_cq)->icid; + params->max_sq_sges = 0; + params->stats_queue = 0; + + if (udata) { + params->sq_num_pages = qp->usq.pbl_info.num_pbes; + params->sq_pbl_ptr = qp->usq.pbl_tbl->pa; + } else { + params->sq_num_pages = qed_chain_get_page_cnt(&qp->sq.pbl); + params->sq_pbl_ptr = qed_chain_get_pbl_phys(&qp->sq.pbl); + } +} + +static inline void +qedr_init_qp_in_params_rq(struct qedr_qp *qp, + struct ib_qp_init_attr *attrs, + struct ib_udata *udata, + struct qed_rdma_create_qp_in_params *params) +{ + params->rq_cq_id = get_qedr_cq(attrs->recv_cq)->icid; + params->srq_id = 0; + params->use_srq = false; + + if (udata) { + params->rq_num_pages = qp->urq.pbl_info.num_pbes; + params->rq_pbl_ptr = qp->urq.pbl_tbl->pa; + } else { + params->rq_num_pages = qed_chain_get_page_cnt(&qp->rq.pbl); + params->rq_pbl_ptr = qed_chain_get_pbl_phys(&qp->rq.pbl); + } +} + +static inline void qedr_qp_user_print(struct qedr_dev *dev, struct qedr_qp *qp) +{ + DP_DEBUG(dev, QEDR_MSG_QP, + "create qp: successfully created user QP. qp=%p, sq_addr=0x%llx, sq_len=%zd, rq_addr=0x%llx, rq_len=%zd\n", + qp, qp->usq.buf_addr, qp->usq.buf_len, qp->urq.buf_addr, + qp->urq.buf_len); +} + +static inline int qedr_init_user_qp(struct ib_ucontext *ib_ctx, + struct qedr_dev *dev, + struct qedr_qp *qp, + struct qedr_create_qp_ureq *ureq) +{ + int rc; + + /* SQ - read access only (0), dma sync not required (0) */ + rc = qedr_init_user_queue(ib_ctx, dev, &qp->usq, ureq->sq_addr, + ureq->sq_len, 0, 0); + if (rc) + return rc; + + /* RQ - read access only (0), dma sync not required (0) */ + rc = qedr_init_user_queue(ib_ctx, dev, &qp->urq, ureq->rq_addr, + ureq->rq_len, 0, 0); + + if (rc) + qedr_cleanup_user_sq(dev, qp); + return rc; +} + +static inline int +qedr_init_kernel_qp(struct qedr_dev *dev, + struct qedr_qp *qp, + struct ib_qp_init_attr *attrs, + struct qed_rdma_create_qp_in_params *params) +{ + int rc; + + rc = qedr_init_qp_kernel_sq(dev, qp, attrs); + if (rc) { + DP_ERR(dev, "failed to init kernel QP %p SQ\n", qp); + return rc; + } + + rc = qedr_init_qp_kernel_params_sq(dev, qp, attrs, params); + if (rc) { + dev->ops->common->chain_free(dev->cdev, &qp->sq.pbl); + DP_ERR(dev, "failed to init kernel QP %p SQ params\n", qp); + return rc; + } + + rc = qedr_init_qp_kernel_rq(dev, qp, attrs); + if (rc) { + qedr_cleanup_kernel_sq(dev, qp); + DP_ERR(dev, "failed to init kernel QP %p RQ\n", qp); + return rc; + } + + rc = qedr_init_qp_kernel_params_rq(dev, qp, attrs); + if (rc) { + DP_ERR(dev, "failed to init kernel QP %p RQ params\n", qp); + qedr_cleanup_kernel_sq(dev, qp); + dev->ops->common->chain_free(dev->cdev, &qp->rq.pbl); + return rc; + } + + return rc; +} + +struct ib_qp *qedr_create_qp(struct ib_pd *ibpd, + struct ib_qp_init_attr *attrs, + struct ib_udata *udata) +{ + struct qedr_dev *dev = get_qedr_dev(ibpd->device); + struct qed_rdma_create_qp_out_params out_params; + struct qed_rdma_create_qp_in_params in_params; + struct qedr_pd *pd = get_qedr_pd(ibpd); + struct ib_ucontext *ib_ctx = NULL; + struct qedr_ucontext *ctx = NULL; + struct qedr_create_qp_ureq ureq; + struct qedr_qp *qp; + int rc = 0; + + DP_DEBUG(dev, QEDR_MSG_QP, "create qp: called from %s, pd=%p\n", + udata ? "user library" : "kernel", pd); + + rc = qedr_check_qp_attrs(ibpd, dev, attrs); + if (rc) + return ERR_PTR(rc); + + qp = kzalloc(sizeof(*qp), GFP_KERNEL); + if (!qp) + return ERR_PTR(-ENOMEM); + + if (attrs->srq) + return ERR_PTR(-EINVAL); + + DP_DEBUG(dev, QEDR_MSG_QP, + "create qp: sq_cq=%p, sq_icid=%d, rq_cq=%p, rq_icid=%d\n", + get_qedr_cq(attrs->send_cq), + get_qedr_cq(attrs->send_cq)->icid, + get_qedr_cq(attrs->recv_cq), + get_qedr_cq(attrs->recv_cq)->icid); + + qedr_set_qp_init_params(dev, qp, pd, attrs); + + memset(&in_params, 0, sizeof(in_params)); + + if (udata) { + if (!(udata && ibpd->uobject && ibpd->uobject->context)) + goto err0; + + ib_ctx = ibpd->uobject->context; + ctx = get_qedr_ucontext(ib_ctx); + + memset(&ureq, 0, sizeof(ureq)); + if (ib_copy_from_udata(&ureq, udata, sizeof(ureq))) { + DP_ERR(dev, + "create qp: problem copying data from user space\n"); + goto err0; + } + + rc = qedr_init_user_qp(ib_ctx, dev, qp, &ureq); + if (rc) + goto err0; + + qedr_init_qp_user_params(&in_params, &ureq); + } else { + rc = qedr_init_kernel_qp(dev, qp, attrs, &in_params); + if (rc) + goto err0; + } + + qedr_init_qp_in_params_sq(dev, pd, qp, attrs, udata, &in_params); + qedr_init_qp_in_params_rq(qp, attrs, udata, &in_params); + + qp->qed_qp = dev->ops->rdma_create_qp(dev->rdma_ctx, + &in_params, &out_params); + + if (!qp->qed_qp) + goto err1; + + qp->qp_id = out_params.qp_id; + qp->icid = out_params.icid; + qp->ibqp.qp_num = qp->qp_id; + + if (udata) { + rc = qedr_copy_qp_uresp(dev, qp, udata); + if (rc) + goto err2; + + qedr_qp_user_print(dev, qp); + } else { + qedr_init_qp_kernel_doorbell_sq(dev, qp); + qedr_init_qp_kernel_doorbell_rq(dev, qp); + } + + DP_DEBUG(dev, QEDR_MSG_QP, "created %s space QP %p\n", + udata ? "user" : "kernel", qp); + + return &qp->ibqp; + +err2: + rc = dev->ops->rdma_destroy_qp(dev->rdma_ctx, qp->qed_qp); + if (rc) + DP_ERR(dev, "create qp: fatal fault. rc=%d", rc); +err1: + if (udata) { + qedr_cleanup_user_sq(dev, qp); + qedr_cleanup_user_rq(dev, qp); + } else { + qedr_cleanup_kernel_sq(dev, qp); + qedr_cleanup_kernel_rq(dev, qp); + } + +err0: + kfree(qp); + + return ERR_PTR(-EFAULT); +} + +enum ib_qp_state qedr_get_ibqp_state(enum qed_roce_qp_state qp_state) +{ + switch (qp_state) { + case QED_ROCE_QP_STATE_RESET: + return IB_QPS_RESET; + case QED_ROCE_QP_STATE_INIT: + return IB_QPS_INIT; + case QED_ROCE_QP_STATE_RTR: + return IB_QPS_RTR; + case QED_ROCE_QP_STATE_RTS: + return IB_QPS_RTS; + case QED_ROCE_QP_STATE_SQD: + return IB_QPS_SQD; + case QED_ROCE_QP_STATE_ERR: + return IB_QPS_ERR; + case QED_ROCE_QP_STATE_SQE: + return IB_QPS_SQE; + } + return IB_QPS_ERR; +} + +enum qed_roce_qp_state qedr_get_state_from_ibqp(enum ib_qp_state qp_state) +{ + switch (qp_state) { + case IB_QPS_RESET: + return QED_ROCE_QP_STATE_RESET; + case IB_QPS_INIT: + return QED_ROCE_QP_STATE_INIT; + case IB_QPS_RTR: + return QED_ROCE_QP_STATE_RTR; + case IB_QPS_RTS: + return QED_ROCE_QP_STATE_RTS; + case IB_QPS_SQD: + return QED_ROCE_QP_STATE_SQD; + case IB_QPS_ERR: + return QED_ROCE_QP_STATE_ERR; + default: + return QED_ROCE_QP_STATE_ERR; + } +} + +static void qedr_reset_qp_hwq_info(struct qedr_qp_hwq_info *qph) +{ + qed_chain_reset(&qph->pbl); + qph->prod = 0; + qph->cons = 0; + qph->wqe_cons = 0; + qph->db_data.data.value = cpu_to_le16(0); +} + +static int qedr_update_qp_state(struct qedr_dev *dev, + struct qedr_qp *qp, + enum qed_roce_qp_state new_state) +{ + int status = 0; + + if (new_state == qp->state) + return 1; + + switch (qp->state) { + case QED_ROCE_QP_STATE_RESET: + switch (new_state) { + case QED_ROCE_QP_STATE_INIT: + qp->prev_wqe_size = 0; + qedr_reset_qp_hwq_info(&qp->sq); + qedr_reset_qp_hwq_info(&qp->rq); + break; + default: + status = -EINVAL; + break; + }; + break; + case QED_ROCE_QP_STATE_INIT: + switch (new_state) { + case QED_ROCE_QP_STATE_RTR: + /* Update doorbell (in case post_recv was + * done before move to RTR) + */ + wmb(); + writel(qp->rq.db_data.raw, qp->rq.db); + /* Make sure write takes effect */ + mmiowb(); + break; + case QED_ROCE_QP_STATE_ERR: + break; + default: + /* Invalid state change. */ + status = -EINVAL; + break; + }; + break; + case QED_ROCE_QP_STATE_RTR: + /* RTR->XXX */ + switch (new_state) { + case QED_ROCE_QP_STATE_RTS: + break; + case QED_ROCE_QP_STATE_ERR: + break; + default: + /* Invalid state change. */ + status = -EINVAL; + break; + }; + break; + case QED_ROCE_QP_STATE_RTS: + /* RTS->XXX */ + switch (new_state) { + case QED_ROCE_QP_STATE_SQD: + break; + case QED_ROCE_QP_STATE_ERR: + break; + default: + /* Invalid state change. */ + status = -EINVAL; + break; + }; + break; + case QED_ROCE_QP_STATE_SQD: + /* SQD->XXX */ + switch (new_state) { + case QED_ROCE_QP_STATE_RTS: + case QED_ROCE_QP_STATE_ERR: + break; + default: + /* Invalid state change. */ + status = -EINVAL; + break; + }; + break; + case QED_ROCE_QP_STATE_ERR: + /* ERR->XXX */ + switch (new_state) { + case QED_ROCE_QP_STATE_RESET: + break; + default: + status = -EINVAL; + break; + }; + break; + default: + status = -EINVAL; + break; + }; + + return status; +} + +int qedr_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, + int attr_mask, struct ib_udata *udata) +{ + struct qedr_qp *qp = get_qedr_qp(ibqp); + struct qed_rdma_modify_qp_in_params qp_params = { 0 }; + struct qedr_dev *dev = get_qedr_dev(&qp->dev->ibdev); + enum ib_qp_state old_qp_state, new_qp_state; + int rc = 0; + + DP_DEBUG(dev, QEDR_MSG_QP, + "modify qp: qp %p attr_mask=0x%x, state=%d", qp, attr_mask, + attr->qp_state); + + old_qp_state = qedr_get_ibqp_state(qp->state); + if (attr_mask & IB_QP_STATE) + new_qp_state = attr->qp_state; + else + new_qp_state = old_qp_state; + + if (!ib_modify_qp_is_ok + (old_qp_state, new_qp_state, ibqp->qp_type, attr_mask, + IB_LINK_LAYER_ETHERNET)) { + DP_ERR(dev, + "modify qp: invalid attribute mask=0x%x specified for\n" + "qpn=0x%x of type=0x%x old_qp_state=0x%x, new_qp_state=0x%x\n", + attr_mask, qp->qp_id, ibqp->qp_type, old_qp_state, + new_qp_state); + rc = -EINVAL; + goto err; + } + + /* Translate the masks... */ + if (attr_mask & IB_QP_STATE) { + SET_FIELD(qp_params.modify_flags, + QED_RDMA_MODIFY_QP_VALID_NEW_STATE, 1); + qp_params.new_state = qedr_get_state_from_ibqp(attr->qp_state); + } + + if (attr_mask & IB_QP_EN_SQD_ASYNC_NOTIFY) + qp_params.sqd_async = true; + + if (attr_mask & IB_QP_PKEY_INDEX) { + SET_FIELD(qp_params.modify_flags, + QED_ROCE_MODIFY_QP_VALID_PKEY, 1); + if (attr->pkey_index >= QEDR_ROCE_PKEY_TABLE_LEN) { + rc = -EINVAL; + goto err; + } + + qp_params.pkey = QEDR_ROCE_PKEY_DEFAULT; + } + + if (attr_mask & IB_QP_QKEY) + qp->qkey = attr->qkey; + + if (attr_mask & IB_QP_ACCESS_FLAGS) { + SET_FIELD(qp_params.modify_flags, + QED_RDMA_MODIFY_QP_VALID_RDMA_OPS_EN, 1); + qp_params.incoming_rdma_read_en = attr->qp_access_flags & + IB_ACCESS_REMOTE_READ; + qp_params.incoming_rdma_write_en = attr->qp_access_flags & + IB_ACCESS_REMOTE_WRITE; + qp_params.incoming_atomic_en = attr->qp_access_flags & + IB_ACCESS_REMOTE_ATOMIC; + } + + if (attr_mask & (IB_QP_AV | IB_QP_PATH_MTU)) { + if (attr_mask & IB_QP_PATH_MTU) { + if (attr->path_mtu < IB_MTU_256 || + attr->path_mtu > IB_MTU_4096) { + pr_err("error: Only MTU sizes of 256, 512, 1024, 2048 and 4096 are supported by RoCE\n"); + rc = -EINVAL; + goto err; + } + qp->mtu = min(ib_mtu_enum_to_int(attr->path_mtu), + ib_mtu_enum_to_int(iboe_get_mtu + (dev->ndev->mtu))); + } + + if (!qp->mtu) { + qp->mtu = + ib_mtu_enum_to_int(iboe_get_mtu(dev->ndev->mtu)); + pr_err("Fixing zeroed MTU to qp->mtu = %d\n", qp->mtu); + } + + SET_FIELD(qp_params.modify_flags, + QED_ROCE_MODIFY_QP_VALID_ADDRESS_VECTOR, 1); + + qp_params.traffic_class_tos = attr->ah_attr.grh.traffic_class; + qp_params.flow_label = attr->ah_attr.grh.flow_label; + qp_params.hop_limit_ttl = attr->ah_attr.grh.hop_limit; + + qp->sgid_idx = attr->ah_attr.grh.sgid_index; + + rc = get_gid_info_from_table(ibqp, attr, attr_mask, &qp_params); + if (rc) { + DP_ERR(dev, + "modify qp: problems with GID index %d (rc=%d)\n", + attr->ah_attr.grh.sgid_index, rc); + return rc; + } + + rc = qedr_get_dmac(dev, &attr->ah_attr, + qp_params.remote_mac_addr); + if (rc) + return rc; + + qp_params.use_local_mac = true; + ether_addr_copy(qp_params.local_mac_addr, dev->ndev->dev_addr); + + DP_DEBUG(dev, QEDR_MSG_QP, "dgid=%x:%x:%x:%x\n", + qp_params.dgid.dwords[0], qp_params.dgid.dwords[1], + qp_params.dgid.dwords[2], qp_params.dgid.dwords[3]); + DP_DEBUG(dev, QEDR_MSG_QP, "sgid=%x:%x:%x:%x\n", + qp_params.sgid.dwords[0], qp_params.sgid.dwords[1], + qp_params.sgid.dwords[2], qp_params.sgid.dwords[3]); + DP_DEBUG(dev, QEDR_MSG_QP, "remote_mac=[%pM]\n", + qp_params.remote_mac_addr); +; + + qp_params.mtu = qp->mtu; + qp_params.lb_indication = false; + } + + if (!qp_params.mtu) { + /* Stay with current MTU */ + if (qp->mtu) + qp_params.mtu = qp->mtu; + else + qp_params.mtu = + ib_mtu_enum_to_int(iboe_get_mtu(dev->ndev->mtu)); + } + + if (attr_mask & IB_QP_TIMEOUT) { + SET_FIELD(qp_params.modify_flags, + QED_ROCE_MODIFY_QP_VALID_ACK_TIMEOUT, 1); + + qp_params.ack_timeout = attr->timeout; + if (attr->timeout) { + u32 temp; + + temp = 4096 * (1UL << attr->timeout) / 1000 / 1000; + /* FW requires [msec] */ + qp_params.ack_timeout = temp; + } else { + /* Infinite */ + qp_params.ack_timeout = 0; + } + } + if (attr_mask & IB_QP_RETRY_CNT) { + SET_FIELD(qp_params.modify_flags, + QED_ROCE_MODIFY_QP_VALID_RETRY_CNT, 1); + qp_params.retry_cnt = attr->retry_cnt; + } + + if (attr_mask & IB_QP_RNR_RETRY) { + SET_FIELD(qp_params.modify_flags, + QED_ROCE_MODIFY_QP_VALID_RNR_RETRY_CNT, 1); + qp_params.rnr_retry_cnt = attr->rnr_retry; + } + + if (attr_mask & IB_QP_RQ_PSN) { + SET_FIELD(qp_params.modify_flags, + QED_ROCE_MODIFY_QP_VALID_RQ_PSN, 1); + qp_params.rq_psn = attr->rq_psn; + qp->rq_psn = attr->rq_psn; + } + + if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC) { + if (attr->max_rd_atomic > dev->attr.max_qp_req_rd_atomic_resc) { + rc = -EINVAL; + DP_ERR(dev, + "unsupported max_rd_atomic=%d, supported=%d\n", + attr->max_rd_atomic, + dev->attr.max_qp_req_rd_atomic_resc); + goto err; + } + + SET_FIELD(qp_params.modify_flags, + QED_RDMA_MODIFY_QP_VALID_MAX_RD_ATOMIC_REQ, 1); + qp_params.max_rd_atomic_req = attr->max_rd_atomic; + } + + if (attr_mask & IB_QP_MIN_RNR_TIMER) { + SET_FIELD(qp_params.modify_flags, + QED_ROCE_MODIFY_QP_VALID_MIN_RNR_NAK_TIMER, 1); + qp_params.min_rnr_nak_timer = attr->min_rnr_timer; + } + + if (attr_mask & IB_QP_SQ_PSN) { + SET_FIELD(qp_params.modify_flags, + QED_ROCE_MODIFY_QP_VALID_SQ_PSN, 1); + qp_params.sq_psn = attr->sq_psn; + qp->sq_psn = attr->sq_psn; + } + + if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) { + if (attr->max_dest_rd_atomic > + dev->attr.max_qp_resp_rd_atomic_resc) { + DP_ERR(dev, + "unsupported max_dest_rd_atomic=%d, supported=%d\n", + attr->max_dest_rd_atomic, + dev->attr.max_qp_resp_rd_atomic_resc); + + rc = -EINVAL; + goto err; + } + + SET_FIELD(qp_params.modify_flags, + QED_RDMA_MODIFY_QP_VALID_MAX_RD_ATOMIC_RESP, 1); + qp_params.max_rd_atomic_resp = attr->max_dest_rd_atomic; + } + + if (attr_mask & IB_QP_DEST_QPN) { + SET_FIELD(qp_params.modify_flags, + QED_ROCE_MODIFY_QP_VALID_DEST_QP, 1); + + qp_params.dest_qp = attr->dest_qp_num; + qp->dest_qp_num = attr->dest_qp_num; + } + + if (qp->qp_type != IB_QPT_GSI) + rc = dev->ops->rdma_modify_qp(dev->rdma_ctx, + qp->qed_qp, &qp_params); + + if (attr_mask & IB_QP_STATE) { + if ((qp->qp_type != IB_QPT_GSI) && (!udata)) + qedr_update_qp_state(dev, qp, qp_params.new_state); + qp->state = qp_params.new_state; + } + +err: + return rc; +} + +static int qedr_to_ib_qp_acc_flags(struct qed_rdma_query_qp_out_params *params) +{ + int ib_qp_acc_flags = 0; + + if (params->incoming_rdma_write_en) + ib_qp_acc_flags |= IB_ACCESS_REMOTE_WRITE; + if (params->incoming_rdma_read_en) + ib_qp_acc_flags |= IB_ACCESS_REMOTE_READ; + if (params->incoming_atomic_en) + ib_qp_acc_flags |= IB_ACCESS_REMOTE_ATOMIC; + ib_qp_acc_flags |= IB_ACCESS_LOCAL_WRITE; + return ib_qp_acc_flags; +} + +int qedr_query_qp(struct ib_qp *ibqp, + struct ib_qp_attr *qp_attr, + int attr_mask, struct ib_qp_init_attr *qp_init_attr) +{ + struct qed_rdma_query_qp_out_params params; + struct qedr_qp *qp = get_qedr_qp(ibqp); + struct qedr_dev *dev = qp->dev; + int rc = 0; + + memset(¶ms, 0, sizeof(params)); + + rc = dev->ops->rdma_query_qp(dev->rdma_ctx, qp->qed_qp, ¶ms); + if (rc) + goto err; + + memset(qp_attr, 0, sizeof(*qp_attr)); + memset(qp_init_attr, 0, sizeof(*qp_init_attr)); + + qp_attr->qp_state = qedr_get_ibqp_state(params.state); + qp_attr->cur_qp_state = qedr_get_ibqp_state(params.state); + qp_attr->path_mtu = iboe_get_mtu(params.mtu); + qp_attr->path_mig_state = IB_MIG_MIGRATED; + qp_attr->rq_psn = params.rq_psn; + qp_attr->sq_psn = params.sq_psn; + qp_attr->dest_qp_num = params.dest_qp; + + qp_attr->qp_access_flags = qedr_to_ib_qp_acc_flags(¶ms); + + qp_attr->cap.max_send_wr = qp->sq.max_wr; + qp_attr->cap.max_recv_wr = qp->rq.max_wr; + qp_attr->cap.max_send_sge = qp->sq.max_sges; + qp_attr->cap.max_recv_sge = qp->rq.max_sges; + qp_attr->cap.max_inline_data = qp->max_inline_data; + qp_init_attr->cap = qp_attr->cap; + + memcpy(&qp_attr->ah_attr.grh.dgid.raw[0], ¶ms.dgid.bytes[0], + sizeof(qp_attr->ah_attr.grh.dgid.raw)); + + qp_attr->ah_attr.grh.flow_label = params.flow_label; + qp_attr->ah_attr.grh.sgid_index = qp->sgid_idx; + qp_attr->ah_attr.grh.hop_limit = params.hop_limit_ttl; + qp_attr->ah_attr.grh.traffic_class = params.traffic_class_tos; + + qp_attr->ah_attr.ah_flags = IB_AH_GRH; + qp_attr->ah_attr.port_num = 1; + qp_attr->ah_attr.sl = 0; + qp_attr->timeout = params.timeout; + qp_attr->rnr_retry = params.rnr_retry; + qp_attr->retry_cnt = params.retry_cnt; + qp_attr->min_rnr_timer = params.min_rnr_nak_timer; + qp_attr->pkey_index = params.pkey_index; + qp_attr->port_num = 1; + qp_attr->ah_attr.src_path_bits = 0; + qp_attr->ah_attr.static_rate = 0; + qp_attr->alt_pkey_index = 0; + qp_attr->alt_port_num = 0; + qp_attr->alt_timeout = 0; + memset(&qp_attr->alt_ah_attr, 0, sizeof(qp_attr->alt_ah_attr)); + + qp_attr->sq_draining = (params.state == QED_ROCE_QP_STATE_SQD) ? 1 : 0; + qp_attr->max_dest_rd_atomic = params.max_dest_rd_atomic; + qp_attr->max_rd_atomic = params.max_rd_atomic; + qp_attr->en_sqd_async_notify = (params.sqd_async) ? 1 : 0; + + DP_DEBUG(dev, QEDR_MSG_QP, "QEDR_QUERY_QP: max_inline_data=%d\n", + qp_attr->cap.max_inline_data); + +err: + return rc; +} + +int qedr_destroy_qp(struct ib_qp *ibqp) +{ + struct qedr_qp *qp = get_qedr_qp(ibqp); + struct qedr_dev *dev = qp->dev; + struct ib_qp_attr attr; + int attr_mask = 0; + int rc = 0; + + DP_DEBUG(dev, QEDR_MSG_QP, "destroy qp: destroying %p, qp type=%d\n", + qp, qp->qp_type); + + if (qp->state != (QED_ROCE_QP_STATE_RESET | QED_ROCE_QP_STATE_ERR | + QED_ROCE_QP_STATE_INIT)) { + attr.qp_state = IB_QPS_ERR; + attr_mask |= IB_QP_STATE; + + /* Change the QP state to ERROR */ + qedr_modify_qp(ibqp, &attr, attr_mask, NULL); + } + + if (qp->qp_type != IB_QPT_GSI) { + rc = dev->ops->rdma_destroy_qp(dev->rdma_ctx, qp->qed_qp); + if (rc) + return rc; + } + + if (ibqp->uobject && ibqp->uobject->context) { + qedr_cleanup_user_sq(dev, qp); + qedr_cleanup_user_rq(dev, qp); + } else { + qedr_cleanup_kernel_sq(dev, qp); + qedr_cleanup_kernel_rq(dev, qp); + } + + kfree(qp); + + return rc; +} diff --git a/drivers/infiniband/hw/qedr/verbs.h b/drivers/infiniband/hw/qedr/verbs.h index 36c8a692f740..056d6cb31fa2 100644 --- a/drivers/infiniband/hw/qedr/verbs.h +++ b/drivers/infiniband/hw/qedr/verbs.h @@ -62,5 +62,12 @@ struct ib_cq *qedr_create_cq(struct ib_device *ibdev, int qedr_resize_cq(struct ib_cq *, int cqe, struct ib_udata *); int qedr_destroy_cq(struct ib_cq *); int qedr_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags); +struct ib_qp *qedr_create_qp(struct ib_pd *, struct ib_qp_init_attr *attrs, + struct ib_udata *); +int qedr_modify_qp(struct ib_qp *, struct ib_qp_attr *attr, + int attr_mask, struct ib_udata *udata); +int qedr_query_qp(struct ib_qp *, struct ib_qp_attr *qp_attr, + int qp_attr_mask, struct ib_qp_init_attr *); +int qedr_destroy_qp(struct ib_qp *ibqp); #endif diff --git a/include/uapi/rdma/qedr-abi.h b/include/uapi/rdma/qedr-abi.h index b0fc5f2125e0..75c270d839c8 100644 --- a/include/uapi/rdma/qedr-abi.h +++ b/include/uapi/rdma/qedr-abi.h @@ -69,4 +69,38 @@ struct qedr_create_cq_uresp { __u16 icid; }; +struct qedr_create_qp_ureq { + __u32 qp_handle_hi; + __u32 qp_handle_lo; + + /* SQ */ + /* user space virtual address of SQ buffer */ + __u64 sq_addr; + + /* length of SQ buffer */ + __u64 sq_len; + + /* RQ */ + /* user space virtual address of RQ buffer */ + __u64 rq_addr; + + /* length of RQ buffer */ + __u64 rq_len; +}; + +struct qedr_create_qp_uresp { + __u32 qp_id; + __u32 atomic_supported; + + /* SQ */ + __u32 sq_db_offset; + __u16 sq_icid; + + /* RQ */ + __u32 rq_db_offset; + __u16 rq_icid; + + __u32 rq_db2_offset; +}; + #endif /* __QEDR_USER_H__ */ -- cgit v1.2.3 From 85a624403c77c3f074931aefdfced59f61b668cb Mon Sep 17 00:00:00 2001 From: Jesse Brandeburg Date: Thu, 13 Oct 2016 16:13:55 -0700 Subject: ethtool: silence warning on bit loss Sparse was complaining when we went to prototype some code using ethtool_cmd_speed_set and SPEED_100000, which uses the upper 16 bits of __u32 speed for the first time. CHECK ... .../uapi/linux/ethtool.h:123:28: warning: cast truncates bits from constant value (186a0 becomes 86a0) The warning is actually bogus, as no bits are really lost, but we can get rid of the sparse warning with this one small change. Reported-by: Preethi Banala Signed-off-by: Jesse Brandeburg Signed-off-by: David S. Miller --- include/uapi/linux/ethtool.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index 099a4200732c..8e547231c1b7 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -119,8 +119,7 @@ struct ethtool_cmd { static inline void ethtool_cmd_speed_set(struct ethtool_cmd *ep, __u32 speed) { - - ep->speed = (__u16)speed; + ep->speed = (__u16)(speed & 0xFFFF); ep->speed_hi = (__u16)(speed >> 16); } -- cgit v1.2.3 From 9f7d416c36124667c406978bcb39746589c35d7f Mon Sep 17 00:00:00 2001 From: Dmitry Vyukov Date: Fri, 14 Oct 2016 16:07:23 +0200 Subject: kprobes: Unpoison stack in jprobe_return() for KASAN I observed false KSAN positives in the sctp code, when sctp uses jprobe_return() in jsctp_sf_eat_sack(). The stray 0xf4 in shadow memory are stack redzones: [ ] ================================================================== [ ] BUG: KASAN: stack-out-of-bounds in memcmp+0xe9/0x150 at addr ffff88005e48f480 [ ] Read of size 1 by task syz-executor/18535 [ ] page:ffffea00017923c0 count:0 mapcount:0 mapping: (null) index:0x0 [ ] flags: 0x1fffc0000000000() [ ] page dumped because: kasan: bad access detected [ ] CPU: 1 PID: 18535 Comm: syz-executor Not tainted 4.8.0+ #28 [ ] Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 [ ] ffff88005e48f2d0 ffffffff82d2b849 ffffffff0bc91e90 fffffbfff10971e8 [ ] ffffed000bc91e90 ffffed000bc91e90 0000000000000001 0000000000000000 [ ] ffff88005e48f480 ffff88005e48f350 ffffffff817d3169 ffff88005e48f370 [ ] Call Trace: [ ] [] dump_stack+0x12e/0x185 [ ] [] kasan_report+0x489/0x4b0 [ ] [] __asan_report_load1_noabort+0x19/0x20 [ ] [] memcmp+0xe9/0x150 [ ] [] depot_save_stack+0x176/0x5c0 [ ] [] save_stack+0xb1/0xd0 [ ] [] kasan_slab_free+0x72/0xc0 [ ] [] kfree+0xc8/0x2a0 [ ] [] skb_free_head+0x79/0xb0 [ ] [] skb_release_data+0x37a/0x420 [ ] [] skb_release_all+0x4f/0x60 [ ] [] consume_skb+0x138/0x370 [ ] [] sctp_chunk_put+0xcb/0x180 [ ] [] sctp_chunk_free+0x58/0x70 [ ] [] sctp_inq_pop+0x68f/0xef0 [ ] [] sctp_assoc_bh_rcv+0xd6/0x4b0 [ ] [] sctp_inq_push+0x131/0x190 [ ] [] sctp_backlog_rcv+0xe9/0xa20 [ ... ] [ ] Memory state around the buggy address: [ ] ffff88005e48f380: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 [ ] ffff88005e48f400: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 [ ] >ffff88005e48f480: f4 f4 00 00 00 00 00 00 00 00 00 00 00 00 00 00 [ ] ^ [ ] ffff88005e48f500: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 [ ] ffff88005e48f580: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 [ ] ================================================================== KASAN stack instrumentation poisons stack redzones on function entry and unpoisons them on function exit. If a function exits abnormally (e.g. with a longjmp like jprobe_return()), stack redzones are left poisoned. Later this leads to random KASAN false reports. Unpoison stack redzones in the frames we are going to jump over before doing actual longjmp in jprobe_return(). Signed-off-by: Dmitry Vyukov Acked-by: Masami Hiramatsu Reviewed-by: Mark Rutland Cc: Mark Rutland Cc: Catalin Marinas Cc: Andrey Ryabinin Cc: Lorenzo Pieralisi Cc: Alexander Potapenko Cc: Will Deacon Cc: Andrew Morton Cc: Ananth N Mavinakayanahalli Cc: Anil S Keshavamurthy Cc: "David S. Miller" Cc: Masami Hiramatsu Cc: kasan-dev@googlegroups.com Cc: surovegin@google.com Cc: rostedt@goodmis.org Link: http://lkml.kernel.org/r/1476454043-101898-1-git-send-email-dvyukov@google.com Signed-off-by: Ingo Molnar --- arch/arm64/kernel/sleep.S | 2 +- arch/x86/kernel/kprobes/core.c | 4 ++++ include/linux/kasan.h | 2 ++ mm/kasan/kasan.c | 22 +++++++++++++++++++--- 4 files changed, 26 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/arch/arm64/kernel/sleep.S b/arch/arm64/kernel/sleep.S index b8799e7c79de..1bec41b5fda3 100644 --- a/arch/arm64/kernel/sleep.S +++ b/arch/arm64/kernel/sleep.S @@ -135,7 +135,7 @@ ENTRY(_cpu_resume) #ifdef CONFIG_KASAN mov x0, sp - bl kasan_unpoison_remaining_stack + bl kasan_unpoison_task_stack_below #endif ldp x19, x20, [x29, #16] diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index f423b0ef23a7..d9d8d16b69db 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -50,6 +50,7 @@ #include #include #include +#include #include #include @@ -1081,6 +1082,9 @@ void jprobe_return(void) { struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); + /* Unpoison stack redzones in the frames we are going to jump over. */ + kasan_unpoison_stack_above_sp_to(kcb->jprobe_saved_sp); + asm volatile ( #ifdef CONFIG_X86_64 " xchg %%rbx,%%rsp \n" diff --git a/include/linux/kasan.h b/include/linux/kasan.h index d600303306eb..820c0ad54a01 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -44,6 +44,7 @@ static inline void kasan_disable_current(void) void kasan_unpoison_shadow(const void *address, size_t size); void kasan_unpoison_task_stack(struct task_struct *task); +void kasan_unpoison_stack_above_sp_to(const void *watermark); void kasan_alloc_pages(struct page *page, unsigned int order); void kasan_free_pages(struct page *page, unsigned int order); @@ -85,6 +86,7 @@ size_t kasan_metadata_size(struct kmem_cache *cache); static inline void kasan_unpoison_shadow(const void *address, size_t size) {} static inline void kasan_unpoison_task_stack(struct task_struct *task) {} +static inline void kasan_unpoison_stack_above_sp_to(const void *watermark) {} static inline void kasan_enable_current(void) {} static inline void kasan_disable_current(void) {} diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c index 88af13c00d3c..70c009741aab 100644 --- a/mm/kasan/kasan.c +++ b/mm/kasan/kasan.c @@ -34,6 +34,7 @@ #include #include #include +#include #include "kasan.h" #include "../slab.h" @@ -62,7 +63,7 @@ void kasan_unpoison_shadow(const void *address, size_t size) } } -static void __kasan_unpoison_stack(struct task_struct *task, void *sp) +static void __kasan_unpoison_stack(struct task_struct *task, const void *sp) { void *base = task_stack_page(task); size_t size = sp - base; @@ -77,9 +78,24 @@ void kasan_unpoison_task_stack(struct task_struct *task) } /* Unpoison the stack for the current task beyond a watermark sp value. */ -asmlinkage void kasan_unpoison_remaining_stack(void *sp) +asmlinkage void kasan_unpoison_task_stack_below(const void *watermark) { - __kasan_unpoison_stack(current, sp); + __kasan_unpoison_stack(current, watermark); +} + +/* + * Clear all poison for the region between the current SP and a provided + * watermark value, as is sometimes required prior to hand-crafted asm function + * returns in the middle of functions. + */ +void kasan_unpoison_stack_above_sp_to(const void *watermark) +{ + const void *sp = __builtin_frame_address(0); + size_t size = watermark - sp; + + if (WARN_ON(sp > watermark)) + return; + kasan_unpoison_shadow(sp, size); } /* -- cgit v1.2.3 From d0679cfaf54c9e42e3ba4f8d5e67969932040d16 Mon Sep 17 00:00:00 2001 From: Hoan Tran Date: Fri, 14 Oct 2016 16:20:21 -0700 Subject: mailbox: PCC: Fix return value of pcc_mbox_request_channel() When CONFIG_PCC is disabled, pcc_mbox_request_channel() needs to return ERR_PTR(-ENODEV), not a NULL pointer, as the callers of this function use IS_ERR() to check for error code. Signed-off-by: Duc Dang Signed-off-by: Hoan Tran Signed-off-by: Rafael J. Wysocki --- include/acpi/pcc.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/acpi/pcc.h b/include/acpi/pcc.h index 17a940a14477..8caa79c61703 100644 --- a/include/acpi/pcc.h +++ b/include/acpi/pcc.h @@ -21,7 +21,7 @@ extern void pcc_mbox_free_channel(struct mbox_chan *chan); static inline struct mbox_chan *pcc_mbox_request_channel(struct mbox_client *cl, int subspace_id) { - return NULL; + return ERR_PTR(-ENODEV); } static inline void pcc_mbox_free_channel(struct mbox_chan *chan) { } #endif -- cgit v1.2.3 From a04a480d4392ea6efd117be2de564117b2a009c0 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Sun, 16 Oct 2016 20:02:52 -0700 Subject: net: Require exact match for TCP socket lookups if dif is l3mdev Currently, socket lookups for l3mdev (vrf) use cases can match a socket that is bound to a port but not a device (ie., a global socket). If the sysctl tcp_l3mdev_accept is not set this leads to ack packets going out based on the main table even though the packet came in from an L3 domain. The end result is that the connection does not establish creating confusion for users since the service is running and a socket shows in ss output. Fix by requiring an exact dif to sk_bound_dev_if match if the skb came through an interface enslaved to an l3mdev device and the tcp_l3mdev_accept is not set. skb's through an l3mdev interface are marked by setting a flag in inet{6}_skb_parm. The IPv6 variant is already set; this patch adds the flag for IPv4. Using an skb flag avoids a device lookup on the dif. The flag is set in the VRF driver using the IP{6}CB macros. For IPv4, the inet_skb_parm struct is moved in the cb per commit 971f10eca186, so the match function in the TCP stack needs to use TCP_SKB_CB. For IPv6, the move is done after the socket lookup, so IP6CB is used. The flags field in inet_skb_parm struct needs to be increased to add another flag. There is currently a 1-byte hole following the flags, so it can be expanded to u16 without increasing the size of the struct. Fixes: 193125dbd8eb ("net: Introduce VRF device driver") Signed-off-by: David Ahern Signed-off-by: David S. Miller --- drivers/net/vrf.c | 2 ++ include/linux/ipv6.h | 17 ++++++++++++++--- include/net/ip.h | 8 +++++++- include/net/tcp.h | 13 ++++++++++++- net/ipv4/inet_hashtables.c | 8 +++++--- net/ipv6/inet6_hashtables.c | 7 ++++--- 6 files changed, 44 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c index 85c271c70d42..820de6a9ddde 100644 --- a/drivers/net/vrf.c +++ b/drivers/net/vrf.c @@ -956,6 +956,7 @@ static struct sk_buff *vrf_ip6_rcv(struct net_device *vrf_dev, if (skb->pkt_type == PACKET_LOOPBACK) { skb->dev = vrf_dev; skb->skb_iif = vrf_dev->ifindex; + IP6CB(skb)->flags |= IP6SKB_L3SLAVE; skb->pkt_type = PACKET_HOST; goto out; } @@ -996,6 +997,7 @@ static struct sk_buff *vrf_ip_rcv(struct net_device *vrf_dev, { skb->dev = vrf_dev; skb->skb_iif = vrf_dev->ifindex; + IPCB(skb)->flags |= IPSKB_L3SLAVE; /* loopback traffic; do not push through packet taps again. * Reset pkt_type for upper layers to process skb diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index 7e9a789be5e0..ca1ad9ebbc92 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -123,12 +123,12 @@ struct inet6_skb_parm { }; #if defined(CONFIG_NET_L3_MASTER_DEV) -static inline bool skb_l3mdev_slave(__u16 flags) +static inline bool ipv6_l3mdev_skb(__u16 flags) { return flags & IP6SKB_L3SLAVE; } #else -static inline bool skb_l3mdev_slave(__u16 flags) +static inline bool ipv6_l3mdev_skb(__u16 flags) { return false; } @@ -139,11 +139,22 @@ static inline bool skb_l3mdev_slave(__u16 flags) static inline int inet6_iif(const struct sk_buff *skb) { - bool l3_slave = skb_l3mdev_slave(IP6CB(skb)->flags); + bool l3_slave = ipv6_l3mdev_skb(IP6CB(skb)->flags); return l3_slave ? skb->skb_iif : IP6CB(skb)->iif; } +/* can not be used in TCP layer after tcp_v6_fill_cb */ +static inline bool inet6_exact_dif_match(struct net *net, struct sk_buff *skb) +{ +#if defined(CONFIG_NET_L3_MASTER_DEV) + if (!net->ipv4.sysctl_tcp_l3mdev_accept && + ipv6_l3mdev_skb(IP6CB(skb)->flags)) + return true; +#endif + return false; +} + struct tcp6_request_sock { struct tcp_request_sock tcp6rsk_tcp; }; diff --git a/include/net/ip.h b/include/net/ip.h index bc43c0fcae12..c9d07988911e 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -38,7 +38,7 @@ struct sock; struct inet_skb_parm { int iif; struct ip_options opt; /* Compiled IP options */ - unsigned char flags; + u16 flags; #define IPSKB_FORWARDED BIT(0) #define IPSKB_XFRM_TUNNEL_SIZE BIT(1) @@ -48,10 +48,16 @@ struct inet_skb_parm { #define IPSKB_DOREDIRECT BIT(5) #define IPSKB_FRAG_PMTU BIT(6) #define IPSKB_FRAG_SEGS BIT(7) +#define IPSKB_L3SLAVE BIT(8) u16 frag_max_size; }; +static inline bool ipv4_l3mdev_skb(u16 flags) +{ + return !!(flags & IPSKB_L3SLAVE); +} + static inline unsigned int ip_hdrlen(const struct sk_buff *skb) { return ip_hdr(skb)->ihl * 4; diff --git a/include/net/tcp.h b/include/net/tcp.h index f83b7f220a65..5b82d4d94834 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -794,12 +794,23 @@ struct tcp_skb_cb { */ static inline int tcp_v6_iif(const struct sk_buff *skb) { - bool l3_slave = skb_l3mdev_slave(TCP_SKB_CB(skb)->header.h6.flags); + bool l3_slave = ipv6_l3mdev_skb(TCP_SKB_CB(skb)->header.h6.flags); return l3_slave ? skb->skb_iif : TCP_SKB_CB(skb)->header.h6.iif; } #endif +/* TCP_SKB_CB reference means this can not be used from early demux */ +static inline bool inet_exact_dif_match(struct net *net, struct sk_buff *skb) +{ +#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV) + if (!net->ipv4.sysctl_tcp_l3mdev_accept && + ipv4_l3mdev_skb(TCP_SKB_CB(skb)->header.h4.flags)) + return true; +#endif + return false; +} + /* Due to TSO, an SKB can be composed of multiple actual * packets. To keep these tracked properly, we use this. */ diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 77c20a489218..ca97835bfec4 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -25,6 +25,7 @@ #include #include #include +#include #include static u32 inet_ehashfn(const struct net *net, const __be32 laddr, @@ -172,7 +173,7 @@ EXPORT_SYMBOL_GPL(__inet_inherit_port); static inline int compute_score(struct sock *sk, struct net *net, const unsigned short hnum, const __be32 daddr, - const int dif) + const int dif, bool exact_dif) { int score = -1; struct inet_sock *inet = inet_sk(sk); @@ -186,7 +187,7 @@ static inline int compute_score(struct sock *sk, struct net *net, return -1; score += 4; } - if (sk->sk_bound_dev_if) { + if (sk->sk_bound_dev_if || exact_dif) { if (sk->sk_bound_dev_if != dif) return -1; score += 4; @@ -215,11 +216,12 @@ struct sock *__inet_lookup_listener(struct net *net, unsigned int hash = inet_lhashfn(net, hnum); struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash]; int score, hiscore = 0, matches = 0, reuseport = 0; + bool exact_dif = inet_exact_dif_match(net, skb); struct sock *sk, *result = NULL; u32 phash = 0; sk_for_each_rcu(sk, &ilb->head) { - score = compute_score(sk, net, hnum, daddr, dif); + score = compute_score(sk, net, hnum, daddr, dif, exact_dif); if (score > hiscore) { reuseport = sk->sk_reuseport; if (reuseport) { diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index 00cf28ad4565..2fd0374a35b1 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c @@ -96,7 +96,7 @@ EXPORT_SYMBOL(__inet6_lookup_established); static inline int compute_score(struct sock *sk, struct net *net, const unsigned short hnum, const struct in6_addr *daddr, - const int dif) + const int dif, bool exact_dif) { int score = -1; @@ -109,7 +109,7 @@ static inline int compute_score(struct sock *sk, struct net *net, return -1; score++; } - if (sk->sk_bound_dev_if) { + if (sk->sk_bound_dev_if || exact_dif) { if (sk->sk_bound_dev_if != dif) return -1; score++; @@ -131,11 +131,12 @@ struct sock *inet6_lookup_listener(struct net *net, unsigned int hash = inet_lhashfn(net, hnum); struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash]; int score, hiscore = 0, matches = 0, reuseport = 0; + bool exact_dif = inet6_exact_dif_match(net, skb); struct sock *sk, *result = NULL; u32 phash = 0; sk_for_each(sk, &ilb->head) { - score = compute_score(sk, net, hnum, daddr, dif); + score = compute_score(sk, net, hnum, daddr, dif, exact_dif); if (score > hiscore) { reuseport = sk->sk_reuseport; if (reuseport) { -- cgit v1.2.3 From 9224eb77e63f70f16c0b6b7a20ca7d395f3bc077 Mon Sep 17 00:00:00 2001 From: Vladimir Murzin Date: Mon, 17 Oct 2016 16:00:46 +0100 Subject: irqchip/gic-v3-its: Fix entry size mask for GITS_BASER Entry Size in GITS_BASER occupies 5 bits [52:48], but we mask out 8 bits. Fixes: cc2d3216f53c ("irqchip: GICv3: ITS command queue") Cc: stable@vger.kernel.org Signed-off-by: Vladimir Murzin Signed-off-by: Marc Zyngier --- include/linux/irqchip/arm-gic-v3.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/irqchip/arm-gic-v3.h b/include/linux/irqchip/arm-gic-v3.h index 8361c8d3edd1..b7e34313cdfe 100644 --- a/include/linux/irqchip/arm-gic-v3.h +++ b/include/linux/irqchip/arm-gic-v3.h @@ -290,7 +290,7 @@ #define GITS_BASER_TYPE_SHIFT (56) #define GITS_BASER_TYPE(r) (((r) >> GITS_BASER_TYPE_SHIFT) & 7) #define GITS_BASER_ENTRY_SIZE_SHIFT (48) -#define GITS_BASER_ENTRY_SIZE(r) ((((r) >> GITS_BASER_ENTRY_SIZE_SHIFT) & 0xff) + 1) +#define GITS_BASER_ENTRY_SIZE(r) ((((r) >> GITS_BASER_ENTRY_SIZE_SHIFT) & 0x1f) + 1) #define GITS_BASER_SHAREABILITY_SHIFT (10) #define GITS_BASER_InnerShareable \ GIC_BASER_SHAREABILITY(GITS_BASER, InnerShareable) -- cgit v1.2.3 From 71757904efadefdf5505712f675218ce59483c5d Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Mon, 17 Oct 2016 08:18:15 -0700 Subject: generic syscalls: kill cruft from removed pkey syscalls pkey_set() and pkey_get() were syscalls present in older versions of the protection keys patches. They were fully excised from the x86 code, but some cruft was left in the generic syscall code. The C++ comments were intended to help to make it more glaring to me to fix them before actually submitting them. That technique worked, but later than I would have liked. I test-compiled this for arm64. Fixes: a60f7b69d92c0 ("generic syscalls: Wire up memory protection keys syscalls") Signed-off-by: Dave Hansen Acked-by: Arnd Bergmann Cc: Thomas Gleixner Cc: x86@kernel.org Cc: linux-arch@vger.kernel.org Cc: mgorman@techsingularity.net Cc: linux-api@vger.kernel.org Cc: linux-mm@kvack.org Cc: luto@kernel.org Cc: akpm@linux-foundation.org Signed-off-by: Linus Torvalds --- include/linux/syscalls.h | 3 --- include/uapi/asm-generic/unistd.h | 4 ---- 2 files changed, 7 deletions(-) (limited to 'include') diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 0d7abb8b7315..91a740f6b884 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -902,8 +902,5 @@ asmlinkage long sys_pkey_mprotect(unsigned long start, size_t len, unsigned long prot, int pkey); asmlinkage long sys_pkey_alloc(unsigned long flags, unsigned long init_val); asmlinkage long sys_pkey_free(int pkey); -//asmlinkage long sys_pkey_get(int pkey, unsigned long flags); -//asmlinkage long sys_pkey_set(int pkey, unsigned long access_rights, -// unsigned long flags); #endif diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h index dbfee7e86ba6..9b1462e38b82 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h @@ -730,10 +730,6 @@ __SYSCALL(__NR_pkey_mprotect, sys_pkey_mprotect) __SYSCALL(__NR_pkey_alloc, sys_pkey_alloc) #define __NR_pkey_free 290 __SYSCALL(__NR_pkey_free, sys_pkey_free) -#define __NR_pkey_get 291 -//__SYSCALL(__NR_pkey_get, sys_pkey_get) -#define __NR_pkey_set 292 -//__SYSCALL(__NR_pkey_set, sys_pkey_set) #undef __NR_syscalls #define __NR_syscalls 291 -- cgit v1.2.3 From 339e1e54891c339b30023c9cc8a005cbf65a3c0c Mon Sep 17 00:00:00 2001 From: Shawn Guo Date: Sat, 8 Oct 2016 16:59:38 +0800 Subject: clk: core: add __init decoration for CLK_OF_DECLARE_DRIVER function The new introduced macro CLK_OF_DECLARE_DRIVER is usually used to declare clock driver init functions, which are mostly decorated with __init. Add __init decoration for CLK_OF_DECLARE_DRIVER function to avoid causing section mismatch warnings on client clock drivers. Signed-off-by: Shawn Guo Fixes: c7296c51ce5d ("clk: core: New macro CLK_OF_DECLARE_DRIVER") Signed-off-by: Stephen Boyd --- include/linux/clk-provider.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h index af596381fa0f..a428aec36ace 100644 --- a/include/linux/clk-provider.h +++ b/include/linux/clk-provider.h @@ -785,7 +785,7 @@ extern struct of_device_id __clk_of_table; * routines, one at of_clk_init(), and one at platform device probe */ #define CLK_OF_DECLARE_DRIVER(name, compat, fn) \ - static void name##_of_clk_init_driver(struct device_node *np) \ + static void __init name##_of_clk_init_driver(struct device_node *np) \ { \ of_node_clear_flag(np, OF_POPULATED); \ fn(np); \ -- cgit v1.2.3 From 19be0eaffa3ac7d8eb6784ad9bdbc7d67ed8e619 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 13 Oct 2016 13:07:36 -0700 Subject: mm: remove gup_flags FOLL_WRITE games from __get_user_pages() This is an ancient bug that was actually attempted to be fixed once (badly) by me eleven years ago in commit 4ceb5db9757a ("Fix get_user_pages() race for write access") but that was then undone due to problems on s390 by commit f33ea7f404e5 ("fix get_user_pages bug"). In the meantime, the s390 situation has long been fixed, and we can now fix it by checking the pte_dirty() bit properly (and do it better). The s390 dirty bit was implemented in abf09bed3cce ("s390/mm: implement software dirty bits") which made it into v3.9. Earlier kernels will have to look at the page state itself. Also, the VM has become more scalable, and what used a purely theoretical race back then has become easier to trigger. To fix it, we introduce a new internal FOLL_COW flag to mark the "yes, we already did a COW" rather than play racy games with FOLL_WRITE that is very fundamental, and then use the pte dirty flag to validate that the FOLL_COW flag is still valid. Reported-and-tested-by: Phil "not Paul" Oester Acked-by: Hugh Dickins Reviewed-by: Michal Hocko Cc: Andy Lutomirski Cc: Kees Cook Cc: Oleg Nesterov Cc: Willy Tarreau Cc: Nick Piggin Cc: Greg Thelen Cc: stable@vger.kernel.org Signed-off-by: Linus Torvalds --- include/linux/mm.h | 1 + mm/gup.c | 14 ++++++++++++-- 2 files changed, 13 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index e9caec6a51e9..ed85879f47f5 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2232,6 +2232,7 @@ static inline struct page *follow_page(struct vm_area_struct *vma, #define FOLL_TRIED 0x800 /* a retry, previous pass started an IO */ #define FOLL_MLOCK 0x1000 /* lock present pages */ #define FOLL_REMOTE 0x2000 /* we are working on non-current tsk/mm */ +#define FOLL_COW 0x4000 /* internal GUP flag */ typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr, void *data); diff --git a/mm/gup.c b/mm/gup.c index 96b2b2fd0fbd..22cc22e7432f 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -60,6 +60,16 @@ static int follow_pfn_pte(struct vm_area_struct *vma, unsigned long address, return -EEXIST; } +/* + * FOLL_FORCE can write to even unwritable pte's, but only + * after we've gone through a COW cycle and they are dirty. + */ +static inline bool can_follow_write_pte(pte_t pte, unsigned int flags) +{ + return pte_write(pte) || + ((flags & FOLL_FORCE) && (flags & FOLL_COW) && pte_dirty(pte)); +} + static struct page *follow_page_pte(struct vm_area_struct *vma, unsigned long address, pmd_t *pmd, unsigned int flags) { @@ -95,7 +105,7 @@ retry: } if ((flags & FOLL_NUMA) && pte_protnone(pte)) goto no_page; - if ((flags & FOLL_WRITE) && !pte_write(pte)) { + if ((flags & FOLL_WRITE) && !can_follow_write_pte(pte, flags)) { pte_unmap_unlock(ptep, ptl); return NULL; } @@ -412,7 +422,7 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma, * reCOWed by userspace write). */ if ((ret & VM_FAULT_WRITE) && !(vma->vm_flags & VM_WRITE)) - *flags &= ~FOLL_WRITE; + *flags |= FOLL_COW; return 0; } -- cgit v1.2.3 From d4944b0ecec0af882483fe44b66729316e575208 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Thu, 13 Oct 2016 01:20:12 +0100 Subject: mm: remove write/force parameters from __get_user_pages_unlocked() This removes the redundant 'write' and 'force' parameters from __get_user_pages_unlocked() to make the use of FOLL_FORCE explicit in callers as use of this flag can result in surprising behaviour (and hence bugs) within the mm subsystem. Signed-off-by: Lorenzo Stoakes Acked-by: Paolo Bonzini Reviewed-by: Jan Kara Acked-by: Michal Hocko Signed-off-by: Linus Torvalds --- include/linux/mm.h | 3 +-- mm/gup.c | 17 +++++++++-------- mm/nommu.c | 12 +++++++++--- mm/process_vm_access.c | 7 +++++-- virt/kvm/async_pf.c | 3 ++- virt/kvm/kvm_main.c | 11 ++++++++--- 6 files changed, 34 insertions(+), 19 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index ed85879f47f5..bcdea1f4e98c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1285,8 +1285,7 @@ long get_user_pages_locked(unsigned long start, unsigned long nr_pages, int write, int force, struct page **pages, int *locked); long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, unsigned long nr_pages, - int write, int force, struct page **pages, - unsigned int gup_flags); + struct page **pages, unsigned int gup_flags); long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, int write, int force, struct page **pages); int get_user_pages_fast(unsigned long start, int nr_pages, int write, diff --git a/mm/gup.c b/mm/gup.c index 720eb9385204..e997f545b059 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -875,17 +875,11 @@ EXPORT_SYMBOL(get_user_pages_locked); */ __always_inline long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, unsigned long nr_pages, - int write, int force, struct page **pages, - unsigned int gup_flags) + struct page **pages, unsigned int gup_flags) { long ret; int locked = 1; - if (write) - gup_flags |= FOLL_WRITE; - if (force) - gup_flags |= FOLL_FORCE; - down_read(&mm->mmap_sem); ret = __get_user_pages_locked(tsk, mm, start, nr_pages, pages, NULL, &locked, false, gup_flags); @@ -915,8 +909,15 @@ EXPORT_SYMBOL(__get_user_pages_unlocked); long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, int write, int force, struct page **pages) { + unsigned int flags = FOLL_TOUCH; + + if (write) + flags |= FOLL_WRITE; + if (force) + flags |= FOLL_FORCE; + return __get_user_pages_unlocked(current, current->mm, start, nr_pages, - write, force, pages, FOLL_TOUCH); + pages, flags); } EXPORT_SYMBOL(get_user_pages_unlocked); diff --git a/mm/nommu.c b/mm/nommu.c index 95daf81a4855..925dcc1fa2f3 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -185,8 +185,7 @@ EXPORT_SYMBOL(get_user_pages_locked); long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, unsigned long nr_pages, - int write, int force, struct page **pages, - unsigned int gup_flags) + struct page **pages, unsigned int gup_flags) { long ret; down_read(&mm->mmap_sem); @@ -200,8 +199,15 @@ EXPORT_SYMBOL(__get_user_pages_unlocked); long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, int write, int force, struct page **pages) { + unsigned int flags = 0; + + if (write) + flags |= FOLL_WRITE; + if (force) + flags |= FOLL_FORCE; + return __get_user_pages_unlocked(current, current->mm, start, nr_pages, - write, force, pages, 0); + pages, flags); } EXPORT_SYMBOL(get_user_pages_unlocked); diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c index 07514d41ebcc..be8dc8d1edb9 100644 --- a/mm/process_vm_access.c +++ b/mm/process_vm_access.c @@ -88,12 +88,16 @@ static int process_vm_rw_single_vec(unsigned long addr, ssize_t rc = 0; unsigned long max_pages_per_loop = PVM_MAX_KMALLOC_PAGES / sizeof(struct pages *); + unsigned int flags = FOLL_REMOTE; /* Work out address and page range required */ if (len == 0) return 0; nr_pages = (addr + len - 1) / PAGE_SIZE - addr / PAGE_SIZE + 1; + if (vm_write) + flags |= FOLL_WRITE; + while (!rc && nr_pages && iov_iter_count(iter)) { int pages = min(nr_pages, max_pages_per_loop); size_t bytes; @@ -104,8 +108,7 @@ static int process_vm_rw_single_vec(unsigned long addr, * current/current->mm */ pages = __get_user_pages_unlocked(task, mm, pa, pages, - vm_write, 0, process_pages, - FOLL_REMOTE); + process_pages, flags); if (pages <= 0) return -EFAULT; diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c index db9668869f6f..8035cc1eb955 100644 --- a/virt/kvm/async_pf.c +++ b/virt/kvm/async_pf.c @@ -84,7 +84,8 @@ static void async_pf_execute(struct work_struct *work) * mm and might be done in another context, so we must * use FOLL_REMOTE. */ - __get_user_pages_unlocked(NULL, mm, addr, 1, 1, 0, NULL, FOLL_REMOTE); + __get_user_pages_unlocked(NULL, mm, addr, 1, NULL, + FOLL_WRITE | FOLL_REMOTE); kvm_async_page_present_sync(vcpu, apf); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 81dfc73d3df3..28510e72618a 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1416,10 +1416,15 @@ static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault, down_read(¤t->mm->mmap_sem); npages = get_user_page_nowait(addr, write_fault, page); up_read(¤t->mm->mmap_sem); - } else + } else { + unsigned int flags = FOLL_TOUCH | FOLL_HWPOISON; + + if (write_fault) + flags |= FOLL_WRITE; + npages = __get_user_pages_unlocked(current, current->mm, addr, 1, - write_fault, 0, page, - FOLL_TOUCH|FOLL_HWPOISON); + page, flags); + } if (npages != 1) return npages; -- cgit v1.2.3 From c164154f66f0c9b02673f07aa4f044f1d9c70274 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Thu, 13 Oct 2016 01:20:13 +0100 Subject: mm: replace get_user_pages_unlocked() write/force parameters with gup_flags This removes the 'write' and 'force' use from get_user_pages_unlocked() and replaces them with 'gup_flags' to make the use of FOLL_FORCE explicit in callers as use of this flag can result in surprising behaviour (and hence bugs) within the mm subsystem. Signed-off-by: Lorenzo Stoakes Reviewed-by: Jan Kara Acked-by: Michal Hocko Signed-off-by: Linus Torvalds --- arch/mips/mm/gup.c | 2 +- arch/s390/mm/gup.c | 3 ++- arch/sh/mm/gup.c | 3 ++- arch/sparc/mm/gup.c | 3 ++- arch/x86/mm/gup.c | 2 +- drivers/media/pci/ivtv/ivtv-udma.c | 4 ++-- drivers/media/pci/ivtv/ivtv-yuv.c | 5 +++-- drivers/scsi/st.c | 5 ++--- drivers/video/fbdev/pvr2fb.c | 4 ++-- include/linux/mm.h | 2 +- mm/gup.c | 14 ++++---------- mm/nommu.c | 11 ++--------- mm/util.c | 3 ++- net/ceph/pagevec.c | 2 +- 14 files changed, 27 insertions(+), 36 deletions(-) (limited to 'include') diff --git a/arch/mips/mm/gup.c b/arch/mips/mm/gup.c index 42d124fb6474..d8c3c159289a 100644 --- a/arch/mips/mm/gup.c +++ b/arch/mips/mm/gup.c @@ -287,7 +287,7 @@ slow_irqon: pages += nr; ret = get_user_pages_unlocked(start, (end - start) >> PAGE_SHIFT, - write, 0, pages); + pages, write ? FOLL_WRITE : 0); /* Have to be a bit careful with return values */ if (nr > 0) { diff --git a/arch/s390/mm/gup.c b/arch/s390/mm/gup.c index adb0c34bf431..18d4107e10ee 100644 --- a/arch/s390/mm/gup.c +++ b/arch/s390/mm/gup.c @@ -266,7 +266,8 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write, /* Try to get the remaining pages with get_user_pages */ start += nr << PAGE_SHIFT; pages += nr; - ret = get_user_pages_unlocked(start, nr_pages - nr, write, 0, pages); + ret = get_user_pages_unlocked(start, nr_pages - nr, pages, + write ? FOLL_WRITE : 0); /* Have to be a bit careful with return values */ if (nr > 0) ret = (ret < 0) ? nr : ret + nr; diff --git a/arch/sh/mm/gup.c b/arch/sh/mm/gup.c index 40fa6c8adc43..063c298ba56c 100644 --- a/arch/sh/mm/gup.c +++ b/arch/sh/mm/gup.c @@ -258,7 +258,8 @@ slow_irqon: pages += nr; ret = get_user_pages_unlocked(start, - (end - start) >> PAGE_SHIFT, write, 0, pages); + (end - start) >> PAGE_SHIFT, pages, + write ? FOLL_WRITE : 0); /* Have to be a bit careful with return values */ if (nr > 0) { diff --git a/arch/sparc/mm/gup.c b/arch/sparc/mm/gup.c index 4e06750a5d29..cd0e32bbcb1d 100644 --- a/arch/sparc/mm/gup.c +++ b/arch/sparc/mm/gup.c @@ -238,7 +238,8 @@ slow: pages += nr; ret = get_user_pages_unlocked(start, - (end - start) >> PAGE_SHIFT, write, 0, pages); + (end - start) >> PAGE_SHIFT, pages, + write ? FOLL_WRITE : 0); /* Have to be a bit careful with return values */ if (nr > 0) { diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c index b8b6a60b32cf..0d4fb3ebbbac 100644 --- a/arch/x86/mm/gup.c +++ b/arch/x86/mm/gup.c @@ -435,7 +435,7 @@ slow_irqon: ret = get_user_pages_unlocked(start, (end - start) >> PAGE_SHIFT, - write, 0, pages); + pages, write ? FOLL_WRITE : 0); /* Have to be a bit careful with return values */ if (nr > 0) { diff --git a/drivers/media/pci/ivtv/ivtv-udma.c b/drivers/media/pci/ivtv/ivtv-udma.c index 4769469fe842..2c9232ef7baa 100644 --- a/drivers/media/pci/ivtv/ivtv-udma.c +++ b/drivers/media/pci/ivtv/ivtv-udma.c @@ -124,8 +124,8 @@ int ivtv_udma_setup(struct ivtv *itv, unsigned long ivtv_dest_addr, } /* Get user pages for DMA Xfer */ - err = get_user_pages_unlocked(user_dma.uaddr, user_dma.page_count, 0, - 1, dma->map); + err = get_user_pages_unlocked(user_dma.uaddr, user_dma.page_count, + dma->map, FOLL_FORCE); if (user_dma.page_count != err) { IVTV_DEBUG_WARN("failed to map user pages, returned %d instead of %d\n", diff --git a/drivers/media/pci/ivtv/ivtv-yuv.c b/drivers/media/pci/ivtv/ivtv-yuv.c index b094054cda6e..f7299d3d8244 100644 --- a/drivers/media/pci/ivtv/ivtv-yuv.c +++ b/drivers/media/pci/ivtv/ivtv-yuv.c @@ -76,11 +76,12 @@ static int ivtv_yuv_prep_user_dma(struct ivtv *itv, struct ivtv_user_dma *dma, /* Get user pages for DMA Xfer */ y_pages = get_user_pages_unlocked(y_dma.uaddr, - y_dma.page_count, 0, 1, &dma->map[0]); + y_dma.page_count, &dma->map[0], FOLL_FORCE); uv_pages = 0; /* silence gcc. value is set and consumed only if: */ if (y_pages == y_dma.page_count) { uv_pages = get_user_pages_unlocked(uv_dma.uaddr, - uv_dma.page_count, 0, 1, &dma->map[y_pages]); + uv_dma.page_count, &dma->map[y_pages], + FOLL_FORCE); } if (y_pages != y_dma.page_count || uv_pages != uv_dma.page_count) { diff --git a/drivers/scsi/st.c b/drivers/scsi/st.c index 7af5226aa55b..618422ea3a41 100644 --- a/drivers/scsi/st.c +++ b/drivers/scsi/st.c @@ -4922,9 +4922,8 @@ static int sgl_map_user_pages(struct st_buffer *STbp, res = get_user_pages_unlocked( uaddr, nr_pages, - rw == READ, - 0, /* don't force */ - pages); + pages, + rw == READ ? FOLL_WRITE : 0); /* don't force */ /* Errors and no page mapped should return here */ if (res < nr_pages) diff --git a/drivers/video/fbdev/pvr2fb.c b/drivers/video/fbdev/pvr2fb.c index 3b1ca4411073..a2564ab91e62 100644 --- a/drivers/video/fbdev/pvr2fb.c +++ b/drivers/video/fbdev/pvr2fb.c @@ -686,8 +686,8 @@ static ssize_t pvr2fb_write(struct fb_info *info, const char *buf, if (!pages) return -ENOMEM; - ret = get_user_pages_unlocked((unsigned long)buf, nr_pages, WRITE, - 0, pages); + ret = get_user_pages_unlocked((unsigned long)buf, nr_pages, pages, + FOLL_WRITE); if (ret < nr_pages) { nr_pages = ret; diff --git a/include/linux/mm.h b/include/linux/mm.h index bcdea1f4e98c..abd53f2eb74e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1287,7 +1287,7 @@ long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, unsigned long nr_pages, struct page **pages, unsigned int gup_flags); long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, - int write, int force, struct page **pages); + struct page **pages, unsigned int gup_flags); int get_user_pages_fast(unsigned long start, int nr_pages, int write, struct page **pages); diff --git a/mm/gup.c b/mm/gup.c index e997f545b059..373d1ec006e4 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -907,17 +907,10 @@ EXPORT_SYMBOL(__get_user_pages_unlocked); * "force" parameter). */ long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, - int write, int force, struct page **pages) + struct page **pages, unsigned int gup_flags) { - unsigned int flags = FOLL_TOUCH; - - if (write) - flags |= FOLL_WRITE; - if (force) - flags |= FOLL_FORCE; - return __get_user_pages_unlocked(current, current->mm, start, nr_pages, - pages, flags); + pages, gup_flags | FOLL_TOUCH); } EXPORT_SYMBOL(get_user_pages_unlocked); @@ -1535,7 +1528,8 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write, start += nr << PAGE_SHIFT; pages += nr; - ret = get_user_pages_unlocked(start, nr_pages - nr, write, 0, pages); + ret = get_user_pages_unlocked(start, nr_pages - nr, pages, + write ? FOLL_WRITE : 0); /* Have to be a bit careful with return values */ if (nr > 0) { diff --git a/mm/nommu.c b/mm/nommu.c index 925dcc1fa2f3..7e27add39f7e 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -197,17 +197,10 @@ long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm, EXPORT_SYMBOL(__get_user_pages_unlocked); long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, - int write, int force, struct page **pages) + struct page **pages, unsigned int gup_flags) { - unsigned int flags = 0; - - if (write) - flags |= FOLL_WRITE; - if (force) - flags |= FOLL_FORCE; - return __get_user_pages_unlocked(current, current->mm, start, nr_pages, - pages, flags); + pages, gup_flags); } EXPORT_SYMBOL(get_user_pages_unlocked); diff --git a/mm/util.c b/mm/util.c index 662cddf914af..4c685bde5ebc 100644 --- a/mm/util.c +++ b/mm/util.c @@ -283,7 +283,8 @@ EXPORT_SYMBOL_GPL(__get_user_pages_fast); int __weak get_user_pages_fast(unsigned long start, int nr_pages, int write, struct page **pages) { - return get_user_pages_unlocked(start, nr_pages, write, 0, pages); + return get_user_pages_unlocked(start, nr_pages, pages, + write ? FOLL_WRITE : 0); } EXPORT_SYMBOL_GPL(get_user_pages_fast); diff --git a/net/ceph/pagevec.c b/net/ceph/pagevec.c index 00d2601407c5..1a7c9a79a53c 100644 --- a/net/ceph/pagevec.c +++ b/net/ceph/pagevec.c @@ -26,7 +26,7 @@ struct page **ceph_get_direct_page_vector(const void __user *data, while (got < num_pages) { rc = get_user_pages_unlocked( (unsigned long)data + ((unsigned long)got * PAGE_SIZE), - num_pages - got, write_page, 0, pages + got); + num_pages - got, pages + got, write_page ? FOLL_WRITE : 0); if (rc < 0) break; BUG_ON(rc == 0); -- cgit v1.2.3 From e4961b0768852d9eb7383e1a5df178eacb714656 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Wed, 19 Oct 2016 16:57:08 +0300 Subject: net: core: Correctly iterate over lower adjacency list Tamir reported the following trace when processing ARP requests received via a vlan device on top of a VLAN-aware bridge: NMI watchdog: BUG: soft lockup - CPU#1 stuck for 22s! [swapper/1:0] [...] CPU: 1 PID: 0 Comm: swapper/1 Tainted: G W 4.8.0-rc7 #1 Hardware name: Mellanox Technologies Ltd. "MSN2100-CB2F"/"SA001017", BIOS 5.6.5 06/07/2016 task: ffff88017edfea40 task.stack: ffff88017ee10000 RIP: 0010:[] [] netdev_all_lower_get_next_rcu+0x33/0x60 [...] Call Trace: [] mlxsw_sp_port_lower_dev_hold+0x5a/0xa0 [mlxsw_spectrum] [] mlxsw_sp_router_netevent_event+0x80/0x150 [mlxsw_spectrum] [] notifier_call_chain+0x4a/0x70 [] atomic_notifier_call_chain+0x1a/0x20 [] call_netevent_notifiers+0x1b/0x20 [] neigh_update+0x306/0x740 [] neigh_event_ns+0x4e/0xb0 [] arp_process+0x66f/0x700 [] ? common_interrupt+0x8c/0x8c [] arp_rcv+0x139/0x1d0 [] ? vlan_do_receive+0xda/0x320 [] __netif_receive_skb_core+0x524/0xab0 [] ? dev_queue_xmit+0x10/0x20 [] ? br_forward_finish+0x3d/0xc0 [bridge] [] ? br_handle_vlan+0xf6/0x1b0 [bridge] [] __netif_receive_skb+0x18/0x60 [] netif_receive_skb_internal+0x40/0xb0 [] netif_receive_skb+0x1c/0x70 [] br_pass_frame_up+0xc6/0x160 [bridge] [] ? deliver_clone+0x37/0x50 [bridge] [] ? br_flood+0xcc/0x160 [bridge] [] br_handle_frame_finish+0x224/0x4f0 [bridge] [] br_handle_frame+0x174/0x300 [bridge] [] __netif_receive_skb_core+0x329/0xab0 [] ? find_next_bit+0x15/0x20 [] ? cpumask_next_and+0x32/0x50 [] ? load_balance+0x178/0x9b0 [] __netif_receive_skb+0x18/0x60 [] netif_receive_skb_internal+0x40/0xb0 [] netif_receive_skb+0x1c/0x70 [] mlxsw_sp_rx_listener_func+0x61/0xb0 [mlxsw_spectrum] [] mlxsw_core_skb_receive+0x187/0x200 [mlxsw_core] [] mlxsw_pci_cq_tasklet+0x63a/0x9b0 [mlxsw_pci] [] tasklet_action+0xf6/0x110 [] __do_softirq+0xf6/0x280 [] irq_exit+0xdf/0xf0 [] do_IRQ+0x54/0xd0 [] common_interrupt+0x8c/0x8c The problem is that netdev_all_lower_get_next_rcu() never advances the iterator, thereby causing the loop over the lower adjacency list to run forever. Fix this by advancing the iterator and avoid the infinite loop. Fixes: 7ce856aaaf13 ("mlxsw: spectrum: Add couple of lower device helper functions") Signed-off-by: Ido Schimmel Reported-by: Tamir Winetroub Reviewed-by: Jiri Pirko Acked-by: David Ahern Signed-off-by: David S. Miller --- include/linux/netdevice.h | 2 +- net/core/dev.c | 10 +++++++--- 2 files changed, 8 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 136ae6bbe81e..465e128699a1 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3877,7 +3877,7 @@ struct net_device *netdev_all_lower_get_next_rcu(struct net_device *dev, ldev = netdev_all_lower_get_next(dev, &(iter))) #define netdev_for_each_all_lower_dev_rcu(dev, ldev, iter) \ - for (iter = (dev)->all_adj_list.lower.next, \ + for (iter = &(dev)->all_adj_list.lower, \ ldev = netdev_all_lower_get_next_rcu(dev, &(iter)); \ ldev; \ ldev = netdev_all_lower_get_next_rcu(dev, &(iter))) diff --git a/net/core/dev.c b/net/core/dev.c index f1fe26f66458..b09ac57f4348 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5511,10 +5511,14 @@ struct net_device *netdev_all_lower_get_next_rcu(struct net_device *dev, { struct netdev_adjacent *lower; - lower = list_first_or_null_rcu(&dev->all_adj_list.lower, - struct netdev_adjacent, list); + lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list); + + if (&lower->list == &dev->all_adj_list.lower) + return NULL; + + *iter = &lower->list; - return lower ? lower->dev : NULL; + return lower->dev; } EXPORT_SYMBOL(netdev_all_lower_get_next_rcu); -- cgit v1.2.3 From 85dda4e5b0ee1f5b4e8cc93d39e475006bc61ccd Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 18 Oct 2016 18:59:34 +0200 Subject: rtnetlink: Add rtnexthop offload flag to compare mask The offload flag is a status flag and should not be used by FIB semantics for comparison. Fixes: 37ed9493699c ("rtnetlink: add RTNH_F_EXTERNAL flag for fib offload") Signed-off-by: Jiri Pirko Reviewed-by: Andy Gospodarek Signed-off-by: David S. Miller --- include/uapi/linux/rtnetlink.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h index 262f0379d83a..5a78be518101 100644 --- a/include/uapi/linux/rtnetlink.h +++ b/include/uapi/linux/rtnetlink.h @@ -350,7 +350,7 @@ struct rtnexthop { #define RTNH_F_OFFLOAD 8 /* offloaded route */ #define RTNH_F_LINKDOWN 16 /* carrier-down on nexthop */ -#define RTNH_COMPARE_MASK (RTNH_F_DEAD | RTNH_F_LINKDOWN) +#define RTNH_COMPARE_MASK (RTNH_F_DEAD | RTNH_F_LINKDOWN | RTNH_F_OFFLOAD) /* Macros to handle hexthops */ -- cgit v1.2.3 From 3b913179c3fa89dd0e304193fa0c746fc0481447 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Thu, 13 Oct 2016 01:20:14 +0100 Subject: mm: replace get_user_pages_locked() write/force parameters with gup_flags This removes the 'write' and 'force' use from get_user_pages_locked() and replaces them with 'gup_flags' to make the use of FOLL_FORCE explicit in callers as use of this flag can result in surprising behaviour (and hence bugs) within the mm subsystem. Signed-off-by: Lorenzo Stoakes Acked-by: Michal Hocko Reviewed-by: Jan Kara Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 +- mm/frame_vector.c | 8 +++++++- mm/gup.c | 12 +++--------- mm/nommu.c | 5 ++++- 4 files changed, 15 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index abd53f2eb74e..9fe9b0438169 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1282,7 +1282,7 @@ long get_user_pages(unsigned long start, unsigned long nr_pages, int write, int force, struct page **pages, struct vm_area_struct **vmas); long get_user_pages_locked(unsigned long start, unsigned long nr_pages, - int write, int force, struct page **pages, int *locked); + unsigned int gup_flags, struct page **pages, int *locked); long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, unsigned long nr_pages, struct page **pages, unsigned int gup_flags); diff --git a/mm/frame_vector.c b/mm/frame_vector.c index 381bb07ed14f..81b67498bb9c 100644 --- a/mm/frame_vector.c +++ b/mm/frame_vector.c @@ -41,10 +41,16 @@ int get_vaddr_frames(unsigned long start, unsigned int nr_frames, int ret = 0; int err; int locked; + unsigned int gup_flags = 0; if (nr_frames == 0) return 0; + if (write) + gup_flags |= FOLL_WRITE; + if (force) + gup_flags |= FOLL_FORCE; + if (WARN_ON_ONCE(nr_frames > vec->nr_allocated)) nr_frames = vec->nr_allocated; @@ -59,7 +65,7 @@ int get_vaddr_frames(unsigned long start, unsigned int nr_frames, vec->got_ref = true; vec->is_pfns = false; ret = get_user_pages_locked(start, nr_frames, - write, force, (struct page **)(vec->ptrs), &locked); + gup_flags, (struct page **)(vec->ptrs), &locked); goto out; } diff --git a/mm/gup.c b/mm/gup.c index 373d1ec006e4..3cfb55ef0cf4 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -848,18 +848,12 @@ static __always_inline long __get_user_pages_locked(struct task_struct *tsk, * up_read(&mm->mmap_sem); */ long get_user_pages_locked(unsigned long start, unsigned long nr_pages, - int write, int force, struct page **pages, + unsigned int gup_flags, struct page **pages, int *locked) { - unsigned int flags = FOLL_TOUCH; - - if (write) - flags |= FOLL_WRITE; - if (force) - flags |= FOLL_FORCE; - return __get_user_pages_locked(current, current->mm, start, nr_pages, - pages, NULL, locked, true, flags); + pages, NULL, locked, true, + gup_flags | FOLL_TOUCH); } EXPORT_SYMBOL(get_user_pages_locked); diff --git a/mm/nommu.c b/mm/nommu.c index 7e27add39f7e..842cfdd1a31e 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -176,9 +176,12 @@ long get_user_pages(unsigned long start, unsigned long nr_pages, EXPORT_SYMBOL(get_user_pages); long get_user_pages_locked(unsigned long start, unsigned long nr_pages, - int write, int force, struct page **pages, + unsigned int gup_flags, struct page **pages, int *locked) { + int write = gup_flags & FOLL_WRITE; + int force = gup_flags & FOLL_FORCE; + return get_user_pages(start, nr_pages, write, force, pages, NULL); } EXPORT_SYMBOL(get_user_pages_locked); -- cgit v1.2.3 From 7f23b3504a0df63b724180262c5f3f117f21bcae Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Thu, 13 Oct 2016 01:20:15 +0100 Subject: mm: replace get_vaddr_frames() write/force parameters with gup_flags This removes the 'write' and 'force' from get_vaddr_frames() and replaces them with 'gup_flags' to make the use of FOLL_FORCE explicit in callers as use of this flag can result in surprising behaviour (and hence bugs) within the mm subsystem. Signed-off-by: Lorenzo Stoakes Acked-by: Michal Hocko Reviewed-by: Jan Kara Signed-off-by: Linus Torvalds --- drivers/gpu/drm/exynos/exynos_drm_g2d.c | 3 ++- drivers/media/platform/omap/omap_vout.c | 2 +- drivers/media/v4l2-core/videobuf2-memops.c | 6 +++++- include/linux/mm.h | 2 +- mm/frame_vector.c | 13 ++----------- 5 files changed, 11 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/drivers/gpu/drm/exynos/exynos_drm_g2d.c b/drivers/gpu/drm/exynos/exynos_drm_g2d.c index aa92decf4233..fbd13fabdf2d 100644 --- a/drivers/gpu/drm/exynos/exynos_drm_g2d.c +++ b/drivers/gpu/drm/exynos/exynos_drm_g2d.c @@ -488,7 +488,8 @@ static dma_addr_t *g2d_userptr_get_dma_addr(struct drm_device *drm_dev, goto err_free; } - ret = get_vaddr_frames(start, npages, true, true, g2d_userptr->vec); + ret = get_vaddr_frames(start, npages, FOLL_FORCE | FOLL_WRITE, + g2d_userptr->vec); if (ret != npages) { DRM_ERROR("failed to get user pages from userptr.\n"); if (ret < 0) diff --git a/drivers/media/platform/omap/omap_vout.c b/drivers/media/platform/omap/omap_vout.c index e668dde6d857..a31b95cb3b09 100644 --- a/drivers/media/platform/omap/omap_vout.c +++ b/drivers/media/platform/omap/omap_vout.c @@ -214,7 +214,7 @@ static int omap_vout_get_userptr(struct videobuf_buffer *vb, u32 virtp, if (!vec) return -ENOMEM; - ret = get_vaddr_frames(virtp, 1, true, false, vec); + ret = get_vaddr_frames(virtp, 1, FOLL_WRITE, vec); if (ret != 1) { frame_vector_destroy(vec); return -EINVAL; diff --git a/drivers/media/v4l2-core/videobuf2-memops.c b/drivers/media/v4l2-core/videobuf2-memops.c index 3c3b517f1d1c..1cd322e939c7 100644 --- a/drivers/media/v4l2-core/videobuf2-memops.c +++ b/drivers/media/v4l2-core/videobuf2-memops.c @@ -42,6 +42,10 @@ struct frame_vector *vb2_create_framevec(unsigned long start, unsigned long first, last; unsigned long nr; struct frame_vector *vec; + unsigned int flags = FOLL_FORCE; + + if (write) + flags |= FOLL_WRITE; first = start >> PAGE_SHIFT; last = (start + length - 1) >> PAGE_SHIFT; @@ -49,7 +53,7 @@ struct frame_vector *vb2_create_framevec(unsigned long start, vec = frame_vector_create(nr); if (!vec) return ERR_PTR(-ENOMEM); - ret = get_vaddr_frames(start & PAGE_MASK, nr, write, true, vec); + ret = get_vaddr_frames(start & PAGE_MASK, nr, flags, vec); if (ret < 0) goto out_destroy; /* We accept only complete set of PFNs */ diff --git a/include/linux/mm.h b/include/linux/mm.h index 9fe9b0438169..91cc923ce985 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1305,7 +1305,7 @@ struct frame_vector { struct frame_vector *frame_vector_create(unsigned int nr_frames); void frame_vector_destroy(struct frame_vector *vec); int get_vaddr_frames(unsigned long start, unsigned int nr_pfns, - bool write, bool force, struct frame_vector *vec); + unsigned int gup_flags, struct frame_vector *vec); void put_vaddr_frames(struct frame_vector *vec); int frame_vector_to_pages(struct frame_vector *vec); void frame_vector_to_pfns(struct frame_vector *vec); diff --git a/mm/frame_vector.c b/mm/frame_vector.c index 81b67498bb9c..db77dcb38afd 100644 --- a/mm/frame_vector.c +++ b/mm/frame_vector.c @@ -11,10 +11,7 @@ * get_vaddr_frames() - map virtual addresses to pfns * @start: starting user address * @nr_frames: number of pages / pfns from start to map - * @write: whether pages will be written to by the caller - * @force: whether to force write access even if user mapping is - * readonly. See description of the same argument of - get_user_pages(). + * @gup_flags: flags modifying lookup behaviour * @vec: structure which receives pages / pfns of the addresses mapped. * It should have space for at least nr_frames entries. * @@ -34,23 +31,17 @@ * This function takes care of grabbing mmap_sem as necessary. */ int get_vaddr_frames(unsigned long start, unsigned int nr_frames, - bool write, bool force, struct frame_vector *vec) + unsigned int gup_flags, struct frame_vector *vec) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma; int ret = 0; int err; int locked; - unsigned int gup_flags = 0; if (nr_frames == 0) return 0; - if (write) - gup_flags |= FOLL_WRITE; - if (force) - gup_flags |= FOLL_FORCE; - if (WARN_ON_ONCE(nr_frames > vec->nr_allocated)) nr_frames = vec->nr_allocated; -- cgit v1.2.3 From 768ae309a96103ed02eb1e111e838c87854d8b51 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Thu, 13 Oct 2016 01:20:16 +0100 Subject: mm: replace get_user_pages() write/force parameters with gup_flags MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This removes the 'write' and 'force' from get_user_pages() and replaces them with 'gup_flags' to make the use of FOLL_FORCE explicit in callers as use of this flag can result in surprising behaviour (and hence bugs) within the mm subsystem. Signed-off-by: Lorenzo Stoakes Acked-by: Christian König Acked-by: Jesper Nilsson Acked-by: Michal Hocko Reviewed-by: Jan Kara Signed-off-by: Linus Torvalds --- arch/cris/arch-v32/drivers/cryptocop.c | 4 +--- arch/ia64/kernel/err_inject.c | 2 +- arch/x86/mm/mpx.c | 5 ++--- drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 7 +++++-- drivers/gpu/drm/radeon/radeon_ttm.c | 3 ++- drivers/gpu/drm/via/via_dmablit.c | 4 ++-- drivers/infiniband/core/umem.c | 6 +++++- drivers/infiniband/hw/mthca/mthca_memfree.c | 2 +- drivers/infiniband/hw/qib/qib_user_pages.c | 3 ++- drivers/infiniband/hw/usnic/usnic_uiom.c | 5 ++++- drivers/media/v4l2-core/videobuf-dma-sg.c | 7 +++++-- drivers/misc/mic/scif/scif_rma.c | 3 +-- drivers/misc/sgi-gru/grufault.c | 2 +- drivers/platform/goldfish/goldfish_pipe.c | 3 ++- drivers/rapidio/devices/rio_mport_cdev.c | 3 ++- .../vc04_services/interface/vchiq_arm/vchiq_2835_arm.c | 3 +-- .../vc04_services/interface/vchiq_arm/vchiq_arm.c | 3 +-- drivers/virt/fsl_hypervisor.c | 4 ++-- include/linux/mm.h | 2 +- mm/gup.c | 12 +++--------- mm/mempolicy.c | 2 +- mm/nommu.c | 18 ++++-------------- 22 files changed, 49 insertions(+), 54 deletions(-) (limited to 'include') diff --git a/arch/cris/arch-v32/drivers/cryptocop.c b/arch/cris/arch-v32/drivers/cryptocop.c index b5698c876fcc..099e170a93ee 100644 --- a/arch/cris/arch-v32/drivers/cryptocop.c +++ b/arch/cris/arch-v32/drivers/cryptocop.c @@ -2722,7 +2722,6 @@ static int cryptocop_ioctl_process(struct inode *inode, struct file *filp, unsig err = get_user_pages((unsigned long int)(oper.indata + prev_ix), noinpages, 0, /* read access only for in data */ - 0, /* no force */ inpages, NULL); @@ -2736,8 +2735,7 @@ static int cryptocop_ioctl_process(struct inode *inode, struct file *filp, unsig if (oper.do_cipher){ err = get_user_pages((unsigned long int)oper.cipher_outdata, nooutpages, - 1, /* write access for out data */ - 0, /* no force */ + FOLL_WRITE, /* write access for out data */ outpages, NULL); up_read(¤t->mm->mmap_sem); diff --git a/arch/ia64/kernel/err_inject.c b/arch/ia64/kernel/err_inject.c index 09f845793d12..5ed0ea92c5bf 100644 --- a/arch/ia64/kernel/err_inject.c +++ b/arch/ia64/kernel/err_inject.c @@ -142,7 +142,7 @@ store_virtual_to_phys(struct device *dev, struct device_attribute *attr, u64 virt_addr=simple_strtoull(buf, NULL, 16); int ret; - ret = get_user_pages(virt_addr, 1, VM_READ, 0, NULL, NULL); + ret = get_user_pages(virt_addr, 1, FOLL_WRITE, NULL, NULL); if (ret<=0) { #ifdef ERR_INJ_DEBUG printk("Virtual address %lx is not existing.\n",virt_addr); diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c index 80476878eb4c..e4f800999b32 100644 --- a/arch/x86/mm/mpx.c +++ b/arch/x86/mm/mpx.c @@ -544,10 +544,9 @@ static int mpx_resolve_fault(long __user *addr, int write) { long gup_ret; int nr_pages = 1; - int force = 0; - gup_ret = get_user_pages((unsigned long)addr, nr_pages, write, - force, NULL, NULL); + gup_ret = get_user_pages((unsigned long)addr, nr_pages, + write ? FOLL_WRITE : 0, NULL, NULL); /* * get_user_pages() returns number of pages gotten. * 0 means we failed to fault in and get anything, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c index 887483b8b818..dcaf691f56b5 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c @@ -555,10 +555,13 @@ struct amdgpu_ttm_tt { int amdgpu_ttm_tt_get_user_pages(struct ttm_tt *ttm, struct page **pages) { struct amdgpu_ttm_tt *gtt = (void *)ttm; - int write = !(gtt->userflags & AMDGPU_GEM_USERPTR_READONLY); + unsigned int flags = 0; unsigned pinned = 0; int r; + if (!(gtt->userflags & AMDGPU_GEM_USERPTR_READONLY)) + flags |= FOLL_WRITE; + if (gtt->userflags & AMDGPU_GEM_USERPTR_ANONONLY) { /* check that we only use anonymous memory to prevent problems with writeback */ @@ -581,7 +584,7 @@ int amdgpu_ttm_tt_get_user_pages(struct ttm_tt *ttm, struct page **pages) list_add(&guptask.list, >t->guptasks); spin_unlock(>t->guptasklock); - r = get_user_pages(userptr, num_pages, write, 0, p, NULL); + r = get_user_pages(userptr, num_pages, flags, p, NULL); spin_lock(>t->guptasklock); list_del(&guptask.list); diff --git a/drivers/gpu/drm/radeon/radeon_ttm.c b/drivers/gpu/drm/radeon/radeon_ttm.c index 455268214b89..3de5e6e21662 100644 --- a/drivers/gpu/drm/radeon/radeon_ttm.c +++ b/drivers/gpu/drm/radeon/radeon_ttm.c @@ -566,7 +566,8 @@ static int radeon_ttm_tt_pin_userptr(struct ttm_tt *ttm) uint64_t userptr = gtt->userptr + pinned * PAGE_SIZE; struct page **pages = ttm->pages + pinned; - r = get_user_pages(userptr, num_pages, write, 0, pages, NULL); + r = get_user_pages(userptr, num_pages, write ? FOLL_WRITE : 0, + pages, NULL); if (r < 0) goto release_pages; diff --git a/drivers/gpu/drm/via/via_dmablit.c b/drivers/gpu/drm/via/via_dmablit.c index 7e2a12c4fed2..1a3ad769f8c8 100644 --- a/drivers/gpu/drm/via/via_dmablit.c +++ b/drivers/gpu/drm/via/via_dmablit.c @@ -241,8 +241,8 @@ via_lock_all_dma_pages(drm_via_sg_info_t *vsg, drm_via_dmablit_t *xfer) down_read(¤t->mm->mmap_sem); ret = get_user_pages((unsigned long)xfer->mem_addr, vsg->num_pages, - (vsg->direction == DMA_FROM_DEVICE), - 0, vsg->pages, NULL); + (vsg->direction == DMA_FROM_DEVICE) ? FOLL_WRITE : 0, + vsg->pages, NULL); up_read(¤t->mm->mmap_sem); if (ret != vsg->num_pages) { diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c index c68746ce6624..224ad274ea0b 100644 --- a/drivers/infiniband/core/umem.c +++ b/drivers/infiniband/core/umem.c @@ -94,6 +94,7 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, unsigned long dma_attrs = 0; struct scatterlist *sg, *sg_list_start; int need_release = 0; + unsigned int gup_flags = FOLL_WRITE; if (dmasync) dma_attrs |= DMA_ATTR_WRITE_BARRIER; @@ -183,6 +184,9 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, if (ret) goto out; + if (!umem->writable) + gup_flags |= FOLL_FORCE; + need_release = 1; sg_list_start = umem->sg_head.sgl; @@ -190,7 +194,7 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, ret = get_user_pages(cur_base, min_t(unsigned long, npages, PAGE_SIZE / sizeof (struct page *)), - 1, !umem->writable, page_list, vma_list); + gup_flags, page_list, vma_list); if (ret < 0) goto out; diff --git a/drivers/infiniband/hw/mthca/mthca_memfree.c b/drivers/infiniband/hw/mthca/mthca_memfree.c index 6c00d04b8b28..c6fe89d79248 100644 --- a/drivers/infiniband/hw/mthca/mthca_memfree.c +++ b/drivers/infiniband/hw/mthca/mthca_memfree.c @@ -472,7 +472,7 @@ int mthca_map_user_db(struct mthca_dev *dev, struct mthca_uar *uar, goto out; } - ret = get_user_pages(uaddr & PAGE_MASK, 1, 1, 0, pages, NULL); + ret = get_user_pages(uaddr & PAGE_MASK, 1, FOLL_WRITE, pages, NULL); if (ret < 0) goto out; diff --git a/drivers/infiniband/hw/qib/qib_user_pages.c b/drivers/infiniband/hw/qib/qib_user_pages.c index 2d2b94fd3633..75f08624ac05 100644 --- a/drivers/infiniband/hw/qib/qib_user_pages.c +++ b/drivers/infiniband/hw/qib/qib_user_pages.c @@ -67,7 +67,8 @@ static int __qib_get_user_pages(unsigned long start_page, size_t num_pages, for (got = 0; got < num_pages; got += ret) { ret = get_user_pages(start_page + got * PAGE_SIZE, - num_pages - got, 1, 1, + num_pages - got, + FOLL_WRITE | FOLL_FORCE, p + got, NULL); if (ret < 0) goto bail_release; diff --git a/drivers/infiniband/hw/usnic/usnic_uiom.c b/drivers/infiniband/hw/usnic/usnic_uiom.c index a0b6ebee4d8a..1ccee6ea5bc3 100644 --- a/drivers/infiniband/hw/usnic/usnic_uiom.c +++ b/drivers/infiniband/hw/usnic/usnic_uiom.c @@ -111,6 +111,7 @@ static int usnic_uiom_get_pages(unsigned long addr, size_t size, int writable, int i; int flags; dma_addr_t pa; + unsigned int gup_flags; if (!can_do_mlock()) return -EPERM; @@ -135,6 +136,8 @@ static int usnic_uiom_get_pages(unsigned long addr, size_t size, int writable, flags = IOMMU_READ | IOMMU_CACHE; flags |= (writable) ? IOMMU_WRITE : 0; + gup_flags = FOLL_WRITE; + gup_flags |= (writable) ? 0 : FOLL_FORCE; cur_base = addr & PAGE_MASK; ret = 0; @@ -142,7 +145,7 @@ static int usnic_uiom_get_pages(unsigned long addr, size_t size, int writable, ret = get_user_pages(cur_base, min_t(unsigned long, npages, PAGE_SIZE / sizeof(struct page *)), - 1, !writable, page_list, NULL); + gup_flags, page_list, NULL); if (ret < 0) goto out; diff --git a/drivers/media/v4l2-core/videobuf-dma-sg.c b/drivers/media/v4l2-core/videobuf-dma-sg.c index f300f060b3f3..1db0af6c7f94 100644 --- a/drivers/media/v4l2-core/videobuf-dma-sg.c +++ b/drivers/media/v4l2-core/videobuf-dma-sg.c @@ -156,6 +156,7 @@ static int videobuf_dma_init_user_locked(struct videobuf_dmabuf *dma, { unsigned long first, last; int err, rw = 0; + unsigned int flags = FOLL_FORCE; dma->direction = direction; switch (dma->direction) { @@ -178,12 +179,14 @@ static int videobuf_dma_init_user_locked(struct videobuf_dmabuf *dma, if (NULL == dma->pages) return -ENOMEM; + if (rw == READ) + flags |= FOLL_WRITE; + dprintk(1, "init user [0x%lx+0x%lx => %d pages]\n", data, size, dma->nr_pages); err = get_user_pages(data & PAGE_MASK, dma->nr_pages, - rw == READ, 1, /* force */ - dma->pages, NULL); + flags, dma->pages, NULL); if (err != dma->nr_pages) { dma->nr_pages = (err >= 0) ? err : 0; diff --git a/drivers/misc/mic/scif/scif_rma.c b/drivers/misc/mic/scif/scif_rma.c index e0203b1a20fd..f806a4471eb9 100644 --- a/drivers/misc/mic/scif/scif_rma.c +++ b/drivers/misc/mic/scif/scif_rma.c @@ -1396,8 +1396,7 @@ retry: pinned_pages->nr_pages = get_user_pages( (u64)addr, nr_pages, - !!(prot & SCIF_PROT_WRITE), - 0, + (prot & SCIF_PROT_WRITE) ? FOLL_WRITE : 0, pinned_pages->pages, NULL); up_write(&mm->mmap_sem); diff --git a/drivers/misc/sgi-gru/grufault.c b/drivers/misc/sgi-gru/grufault.c index a2d97b9b17e3..6fb773dbcd0c 100644 --- a/drivers/misc/sgi-gru/grufault.c +++ b/drivers/misc/sgi-gru/grufault.c @@ -198,7 +198,7 @@ static int non_atomic_pte_lookup(struct vm_area_struct *vma, #else *pageshift = PAGE_SHIFT; #endif - if (get_user_pages(vaddr, 1, write, 0, &page, NULL) <= 0) + if (get_user_pages(vaddr, 1, write ? FOLL_WRITE : 0, &page, NULL) <= 0) return -EFAULT; *paddr = page_to_phys(page); put_page(page); diff --git a/drivers/platform/goldfish/goldfish_pipe.c b/drivers/platform/goldfish/goldfish_pipe.c index 07462d79d040..1aba2c74160e 100644 --- a/drivers/platform/goldfish/goldfish_pipe.c +++ b/drivers/platform/goldfish/goldfish_pipe.c @@ -309,7 +309,8 @@ static ssize_t goldfish_pipe_read_write(struct file *filp, char __user *buffer, * much memory to the process. */ down_read(¤t->mm->mmap_sem); - ret = get_user_pages(address, 1, !is_write, 0, &page, NULL); + ret = get_user_pages(address, 1, is_write ? 0 : FOLL_WRITE, + &page, NULL); up_read(¤t->mm->mmap_sem); if (ret < 0) break; diff --git a/drivers/rapidio/devices/rio_mport_cdev.c b/drivers/rapidio/devices/rio_mport_cdev.c index 436dfe871d32..9013a585507e 100644 --- a/drivers/rapidio/devices/rio_mport_cdev.c +++ b/drivers/rapidio/devices/rio_mport_cdev.c @@ -892,7 +892,8 @@ rio_dma_transfer(struct file *filp, u32 transfer_mode, down_read(¤t->mm->mmap_sem); pinned = get_user_pages( (unsigned long)xfer->loc_addr & PAGE_MASK, - nr_pages, dir == DMA_FROM_DEVICE, 0, + nr_pages, + dir == DMA_FROM_DEVICE ? FOLL_WRITE : 0, page_list, NULL); up_read(¤t->mm->mmap_sem); diff --git a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_2835_arm.c b/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_2835_arm.c index c29040fdf9a7..1091b9f1dd07 100644 --- a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_2835_arm.c +++ b/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_2835_arm.c @@ -423,8 +423,7 @@ create_pagelist(char __user *buf, size_t count, unsigned short type, actual_pages = get_user_pages(task, task->mm, (unsigned long)buf & ~(PAGE_SIZE - 1), num_pages, - (type == PAGELIST_READ) /*Write */ , - 0 /*Force */ , + (type == PAGELIST_READ) ? FOLL_WRITE : 0, pages, NULL /*vmas */); up_read(&task->mm->mmap_sem); diff --git a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_arm.c b/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_arm.c index e11c0e07471b..7b6cd4d80621 100644 --- a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_arm.c +++ b/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_arm.c @@ -1477,8 +1477,7 @@ dump_phys_mem(void *virt_addr, uint32_t num_bytes) current->mm, /* mm */ (unsigned long)virt_addr, /* start */ num_pages, /* len */ - 0, /* write */ - 0, /* force */ + 0, /* gup_flags */ pages, /* pages (array of page pointers) */ NULL); /* vmas */ up_read(¤t->mm->mmap_sem); diff --git a/drivers/virt/fsl_hypervisor.c b/drivers/virt/fsl_hypervisor.c index 60bdad3a689b..150ce2abf6c8 100644 --- a/drivers/virt/fsl_hypervisor.c +++ b/drivers/virt/fsl_hypervisor.c @@ -245,8 +245,8 @@ static long ioctl_memcpy(struct fsl_hv_ioctl_memcpy __user *p) /* Get the physical addresses of the source buffer */ down_read(¤t->mm->mmap_sem); num_pinned = get_user_pages(param.local_vaddr - lb_offset, - num_pages, (param.source == -1) ? READ : WRITE, - 0, pages, NULL); + num_pages, (param.source == -1) ? 0 : FOLL_WRITE, + pages, NULL); up_read(¤t->mm->mmap_sem); if (num_pinned != num_pages) { diff --git a/include/linux/mm.h b/include/linux/mm.h index 91cc923ce985..30bb5d9631bb 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1279,7 +1279,7 @@ long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm, int write, int force, struct page **pages, struct vm_area_struct **vmas); long get_user_pages(unsigned long start, unsigned long nr_pages, - int write, int force, struct page **pages, + unsigned int gup_flags, struct page **pages, struct vm_area_struct **vmas); long get_user_pages_locked(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, int *locked); diff --git a/mm/gup.c b/mm/gup.c index 3cfb55ef0cf4..bbc2e76cdd77 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -987,18 +987,12 @@ EXPORT_SYMBOL(get_user_pages_remote); * obviously don't pass FOLL_REMOTE in here. */ long get_user_pages(unsigned long start, unsigned long nr_pages, - int write, int force, struct page **pages, + unsigned int gup_flags, struct page **pages, struct vm_area_struct **vmas) { - unsigned int flags = FOLL_TOUCH; - - if (write) - flags |= FOLL_WRITE; - if (force) - flags |= FOLL_FORCE; - return __get_user_pages_locked(current, current->mm, start, nr_pages, - pages, vmas, NULL, false, flags); + pages, vmas, NULL, false, + gup_flags | FOLL_TOUCH); } EXPORT_SYMBOL(get_user_pages); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index ad1c96ac313c..0b859af06b87 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -850,7 +850,7 @@ static int lookup_node(unsigned long addr) struct page *p; int err; - err = get_user_pages(addr & PAGE_MASK, 1, 0, 0, &p, NULL); + err = get_user_pages(addr & PAGE_MASK, 1, 0, &p, NULL); if (err >= 0) { err = page_to_nid(p); put_page(p); diff --git a/mm/nommu.c b/mm/nommu.c index 842cfdd1a31e..70cb844dfd95 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -160,18 +160,11 @@ finish_or_fault: * - don't permit access to VMAs that don't support it, such as I/O mappings */ long get_user_pages(unsigned long start, unsigned long nr_pages, - int write, int force, struct page **pages, + unsigned int gup_flags, struct page **pages, struct vm_area_struct **vmas) { - int flags = 0; - - if (write) - flags |= FOLL_WRITE; - if (force) - flags |= FOLL_FORCE; - - return __get_user_pages(current, current->mm, start, nr_pages, flags, - pages, vmas, NULL); + return __get_user_pages(current, current->mm, start, nr_pages, + gup_flags, pages, vmas, NULL); } EXPORT_SYMBOL(get_user_pages); @@ -179,10 +172,7 @@ long get_user_pages_locked(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, int *locked) { - int write = gup_flags & FOLL_WRITE; - int force = gup_flags & FOLL_FORCE; - - return get_user_pages(start, nr_pages, write, force, pages, NULL); + return get_user_pages(start, nr_pages, gup_flags, pages, NULL); } EXPORT_SYMBOL(get_user_pages_locked); -- cgit v1.2.3 From 9beae1ea89305a9667ceaab6d0bf46a045ad71e7 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Thu, 13 Oct 2016 01:20:17 +0100 Subject: mm: replace get_user_pages_remote() write/force parameters with gup_flags This removes the 'write' and 'force' from get_user_pages_remote() and replaces them with 'gup_flags' to make the use of FOLL_FORCE explicit in callers as use of this flag can result in surprising behaviour (and hence bugs) within the mm subsystem. Signed-off-by: Lorenzo Stoakes Acked-by: Michal Hocko Reviewed-by: Jan Kara Signed-off-by: Linus Torvalds --- drivers/gpu/drm/etnaviv/etnaviv_gem.c | 7 +++++-- drivers/gpu/drm/i915/i915_gem_userptr.c | 6 +++++- drivers/infiniband/core/umem_odp.c | 7 +++++-- fs/exec.c | 9 +++++++-- include/linux/mm.h | 2 +- kernel/events/uprobes.c | 6 ++++-- mm/gup.c | 22 +++++++--------------- mm/memory.c | 6 +++++- security/tomoyo/domain.c | 2 +- 9 files changed, 40 insertions(+), 27 deletions(-) (limited to 'include') diff --git a/drivers/gpu/drm/etnaviv/etnaviv_gem.c b/drivers/gpu/drm/etnaviv/etnaviv_gem.c index 5ce3603e6eac..0370b842d9cc 100644 --- a/drivers/gpu/drm/etnaviv/etnaviv_gem.c +++ b/drivers/gpu/drm/etnaviv/etnaviv_gem.c @@ -748,19 +748,22 @@ static struct page **etnaviv_gem_userptr_do_get_pages( int ret = 0, pinned, npages = etnaviv_obj->base.size >> PAGE_SHIFT; struct page **pvec; uintptr_t ptr; + unsigned int flags = 0; pvec = drm_malloc_ab(npages, sizeof(struct page *)); if (!pvec) return ERR_PTR(-ENOMEM); + if (!etnaviv_obj->userptr.ro) + flags |= FOLL_WRITE; + pinned = 0; ptr = etnaviv_obj->userptr.ptr; down_read(&mm->mmap_sem); while (pinned < npages) { ret = get_user_pages_remote(task, mm, ptr, npages - pinned, - !etnaviv_obj->userptr.ro, 0, - pvec + pinned, NULL); + flags, pvec + pinned, NULL); if (ret < 0) break; diff --git a/drivers/gpu/drm/i915/i915_gem_userptr.c b/drivers/gpu/drm/i915/i915_gem_userptr.c index e537930c64b5..c6f780f5abc9 100644 --- a/drivers/gpu/drm/i915/i915_gem_userptr.c +++ b/drivers/gpu/drm/i915/i915_gem_userptr.c @@ -508,6 +508,10 @@ __i915_gem_userptr_get_pages_worker(struct work_struct *_work) pvec = drm_malloc_gfp(npages, sizeof(struct page *), GFP_TEMPORARY); if (pvec != NULL) { struct mm_struct *mm = obj->userptr.mm->mm; + unsigned int flags = 0; + + if (!obj->userptr.read_only) + flags |= FOLL_WRITE; ret = -EFAULT; if (atomic_inc_not_zero(&mm->mm_users)) { @@ -517,7 +521,7 @@ __i915_gem_userptr_get_pages_worker(struct work_struct *_work) (work->task, mm, obj->userptr.ptr + pinned * PAGE_SIZE, npages - pinned, - !obj->userptr.read_only, 0, + flags, pvec + pinned, NULL); if (ret < 0) break; diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c index 75077a018675..1f0fe3217f23 100644 --- a/drivers/infiniband/core/umem_odp.c +++ b/drivers/infiniband/core/umem_odp.c @@ -527,6 +527,7 @@ int ib_umem_odp_map_dma_pages(struct ib_umem *umem, u64 user_virt, u64 bcnt, u64 off; int j, k, ret = 0, start_idx, npages = 0; u64 base_virt_addr; + unsigned int flags = 0; if (access_mask == 0) return -EINVAL; @@ -556,6 +557,9 @@ int ib_umem_odp_map_dma_pages(struct ib_umem *umem, u64 user_virt, u64 bcnt, goto out_put_task; } + if (access_mask & ODP_WRITE_ALLOWED_BIT) + flags |= FOLL_WRITE; + start_idx = (user_virt - ib_umem_start(umem)) >> PAGE_SHIFT; k = start_idx; @@ -574,8 +578,7 @@ int ib_umem_odp_map_dma_pages(struct ib_umem *umem, u64 user_virt, u64 bcnt, */ npages = get_user_pages_remote(owning_process, owning_mm, user_virt, gup_num_pages, - access_mask & ODP_WRITE_ALLOWED_BIT, - 0, local_page_list, NULL); + flags, local_page_list, NULL); up_read(&owning_mm->mmap_sem); if (npages < 0) diff --git a/fs/exec.c b/fs/exec.c index 6fcfb3f7b137..4e497b9ee71e 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -191,6 +191,7 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, { struct page *page; int ret; + unsigned int gup_flags = FOLL_FORCE; #ifdef CONFIG_STACK_GROWSUP if (write) { @@ -199,12 +200,16 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, return NULL; } #endif + + if (write) + gup_flags |= FOLL_WRITE; + /* * We are doing an exec(). 'current' is the process * doing the exec and bprm->mm is the new process's mm. */ - ret = get_user_pages_remote(current, bprm->mm, pos, 1, write, - 1, &page, NULL); + ret = get_user_pages_remote(current, bprm->mm, pos, 1, gup_flags, + &page, NULL); if (ret <= 0) return NULL; diff --git a/include/linux/mm.h b/include/linux/mm.h index 30bb5d9631bb..ecc4be7b67e0 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1276,7 +1276,7 @@ long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, struct vm_area_struct **vmas, int *nonblocking); long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, unsigned long nr_pages, - int write, int force, struct page **pages, + unsigned int gup_flags, struct page **pages, struct vm_area_struct **vmas); long get_user_pages(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index d4129bb05e5d..f9ec9add2164 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -300,7 +300,8 @@ int uprobe_write_opcode(struct mm_struct *mm, unsigned long vaddr, retry: /* Read the page with vaddr into memory */ - ret = get_user_pages_remote(NULL, mm, vaddr, 1, 0, 1, &old_page, &vma); + ret = get_user_pages_remote(NULL, mm, vaddr, 1, FOLL_FORCE, &old_page, + &vma); if (ret <= 0) return ret; @@ -1710,7 +1711,8 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr) * but we treat this as a 'remote' access since it is * essentially a kernel access to the memory. */ - result = get_user_pages_remote(NULL, mm, vaddr, 1, 0, 1, &page, NULL); + result = get_user_pages_remote(NULL, mm, vaddr, 1, FOLL_FORCE, &page, + NULL); if (result < 0) return result; diff --git a/mm/gup.c b/mm/gup.c index bbc2e76cdd77..7aa113c2d373 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -915,9 +915,7 @@ EXPORT_SYMBOL(get_user_pages_unlocked); * @mm: mm_struct of target mm * @start: starting user address * @nr_pages: number of pages from start to pin - * @write: whether pages will be written to by the caller - * @force: whether to force access even when user mapping is currently - * protected (but never forces write access to shared mapping). + * @gup_flags: flags modifying lookup behaviour * @pages: array that receives pointers to the pages pinned. * Should be at least nr_pages long. Or NULL, if caller * only intends to ensure the pages are faulted in. @@ -946,9 +944,9 @@ EXPORT_SYMBOL(get_user_pages_unlocked); * or similar operation cannot guarantee anything stronger anyway because * locks can't be held over the syscall boundary. * - * If write=0, the page must not be written to. If the page is written to, - * set_page_dirty (or set_page_dirty_lock, as appropriate) must be called - * after the page is finished with, and before put_page is called. + * If gup_flags & FOLL_WRITE == 0, the page must not be written to. If the page + * is written to, set_page_dirty (or set_page_dirty_lock, as appropriate) must + * be called after the page is finished with, and before put_page is called. * * get_user_pages is typically used for fewer-copy IO operations, to get a * handle on the memory by some means other than accesses via the user virtual @@ -965,18 +963,12 @@ EXPORT_SYMBOL(get_user_pages_unlocked); */ long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, unsigned long nr_pages, - int write, int force, struct page **pages, + unsigned int gup_flags, struct page **pages, struct vm_area_struct **vmas) { - unsigned int flags = FOLL_TOUCH | FOLL_REMOTE; - - if (write) - flags |= FOLL_WRITE; - if (force) - flags |= FOLL_FORCE; - return __get_user_pages_locked(tsk, mm, start, nr_pages, pages, vmas, - NULL, false, flags); + NULL, false, + gup_flags | FOLL_TOUCH | FOLL_REMOTE); } EXPORT_SYMBOL(get_user_pages_remote); diff --git a/mm/memory.c b/mm/memory.c index fc1987dfd8cc..20a9adb7b36e 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3873,6 +3873,10 @@ static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, { struct vm_area_struct *vma; void *old_buf = buf; + unsigned int flags = FOLL_FORCE; + + if (write) + flags |= FOLL_WRITE; down_read(&mm->mmap_sem); /* ignore errors, just check how much was successfully transferred */ @@ -3882,7 +3886,7 @@ static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, struct page *page = NULL; ret = get_user_pages_remote(tsk, mm, addr, 1, - write, 1, &page, &vma); + flags, &page, &vma); if (ret <= 0) { #ifndef CONFIG_HAVE_IOREMAP_PROT break; diff --git a/security/tomoyo/domain.c b/security/tomoyo/domain.c index ade7c6cad172..682b73af7766 100644 --- a/security/tomoyo/domain.c +++ b/security/tomoyo/domain.c @@ -881,7 +881,7 @@ bool tomoyo_dump_page(struct linux_binprm *bprm, unsigned long pos, * the execve(). */ if (get_user_pages_remote(current, bprm->mm, pos, 1, - 0, 1, &page, NULL) <= 0) + FOLL_FORCE, &page, NULL) <= 0) return false; #else page = bprm->page[pos / PAGE_SIZE]; -- cgit v1.2.3 From 6347e8d5bcce33fc36e651901efefbe2c93a43ef Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Thu, 13 Oct 2016 01:20:19 +0100 Subject: mm: replace access_remote_vm() write parameter with gup_flags This removes the 'write' argument from access_remote_vm() and replaces it with 'gup_flags' as use of this function previously silently implied FOLL_FORCE, whereas after this patch callers explicitly pass this flag. We make this explicit as use of FOLL_FORCE can result in surprising behaviour (and hence bugs) within the mm subsystem. Signed-off-by: Lorenzo Stoakes Acked-by: Michal Hocko Signed-off-by: Linus Torvalds --- fs/proc/base.c | 19 +++++++++++++------ include/linux/mm.h | 2 +- mm/memory.c | 11 +++-------- mm/nommu.c | 7 +++---- 4 files changed, 20 insertions(+), 19 deletions(-) (limited to 'include') diff --git a/fs/proc/base.c b/fs/proc/base.c index c2964d890c9a..8e654468ab67 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -252,7 +252,7 @@ static ssize_t proc_pid_cmdline_read(struct file *file, char __user *buf, * Inherently racy -- command line shares address space * with code and data. */ - rv = access_remote_vm(mm, arg_end - 1, &c, 1, 0); + rv = access_remote_vm(mm, arg_end - 1, &c, 1, FOLL_FORCE); if (rv <= 0) goto out_free_page; @@ -270,7 +270,8 @@ static ssize_t proc_pid_cmdline_read(struct file *file, char __user *buf, int nr_read; _count = min3(count, len, PAGE_SIZE); - nr_read = access_remote_vm(mm, p, page, _count, 0); + nr_read = access_remote_vm(mm, p, page, _count, + FOLL_FORCE); if (nr_read < 0) rv = nr_read; if (nr_read <= 0) @@ -305,7 +306,8 @@ static ssize_t proc_pid_cmdline_read(struct file *file, char __user *buf, bool final; _count = min3(count, len, PAGE_SIZE); - nr_read = access_remote_vm(mm, p, page, _count, 0); + nr_read = access_remote_vm(mm, p, page, _count, + FOLL_FORCE); if (nr_read < 0) rv = nr_read; if (nr_read <= 0) @@ -354,7 +356,8 @@ skip_argv: bool final; _count = min3(count, len, PAGE_SIZE); - nr_read = access_remote_vm(mm, p, page, _count, 0); + nr_read = access_remote_vm(mm, p, page, _count, + FOLL_FORCE); if (nr_read < 0) rv = nr_read; if (nr_read <= 0) @@ -832,6 +835,7 @@ static ssize_t mem_rw(struct file *file, char __user *buf, unsigned long addr = *ppos; ssize_t copied; char *page; + unsigned int flags = FOLL_FORCE; if (!mm) return 0; @@ -844,6 +848,9 @@ static ssize_t mem_rw(struct file *file, char __user *buf, if (!atomic_inc_not_zero(&mm->mm_users)) goto free; + if (write) + flags |= FOLL_WRITE; + while (count > 0) { int this_len = min_t(int, count, PAGE_SIZE); @@ -852,7 +859,7 @@ static ssize_t mem_rw(struct file *file, char __user *buf, break; } - this_len = access_remote_vm(mm, addr, page, this_len, write); + this_len = access_remote_vm(mm, addr, page, this_len, flags); if (!this_len) { if (!copied) copied = -EIO; @@ -965,7 +972,7 @@ static ssize_t environ_read(struct file *file, char __user *buf, this_len = min(max_len, this_len); retval = access_remote_vm(mm, (env_start + src), - page, this_len, 0); + page, this_len, FOLL_FORCE); if (retval <= 0) { ret = retval; diff --git a/include/linux/mm.h b/include/linux/mm.h index ecc4be7b67e0..f31bf9058587 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1268,7 +1268,7 @@ static inline int fixup_user_fault(struct task_struct *tsk, extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write); extern int access_remote_vm(struct mm_struct *mm, unsigned long addr, - void *buf, int len, int write); + void *buf, int len, unsigned int gup_flags); long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, unsigned long nr_pages, diff --git a/mm/memory.c b/mm/memory.c index 79ebed3a4c2b..bac2d994850e 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3935,19 +3935,14 @@ static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, * @addr: start address to access * @buf: source or destination buffer * @len: number of bytes to transfer - * @write: whether the access is a write + * @gup_flags: flags modifying lookup behaviour * * The caller must hold a reference on @mm. */ int access_remote_vm(struct mm_struct *mm, unsigned long addr, - void *buf, int len, int write) + void *buf, int len, unsigned int gup_flags) { - unsigned int flags = FOLL_FORCE; - - if (write) - flags |= FOLL_WRITE; - - return __access_remote_vm(NULL, mm, addr, buf, len, flags); + return __access_remote_vm(NULL, mm, addr, buf, len, gup_flags); } /* diff --git a/mm/nommu.c b/mm/nommu.c index bde7df35118b..93d5bb53fc63 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1847,15 +1847,14 @@ static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, * @addr: start address to access * @buf: source or destination buffer * @len: number of bytes to transfer - * @write: whether the access is a write + * @gup_flags: flags modifying lookup behaviour * * The caller must hold a reference on @mm. */ int access_remote_vm(struct mm_struct *mm, unsigned long addr, - void *buf, int len, int write) + void *buf, int len, unsigned int gup_flags) { - return __access_remote_vm(NULL, mm, addr, buf, len, - write ? FOLL_WRITE : 0); + return __access_remote_vm(NULL, mm, addr, buf, len, gup_flags); } /* -- cgit v1.2.3 From f307ab6dcea03f9d8e4d70508fd7d1ca57cfa7f9 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Thu, 13 Oct 2016 01:20:20 +0100 Subject: mm: replace access_process_vm() write parameter with gup_flags This removes the 'write' argument from access_process_vm() and replaces it with 'gup_flags' as use of this function previously silently implied FOLL_FORCE, whereas after this patch callers explicitly pass this flag. We make this explicit as use of FOLL_FORCE can result in surprising behaviour (and hence bugs) within the mm subsystem. Signed-off-by: Lorenzo Stoakes Acked-by: Jesper Nilsson Acked-by: Michal Hocko Acked-by: Michael Ellerman Signed-off-by: Linus Torvalds --- arch/alpha/kernel/ptrace.c | 9 ++++++--- arch/blackfin/kernel/ptrace.c | 5 +++-- arch/cris/arch-v32/kernel/ptrace.c | 4 ++-- arch/ia64/kernel/ptrace.c | 14 +++++++++----- arch/m32r/kernel/ptrace.c | 15 ++++++++++----- arch/mips/kernel/ptrace32.c | 5 +++-- arch/powerpc/kernel/ptrace32.c | 5 +++-- arch/score/kernel/ptrace.c | 10 ++++++---- arch/sparc/kernel/ptrace_64.c | 24 ++++++++++++++++-------- arch/x86/kernel/step.c | 3 ++- arch/x86/um/ptrace_32.c | 3 ++- arch/x86/um/ptrace_64.c | 3 ++- include/linux/mm.h | 3 ++- kernel/ptrace.c | 16 ++++++++++------ mm/memory.c | 8 ++------ mm/nommu.c | 6 +++--- mm/util.c | 5 +++-- 17 files changed, 84 insertions(+), 54 deletions(-) (limited to 'include') diff --git a/arch/alpha/kernel/ptrace.c b/arch/alpha/kernel/ptrace.c index d9ee81769899..940dfb406591 100644 --- a/arch/alpha/kernel/ptrace.c +++ b/arch/alpha/kernel/ptrace.c @@ -157,14 +157,16 @@ put_reg(struct task_struct *task, unsigned long regno, unsigned long data) static inline int read_int(struct task_struct *task, unsigned long addr, int * data) { - int copied = access_process_vm(task, addr, data, sizeof(int), 0); + int copied = access_process_vm(task, addr, data, sizeof(int), + FOLL_FORCE); return (copied == sizeof(int)) ? 0 : -EIO; } static inline int write_int(struct task_struct *task, unsigned long addr, int data) { - int copied = access_process_vm(task, addr, &data, sizeof(int), 1); + int copied = access_process_vm(task, addr, &data, sizeof(int), + FOLL_FORCE | FOLL_WRITE); return (copied == sizeof(int)) ? 0 : -EIO; } @@ -281,7 +283,8 @@ long arch_ptrace(struct task_struct *child, long request, /* When I and D space are separate, these will need to be fixed. */ case PTRACE_PEEKTEXT: /* read word at location addr. */ case PTRACE_PEEKDATA: - copied = access_process_vm(child, addr, &tmp, sizeof(tmp), 0); + copied = access_process_vm(child, addr, &tmp, sizeof(tmp), + FOLL_FORCE); ret = -EIO; if (copied != sizeof(tmp)) break; diff --git a/arch/blackfin/kernel/ptrace.c b/arch/blackfin/kernel/ptrace.c index 8b8fe671b1a6..8d79286ee4e8 100644 --- a/arch/blackfin/kernel/ptrace.c +++ b/arch/blackfin/kernel/ptrace.c @@ -271,7 +271,7 @@ long arch_ptrace(struct task_struct *child, long request, case BFIN_MEM_ACCESS_CORE: case BFIN_MEM_ACCESS_CORE_ONLY: copied = access_process_vm(child, addr, &tmp, - to_copy, 0); + to_copy, FOLL_FORCE); if (copied) break; @@ -324,7 +324,8 @@ long arch_ptrace(struct task_struct *child, long request, case BFIN_MEM_ACCESS_CORE: case BFIN_MEM_ACCESS_CORE_ONLY: copied = access_process_vm(child, addr, &data, - to_copy, 1); + to_copy, + FOLL_FORCE | FOLL_WRITE); break; case BFIN_MEM_ACCESS_DMA: if (safe_dma_memcpy(paddr, &data, to_copy)) diff --git a/arch/cris/arch-v32/kernel/ptrace.c b/arch/cris/arch-v32/kernel/ptrace.c index f085229cf870..f0df654ac6fc 100644 --- a/arch/cris/arch-v32/kernel/ptrace.c +++ b/arch/cris/arch-v32/kernel/ptrace.c @@ -147,7 +147,7 @@ long arch_ptrace(struct task_struct *child, long request, /* The trampoline page is globally mapped, no page table to traverse.*/ tmp = *(unsigned long*)addr; } else { - copied = access_process_vm(child, addr, &tmp, sizeof(tmp), 0); + copied = access_process_vm(child, addr, &tmp, sizeof(tmp), FOLL_FORCE); if (copied != sizeof(tmp)) break; @@ -279,7 +279,7 @@ static int insn_size(struct task_struct *child, unsigned long pc) int opsize = 0; /* Read the opcode at pc (do what PTRACE_PEEKTEXT would do). */ - copied = access_process_vm(child, pc, &opcode, sizeof(opcode), 0); + copied = access_process_vm(child, pc, &opcode, sizeof(opcode), FOLL_FORCE); if (copied != sizeof(opcode)) return 0; diff --git a/arch/ia64/kernel/ptrace.c b/arch/ia64/kernel/ptrace.c index 6f54d511cc50..31aa8c0f68e1 100644 --- a/arch/ia64/kernel/ptrace.c +++ b/arch/ia64/kernel/ptrace.c @@ -453,7 +453,7 @@ ia64_peek (struct task_struct *child, struct switch_stack *child_stack, return 0; } } - copied = access_process_vm(child, addr, &ret, sizeof(ret), 0); + copied = access_process_vm(child, addr, &ret, sizeof(ret), FOLL_FORCE); if (copied != sizeof(ret)) return -EIO; *val = ret; @@ -489,7 +489,8 @@ ia64_poke (struct task_struct *child, struct switch_stack *child_stack, *ia64_rse_skip_regs(krbs, regnum) = val; } } - } else if (access_process_vm(child, addr, &val, sizeof(val), 1) + } else if (access_process_vm(child, addr, &val, sizeof(val), + FOLL_FORCE | FOLL_WRITE) != sizeof(val)) return -EIO; return 0; @@ -543,7 +544,8 @@ ia64_sync_user_rbs (struct task_struct *child, struct switch_stack *sw, ret = ia64_peek(child, sw, user_rbs_end, addr, &val); if (ret < 0) return ret; - if (access_process_vm(child, addr, &val, sizeof(val), 1) + if (access_process_vm(child, addr, &val, sizeof(val), + FOLL_FORCE | FOLL_WRITE) != sizeof(val)) return -EIO; } @@ -559,7 +561,8 @@ ia64_sync_kernel_rbs (struct task_struct *child, struct switch_stack *sw, /* now copy word for word from user rbs to kernel rbs: */ for (addr = user_rbs_start; addr < user_rbs_end; addr += 8) { - if (access_process_vm(child, addr, &val, sizeof(val), 0) + if (access_process_vm(child, addr, &val, sizeof(val), + FOLL_FORCE) != sizeof(val)) return -EIO; @@ -1156,7 +1159,8 @@ arch_ptrace (struct task_struct *child, long request, case PTRACE_PEEKTEXT: case PTRACE_PEEKDATA: /* read word at location addr */ - if (access_process_vm(child, addr, &data, sizeof(data), 0) + if (access_process_vm(child, addr, &data, sizeof(data), + FOLL_FORCE) != sizeof(data)) return -EIO; /* ensure return value is not mistaken for error code */ diff --git a/arch/m32r/kernel/ptrace.c b/arch/m32r/kernel/ptrace.c index 51f5e9aa4901..c145605a981f 100644 --- a/arch/m32r/kernel/ptrace.c +++ b/arch/m32r/kernel/ptrace.c @@ -493,7 +493,8 @@ unregister_all_debug_traps(struct task_struct *child) int i; for (i = 0; i < p->nr_trap; i++) - access_process_vm(child, p->addr[i], &p->insn[i], sizeof(p->insn[i]), 1); + access_process_vm(child, p->addr[i], &p->insn[i], sizeof(p->insn[i]), + FOLL_FORCE | FOLL_WRITE); p->nr_trap = 0; } @@ -537,7 +538,8 @@ embed_debug_trap(struct task_struct *child, unsigned long next_pc) unsigned long next_insn, code; unsigned long addr = next_pc & ~3; - if (access_process_vm(child, addr, &next_insn, sizeof(next_insn), 0) + if (access_process_vm(child, addr, &next_insn, sizeof(next_insn), + FOLL_FORCE) != sizeof(next_insn)) { return -1; /* error */ } @@ -546,7 +548,8 @@ embed_debug_trap(struct task_struct *child, unsigned long next_pc) if (register_debug_trap(child, next_pc, next_insn, &code)) { return -1; /* error */ } - if (access_process_vm(child, addr, &code, sizeof(code), 1) + if (access_process_vm(child, addr, &code, sizeof(code), + FOLL_FORCE | FOLL_WRITE) != sizeof(code)) { return -1; /* error */ } @@ -562,7 +565,8 @@ withdraw_debug_trap(struct pt_regs *regs) addr = (regs->bpc - 2) & ~3; regs->bpc -= 2; if (unregister_debug_trap(current, addr, &code)) { - access_process_vm(current, addr, &code, sizeof(code), 1); + access_process_vm(current, addr, &code, sizeof(code), + FOLL_FORCE | FOLL_WRITE); invalidate_cache(); } } @@ -589,7 +593,8 @@ void user_enable_single_step(struct task_struct *child) /* Compute next pc. */ pc = get_stack_long(child, PT_BPC); - if (access_process_vm(child, pc&~3, &insn, sizeof(insn), 0) + if (access_process_vm(child, pc&~3, &insn, sizeof(insn), + FOLL_FORCE) != sizeof(insn)) return; diff --git a/arch/mips/kernel/ptrace32.c b/arch/mips/kernel/ptrace32.c index 283b5a1967d1..7e71a4e0281b 100644 --- a/arch/mips/kernel/ptrace32.c +++ b/arch/mips/kernel/ptrace32.c @@ -70,7 +70,7 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request, break; copied = access_process_vm(child, (u64)addrOthers, &tmp, - sizeof(tmp), 0); + sizeof(tmp), FOLL_FORCE); if (copied != sizeof(tmp)) break; ret = put_user(tmp, (u32 __user *) (unsigned long) data); @@ -179,7 +179,8 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request, break; ret = 0; if (access_process_vm(child, (u64)addrOthers, &data, - sizeof(data), 1) == sizeof(data)) + sizeof(data), + FOLL_FORCE | FOLL_WRITE) == sizeof(data)) break; ret = -EIO; break; diff --git a/arch/powerpc/kernel/ptrace32.c b/arch/powerpc/kernel/ptrace32.c index f52b7db327c8..010b7b310237 100644 --- a/arch/powerpc/kernel/ptrace32.c +++ b/arch/powerpc/kernel/ptrace32.c @@ -74,7 +74,7 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request, break; copied = access_process_vm(child, (u64)addrOthers, &tmp, - sizeof(tmp), 0); + sizeof(tmp), FOLL_FORCE); if (copied != sizeof(tmp)) break; ret = put_user(tmp, (u32 __user *)data); @@ -179,7 +179,8 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request, break; ret = 0; if (access_process_vm(child, (u64)addrOthers, &tmp, - sizeof(tmp), 1) == sizeof(tmp)) + sizeof(tmp), + FOLL_FORCE | FOLL_WRITE) == sizeof(tmp)) break; ret = -EIO; break; diff --git a/arch/score/kernel/ptrace.c b/arch/score/kernel/ptrace.c index 55836188b217..4f7314d5f334 100644 --- a/arch/score/kernel/ptrace.c +++ b/arch/score/kernel/ptrace.c @@ -131,7 +131,7 @@ read_tsk_long(struct task_struct *child, { int copied; - copied = access_process_vm(child, addr, res, sizeof(*res), 0); + copied = access_process_vm(child, addr, res, sizeof(*res), FOLL_FORCE); return copied != sizeof(*res) ? -EIO : 0; } @@ -142,7 +142,7 @@ read_tsk_short(struct task_struct *child, { int copied; - copied = access_process_vm(child, addr, res, sizeof(*res), 0); + copied = access_process_vm(child, addr, res, sizeof(*res), FOLL_FORCE); return copied != sizeof(*res) ? -EIO : 0; } @@ -153,7 +153,8 @@ write_tsk_short(struct task_struct *child, { int copied; - copied = access_process_vm(child, addr, &val, sizeof(val), 1); + copied = access_process_vm(child, addr, &val, sizeof(val), + FOLL_FORCE | FOLL_WRITE); return copied != sizeof(val) ? -EIO : 0; } @@ -164,7 +165,8 @@ write_tsk_long(struct task_struct *child, { int copied; - copied = access_process_vm(child, addr, &val, sizeof(val), 1); + copied = access_process_vm(child, addr, &val, sizeof(val), + FOLL_FORCE | FOLL_WRITE); return copied != sizeof(val) ? -EIO : 0; } diff --git a/arch/sparc/kernel/ptrace_64.c b/arch/sparc/kernel/ptrace_64.c index 9ddc4928a089..ac082dd8c67d 100644 --- a/arch/sparc/kernel/ptrace_64.c +++ b/arch/sparc/kernel/ptrace_64.c @@ -127,7 +127,8 @@ static int get_from_target(struct task_struct *target, unsigned long uaddr, if (copy_from_user(kbuf, (void __user *) uaddr, len)) return -EFAULT; } else { - int len2 = access_process_vm(target, uaddr, kbuf, len, 0); + int len2 = access_process_vm(target, uaddr, kbuf, len, + FOLL_FORCE); if (len2 != len) return -EFAULT; } @@ -141,7 +142,8 @@ static int set_to_target(struct task_struct *target, unsigned long uaddr, if (copy_to_user((void __user *) uaddr, kbuf, len)) return -EFAULT; } else { - int len2 = access_process_vm(target, uaddr, kbuf, len, 1); + int len2 = access_process_vm(target, uaddr, kbuf, len, + FOLL_FORCE | FOLL_WRITE); if (len2 != len) return -EFAULT; } @@ -505,7 +507,8 @@ static int genregs32_get(struct task_struct *target, if (access_process_vm(target, (unsigned long) ®_window[pos], - k, sizeof(*k), 0) + k, sizeof(*k), + FOLL_FORCE) != sizeof(*k)) return -EFAULT; k++; @@ -531,12 +534,14 @@ static int genregs32_get(struct task_struct *target, if (access_process_vm(target, (unsigned long) ®_window[pos], - ®, sizeof(reg), 0) + ®, sizeof(reg), + FOLL_FORCE) != sizeof(reg)) return -EFAULT; if (access_process_vm(target, (unsigned long) u, - ®, sizeof(reg), 1) + ®, sizeof(reg), + FOLL_FORCE | FOLL_WRITE) != sizeof(reg)) return -EFAULT; pos++; @@ -615,7 +620,8 @@ static int genregs32_set(struct task_struct *target, (unsigned long) ®_window[pos], (void *) k, - sizeof(*k), 1) + sizeof(*k), + FOLL_FORCE | FOLL_WRITE) != sizeof(*k)) return -EFAULT; k++; @@ -642,13 +648,15 @@ static int genregs32_set(struct task_struct *target, if (access_process_vm(target, (unsigned long) u, - ®, sizeof(reg), 0) + ®, sizeof(reg), + FOLL_FORCE) != sizeof(reg)) return -EFAULT; if (access_process_vm(target, (unsigned long) ®_window[pos], - ®, sizeof(reg), 1) + ®, sizeof(reg), + FOLL_FORCE | FOLL_WRITE) != sizeof(reg)) return -EFAULT; pos++; diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c index c9a073866ca7..a23ce84a3f6c 100644 --- a/arch/x86/kernel/step.c +++ b/arch/x86/kernel/step.c @@ -57,7 +57,8 @@ static int is_setting_trap_flag(struct task_struct *child, struct pt_regs *regs) unsigned char opcode[15]; unsigned long addr = convert_ip_to_linear(child, regs); - copied = access_process_vm(child, addr, opcode, sizeof(opcode), 0); + copied = access_process_vm(child, addr, opcode, sizeof(opcode), + FOLL_FORCE); for (i = 0; i < copied; i++) { switch (opcode[i]) { /* popf and iret */ diff --git a/arch/x86/um/ptrace_32.c b/arch/x86/um/ptrace_32.c index 5766ead6fdb9..60a5a5a85505 100644 --- a/arch/x86/um/ptrace_32.c +++ b/arch/x86/um/ptrace_32.c @@ -36,7 +36,8 @@ int is_syscall(unsigned long addr) * slow, but that doesn't matter, since it will be called only * in case of singlestepping, if copy_from_user failed. */ - n = access_process_vm(current, addr, &instr, sizeof(instr), 0); + n = access_process_vm(current, addr, &instr, sizeof(instr), + FOLL_FORCE); if (n != sizeof(instr)) { printk(KERN_ERR "is_syscall : failed to read " "instruction from 0x%lx\n", addr); diff --git a/arch/x86/um/ptrace_64.c b/arch/x86/um/ptrace_64.c index 0b5c184dd5b3..e30202b1716e 100644 --- a/arch/x86/um/ptrace_64.c +++ b/arch/x86/um/ptrace_64.c @@ -212,7 +212,8 @@ int is_syscall(unsigned long addr) * slow, but that doesn't matter, since it will be called only * in case of singlestepping, if copy_from_user failed. */ - n = access_process_vm(current, addr, &instr, sizeof(instr), 0); + n = access_process_vm(current, addr, &instr, sizeof(instr), + FOLL_FORCE); if (n != sizeof(instr)) { printk("is_syscall : failed to read instruction from " "0x%lx\n", addr); diff --git a/include/linux/mm.h b/include/linux/mm.h index f31bf9058587..ffbd72979ee7 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1266,7 +1266,8 @@ static inline int fixup_user_fault(struct task_struct *tsk, } #endif -extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write); +extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, + unsigned int gup_flags); extern int access_remote_vm(struct mm_struct *mm, unsigned long addr, void *buf, int len, unsigned int gup_flags); diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 2a99027312a6..e6474f7272ec 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -537,7 +537,7 @@ int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst int this_len, retval; this_len = (len > sizeof(buf)) ? sizeof(buf) : len; - retval = access_process_vm(tsk, src, buf, this_len, 0); + retval = access_process_vm(tsk, src, buf, this_len, FOLL_FORCE); if (!retval) { if (copied) break; @@ -564,7 +564,8 @@ int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long ds this_len = (len > sizeof(buf)) ? sizeof(buf) : len; if (copy_from_user(buf, src, this_len)) return -EFAULT; - retval = access_process_vm(tsk, dst, buf, this_len, 1); + retval = access_process_vm(tsk, dst, buf, this_len, + FOLL_FORCE | FOLL_WRITE); if (!retval) { if (copied) break; @@ -1127,7 +1128,7 @@ int generic_ptrace_peekdata(struct task_struct *tsk, unsigned long addr, unsigned long tmp; int copied; - copied = access_process_vm(tsk, addr, &tmp, sizeof(tmp), 0); + copied = access_process_vm(tsk, addr, &tmp, sizeof(tmp), FOLL_FORCE); if (copied != sizeof(tmp)) return -EIO; return put_user(tmp, (unsigned long __user *)data); @@ -1138,7 +1139,8 @@ int generic_ptrace_pokedata(struct task_struct *tsk, unsigned long addr, { int copied; - copied = access_process_vm(tsk, addr, &data, sizeof(data), 1); + copied = access_process_vm(tsk, addr, &data, sizeof(data), + FOLL_FORCE | FOLL_WRITE); return (copied == sizeof(data)) ? 0 : -EIO; } @@ -1155,7 +1157,8 @@ int compat_ptrace_request(struct task_struct *child, compat_long_t request, switch (request) { case PTRACE_PEEKTEXT: case PTRACE_PEEKDATA: - ret = access_process_vm(child, addr, &word, sizeof(word), 0); + ret = access_process_vm(child, addr, &word, sizeof(word), + FOLL_FORCE); if (ret != sizeof(word)) ret = -EIO; else @@ -1164,7 +1167,8 @@ int compat_ptrace_request(struct task_struct *child, compat_long_t request, case PTRACE_POKETEXT: case PTRACE_POKEDATA: - ret = access_process_vm(child, addr, &data, sizeof(data), 1); + ret = access_process_vm(child, addr, &data, sizeof(data), + FOLL_FORCE | FOLL_WRITE); ret = (ret != sizeof(data) ? -EIO : 0); break; diff --git a/mm/memory.c b/mm/memory.c index bac2d994850e..e18c57bdc75c 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3951,20 +3951,16 @@ int access_remote_vm(struct mm_struct *mm, unsigned long addr, * Do not walk the page table directly, use get_user_pages */ int access_process_vm(struct task_struct *tsk, unsigned long addr, - void *buf, int len, int write) + void *buf, int len, unsigned int gup_flags) { struct mm_struct *mm; int ret; - unsigned int flags = FOLL_FORCE; mm = get_task_mm(tsk); if (!mm) return 0; - if (write) - flags |= FOLL_WRITE; - - ret = __access_remote_vm(tsk, mm, addr, buf, len, flags); + ret = __access_remote_vm(tsk, mm, addr, buf, len, gup_flags); mmput(mm); diff --git a/mm/nommu.c b/mm/nommu.c index 93d5bb53fc63..db5fd1795298 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1861,7 +1861,8 @@ int access_remote_vm(struct mm_struct *mm, unsigned long addr, * Access another process' address space. * - source/target buffer must be kernel space */ -int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write) +int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, + unsigned int gup_flags) { struct mm_struct *mm; @@ -1872,8 +1873,7 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in if (!mm) return 0; - len = __access_remote_vm(tsk, mm, addr, buf, len, - write ? FOLL_WRITE : 0); + len = __access_remote_vm(tsk, mm, addr, buf, len, gup_flags); mmput(mm); return len; diff --git a/mm/util.c b/mm/util.c index 4c685bde5ebc..952cbe7ad7b7 100644 --- a/mm/util.c +++ b/mm/util.c @@ -624,7 +624,7 @@ int get_cmdline(struct task_struct *task, char *buffer, int buflen) if (len > buflen) len = buflen; - res = access_process_vm(task, arg_start, buffer, len, 0); + res = access_process_vm(task, arg_start, buffer, len, FOLL_FORCE); /* * If the nul at the end of args has been overwritten, then @@ -639,7 +639,8 @@ int get_cmdline(struct task_struct *task, char *buffer, int buflen) if (len > buflen - res) len = buflen - res; res += access_process_vm(task, env_start, - buffer+res, len, 0); + buffer+res, len, + FOLL_FORCE); res = strnlen(buffer, res); } } -- cgit v1.2.3 From 8ef2074d28373014d05e92b5f13364ef51075b6e Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Wed, 19 Oct 2016 09:51:05 -0600 Subject: nvme: Add tertiary number to NVME_VS NVMe 1.2.1 specification adds a tertiary element to the version number. This updates the macro and its callers to include the final number and fixup a single place in nvmet where the version was generated manually. Signed-off-by: Gabriel Krisman Bertazi Reviewed-by: Sagi Grimberg Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 8 ++++---- drivers/nvme/host/pci.c | 4 ++-- drivers/nvme/host/scsi.c | 4 ++-- drivers/nvme/target/core.c | 2 +- include/linux/nvme.h | 3 ++- 5 files changed, 11 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 2a57f5ede386..bb168b71048d 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -900,9 +900,9 @@ static int nvme_revalidate_ns(struct nvme_ns *ns, struct nvme_id_ns **id) return -ENODEV; } - if (ns->ctrl->vs >= NVME_VS(1, 1)) + if (ns->ctrl->vs >= NVME_VS(1, 1, 0)) memcpy(ns->eui, (*id)->eui64, sizeof(ns->eui)); - if (ns->ctrl->vs >= NVME_VS(1, 2)) + if (ns->ctrl->vs >= NVME_VS(1, 2, 0)) memcpy(ns->uuid, (*id)->nguid, sizeof(ns->uuid)); return 0; @@ -1242,7 +1242,7 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) } page_shift = NVME_CAP_MPSMIN(cap) + 12; - if (ctrl->vs >= NVME_VS(1, 1)) + if (ctrl->vs >= NVME_VS(1, 1, 0)) ctrl->subsystem = NVME_CAP_NSSRC(cap); ret = nvme_identify_ctrl(ctrl, &id); @@ -1842,7 +1842,7 @@ static void nvme_scan_work(struct work_struct *work) return; nn = le32_to_cpu(id->nn); - if (ctrl->vs >= NVME_VS(1, 1) && + if (ctrl->vs >= NVME_VS(1, 1, 0) && !(ctrl->quirks & NVME_QUIRK_IDENTIFY_CNS)) { if (!nvme_scan_ns_list(ctrl, nn)) goto done; diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index a7c6e9d74943..26a8d31b291d 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1214,7 +1214,7 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev) u64 cap = lo_hi_readq(dev->bar + NVME_REG_CAP); struct nvme_queue *nvmeq; - dev->subsystem = readl(dev->bar + NVME_REG_VS) >= NVME_VS(1, 1) ? + dev->subsystem = readl(dev->bar + NVME_REG_VS) >= NVME_VS(1, 1, 0) ? NVME_CAP_NSSRC(cap) : 0; if (dev->subsystem && @@ -1633,7 +1633,7 @@ static int nvme_pci_enable(struct nvme_dev *dev) * NULL as final argument to sysfs_add_file_to_group. */ - if (readl(dev->bar + NVME_REG_VS) >= NVME_VS(1, 2)) { + if (readl(dev->bar + NVME_REG_VS) >= NVME_VS(1, 2, 0)) { dev->cmb = nvme_map_cmb(dev); if (dev->cmbsz) { diff --git a/drivers/nvme/host/scsi.c b/drivers/nvme/host/scsi.c index c2a0a1c7d05d..3eaa4d27801e 100644 --- a/drivers/nvme/host/scsi.c +++ b/drivers/nvme/host/scsi.c @@ -606,7 +606,7 @@ static int nvme_fill_device_id_eui64(struct nvme_ns *ns, struct sg_io_hdr *hdr, eui = id_ns->eui64; len = sizeof(id_ns->eui64); - if (ns->ctrl->vs >= NVME_VS(1, 2)) { + if (ns->ctrl->vs >= NVME_VS(1, 2, 0)) { if (bitmap_empty(eui, len * 8)) { eui = id_ns->nguid; len = sizeof(id_ns->nguid); @@ -679,7 +679,7 @@ static int nvme_trans_device_id_page(struct nvme_ns *ns, struct sg_io_hdr *hdr, { int res; - if (ns->ctrl->vs >= NVME_VS(1, 1)) { + if (ns->ctrl->vs >= NVME_VS(1, 1, 0)) { res = nvme_fill_device_id_eui64(ns, hdr, resp, alloc_len); if (res != -EOPNOTSUPP) return res; diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index 6559d5afa7bf..b4cacb6f0258 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -882,7 +882,7 @@ struct nvmet_subsys *nvmet_subsys_alloc(const char *subsysnqn, if (!subsys) return NULL; - subsys->ver = (1 << 16) | (2 << 8) | 1; /* NVMe 1.2.1 */ + subsys->ver = NVME_VS(1, 2, 1); /* NVMe 1.2.1 */ switch (type) { case NVME_NQN_NVME: diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 7676557ce357..086d196e68f7 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -960,6 +960,7 @@ struct nvme_completion { __le16 status; /* did the command fail, and if so, why? */ }; -#define NVME_VS(major, minor) (((major) << 16) | ((minor) << 8)) +#define NVME_VS(major, minor, tertiary) \ + (((major) << 16) | ((minor) << 8) | (tertiary)) #endif /* _LINUX_NVME_H */ -- cgit v1.2.3 From a446c0840e244f34c22cc13b3a62d50aa51fb4c6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 30 Sep 2016 13:51:06 +0200 Subject: nvme.h: resync with nvme-cli Import a few updates to nvme.h from nvme-cli. This mostly includes a few new fields and error codes, but also a few renames that so far are only used in user space. Also one field is moved from an array of two le64 values to one of 16 u8 values so that we can more easily access it. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Gabriel Krisman Bertazi Signed-off-by: Jens Axboe --- drivers/nvme/target/admin-cmd.c | 2 +- drivers/nvme/target/discovery.c | 2 +- include/linux/nvme.h | 33 +++++++++++++++++++++++++++------ 3 files changed, 29 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index 7ab9c9381b98..6944f246e562 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -199,7 +199,7 @@ static void nvmet_execute_identify_ctrl(struct nvmet_req *req) */ /* we support multiple ports and multiples hosts: */ - id->mic = (1 << 0) | (1 << 1); + id->cmic = (1 << 0) | (1 << 1); /* no limit on data transfer sizes for now */ id->mdts = 0; diff --git a/drivers/nvme/target/discovery.c b/drivers/nvme/target/discovery.c index 6f65646e89cf..a6c25fbcff3f 100644 --- a/drivers/nvme/target/discovery.c +++ b/drivers/nvme/target/discovery.c @@ -54,7 +54,7 @@ static void nvmet_format_discovery_entry(struct nvmf_disc_rsp_page_hdr *hdr, /* we support only dynamic controllers */ e->cntlid = cpu_to_le16(NVME_CNTLID_DYNAMIC); e->asqsz = cpu_to_le16(NVMF_AQ_DEPTH); - e->nqntype = type; + e->subtype = type; memcpy(e->trsvcid, port->disc_addr.trsvcid, NVMF_TRSVCID_SIZE); memcpy(e->traddr, port->disc_addr.traddr, NVMF_TRADDR_SIZE); memcpy(e->tsas.common, port->disc_addr.tsas.common, NVMF_TSAS_SIZE); diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 086d196e68f7..989699641e10 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -182,7 +182,7 @@ struct nvme_id_ctrl { char fr[8]; __u8 rab; __u8 ieee[3]; - __u8 mic; + __u8 cmic; __u8 mdts; __le16 cntlid; __le32 ver; @@ -202,7 +202,13 @@ struct nvme_id_ctrl { __u8 apsta; __le16 wctemp; __le16 cctemp; - __u8 rsvd270[50]; + __le16 mtfa; + __le32 hmpre; + __le32 hmmin; + __u8 tnvmcap[16]; + __u8 unvmcap[16]; + __le32 rpmbs; + __u8 rsvd316[4]; __le16 kas; __u8 rsvd322[190]; __u8 sqes; @@ -267,7 +273,7 @@ struct nvme_id_ns { __le16 nabo; __le16 nabspf; __u16 rsvd46; - __le64 nvmcap[2]; + __u8 nvmcap[16]; __u8 rsvd64[40]; __u8 nguid[16]; __u8 eui64[8]; @@ -556,8 +562,10 @@ enum nvme_admin_opcode { nvme_admin_set_features = 0x09, nvme_admin_get_features = 0x0a, nvme_admin_async_event = 0x0c, + nvme_admin_ns_mgmt = 0x0d, nvme_admin_activate_fw = 0x10, nvme_admin_download_fw = 0x11, + nvme_admin_ns_attach = 0x15, nvme_admin_keep_alive = 0x18, nvme_admin_format_nvm = 0x80, nvme_admin_security_send = 0x81, @@ -583,6 +591,7 @@ enum { NVME_FEAT_WRITE_ATOMIC = 0x0a, NVME_FEAT_ASYNC_EVENT = 0x0b, NVME_FEAT_AUTO_PST = 0x0c, + NVME_FEAT_HOST_MEM_BUF = 0x0d, NVME_FEAT_KATO = 0x0f, NVME_FEAT_SW_PROGRESS = 0x80, NVME_FEAT_HOST_ID = 0x81, @@ -745,7 +754,7 @@ struct nvmf_common_command { struct nvmf_disc_rsp_page_entry { __u8 trtype; __u8 adrfam; - __u8 nqntype; + __u8 subtype; __u8 treq; __le16 portid; __le16 cntlid; @@ -905,12 +914,23 @@ enum { NVME_SC_INVALID_VECTOR = 0x108, NVME_SC_INVALID_LOG_PAGE = 0x109, NVME_SC_INVALID_FORMAT = 0x10a, - NVME_SC_FIRMWARE_NEEDS_RESET = 0x10b, + NVME_SC_FW_NEEDS_CONV_RESET = 0x10b, NVME_SC_INVALID_QUEUE = 0x10c, NVME_SC_FEATURE_NOT_SAVEABLE = 0x10d, NVME_SC_FEATURE_NOT_CHANGEABLE = 0x10e, NVME_SC_FEATURE_NOT_PER_NS = 0x10f, - NVME_SC_FW_NEEDS_RESET_SUBSYS = 0x110, + NVME_SC_FW_NEEDS_SUBSYS_RESET = 0x110, + NVME_SC_FW_NEEDS_RESET = 0x111, + NVME_SC_FW_NEEDS_MAX_TIME = 0x112, + NVME_SC_FW_ACIVATE_PROHIBITED = 0x113, + NVME_SC_OVERLAPPING_RANGE = 0x114, + NVME_SC_NS_INSUFFICENT_CAP = 0x115, + NVME_SC_NS_ID_UNAVAILABLE = 0x116, + NVME_SC_NS_ALREADY_ATTACHED = 0x118, + NVME_SC_NS_IS_PRIVATE = 0x119, + NVME_SC_NS_NOT_ATTACHED = 0x11a, + NVME_SC_THIN_PROV_NOT_SUPP = 0x11b, + NVME_SC_CTRL_LIST_INVALID = 0x11c, /* * I/O Command Set Specific - NVM commands: @@ -941,6 +961,7 @@ enum { NVME_SC_REFTAG_CHECK = 0x284, NVME_SC_COMPARE_FAILED = 0x285, NVME_SC_ACCESS_DENIED = 0x286, + NVME_SC_UNWRITTEN_BLOCK = 0x287, NVME_SC_DNR = 0x4000, }; -- cgit v1.2.3 From 8d63687afda019a6e037bf9c4ceb3e514c26a35d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 30 Sep 2016 13:51:07 +0200 Subject: nvme.h: don't use uuid_be This makes life easier for nvme-cli and we don't really need the uuid type anyway to start with. Signed-off-by: Christoph Hellwig Reviewed-by: Gabriel Krisman Bertazi Reviewed-by: Jay Freyensee Signed-off-by: Jens Axboe --- include/linux/nvme.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 989699641e10..d31ff2dd1d51 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -16,7 +16,6 @@ #define _LINUX_NVME_H #include -#include /* NQN names in commands fields specified one size */ #define NVMF_NQN_FIELD_LEN 256 @@ -803,7 +802,7 @@ struct nvmf_connect_command { }; struct nvmf_connect_data { - uuid_be hostid; + __u8 hostid[16]; __le16 cntlid; char resv4[238]; char subsysnqn[NVMF_NQN_FIELD_LEN]; -- cgit v1.2.3 From 329dd7681c5af84e8ea9f4494c1a304389cdfc6e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 30 Sep 2016 13:51:08 +0200 Subject: nvme.h: add an enum for cns values Ported over from nvme-cli. Signed-off-by: Christoph Hellwig Reviewed-by: Gabriel Krisman Bertazi Reviewed-by: Keith Busch Signed-off-by: Jens Axboe --- include/linux/nvme.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index d31ff2dd1d51..fc3c24206593 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -281,6 +281,16 @@ struct nvme_id_ns { __u8 vs[3712]; }; +enum { + NVME_ID_CNS_NS = 0x00, + NVME_ID_CNS_CTRL = 0x01, + NVME_ID_CNS_NS_ACTIVE_LIST = 0x02, + NVME_ID_CNS_NS_PRESENT_LIST = 0x10, + NVME_ID_CNS_NS_PRESENT = 0x11, + NVME_ID_CNS_CTRL_NS_LIST = 0x12, + NVME_ID_CNS_CTRL_LIST = 0x13, +}; + enum { NVME_NS_FEAT_THIN = 1 << 0, NVME_NS_FLBAS_LBA_MASK = 0xf, -- cgit v1.2.3 From 449a137846c84829a328757cd21fd9ca65c08519 Mon Sep 17 00:00:00 2001 From: Nicholas Bellinger Date: Sat, 8 Oct 2016 17:26:44 -0700 Subject: target: Make EXTENDED_COPY 0xe4 failure return COPY TARGET DEVICE NOT REACHABLE This patch addresses a bug where EXTENDED_COPY across multiple LUNs results in a CHECK_CONDITION when the source + destination are not located on the same physical node. ESX Host environments expect sense COPY_ABORTED w/ COPY TARGET DEVICE NOT REACHABLE to be returned when this occurs, in order to signal fallback to local copy method. As described in section 6.3.3 of spc4r22: "If it is not possible to complete processing of a segment because the copy manager is unable to establish communications with a copy target device, because the copy target device does not respond to INQUIRY, or because the data returned in response to INQUIRY indicates an unsupported logical unit, then the EXTENDED COPY command shall be terminated with CHECK CONDITION status, with the sense key set to COPY ABORTED, and the additional sense code set to COPY TARGET DEVICE NOT REACHABLE." Tested on v4.1.y with ESX v5.5u2+ with BlockCopy across multiple nodes. Reported-by: Nixon Vincent Tested-by: Nixon Vincent Cc: Nixon Vincent Tested-by: Dinesh Israni Signed-off-by: Dinesh Israni Cc: Dinesh Israni Cc: stable@vger.kernel.org # 3.14+ Signed-off-by: Nicholas Bellinger --- drivers/target/target_core_transport.c | 7 +++++++ drivers/target/target_core_xcopy.c | 22 ++++++++++++++++------ include/target/target_core_base.h | 1 + 3 files changed, 24 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/drivers/target/target_core_transport.c b/drivers/target/target_core_transport.c index 00ec46413146..000bc6d077ea 100644 --- a/drivers/target/target_core_transport.c +++ b/drivers/target/target_core_transport.c @@ -1706,6 +1706,7 @@ void transport_generic_request_failure(struct se_cmd *cmd, case TCM_LOGICAL_BLOCK_GUARD_CHECK_FAILED: case TCM_LOGICAL_BLOCK_APP_TAG_CHECK_FAILED: case TCM_LOGICAL_BLOCK_REF_TAG_CHECK_FAILED: + case TCM_COPY_TARGET_DEVICE_NOT_REACHABLE: break; case TCM_OUT_OF_RESOURCES: sense_reason = TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE; @@ -2873,6 +2874,12 @@ static const struct sense_info sense_info_table[] = { .ascq = 0x03, /* LOGICAL BLOCK REFERENCE TAG CHECK FAILED */ .add_sector_info = true, }, + [TCM_COPY_TARGET_DEVICE_NOT_REACHABLE] = { + .key = COPY_ABORTED, + .asc = 0x0d, + .ascq = 0x02, /* COPY TARGET DEVICE NOT REACHABLE */ + + }, [TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE] = { /* * Returning ILLEGAL REQUEST would cause immediate IO errors on diff --git a/drivers/target/target_core_xcopy.c b/drivers/target/target_core_xcopy.c index 75cd85426ae3..f2a1443ecb33 100644 --- a/drivers/target/target_core_xcopy.c +++ b/drivers/target/target_core_xcopy.c @@ -104,7 +104,7 @@ static int target_xcopy_locate_se_dev_e4(struct se_cmd *se_cmd, struct xcopy_op } mutex_unlock(&g_device_mutex); - pr_err("Unable to locate 0xe4 descriptor for EXTENDED_COPY\n"); + pr_debug_ratelimited("Unable to locate 0xe4 descriptor for EXTENDED_COPY\n"); return -EINVAL; } @@ -185,7 +185,7 @@ static int target_xcopy_parse_tiddesc_e4(struct se_cmd *se_cmd, struct xcopy_op static int target_xcopy_parse_target_descriptors(struct se_cmd *se_cmd, struct xcopy_op *xop, unsigned char *p, - unsigned short tdll) + unsigned short tdll, sense_reason_t *sense_ret) { struct se_device *local_dev = se_cmd->se_dev; unsigned char *desc = p; @@ -193,6 +193,8 @@ static int target_xcopy_parse_target_descriptors(struct se_cmd *se_cmd, unsigned short start = 0; bool src = true; + *sense_ret = TCM_INVALID_PARAMETER_LIST; + if (offset != 0) { pr_err("XCOPY target descriptor list length is not" " multiple of %d\n", XCOPY_TARGET_DESC_LEN); @@ -243,9 +245,16 @@ static int target_xcopy_parse_target_descriptors(struct se_cmd *se_cmd, rc = target_xcopy_locate_se_dev_e4(se_cmd, xop, true); else rc = target_xcopy_locate_se_dev_e4(se_cmd, xop, false); - - if (rc < 0) + /* + * If a matching IEEE NAA 0x83 descriptor for the requested device + * is not located on this node, return COPY_ABORTED with ASQ/ASQC + * 0x0d/0x02 - COPY_TARGET_DEVICE_NOT_REACHABLE to request the + * initiator to fall back to normal copy method. + */ + if (rc < 0) { + *sense_ret = TCM_COPY_TARGET_DEVICE_NOT_REACHABLE; goto out; + } pr_debug("XCOPY TGT desc: Source dev: %p NAA IEEE WWN: 0x%16phN\n", xop->src_dev, &xop->src_tid_wwn[0]); @@ -816,7 +825,8 @@ out: xcopy_pt_undepend_remotedev(xop); kfree(xop); - pr_warn("target_xcopy_do_work: Setting X-COPY CHECK_CONDITION -> sending response\n"); + pr_warn_ratelimited("target_xcopy_do_work: rc: %d, Setting X-COPY CHECK_CONDITION" + " -> sending response\n", rc); ec_cmd->scsi_status = SAM_STAT_CHECK_CONDITION; target_complete_cmd(ec_cmd, SAM_STAT_CHECK_CONDITION); } @@ -875,7 +885,7 @@ sense_reason_t target_do_xcopy(struct se_cmd *se_cmd) " tdll: %hu sdll: %u inline_dl: %u\n", list_id, list_id_usage, tdll, sdll, inline_dl); - rc = target_xcopy_parse_target_descriptors(se_cmd, xop, &p[16], tdll); + rc = target_xcopy_parse_target_descriptors(se_cmd, xop, &p[16], tdll, &ret); if (rc <= 0) goto out; diff --git a/include/target/target_core_base.h b/include/target/target_core_base.h index fb8e3b6febdf..c2119008990a 100644 --- a/include/target/target_core_base.h +++ b/include/target/target_core_base.h @@ -177,6 +177,7 @@ enum tcm_sense_reason_table { TCM_LOGICAL_BLOCK_GUARD_CHECK_FAILED = R(0x15), TCM_LOGICAL_BLOCK_APP_TAG_CHECK_FAILED = R(0x16), TCM_LOGICAL_BLOCK_REF_TAG_CHECK_FAILED = R(0x17), + TCM_COPY_TARGET_DEVICE_NOT_REACHABLE = R(0x18), #undef R }; -- cgit v1.2.3 From d33fd776f992332be110f6ceca6dad60cb5d513f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 20 Oct 2016 15:51:28 +1100 Subject: iomap: add IOMAP_REPORT This allows the file system to tell a FIEMAP from a read operation, and thus avoids the need to report flags that aren't actually used in the read path. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Reviewed-by: Brian Foster Signed-off-by: Dave Chinner --- fs/iomap.c | 2 +- include/linux/iomap.h | 17 +++++++++++------ 2 files changed, 12 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/fs/iomap.c b/fs/iomap.c index 013d1d36fbbf..a92204012e2d 100644 --- a/fs/iomap.c +++ b/fs/iomap.c @@ -561,7 +561,7 @@ int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fi, } while (len > 0) { - ret = iomap_apply(inode, start, len, 0, ops, &ctx, + ret = iomap_apply(inode, start, len, IOMAP_REPORT, ops, &ctx, iomap_fiemap_actor); /* inode with no (attribute) mapping will give ENOENT */ if (ret == -ENOENT) diff --git a/include/linux/iomap.h b/include/linux/iomap.h index e63e288dee83..7892f55a1866 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -19,11 +19,15 @@ struct vm_fault; #define IOMAP_UNWRITTEN 0x04 /* blocks allocated @blkno in unwritten state */ /* - * Flags for iomap mappings: + * Flags for all iomap mappings: */ -#define IOMAP_F_MERGED 0x01 /* contains multiple blocks/extents */ -#define IOMAP_F_SHARED 0x02 /* block shared with another file */ -#define IOMAP_F_NEW 0x04 /* blocks have been newly allocated */ +#define IOMAP_F_NEW 0x01 /* blocks have been newly allocated */ + +/* + * Flags that only need to be reported for IOMAP_REPORT requests: + */ +#define IOMAP_F_MERGED 0x10 /* contains multiple blocks/extents */ +#define IOMAP_F_SHARED 0x20 /* block shared with another file */ /* * Magic value for blkno: @@ -42,8 +46,9 @@ struct iomap { /* * Flags for iomap_begin / iomap_end. No flag implies a read. */ -#define IOMAP_WRITE (1 << 0) -#define IOMAP_ZERO (1 << 1) +#define IOMAP_WRITE (1 << 0) /* writing, must allocate blocks */ +#define IOMAP_ZERO (1 << 1) /* zeroing operation, may skip holes */ +#define IOMAP_REPORT (1 << 2) /* report extent status, e.g. FIEMAP */ struct iomap_ops { /* -- cgit v1.2.3 From d17af5056cf9e9fc05e68832f7c15687fcc12281 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Fri, 30 Sep 2016 10:58:58 -0700 Subject: mm: Change vm_is_stack_for_task() to vm_is_stack_for_current() Asking for a non-current task's stack can't be done without races unless the task is frozen in kernel mode. As far as I know, vm_is_stack_for_task() never had a safe non-current use case. The __unused annotation is because some KSTK_ESP implementations ignore their parameter, which IMO is further justification for this patch. Signed-off-by: Andy Lutomirski Acked-by: Thomas Gleixner Cc: Al Viro Cc: Andrew Morton Cc: Borislav Petkov Cc: Brian Gerst Cc: Jann Horn Cc: Kees Cook Cc: Linus Torvalds Cc: Linux API Cc: Peter Zijlstra Cc: Tycho Andersen Link: http://lkml.kernel.org/r/4c3f68f426e6c061ca98b4fc7ef85ffbb0a25b0c.1475257877.git.luto@kernel.org Signed-off-by: Ingo Molnar --- include/linux/mm.h | 2 +- mm/util.c | 4 +++- security/selinux/hooks.c | 2 +- 3 files changed, 5 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index e9caec6a51e9..a658a5167bce 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1391,7 +1391,7 @@ static inline int stack_guard_page_end(struct vm_area_struct *vma, !vma_growsup(vma->vm_next, addr); } -int vma_is_stack_for_task(struct vm_area_struct *vma, struct task_struct *t); +int vma_is_stack_for_current(struct vm_area_struct *vma); extern unsigned long move_page_tables(struct vm_area_struct *vma, unsigned long old_addr, struct vm_area_struct *new_vma, diff --git a/mm/util.c b/mm/util.c index 662cddf914af..c174e8921995 100644 --- a/mm/util.c +++ b/mm/util.c @@ -230,8 +230,10 @@ void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma, } /* Check if the vma is being used as a stack by this task */ -int vma_is_stack_for_task(struct vm_area_struct *vma, struct task_struct *t) +int vma_is_stack_for_current(struct vm_area_struct *vma) { + struct task_struct * __maybe_unused t = current; + return (vma->vm_start <= KSTK_ESP(t) && vma->vm_end >= KSTK_ESP(t)); } diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 085057936287..09fd6108e421 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -3557,7 +3557,7 @@ static int selinux_file_mprotect(struct vm_area_struct *vma, } else if (!vma->vm_file && ((vma->vm_start <= vma->vm_mm->start_stack && vma->vm_end >= vma->vm_mm->start_stack) || - vma_is_stack_for_task(vma, current))) { + vma_is_stack_for_current(vma))) { rc = current_has_perm(current, PROCESS__EXECSTACK); } else if (vma->vm_file && vma->anon_vma) { /* -- cgit v1.2.3 From c8061485a0d7569a865a3cc3c63347b0f42b3765 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 19 Oct 2016 19:28:11 +0100 Subject: sched/core, x86: Make struct thread_info arch specific again The following commit: c65eacbe290b ("sched/core: Allow putting thread_info into task_struct") ... made 'struct thread_info' a generic struct with only a single ::flags member, if CONFIG_THREAD_INFO_IN_TASK_STRUCT=y is selected. This change however seems to be quite x86 centric, since at least the generic preemption code (asm-generic/preempt.h) assumes that struct thread_info also has a preempt_count member, which apparently was not true for x86. We could add a bit more #ifdefs to solve this problem too, but it seems to be much simpler to make struct thread_info arch specific again. This also makes the conversion to THREAD_INFO_IN_TASK_STRUCT a bit easier for architectures that have a couple of arch specific stuff in their thread_info definition. The arch specific stuff _could_ be moved to thread_struct. However keeping them in thread_info makes it easier: accessing thread_info members is simple, since it is at the beginning of the task_struct, while the thread_struct is at the end. At least on s390 the offsets needed to access members of the thread_struct (with task_struct as base) are too large for various asm instructions. This is not a problem when keeping these members within thread_info. Signed-off-by: Heiko Carstens Signed-off-by: Mark Rutland Acked-by: Thomas Gleixner Cc: Andrew Morton Cc: Andy Lutomirski Cc: Linus Torvalds Cc: Peter Zijlstra Cc: keescook@chromium.org Cc: linux-arch@vger.kernel.org Link: http://lkml.kernel.org/r/1476901693-8492-2-git-send-email-mark.rutland@arm.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/thread_info.h | 9 +++++++++ include/linux/thread_info.h | 11 ----------- 2 files changed, 9 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 2aaca53c0974..ad6f5eb07a95 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -52,6 +52,15 @@ struct task_struct; #include #include +struct thread_info { + unsigned long flags; /* low level flags */ +}; + +#define INIT_THREAD_INFO(tsk) \ +{ \ + .flags = 0, \ +} + #define init_stack (init_thread_union.stack) #else /* !__ASSEMBLY__ */ diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h index 45f004e9cc59..2873baf5372a 100644 --- a/include/linux/thread_info.h +++ b/include/linux/thread_info.h @@ -13,17 +13,6 @@ struct timespec; struct compat_timespec; -#ifdef CONFIG_THREAD_INFO_IN_TASK -struct thread_info { - unsigned long flags; /* low level flags */ -}; - -#define INIT_THREAD_INFO(tsk) \ -{ \ - .flags = 0, \ -} -#endif - #ifdef CONFIG_THREAD_INFO_IN_TASK #define current_thread_info() ((struct thread_info *)current) #endif -- cgit v1.2.3 From c6fe46a79ecd79606bb96fada4515f6b23f87b62 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Tue, 18 Oct 2016 00:41:12 +0900 Subject: cpufreq: fix overflow in cpufreq_table_find_index_dl() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 'best' is always less or equals to 'pos', so `best - pos' returns a negative value which is then getting casted to `unsigned int' and passed to __cpufreq_driver_target()->acpi_cpufreq_target() for policy->freq_table selection. This results in BUG: unable to handle kernel paging request at ffff881019b469f8 IP: [] acpi_cpufreq_target+0x4f/0x190 [acpi_cpufreq] PGD 267f067 PUD 0 Oops: 0000 [#1] PREEMPT SMP CPU: 6 PID: 70 Comm: kworker/6:1 Not tainted 4.9.0-rc1-next-20161017-dbg-dirty Workqueue: events dbs_work_handler task: ffff88041b808000 task.stack: ffff88041b810000 RIP: 0010:[] [] acpi_cpufreq_target+0x4f/0x190 [acpi_cpufreq] RSP: 0018:ffff88041b813c60 EFLAGS: 00010282 RAX: ffff880419b46a00 RBX: ffff88041b848400 RCX: ffff880419b20f80 RDX: 00000000001dff38 RSI: 00000000ffffffff RDI: ffff88041b848400 RBP: ffff88041b813cb0 R08: 0000000000000006 R09: 0000000000000040 R10: ffffffff8207f9e0 R11: ffffffff8173595b R12: 0000000000000000 R13: ffff88041f1dff38 R14: 0000000000262900 R15: 0000000bfffffff4 FS: 0000000000000000(0000) GS:ffff88041f000000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: ffff881019b469f8 CR3: 000000041a2d3000 CR4: 00000000001406e0 Stack: ffff88041b813cb0 ffffffff813347f9 ffff88041b813ca0 ffffffff81334663 ffff88041f1d4bc0 ffff88041b848400 0000000000000000 0000000000000000 0000000000262900 0000000000000000 ffff88041b813d00 ffffffff813355dc Call Trace: [] ? cpufreq_freq_transition_begin+0xf1/0xfc [] ? get_cpu_idle_time+0x97/0xa6 [] __cpufreq_driver_target+0x3b6/0x44e [] cs_dbs_timer+0x11a/0x135 [] dbs_work_handler+0x39/0x62 [] process_one_work+0x280/0x4a5 [] worker_thread+0x24f/0x397 [] ? rescuer_thread+0x30b/0x30b [] ? nl80211_get_key+0x29/0x36a [] kthread+0xfc/0x104 [] ? put_lock_stats.isra.9+0xe/0x20 [] ? kthread_create_on_node+0x3f/0x3f [] ret_from_fork+0x22/0x30 Code: 56 4d 6b ff 0c 41 55 41 54 53 48 83 ec 28 48 8b 15 ad 1e 00 00 44 8b 41 08 48 8b 87 c8 00 00 00 49 89 d5 4e 03 2c c5 80 b2 78 81 <46> 8b 74 38 04 45 3b 75 00 75 11 31 c0 83 39 00 0f 84 1c 01 00 RIP [] acpi_cpufreq_target+0x4f/0x190 [acpi_cpufreq] RSP CR2: ffff881019b469f8 ---[ end trace 16d9fc7a17897d37 ]--- [ rjw: In some cases this bug may also cause incorrect frequencies to be selected by cpufreq governors. ] Fixes: 899bb6642f2a (cpufreq: skip invalid entries when searching the frequency) Link: http://marc.info/?l=linux-kernel&m=147672030714331&w=2 Reported-and-tested-by: Sedat Dilek Reported-and-tested-by: Jörg Otte Signed-off-by: Sergey Senozhatsky Acked-by: Viresh Kumar Cc: 4.8+ # 4.8+ Signed-off-by: Rafael J. Wysocki --- include/linux/cpufreq.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 5fa55fc56e18..32dc0cbd51ca 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -677,10 +677,10 @@ static inline int cpufreq_table_find_index_dl(struct cpufreq_policy *policy, if (best == table - 1) return pos - table; - return best - pos; + return best - table; } - return best - pos; + return best - table; } /* Works only on sorted freq-tables */ -- cgit v1.2.3 From 9995f4f184613fb02ee73092b03545520a72b104 Mon Sep 17 00:00:00 2001 From: Rich Felker Date: Thu, 13 Oct 2016 21:51:06 +0000 Subject: clocksource: Add J-Core timer/clocksource driver At the hardware level, the J-Core PIT is integrated with the interrupt controller, but it is represented as its own device and has an independent programming interface. It provides a 12-bit countdown timer, which is not presently used, and a periodic timer. The interval length for the latter is programmable via a 32-bit throttle register whose units are determined by a bus-period register. The periodic timer is used to implement both periodic and oneshot clock event modes; in oneshot mode the interrupt handler simply disables the timer as soon as it fires. Despite its device tree node representing an interrupt for the PIT, the actual irq generated is programmable, not hard-wired. The driver is responsible for programming the PIT to generate the hardware irq number that the DT assigns to it. On SMP configurations, J-Core provides cpu-local instances of the PIT; no broadcast timer is needed. This driver supports the creation of the necessary per-cpu clock_event_device instances. A nanosecond-resolution clocksource is provided using the J-Core "RTC" registers, which give a 64-bit seconds count and 32-bit nanoseconds that wrap every second. The driver converts these to a full-range 32-bit nanoseconds count. Signed-off-by: Rich Felker Cc: Mark Rutland Cc: devicetree@vger.kernel.org Cc: linux-sh@vger.kernel.org Cc: Daniel Lezcano Cc: Rob Herring Link: http://lkml.kernel.org/r/b591ff12cc5ebf63d1edc98da26046f95a233814.1476393790.git.dalias@libc.org Signed-off-by: Thomas Gleixner --- drivers/clocksource/Kconfig | 10 ++ drivers/clocksource/Makefile | 1 + drivers/clocksource/jcore-pit.c | 249 ++++++++++++++++++++++++++++++++++++++++ include/linux/cpuhotplug.h | 1 + 4 files changed, 261 insertions(+) create mode 100644 drivers/clocksource/jcore-pit.c (limited to 'include') diff --git a/drivers/clocksource/Kconfig b/drivers/clocksource/Kconfig index 245190839359..e2c6e43cf8ca 100644 --- a/drivers/clocksource/Kconfig +++ b/drivers/clocksource/Kconfig @@ -417,6 +417,16 @@ config SYS_SUPPORTS_SH_TMU config SYS_SUPPORTS_EM_STI bool +config CLKSRC_JCORE_PIT + bool "J-Core PIT timer driver" if COMPILE_TEST + depends on OF + depends on GENERIC_CLOCKEVENTS + depends on HAS_IOMEM + select CLKSRC_MMIO + help + This enables build of clocksource and clockevent driver for + the integrated PIT in the J-Core synthesizable, open source SoC. + config SH_TIMER_CMT bool "Renesas CMT timer driver" if COMPILE_TEST depends on GENERIC_CLOCKEVENTS diff --git a/drivers/clocksource/Makefile b/drivers/clocksource/Makefile index fd9d6df0bbc0..cf87f407f1ad 100644 --- a/drivers/clocksource/Makefile +++ b/drivers/clocksource/Makefile @@ -5,6 +5,7 @@ obj-$(CONFIG_ATMEL_TCB_CLKSRC) += tcb_clksrc.o obj-$(CONFIG_X86_PM_TIMER) += acpi_pm.o obj-$(CONFIG_SCx200HR_TIMER) += scx200_hrt.o obj-$(CONFIG_CS5535_CLOCK_EVENT_SRC) += cs5535-clockevt.o +obj-$(CONFIG_CLKSRC_JCORE_PIT) += jcore-pit.o obj-$(CONFIG_SH_TIMER_CMT) += sh_cmt.o obj-$(CONFIG_SH_TIMER_MTU2) += sh_mtu2.o obj-$(CONFIG_SH_TIMER_TMU) += sh_tmu.o diff --git a/drivers/clocksource/jcore-pit.c b/drivers/clocksource/jcore-pit.c new file mode 100644 index 000000000000..54e1665aa03c --- /dev/null +++ b/drivers/clocksource/jcore-pit.c @@ -0,0 +1,249 @@ +/* + * J-Core SoC PIT/clocksource driver + * + * Copyright (C) 2015-2016 Smart Energy Instruments, Inc. + * + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file "COPYING" in the main directory of this archive + * for more details. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define PIT_IRQ_SHIFT 12 +#define PIT_PRIO_SHIFT 20 +#define PIT_ENABLE_SHIFT 26 +#define PIT_PRIO_MASK 0xf + +#define REG_PITEN 0x00 +#define REG_THROT 0x10 +#define REG_COUNT 0x14 +#define REG_BUSPD 0x18 +#define REG_SECHI 0x20 +#define REG_SECLO 0x24 +#define REG_NSEC 0x28 + +struct jcore_pit { + struct clock_event_device ced; + void __iomem *base; + unsigned long periodic_delta; + u32 enable_val; +}; + +static void __iomem *jcore_pit_base; +static struct jcore_pit __percpu *jcore_pit_percpu; + +static notrace u64 jcore_sched_clock_read(void) +{ + u32 seclo, nsec, seclo0; + __iomem void *base = jcore_pit_base; + + seclo = readl(base + REG_SECLO); + do { + seclo0 = seclo; + nsec = readl(base + REG_NSEC); + seclo = readl(base + REG_SECLO); + } while (seclo0 != seclo); + + return seclo * NSEC_PER_SEC + nsec; +} + +static cycle_t jcore_clocksource_read(struct clocksource *cs) +{ + return jcore_sched_clock_read(); +} + +static int jcore_pit_disable(struct jcore_pit *pit) +{ + writel(0, pit->base + REG_PITEN); + return 0; +} + +static int jcore_pit_set(unsigned long delta, struct jcore_pit *pit) +{ + jcore_pit_disable(pit); + writel(delta, pit->base + REG_THROT); + writel(pit->enable_val, pit->base + REG_PITEN); + return 0; +} + +static int jcore_pit_set_state_shutdown(struct clock_event_device *ced) +{ + struct jcore_pit *pit = container_of(ced, struct jcore_pit, ced); + + return jcore_pit_disable(pit); +} + +static int jcore_pit_set_state_oneshot(struct clock_event_device *ced) +{ + struct jcore_pit *pit = container_of(ced, struct jcore_pit, ced); + + return jcore_pit_disable(pit); +} + +static int jcore_pit_set_state_periodic(struct clock_event_device *ced) +{ + struct jcore_pit *pit = container_of(ced, struct jcore_pit, ced); + + return jcore_pit_set(pit->periodic_delta, pit); +} + +static int jcore_pit_set_next_event(unsigned long delta, + struct clock_event_device *ced) +{ + struct jcore_pit *pit = container_of(ced, struct jcore_pit, ced); + + return jcore_pit_set(delta, pit); +} + +static int jcore_pit_local_init(unsigned cpu) +{ + struct jcore_pit *pit = this_cpu_ptr(jcore_pit_percpu); + unsigned buspd, freq; + + pr_info("Local J-Core PIT init on cpu %u\n", cpu); + + buspd = readl(pit->base + REG_BUSPD); + freq = DIV_ROUND_CLOSEST(NSEC_PER_SEC, buspd); + pit->periodic_delta = DIV_ROUND_CLOSEST(NSEC_PER_SEC, HZ * buspd); + + clockevents_config_and_register(&pit->ced, freq, 1, ULONG_MAX); + + return 0; +} + +static irqreturn_t jcore_timer_interrupt(int irq, void *dev_id) +{ + struct jcore_pit *pit = this_cpu_ptr(dev_id); + + if (clockevent_state_oneshot(&pit->ced)) + jcore_pit_disable(pit); + + pit->ced.event_handler(&pit->ced); + + return IRQ_HANDLED; +} + +static int __init jcore_pit_init(struct device_node *node) +{ + int err; + unsigned pit_irq, cpu; + unsigned long hwirq; + u32 irqprio, enable_val; + + jcore_pit_base = of_iomap(node, 0); + if (!jcore_pit_base) { + pr_err("Error: Cannot map base address for J-Core PIT\n"); + return -ENXIO; + } + + pit_irq = irq_of_parse_and_map(node, 0); + if (!pit_irq) { + pr_err("Error: J-Core PIT has no IRQ\n"); + return -ENXIO; + } + + pr_info("Initializing J-Core PIT at %p IRQ %d\n", + jcore_pit_base, pit_irq); + + err = clocksource_mmio_init(jcore_pit_base, "jcore_pit_cs", + NSEC_PER_SEC, 400, 32, + jcore_clocksource_read); + if (err) { + pr_err("Error registering clocksource device: %d\n", err); + return err; + } + + sched_clock_register(jcore_sched_clock_read, 32, NSEC_PER_SEC); + + jcore_pit_percpu = alloc_percpu(struct jcore_pit); + if (!jcore_pit_percpu) { + pr_err("Failed to allocate memory for clock event device\n"); + return -ENOMEM; + } + + err = request_irq(pit_irq, jcore_timer_interrupt, + IRQF_TIMER | IRQF_PERCPU, + "jcore_pit", jcore_pit_percpu); + if (err) { + pr_err("pit irq request failed: %d\n", err); + free_percpu(jcore_pit_percpu); + return err; + } + + /* + * The J-Core PIT is not hard-wired to a particular IRQ, but + * integrated with the interrupt controller such that the IRQ it + * generates is programmable, as follows: + * + * The bit layout of the PIT enable register is: + * + * .....e..ppppiiiiiiii............ + * + * where the .'s indicate unrelated/unused bits, e is enable, + * p is priority, and i is hard irq number. + * + * For the PIT included in AIC1 (obsolete but still in use), + * any hard irq (trap number) can be programmed via the 8 + * iiiiiiii bits, and a priority (0-15) is programmable + * separately in the pppp bits. + * + * For the PIT included in AIC2 (current), the programming + * interface is equivalent modulo interrupt mapping. This is + * why a different compatible tag was not used. However only + * traps 64-127 (the ones actually intended to be used for + * interrupts, rather than syscalls/exceptions/etc.) can be + * programmed (the high 2 bits of i are ignored) and the + * priority pppp is <<2'd and or'd onto the irq number. This + * choice seems to have been made on the hardware engineering + * side under an assumption that preserving old AIC1 priority + * mappings was important. Future models will likely ignore + * the pppp field. + */ + hwirq = irq_get_irq_data(pit_irq)->hwirq; + irqprio = (hwirq >> 2) & PIT_PRIO_MASK; + enable_val = (1U << PIT_ENABLE_SHIFT) + | (hwirq << PIT_IRQ_SHIFT) + | (irqprio << PIT_PRIO_SHIFT); + + for_each_present_cpu(cpu) { + struct jcore_pit *pit = per_cpu_ptr(jcore_pit_percpu, cpu); + + pit->base = of_iomap(node, cpu); + if (!pit->base) { + pr_err("Unable to map PIT for cpu %u\n", cpu); + continue; + } + + pit->ced.name = "jcore_pit"; + pit->ced.features = CLOCK_EVT_FEAT_PERIODIC + | CLOCK_EVT_FEAT_ONESHOT + | CLOCK_EVT_FEAT_PERCPU; + pit->ced.cpumask = cpumask_of(cpu); + pit->ced.rating = 400; + pit->ced.irq = pit_irq; + pit->ced.set_state_shutdown = jcore_pit_set_state_shutdown; + pit->ced.set_state_periodic = jcore_pit_set_state_periodic; + pit->ced.set_state_oneshot = jcore_pit_set_state_oneshot; + pit->ced.set_next_event = jcore_pit_set_next_event; + + pit->enable_val = enable_val; + } + + cpuhp_setup_state(CPUHP_AP_JCORE_TIMER_STARTING, + "AP_JCORE_TIMER_STARTING", + jcore_pit_local_init, NULL); + + return 0; +} + +CLOCKSOURCE_OF_DECLARE(jcore_pit, "jcore,pit", jcore_pit_init); diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 9b207a8c5af3..afe641c02dca 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -81,6 +81,7 @@ enum cpuhp_state { CPUHP_AP_ARM_ARCH_TIMER_STARTING, CPUHP_AP_ARM_GLOBAL_TIMER_STARTING, CPUHP_AP_DUMMY_TIMER_STARTING, + CPUHP_AP_JCORE_TIMER_STARTING, CPUHP_AP_EXYNOS4_MCT_TIMER_STARTING, CPUHP_AP_ARM_TWD_STARTING, CPUHP_AP_METAG_TIMER_STARTING, -- cgit v1.2.3 From fcd91dd449867c6bfe56a81cabba76b829fd05cd Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Thu, 20 Oct 2016 15:58:02 +0200 Subject: net: add recursion limit to GRO MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently, GRO can do unlimited recursion through the gro_receive handlers. This was fixed for tunneling protocols by limiting tunnel GRO to one level with encap_mark, but both VLAN and TEB still have this problem. Thus, the kernel is vulnerable to a stack overflow, if we receive a packet composed entirely of VLAN headers. This patch adds a recursion counter to the GRO layer to prevent stack overflow. When a gro_receive function hits the recursion limit, GRO is aborted for this skb and it is processed normally. This recursion counter is put in the GRO CB, but could be turned into a percpu counter if we run out of space in the CB. Thanks to Vladimír BeneÅ¡ for the initial bug report. Fixes: CVE-2016-7039 Fixes: 9b174d88c257 ("net: Add Transparent Ethernet Bridging GRO support.") Fixes: 66e5133f19e9 ("vlan: Add GRO support for non hardware accelerated vlan") Signed-off-by: Sabrina Dubroca Reviewed-by: Jiri Benc Acked-by: Hannes Frederic Sowa Acked-by: Tom Herbert Signed-off-by: David S. Miller --- drivers/net/geneve.c | 2 +- drivers/net/vxlan.c | 2 +- include/linux/netdevice.h | 39 ++++++++++++++++++++++++++++++++++++++- net/8021q/vlan.c | 2 +- net/core/dev.c | 1 + net/ethernet/eth.c | 2 +- net/ipv4/af_inet.c | 2 +- net/ipv4/fou.c | 4 ++-- net/ipv4/gre_offload.c | 2 +- net/ipv4/udp_offload.c | 2 +- net/ipv6/ip6_offload.c | 2 +- 11 files changed, 49 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c index 3c20e87bb761..16af1ce99233 100644 --- a/drivers/net/geneve.c +++ b/drivers/net/geneve.c @@ -453,7 +453,7 @@ static struct sk_buff **geneve_gro_receive(struct sock *sk, skb_gro_pull(skb, gh_len); skb_gro_postpull_rcsum(skb, gh, gh_len); - pp = ptype->callbacks.gro_receive(head, skb); + pp = call_gro_receive(ptype->callbacks.gro_receive, head, skb); flush = 0; out_unlock: diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index e7d16687538b..c1639a3e95a4 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -583,7 +583,7 @@ static struct sk_buff **vxlan_gro_receive(struct sock *sk, } } - pp = eth_gro_receive(head, skb); + pp = call_gro_receive(eth_gro_receive, head, skb); flush = 0; out: diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 465e128699a1..91ee3643ccc8 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2169,7 +2169,10 @@ struct napi_gro_cb { /* Used to determine if flush_id can be ignored */ u8 is_atomic:1; - /* 5 bit hole */ + /* Number of gro_receive callbacks this packet already went through */ + u8 recursion_counter:4; + + /* 1 bit hole */ /* used to support CHECKSUM_COMPLETE for tunneling protocols */ __wsum csum; @@ -2180,6 +2183,40 @@ struct napi_gro_cb { #define NAPI_GRO_CB(skb) ((struct napi_gro_cb *)(skb)->cb) +#define GRO_RECURSION_LIMIT 15 +static inline int gro_recursion_inc_test(struct sk_buff *skb) +{ + return ++NAPI_GRO_CB(skb)->recursion_counter == GRO_RECURSION_LIMIT; +} + +typedef struct sk_buff **(*gro_receive_t)(struct sk_buff **, struct sk_buff *); +static inline struct sk_buff **call_gro_receive(gro_receive_t cb, + struct sk_buff **head, + struct sk_buff *skb) +{ + if (unlikely(gro_recursion_inc_test(skb))) { + NAPI_GRO_CB(skb)->flush |= 1; + return NULL; + } + + return cb(head, skb); +} + +typedef struct sk_buff **(*gro_receive_sk_t)(struct sock *, struct sk_buff **, + struct sk_buff *); +static inline struct sk_buff **call_gro_receive_sk(gro_receive_sk_t cb, + struct sock *sk, + struct sk_buff **head, + struct sk_buff *skb) +{ + if (unlikely(gro_recursion_inc_test(skb))) { + NAPI_GRO_CB(skb)->flush |= 1; + return NULL; + } + + return cb(sk, head, skb); +} + struct packet_type { __be16 type; /* This is really htons(ether_type). */ struct net_device *dev; /* NULL is wildcarded here */ diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c index 8de138d3306b..f2531ad66b68 100644 --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -664,7 +664,7 @@ static struct sk_buff **vlan_gro_receive(struct sk_buff **head, skb_gro_pull(skb, sizeof(*vhdr)); skb_gro_postpull_rcsum(skb, vhdr, sizeof(*vhdr)); - pp = ptype->callbacks.gro_receive(head, skb); + pp = call_gro_receive(ptype->callbacks.gro_receive, head, skb); out_unlock: rcu_read_unlock(); diff --git a/net/core/dev.c b/net/core/dev.c index b09ac57f4348..dbc871306910 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4511,6 +4511,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff NAPI_GRO_CB(skb)->flush = 0; NAPI_GRO_CB(skb)->free = 0; NAPI_GRO_CB(skb)->encap_mark = 0; + NAPI_GRO_CB(skb)->recursion_counter = 0; NAPI_GRO_CB(skb)->is_fou = 0; NAPI_GRO_CB(skb)->is_atomic = 1; NAPI_GRO_CB(skb)->gro_remcsum_start = 0; diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c index 66dff5e3d772..02acfff36028 100644 --- a/net/ethernet/eth.c +++ b/net/ethernet/eth.c @@ -439,7 +439,7 @@ struct sk_buff **eth_gro_receive(struct sk_buff **head, skb_gro_pull(skb, sizeof(*eh)); skb_gro_postpull_rcsum(skb, eh, sizeof(*eh)); - pp = ptype->callbacks.gro_receive(head, skb); + pp = call_gro_receive(ptype->callbacks.gro_receive, head, skb); out_unlock: rcu_read_unlock(); diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 1effc986739e..9648c97e541f 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1391,7 +1391,7 @@ struct sk_buff **inet_gro_receive(struct sk_buff **head, struct sk_buff *skb) skb_gro_pull(skb, sizeof(*iph)); skb_set_transport_header(skb, skb_gro_offset(skb)); - pp = ops->callbacks.gro_receive(head, skb); + pp = call_gro_receive(ops->callbacks.gro_receive, head, skb); out_unlock: rcu_read_unlock(); diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c index cf50f7e2b012..030d1531e897 100644 --- a/net/ipv4/fou.c +++ b/net/ipv4/fou.c @@ -249,7 +249,7 @@ static struct sk_buff **fou_gro_receive(struct sock *sk, if (!ops || !ops->callbacks.gro_receive) goto out_unlock; - pp = ops->callbacks.gro_receive(head, skb); + pp = call_gro_receive(ops->callbacks.gro_receive, head, skb); out_unlock: rcu_read_unlock(); @@ -441,7 +441,7 @@ next_proto: if (WARN_ON_ONCE(!ops || !ops->callbacks.gro_receive)) goto out_unlock; - pp = ops->callbacks.gro_receive(head, skb); + pp = call_gro_receive(ops->callbacks.gro_receive, head, skb); flush = 0; out_unlock: diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c index 96e0efecefa6..d5cac99170b1 100644 --- a/net/ipv4/gre_offload.c +++ b/net/ipv4/gre_offload.c @@ -229,7 +229,7 @@ static struct sk_buff **gre_gro_receive(struct sk_buff **head, /* Adjusted NAPI_GRO_CB(skb)->csum after skb_gro_pull()*/ skb_gro_postpull_rcsum(skb, greh, grehlen); - pp = ptype->callbacks.gro_receive(head, skb); + pp = call_gro_receive(ptype->callbacks.gro_receive, head, skb); flush = 0; out_unlock: diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index f9333c963607..b2be1d9757ef 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -295,7 +295,7 @@ unflush: skb_gro_pull(skb, sizeof(struct udphdr)); /* pull encapsulating udp header */ skb_gro_postpull_rcsum(skb, uh, sizeof(struct udphdr)); - pp = udp_sk(sk)->gro_receive(sk, head, skb); + pp = call_gro_receive_sk(udp_sk(sk)->gro_receive, sk, head, skb); out_unlock: rcu_read_unlock(); diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c index e7bfd55899a3..1fcf61f1cbc3 100644 --- a/net/ipv6/ip6_offload.c +++ b/net/ipv6/ip6_offload.c @@ -246,7 +246,7 @@ static struct sk_buff **ipv6_gro_receive(struct sk_buff **head, skb_gro_postpull_rcsum(skb, iph, nlen); - pp = ops->callbacks.gro_receive(head, skb); + pp = call_gro_receive(ops->callbacks.gro_receive, head, skb); out_unlock: rcu_read_unlock(); -- cgit v1.2.3 From 286c72deabaa240b7eebbd99496ed3324d69f3c0 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 20 Oct 2016 09:39:40 -0700 Subject: udp: must lock the socket in udp_disconnect() Baozeng Ding reported KASAN traces showing uses after free in udp_lib_get_port() and other related UDP functions. A CONFIG_DEBUG_PAGEALLOC=y kernel would eventually crash. I could write a reproducer with two threads doing : static int sock_fd; static void *thr1(void *arg) { for (;;) { connect(sock_fd, (const struct sockaddr *)arg, sizeof(struct sockaddr_in)); } } static void *thr2(void *arg) { struct sockaddr_in unspec; for (;;) { memset(&unspec, 0, sizeof(unspec)); connect(sock_fd, (const struct sockaddr *)&unspec, sizeof(unspec)); } } Problem is that udp_disconnect() could run without holding socket lock, and this was causing list corruptions. Signed-off-by: Eric Dumazet Reported-by: Baozeng Ding Signed-off-by: David S. Miller --- include/net/udp.h | 1 + net/ipv4/ping.c | 2 +- net/ipv4/raw.c | 2 +- net/ipv4/udp.c | 13 +++++++++++-- net/ipv6/ping.c | 2 +- net/ipv6/raw.c | 2 +- net/l2tp/l2tp_ip.c | 2 +- net/l2tp/l2tp_ip6.c | 2 +- 8 files changed, 18 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/net/udp.h b/include/net/udp.h index ea53a87d880f..4948790d393d 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -258,6 +258,7 @@ void udp_flush_pending_frames(struct sock *sk); void udp4_hwcsum(struct sk_buff *skb, __be32 src, __be32 dst); int udp_rcv(struct sk_buff *skb); int udp_ioctl(struct sock *sk, int cmd, unsigned long arg); +int __udp_disconnect(struct sock *sk, int flags); int udp_disconnect(struct sock *sk, int flags); unsigned int udp_poll(struct file *file, struct socket *sock, poll_table *wait); struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb, diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 7cf7d6e380c2..205e2000d395 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -994,7 +994,7 @@ struct proto ping_prot = { .init = ping_init_sock, .close = ping_close, .connect = ip4_datagram_connect, - .disconnect = udp_disconnect, + .disconnect = __udp_disconnect, .setsockopt = ip_setsockopt, .getsockopt = ip_getsockopt, .sendmsg = ping_v4_sendmsg, diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 90a85c955872..ecbe5a7c2d6d 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -918,7 +918,7 @@ struct proto raw_prot = { .close = raw_close, .destroy = raw_destroy, .connect = ip4_datagram_connect, - .disconnect = udp_disconnect, + .disconnect = __udp_disconnect, .ioctl = raw_ioctl, .init = raw_init, .setsockopt = raw_setsockopt, diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 7d96dc2d3d08..311613e413cb 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1345,7 +1345,7 @@ csum_copy_err: goto try_again; } -int udp_disconnect(struct sock *sk, int flags) +int __udp_disconnect(struct sock *sk, int flags) { struct inet_sock *inet = inet_sk(sk); /* @@ -1367,6 +1367,15 @@ int udp_disconnect(struct sock *sk, int flags) sk_dst_reset(sk); return 0; } +EXPORT_SYMBOL(__udp_disconnect); + +int udp_disconnect(struct sock *sk, int flags) +{ + lock_sock(sk); + __udp_disconnect(sk, flags); + release_sock(sk); + return 0; +} EXPORT_SYMBOL(udp_disconnect); void udp_lib_unhash(struct sock *sk) @@ -2193,7 +2202,7 @@ int udp_abort(struct sock *sk, int err) sk->sk_err = err; sk->sk_error_report(sk); - udp_disconnect(sk, 0); + __udp_disconnect(sk, 0); release_sock(sk); diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c index 0e983b694ee8..66e2d9dfc43a 100644 --- a/net/ipv6/ping.c +++ b/net/ipv6/ping.c @@ -180,7 +180,7 @@ struct proto pingv6_prot = { .init = ping_init_sock, .close = ping_close, .connect = ip6_datagram_connect_v6_only, - .disconnect = udp_disconnect, + .disconnect = __udp_disconnect, .setsockopt = ipv6_setsockopt, .getsockopt = ipv6_getsockopt, .sendmsg = ping_v6_sendmsg, diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 54404f08efcc..054a1d84fc5e 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -1241,7 +1241,7 @@ struct proto rawv6_prot = { .close = rawv6_close, .destroy = raw6_destroy, .connect = ip6_datagram_connect_v6_only, - .disconnect = udp_disconnect, + .disconnect = __udp_disconnect, .ioctl = rawv6_ioctl, .init = rawv6_init_sk, .setsockopt = rawv6_setsockopt, diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c index 42de4ccd159f..fce25afb652a 100644 --- a/net/l2tp/l2tp_ip.c +++ b/net/l2tp/l2tp_ip.c @@ -338,7 +338,7 @@ static int l2tp_ip_disconnect(struct sock *sk, int flags) if (sock_flag(sk, SOCK_ZAPPED)) return 0; - return udp_disconnect(sk, flags); + return __udp_disconnect(sk, flags); } static int l2tp_ip_getname(struct socket *sock, struct sockaddr *uaddr, diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c index ea2ae6664cc8..ad3468c32b53 100644 --- a/net/l2tp/l2tp_ip6.c +++ b/net/l2tp/l2tp_ip6.c @@ -410,7 +410,7 @@ static int l2tp_ip6_disconnect(struct sock *sk, int flags) if (sock_flag(sk, SOCK_ZAPPED)) return 0; - return udp_disconnect(sk, flags); + return __udp_disconnect(sk, flags); } static int l2tp_ip6_getname(struct socket *sock, struct sockaddr *uaddr, -- cgit v1.2.3 From 8651be8f14a12d24f203f283601d9b0418c389ff Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Wed, 19 Oct 2016 23:35:12 -0700 Subject: ipv6: fix a potential deadlock in do_ipv6_setsockopt() Baozeng reported this deadlock case: CPU0 CPU1 ---- ---- lock([ 165.136033] sk_lock-AF_INET6); lock([ 165.136033] rtnl_mutex); lock([ 165.136033] sk_lock-AF_INET6); lock([ 165.136033] rtnl_mutex); Similar to commit 87e9f0315952 ("ipv4: fix a potential deadlock in mcast getsockopt() path") this is due to we still have a case, ipv6_sock_mc_close(), where we acquire sk_lock before rtnl_lock. Close this deadlock with the similar solution, that is always acquire rtnl lock first. Fixes: baf606d9c9b1 ("ipv4,ipv6: grab rtnl before locking the socket") Reported-by: Baozeng Ding Tested-by: Baozeng Ding Cc: Marcelo Ricardo Leitner Signed-off-by: Cong Wang Reviewed-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- include/net/addrconf.h | 1 + net/ipv6/ipv6_sockglue.c | 3 ++- net/ipv6/mcast.c | 17 ++++++++++++----- 3 files changed, 15 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/net/addrconf.h b/include/net/addrconf.h index f2d072787947..8f998afc1384 100644 --- a/include/net/addrconf.h +++ b/include/net/addrconf.h @@ -174,6 +174,7 @@ int ipv6_sock_mc_join(struct sock *sk, int ifindex, const struct in6_addr *addr); int ipv6_sock_mc_drop(struct sock *sk, int ifindex, const struct in6_addr *addr); +void __ipv6_sock_mc_close(struct sock *sk); void ipv6_sock_mc_close(struct sock *sk); bool inet6_mc_check(struct sock *sk, const struct in6_addr *mc_addr, const struct in6_addr *src_addr); diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index 5330262ab673..636ec56f5f50 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -120,6 +120,7 @@ struct ipv6_txoptions *ipv6_update_options(struct sock *sk, static bool setsockopt_needs_rtnl(int optname) { switch (optname) { + case IPV6_ADDRFORM: case IPV6_ADD_MEMBERSHIP: case IPV6_DROP_MEMBERSHIP: case IPV6_JOIN_ANYCAST: @@ -198,7 +199,7 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, } fl6_free_socklist(sk); - ipv6_sock_mc_close(sk); + __ipv6_sock_mc_close(sk); /* * Sock is moving from IPv6 to IPv4 (sk_prot), so diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index 75c1fc54f188..14a3903f1c82 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -276,16 +276,14 @@ static struct inet6_dev *ip6_mc_find_dev_rcu(struct net *net, return idev; } -void ipv6_sock_mc_close(struct sock *sk) +void __ipv6_sock_mc_close(struct sock *sk) { struct ipv6_pinfo *np = inet6_sk(sk); struct ipv6_mc_socklist *mc_lst; struct net *net = sock_net(sk); - if (!rcu_access_pointer(np->ipv6_mc_list)) - return; + ASSERT_RTNL(); - rtnl_lock(); while ((mc_lst = rtnl_dereference(np->ipv6_mc_list)) != NULL) { struct net_device *dev; @@ -303,8 +301,17 @@ void ipv6_sock_mc_close(struct sock *sk) atomic_sub(sizeof(*mc_lst), &sk->sk_omem_alloc); kfree_rcu(mc_lst, rcu); - } +} + +void ipv6_sock_mc_close(struct sock *sk) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + + if (!rcu_access_pointer(np->ipv6_mc_list)) + return; + rtnl_lock(); + __ipv6_sock_mc_close(sk); rtnl_unlock(); } -- cgit v1.2.3 From 0e191827383a6503a3bc547e63c74ff093f450f5 Mon Sep 17 00:00:00 2001 From: Sudarsana Reddy Kalluru Date: Fri, 21 Oct 2016 04:43:42 -0400 Subject: qed*: Reduce the memory footprint for Rx path With the current default values for Rx path i.e., 8 queues of 8Kb entries each with 4Kb size, interface will consume 256Mb for Rx. The default values causing the driver probe to fail when the system memory is low. Based on the perforamnce results, rx-ring count value of 1Kb gives the comparable performance with Rx coalesce timeout of 12 seconds. Updating the default values. Signed-off-by: Sudarsana Reddy Kalluru Signed-off-by: Yuval Mintz Signed-off-by: David S. Miller --- drivers/net/ethernet/qlogic/qed/qed_main.c | 1 + drivers/net/ethernet/qlogic/qede/qede.h | 2 +- include/linux/qed/qed_if.h | 1 + 3 files changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/drivers/net/ethernet/qlogic/qed/qed_main.c b/drivers/net/ethernet/qlogic/qed/qed_main.c index 8dc3f4670f64..c418360ba02a 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_main.c +++ b/drivers/net/ethernet/qlogic/qed/qed_main.c @@ -878,6 +878,7 @@ static int qed_slowpath_start(struct qed_dev *cdev, } } + cdev->rx_coalesce_usecs = QED_DEFAULT_RX_USECS; rc = qed_nic_setup(cdev); if (rc) goto err; diff --git a/drivers/net/ethernet/qlogic/qede/qede.h b/drivers/net/ethernet/qlogic/qede/qede.h index d6a4a9a5eb4d..974689a13337 100644 --- a/drivers/net/ethernet/qlogic/qede/qede.h +++ b/drivers/net/ethernet/qlogic/qede/qede.h @@ -354,7 +354,7 @@ void qede_update_rx_prod(struct qede_dev *edev, struct qede_rx_queue *rxq); #define RX_RING_SIZE ((u16)BIT(RX_RING_SIZE_POW)) #define NUM_RX_BDS_MAX (RX_RING_SIZE - 1) #define NUM_RX_BDS_MIN 128 -#define NUM_RX_BDS_DEF NUM_RX_BDS_MAX +#define NUM_RX_BDS_DEF ((u16)BIT(10) - 1) #define TX_RING_SIZE_POW 13 #define TX_RING_SIZE ((u16)BIT(TX_RING_SIZE_POW)) diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h index f9ae903bbb84..8978a60371f4 100644 --- a/include/linux/qed/qed_if.h +++ b/include/linux/qed/qed_if.h @@ -146,6 +146,7 @@ enum qed_led_mode { #define DIRECT_REG_RD(reg_addr) readl((void __iomem *)(reg_addr)) #define QED_COALESCE_MAX 0xFF +#define QED_DEFAULT_RX_USECS 12 /* forward */ struct qed_dev; -- cgit v1.2.3 From f1caa61df2a3dc4c58316295c5dc5edba4c68d85 Mon Sep 17 00:00:00 2001 From: Sinan Kaya Date: Mon, 24 Oct 2016 00:31:31 -0400 Subject: ACPI/PCI: pci_link: penalize SCI correctly Ondrej reported that IRQs stopped working in v4.7 on several platforms. A typical scenario, from Ondrej's VT82C694X/694X, is: ACPI: Using PIC for interrupt routing ACPI: PCI Interrupt Link [LNKA] (IRQs 1 3 4 5 6 7 10 *11 12 14 15) ACPI: No IRQ available for PCI Interrupt Link [LNKA] 8139too 0000:00:0f.0: PCI INT A: no GSI We're using PIC routing, so acpi_irq_balance == 0, and LNKA is already active at IRQ 11. In that case, acpi_pci_link_allocate() only tries to use the active IRQ (IRQ 11) which also happens to be the SCI. We should penalize the SCI by PIRQ_PENALTY_PCI_USING, but irq_get_trigger_type(11) returns something other than IRQ_TYPE_LEVEL_LOW, so we penalize it by PIRQ_PENALTY_ISA_ALWAYS instead, which makes acpi_pci_link_allocate() assume the IRQ isn't available and give up. Add acpi_penalize_sci_irq() so platforms can tell us the SCI IRQ, trigger, and polarity directly and we don't have to depend on irq_get_trigger_type(). Fixes: 103544d86976 (ACPI,PCI,IRQ: reduce resource requirements) Link: http://lkml.kernel.org/r/201609251512.05657.linux@rainbow-software.org Reported-by: Ondrej Zary Acked-by: Bjorn Helgaas Signed-off-by: Sinan Kaya Tested-by: Jonathan Liu Signed-off-by: Rafael J. Wysocki --- arch/x86/kernel/acpi/boot.c | 1 + drivers/acpi/pci_link.c | 30 +++++++++++++++--------------- include/linux/acpi.h | 1 + 3 files changed, 17 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 8a5abaa7d453..931ced8ca345 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -454,6 +454,7 @@ static void __init acpi_sci_ioapic_setup(u8 bus_irq, u16 polarity, u16 trigger, polarity = acpi_sci_flags & ACPI_MADT_POLARITY_MASK; mp_override_legacy_irq(bus_irq, polarity, trigger, gsi); + acpi_penalize_sci_irq(bus_irq, trigger, polarity); /* * stash over-ride to indicate we've been here diff --git a/drivers/acpi/pci_link.c b/drivers/acpi/pci_link.c index 6229b022a5d4..74bf96efae95 100644 --- a/drivers/acpi/pci_link.c +++ b/drivers/acpi/pci_link.c @@ -87,6 +87,7 @@ struct acpi_pci_link { static LIST_HEAD(acpi_link_list); static DEFINE_MUTEX(acpi_link_lock); +static int sci_irq = -1, sci_penalty; /* -------------------------------------------------------------------------- PCI Link Device Management @@ -496,25 +497,13 @@ static int acpi_irq_get_penalty(int irq) { int penalty = 0; - /* - * Penalize IRQ used by ACPI SCI. If ACPI SCI pin attributes conflict - * with PCI IRQ attributes, mark ACPI SCI as ISA_ALWAYS so it won't be - * use for PCI IRQs. - */ - if (irq == acpi_gbl_FADT.sci_interrupt) { - u32 type = irq_get_trigger_type(irq) & IRQ_TYPE_SENSE_MASK; - - if (type != IRQ_TYPE_LEVEL_LOW) - penalty += PIRQ_PENALTY_ISA_ALWAYS; - else - penalty += PIRQ_PENALTY_PCI_USING; - } + if (irq == sci_irq) + penalty += sci_penalty; if (irq < ACPI_MAX_ISA_IRQS) return penalty + acpi_isa_irq_penalty[irq]; - penalty += acpi_irq_pci_sharing_penalty(irq); - return penalty; + return penalty + acpi_irq_pci_sharing_penalty(irq); } int __init acpi_irq_penalty_init(void) @@ -881,6 +870,17 @@ bool acpi_isa_irq_available(int irq) acpi_irq_get_penalty(irq) < PIRQ_PENALTY_ISA_ALWAYS); } +void acpi_penalize_sci_irq(int irq, int trigger, int polarity) +{ + sci_irq = irq; + + if (trigger == ACPI_MADT_TRIGGER_LEVEL && + polarity == ACPI_MADT_POLARITY_ACTIVE_LOW) + sci_penalty = PIRQ_PENALTY_PCI_USING; + else + sci_penalty = PIRQ_PENALTY_ISA_ALWAYS; +} + /* * Over-ride default table to reserve additional IRQs for use by ISA * e.g. acpi_irq_isa=5 diff --git a/include/linux/acpi.h b/include/linux/acpi.h index ddbeda6dbdc8..689a8b9b9c8f 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -326,6 +326,7 @@ struct pci_dev; int acpi_pci_irq_enable (struct pci_dev *dev); void acpi_penalize_isa_irq(int irq, int active); bool acpi_isa_irq_available(int irq); +void acpi_penalize_sci_irq(int irq, int trigger, int polarity); void acpi_pci_irq_disable (struct pci_dev *dev); extern int ec_read(u8 addr, u8 *val); -- cgit v1.2.3 From 0d7317598214134d73da59990b846481a9527a00 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 24 Oct 2016 10:57:25 +0100 Subject: mm: unexport __get_user_pages() This patch unexports the low-level __get_user_pages() function. Recent refactoring of the get_user_pages* functions allow flags to be passed through get_user_pages() which eliminates the need for access to this function from its one user, kvm. We can see that the two calls to get_user_pages() which replace __get_user_pages() in kvm_main.c are equivalent by examining their call stacks: get_user_page_nowait(): get_user_pages(start, 1, flags, page, NULL) __get_user_pages_locked(current, current->mm, start, 1, page, NULL, NULL, false, flags | FOLL_TOUCH) __get_user_pages(current, current->mm, start, 1, flags | FOLL_TOUCH | FOLL_GET, page, NULL, NULL) check_user_page_hwpoison(): get_user_pages(addr, 1, flags, NULL, NULL) __get_user_pages_locked(current, current->mm, addr, 1, NULL, NULL, NULL, false, flags | FOLL_TOUCH) __get_user_pages(current, current->mm, addr, 1, flags | FOLL_TOUCH, NULL, NULL, NULL) Signed-off-by: Lorenzo Stoakes Acked-by: Paolo Bonzini Acked-by: Michal Hocko Signed-off-by: Linus Torvalds --- include/linux/mm.h | 4 ---- mm/gup.c | 3 +-- mm/nommu.c | 2 +- virt/kvm/kvm_main.c | 10 ++++------ 4 files changed, 6 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 3a191853faaa..a92c8d73aeaf 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1271,10 +1271,6 @@ extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void * extern int access_remote_vm(struct mm_struct *mm, unsigned long addr, void *buf, int len, unsigned int gup_flags); -long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, unsigned long nr_pages, - unsigned int foll_flags, struct page **pages, - struct vm_area_struct **vmas, int *nonblocking); long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, diff --git a/mm/gup.c b/mm/gup.c index 7aa113c2d373..ec4f82704b6f 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -526,7 +526,7 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags) * instead of __get_user_pages. __get_user_pages should be used only if * you need some special @gup_flags. */ -long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, +static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, struct vm_area_struct **vmas, int *nonblocking) @@ -631,7 +631,6 @@ next_page: } while (nr_pages); return i; } -EXPORT_SYMBOL(__get_user_pages); bool vma_permits_fault(struct vm_area_struct *vma, unsigned int fault_flags) { diff --git a/mm/nommu.c b/mm/nommu.c index db5fd1795298..8b8faaf2a9e9 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -109,7 +109,7 @@ unsigned int kobjsize(const void *objp) return PAGE_SIZE << compound_order(page); } -long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, +static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, unsigned long nr_pages, unsigned int foll_flags, struct page **pages, struct vm_area_struct **vmas, int *nonblocking) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 28510e72618a..2907b7b78654 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1346,21 +1346,19 @@ unsigned long kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu *vcpu, gfn_t gfn, bool *w static int get_user_page_nowait(unsigned long start, int write, struct page **page) { - int flags = FOLL_TOUCH | FOLL_NOWAIT | FOLL_HWPOISON | FOLL_GET; + int flags = FOLL_NOWAIT | FOLL_HWPOISON; if (write) flags |= FOLL_WRITE; - return __get_user_pages(current, current->mm, start, 1, flags, page, - NULL, NULL); + return get_user_pages(start, 1, flags, page, NULL); } static inline int check_user_page_hwpoison(unsigned long addr) { - int rc, flags = FOLL_TOUCH | FOLL_HWPOISON | FOLL_WRITE; + int rc, flags = FOLL_HWPOISON | FOLL_WRITE; - rc = __get_user_pages(current, current->mm, addr, 1, - flags, NULL, NULL, NULL); + rc = get_user_pages(addr, 1, flags, NULL, NULL); return rc == -EHWPOISON; } -- cgit v1.2.3 From 8ef4227615e158faa4ee85a1d6466782f7e22f2f Mon Sep 17 00:00:00 2001 From: Dave Airlie Date: Mon, 24 Oct 2016 15:27:59 +1000 Subject: x86/io: add interface to reserve io memtype for a resource range. (v1.1) A recent change to the mm code in: 87744ab3832b mm: fix cache mode tracking in vm_insert_mixed() started enforcing checking the memory type against the registered list for amixed pfn insertion mappings. It happens that the drm drivers for a number of gpus relied on this being broken. Currently the driver only inserted VRAM mappings into the tracking table when they came from the kernel, and userspace mappings never landed in the table. This led to a regression where all the mapping end up as UC instead of WC now. I've considered a number of solutions but since this needs to be fixed in fixes and not next, and some of the solutions were going to introduce overhead that hadn't been there before I didn't consider them viable at this stage. These mainly concerned hooking into the TTM io reserve APIs, but these API have a bunch of fast paths I didn't want to unwind to add this to. The solution I've decided on is to add a new API like the arch_phys_wc APIs (these would have worked but wc_del didn't take a range), and use them from the drivers to add a WC compatible mapping to the table for all VRAM on those GPUs. This means we can then create userspace mapping that won't get degraded to UC. v1.1: use CONFIG_X86_PAT + add some comments in io.h Cc: Toshi Kani Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Andy Lutomirski Cc: Denys Vlasenko Cc: Brian Gerst Cc: x86@kernel.org Cc: mcgrof@suse.com Cc: Dan Williams Acked-by: Ingo Molnar Reviewed-by: Thomas Gleixner Signed-off-by: Dave Airlie --- arch/x86/include/asm/io.h | 6 ++++++ arch/x86/mm/pat.c | 14 ++++++++++++++ include/linux/io.h | 22 ++++++++++++++++++++++ 3 files changed, 42 insertions(+) (limited to 'include') diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index de25aad07853..d34bd370074b 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -351,4 +351,10 @@ extern void arch_phys_wc_del(int handle); #define arch_phys_wc_add arch_phys_wc_add #endif +#ifdef CONFIG_X86_PAT +extern int arch_io_reserve_memtype_wc(resource_size_t start, resource_size_t size); +extern void arch_io_free_memtype_wc(resource_size_t start, resource_size_t size); +#define arch_io_reserve_memtype_wc arch_io_reserve_memtype_wc +#endif + #endif /* _ASM_X86_IO_H */ diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 170cc4ff057b..83e701f160a9 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -730,6 +730,20 @@ void io_free_memtype(resource_size_t start, resource_size_t end) free_memtype(start, end); } +int arch_io_reserve_memtype_wc(resource_size_t start, resource_size_t size) +{ + enum page_cache_mode type = _PAGE_CACHE_MODE_WC; + + return io_reserve_memtype(start, start + size, &type); +} +EXPORT_SYMBOL(arch_io_reserve_memtype_wc); + +void arch_io_free_memtype_wc(resource_size_t start, resource_size_t size) +{ + io_free_memtype(start, start + size); +} +EXPORT_SYMBOL(arch_io_free_memtype_wc); + pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, unsigned long size, pgprot_t vma_prot) { diff --git a/include/linux/io.h b/include/linux/io.h index e2c8419278c1..82ef36eac8a1 100644 --- a/include/linux/io.h +++ b/include/linux/io.h @@ -141,4 +141,26 @@ enum { void *memremap(resource_size_t offset, size_t size, unsigned long flags); void memunmap(void *addr); +/* + * On x86 PAT systems we have memory tracking that keeps track of + * the allowed mappings on memory ranges. This tracking works for + * all the in-kernel mapping APIs (ioremap*), but where the user + * wishes to map a range from a physical device into user memory + * the tracking won't be updated. This API is to be used by + * drivers which remap physical device pages into userspace, + * and wants to make sure they are mapped WC and not UC. + */ +#ifndef arch_io_reserve_memtype_wc +static inline int arch_io_reserve_memtype_wc(resource_size_t base, + resource_size_t size) +{ + return 0; +} + +static inline void arch_io_free_memtype_wc(resource_size_t base, + resource_size_t size) +{ +} +#endif + #endif /* _LINUX_IO_H */ -- cgit v1.2.3 From b4f7f4ad425a84fd5d40da21aff8681997b25ff9 Mon Sep 17 00:00:00 2001 From: Jani Nikula Date: Fri, 21 Oct 2016 15:57:23 +0300 Subject: mac80211: fix some sphinx warnings Signed-off-by: Jani Nikula Signed-off-by: Johannes Berg --- include/net/mac80211.h | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index a810dfcb83c2..e2dba93e374f 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -811,14 +811,18 @@ enum mac80211_rate_control_flags { * in the control information, and it will be filled by the rate * control algorithm according to what should be sent. For example, * if this array contains, in the format { , } the - * information + * information:: + * * { 3, 2 }, { 2, 2 }, { 1, 4 }, { -1, 0 }, { -1, 0 } + * * then this means that the frame should be transmitted * up to twice at rate 3, up to twice at rate 2, and up to four * times at rate 1 if it doesn't get acknowledged. Say it gets * acknowledged by the peer after the fifth attempt, the status - * information should then contain + * information should then contain:: + * * { 3, 2 }, { 2, 2 }, { 1, 1 }, { -1, 0 } ... + * * since it was transmitted twice at rate 3, twice at rate 2 * and once at rate 1 after which we received an acknowledgement. */ @@ -1168,8 +1172,8 @@ enum mac80211_rx_vht_flags { * @rate_idx: index of data rate into band's supported rates or MCS index if * HT or VHT is used (%RX_FLAG_HT/%RX_FLAG_VHT) * @vht_nss: number of streams (VHT only) - * @flag: %RX_FLAG_* - * @vht_flag: %RX_VHT_FLAG_* + * @flag: %RX_FLAG_\* + * @vht_flag: %RX_VHT_FLAG_\* * @rx_flags: internal RX flags for mac80211 * @ampdu_reference: A-MPDU reference number, must be a different value for * each A-MPDU but the same for each subframe within one A-MPDU @@ -1432,7 +1436,7 @@ enum ieee80211_vif_flags { * @probe_req_reg: probe requests should be reported to mac80211 for this * interface. * @drv_priv: data area for driver use, will always be aligned to - * sizeof(void *). + * sizeof(void \*). * @txq: the multicast data TX queue (if driver uses the TXQ abstraction) */ struct ieee80211_vif { @@ -1743,7 +1747,7 @@ struct ieee80211_sta_rates { * @wme: indicates whether the STA supports QoS/WME (if local devices does, * otherwise always false) * @drv_priv: data area for driver use, will always be aligned to - * sizeof(void *), size is determined in hw information. + * sizeof(void \*), size is determined in hw information. * @uapsd_queues: bitmap of queues configured for uapsd. Only valid * if wme is supported. * @max_sp: max Service Period. Only valid if wme is supported. @@ -2146,12 +2150,12 @@ enum ieee80211_hw_flags { * * @radiotap_mcs_details: lists which MCS information can the HW * reports, by default it is set to _MCS, _GI and _BW but doesn't - * include _FMT. Use %IEEE80211_RADIOTAP_MCS_HAVE_* values, only + * include _FMT. Use %IEEE80211_RADIOTAP_MCS_HAVE_\* values, only * adding _BW is supported today. * * @radiotap_vht_details: lists which VHT MCS information the HW reports, * the default is _GI | _BANDWIDTH. - * Use the %IEEE80211_RADIOTAP_VHT_KNOWN_* values. + * Use the %IEEE80211_RADIOTAP_VHT_KNOWN_\* values. * * @radiotap_timestamp: Information for the radiotap timestamp field; if the * 'units_pos' member is set to a non-negative value it must be set to @@ -2486,6 +2490,7 @@ void ieee80211_free_txskb(struct ieee80211_hw *hw, struct sk_buff *skb); * in the software stack cares about, we will, in the future, have mac80211 * tell the driver which information elements are interesting in the sense * that we want to see changes in them. This will include + * * - a list of information element IDs * - a list of OUIs for the vendor information element * -- cgit v1.2.3 From 293de7dee4f9602676846dbeb84b1580123306b4 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Sun, 23 Oct 2016 09:28:29 -0700 Subject: doc: update docbook annotations for socket and skb The skbuff and sock structure both had missing parameter annotation values. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- include/linux/skbuff.h | 1 + include/net/sock.h | 4 +++- 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 601258f6e621..32810f279f8e 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -936,6 +936,7 @@ struct sk_buff_fclones { /** * skb_fclone_busy - check if fclone is busy + * @sk: socket * @skb: buffer * * Returns true if skb is a fast clone, and its clone is not freed. diff --git a/include/net/sock.h b/include/net/sock.h index ebf75db08e06..73c6b008f1b7 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -252,6 +252,7 @@ struct sock_common { * @sk_pacing_rate: Pacing rate (if supported by transport/packet scheduler) * @sk_max_pacing_rate: Maximum pacing rate (%SO_MAX_PACING_RATE) * @sk_sndbuf: size of send buffer in bytes + * @sk_padding: unused element for alignment * @sk_no_check_tx: %SO_NO_CHECK setting, set checksum in TX packets * @sk_no_check_rx: allow zero checksum in RX packets * @sk_route_caps: route capabilities (e.g. %NETIF_F_TSO) @@ -302,7 +303,8 @@ struct sock_common { * @sk_backlog_rcv: callback to process the backlog * @sk_destruct: called at sock freeing time, i.e. when all refcnt == 0 * @sk_reuseport_cb: reuseport group container - */ + * @sk_rcu: used during RCU grace period + */ struct sock { /* * Now struct inet_timewait_sock also uses sock_common, so please just -- cgit v1.2.3 From 10df8e6152c6c400a563a673e9956320bfce1871 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 23 Oct 2016 18:03:06 -0700 Subject: udp: fix IP_CHECKSUM handling First bug was added in commit ad6f939ab193 ("ip: Add offset parameter to ip_cmsg_recv") : Tom missed that ipv4 udp messages could be received on AF_INET6 socket. ip_cmsg_recv(msg, skb) should have been replaced by ip_cmsg_recv_offset(msg, skb, sizeof(struct udphdr)); Then commit e6afc8ace6dd ("udp: remove headers from UDP packets before queueing") forgot to adjust the offsets now UDP headers are pulled before skb are put in receive queue. Fixes: ad6f939ab193 ("ip: Add offset parameter to ip_cmsg_recv") Fixes: e6afc8ace6dd ("udp: remove headers from UDP packets before queueing") Signed-off-by: Eric Dumazet Cc: Sam Kumar Cc: Willem de Bruijn Tested-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/net/ip.h | 4 ++-- net/ipv4/ip_sockglue.c | 11 ++++++----- net/ipv4/udp.c | 2 +- net/ipv6/udp.c | 3 ++- 4 files changed, 11 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/net/ip.h b/include/net/ip.h index c9d07988911e..5413883ac47f 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -578,7 +578,7 @@ int ip_options_rcv_srr(struct sk_buff *skb); */ void ipv4_pktinfo_prepare(const struct sock *sk, struct sk_buff *skb); -void ip_cmsg_recv_offset(struct msghdr *msg, struct sk_buff *skb, int offset); +void ip_cmsg_recv_offset(struct msghdr *msg, struct sk_buff *skb, int tlen, int offset); int ip_cmsg_send(struct sock *sk, struct msghdr *msg, struct ipcm_cookie *ipc, bool allow_ipv6); int ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval, @@ -600,7 +600,7 @@ void ip_local_error(struct sock *sk, int err, __be32 daddr, __be16 dport, static inline void ip_cmsg_recv(struct msghdr *msg, struct sk_buff *skb) { - ip_cmsg_recv_offset(msg, skb, 0); + ip_cmsg_recv_offset(msg, skb, 0, 0); } bool icmp_global_allow(void); diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index af4919792b6a..b8a2d63d1fb8 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -98,7 +98,7 @@ static void ip_cmsg_recv_retopts(struct msghdr *msg, struct sk_buff *skb) } static void ip_cmsg_recv_checksum(struct msghdr *msg, struct sk_buff *skb, - int offset) + int tlen, int offset) { __wsum csum = skb->csum; @@ -106,8 +106,9 @@ static void ip_cmsg_recv_checksum(struct msghdr *msg, struct sk_buff *skb, return; if (offset != 0) - csum = csum_sub(csum, csum_partial(skb_transport_header(skb), - offset, 0)); + csum = csum_sub(csum, + csum_partial(skb_transport_header(skb) + tlen, + offset, 0)); put_cmsg(msg, SOL_IP, IP_CHECKSUM, sizeof(__wsum), &csum); } @@ -153,7 +154,7 @@ static void ip_cmsg_recv_dstaddr(struct msghdr *msg, struct sk_buff *skb) } void ip_cmsg_recv_offset(struct msghdr *msg, struct sk_buff *skb, - int offset) + int tlen, int offset) { struct inet_sock *inet = inet_sk(skb->sk); unsigned int flags = inet->cmsg_flags; @@ -216,7 +217,7 @@ void ip_cmsg_recv_offset(struct msghdr *msg, struct sk_buff *skb, } if (flags & IP_CMSG_CHECKSUM) - ip_cmsg_recv_checksum(msg, skb, offset); + ip_cmsg_recv_checksum(msg, skb, tlen, offset); } EXPORT_SYMBOL(ip_cmsg_recv_offset); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 311613e413cb..d123d68f4d1d 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1322,7 +1322,7 @@ try_again: *addr_len = sizeof(*sin); } if (inet->cmsg_flags) - ip_cmsg_recv_offset(msg, skb, sizeof(struct udphdr) + off); + ip_cmsg_recv_offset(msg, skb, sizeof(struct udphdr), off); err = copied; if (flags & MSG_TRUNC) diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 9aa7c1c7a9ce..b2ef061e6836 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -427,7 +427,8 @@ try_again: if (is_udp4) { if (inet->cmsg_flags) - ip_cmsg_recv(msg, skb); + ip_cmsg_recv_offset(msg, skb, + sizeof(struct udphdr), off); } else { if (np->rxopt.all) ip6_datagram_recv_specific_ctl(sk, msg, skb); -- cgit v1.2.3 From 9dcb8b685fc30813b35ab4b4bf39244430753190 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 26 Oct 2016 10:15:30 -0700 Subject: mm: remove per-zone hashtable of bitlock waitqueues The per-zone waitqueues exist because of a scalability issue with the page waitqueues on some NUMA machines, but it turns out that they hurt normal loads, and now with the vmalloced stacks they also end up breaking gfs2 that uses a bit_wait on a stack object: wait_on_bit(&gh->gh_iflags, HIF_WAIT, TASK_UNINTERRUPTIBLE) where 'gh' can be a reference to the local variable 'mount_gh' on the stack of fill_super(). The reason the per-zone hash table breaks for this case is that there is no "zone" for virtual allocations, and trying to look up the physical page to get at it will fail (with a BUG_ON()). It turns out that I actually complained to the mm people about the per-zone hash table for another reason just a month ago: the zone lookup also hurts the regular use of "unlock_page()" a lot, because the zone lookup ends up forcing several unnecessary cache misses and generates horrible code. As part of that earlier discussion, we had a much better solution for the NUMA scalability issue - by just making the page lock have a separate contention bit, the waitqueue doesn't even have to be looked at for the normal case. Peter Zijlstra already has a patch for that, but let's see if anybody even notices. In the meantime, let's fix the actual gfs2 breakage by simplifying the bitlock waitqueues and removing the per-zone issue. Reported-by: Andreas Gruenbacher Tested-by: Bob Peterson Acked-by: Mel Gorman Cc: Peter Zijlstra Cc: Andy Lutomirski Cc: Steven Whitehouse Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 30 +------------ kernel/sched/core.c | 16 +++++++ kernel/sched/wait.c | 10 ----- mm/filemap.c | 4 +- mm/memory_hotplug.c | 28 ------------ mm/page_alloc.c | 115 +------------------------------------------------ 6 files changed, 21 insertions(+), 182 deletions(-) (limited to 'include') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 7f2ae99e5daf..0f088f3a2fed 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -440,33 +440,7 @@ struct zone { seqlock_t span_seqlock; #endif - /* - * wait_table -- the array holding the hash table - * wait_table_hash_nr_entries -- the size of the hash table array - * wait_table_bits -- wait_table_size == (1 << wait_table_bits) - * - * The purpose of all these is to keep track of the people - * waiting for a page to become available and make them - * runnable again when possible. The trouble is that this - * consumes a lot of space, especially when so few things - * wait on pages at a given time. So instead of using - * per-page waitqueues, we use a waitqueue hash table. - * - * The bucket discipline is to sleep on the same queue when - * colliding and wake all in that wait queue when removing. - * When something wakes, it must check to be sure its page is - * truly available, a la thundering herd. The cost of a - * collision is great, but given the expected load of the - * table, they should be so rare as to be outweighed by the - * benefits from the saved space. - * - * __wait_on_page_locked() and unlock_page() in mm/filemap.c, are the - * primary users of these fields, and in mm/page_alloc.c - * free_area_init_core() performs the initialization of them. - */ - wait_queue_head_t *wait_table; - unsigned long wait_table_hash_nr_entries; - unsigned long wait_table_bits; + int initialized; /* Write-intensive fields used from the page allocator */ ZONE_PADDING(_pad1_) @@ -546,7 +520,7 @@ static inline bool zone_spans_pfn(const struct zone *zone, unsigned long pfn) static inline bool zone_is_initialized(struct zone *zone) { - return !!zone->wait_table; + return zone->initialized; } static inline bool zone_is_empty(struct zone *zone) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 94732d1ab00a..42d4027f9e26 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7515,11 +7515,27 @@ static struct kmem_cache *task_group_cache __read_mostly; DECLARE_PER_CPU(cpumask_var_t, load_balance_mask); DECLARE_PER_CPU(cpumask_var_t, select_idle_mask); +#define WAIT_TABLE_BITS 8 +#define WAIT_TABLE_SIZE (1 << WAIT_TABLE_BITS) +static wait_queue_head_t bit_wait_table[WAIT_TABLE_SIZE] __cacheline_aligned; + +wait_queue_head_t *bit_waitqueue(void *word, int bit) +{ + const int shift = BITS_PER_LONG == 32 ? 5 : 6; + unsigned long val = (unsigned long)word << shift | bit; + + return bit_wait_table + hash_long(val, WAIT_TABLE_BITS); +} +EXPORT_SYMBOL(bit_waitqueue); + void __init sched_init(void) { int i, j; unsigned long alloc_size = 0, ptr; + for (i = 0; i < WAIT_TABLE_SIZE; i++) + init_waitqueue_head(bit_wait_table + i); + #ifdef CONFIG_FAIR_GROUP_SCHED alloc_size += 2 * nr_cpu_ids * sizeof(void **); #endif diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 4f7053579fe3..9453efe9b25a 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -480,16 +480,6 @@ void wake_up_bit(void *word, int bit) } EXPORT_SYMBOL(wake_up_bit); -wait_queue_head_t *bit_waitqueue(void *word, int bit) -{ - const int shift = BITS_PER_LONG == 32 ? 5 : 6; - const struct zone *zone = page_zone(virt_to_page(word)); - unsigned long val = (unsigned long)word << shift | bit; - - return &zone->wait_table[hash_long(val, zone->wait_table_bits)]; -} -EXPORT_SYMBOL(bit_waitqueue); - /* * Manipulate the atomic_t address to produce a better bit waitqueue table hash * index (we're keying off bit -1, but that would produce a horrible hash diff --git a/mm/filemap.c b/mm/filemap.c index 849f459ad078..c7fe2f16503f 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -790,9 +790,7 @@ EXPORT_SYMBOL(__page_cache_alloc); */ wait_queue_head_t *page_waitqueue(struct page *page) { - const struct zone *zone = page_zone(page); - - return &zone->wait_table[hash_ptr(page, zone->wait_table_bits)]; + return bit_waitqueue(page, 0); } EXPORT_SYMBOL(page_waitqueue); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 962927309b6e..b18dab401be6 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -268,7 +268,6 @@ void __init register_page_bootmem_info_node(struct pglist_data *pgdat) unsigned long i, pfn, end_pfn, nr_pages; int node = pgdat->node_id; struct page *page; - struct zone *zone; nr_pages = PAGE_ALIGN(sizeof(struct pglist_data)) >> PAGE_SHIFT; page = virt_to_page(pgdat); @@ -276,19 +275,6 @@ void __init register_page_bootmem_info_node(struct pglist_data *pgdat) for (i = 0; i < nr_pages; i++, page++) get_page_bootmem(node, page, NODE_INFO); - zone = &pgdat->node_zones[0]; - for (; zone < pgdat->node_zones + MAX_NR_ZONES - 1; zone++) { - if (zone_is_initialized(zone)) { - nr_pages = zone->wait_table_hash_nr_entries - * sizeof(wait_queue_head_t); - nr_pages = PAGE_ALIGN(nr_pages) >> PAGE_SHIFT; - page = virt_to_page(zone->wait_table); - - for (i = 0; i < nr_pages; i++, page++) - get_page_bootmem(node, page, NODE_INFO); - } - } - pfn = pgdat->node_start_pfn; end_pfn = pgdat_end_pfn(pgdat); @@ -2158,20 +2144,6 @@ void try_offline_node(int nid) */ node_set_offline(nid); unregister_one_node(nid); - - /* free waittable in each zone */ - for (i = 0; i < MAX_NR_ZONES; i++) { - struct zone *zone = pgdat->node_zones + i; - - /* - * wait_table may be allocated from boot memory, - * here only free if it's allocated by vmalloc. - */ - if (is_vmalloc_addr(zone->wait_table)) { - vfree(zone->wait_table); - zone->wait_table = NULL; - } - } } EXPORT_SYMBOL(try_offline_node); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2b3bf6767d54..de7c6e43b1c9 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4976,72 +4976,6 @@ void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone) #endif } -/* - * Helper functions to size the waitqueue hash table. - * Essentially these want to choose hash table sizes sufficiently - * large so that collisions trying to wait on pages are rare. - * But in fact, the number of active page waitqueues on typical - * systems is ridiculously low, less than 200. So this is even - * conservative, even though it seems large. - * - * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to - * waitqueues, i.e. the size of the waitq table given the number of pages. - */ -#define PAGES_PER_WAITQUEUE 256 - -#ifndef CONFIG_MEMORY_HOTPLUG -static inline unsigned long wait_table_hash_nr_entries(unsigned long pages) -{ - unsigned long size = 1; - - pages /= PAGES_PER_WAITQUEUE; - - while (size < pages) - size <<= 1; - - /* - * Once we have dozens or even hundreds of threads sleeping - * on IO we've got bigger problems than wait queue collision. - * Limit the size of the wait table to a reasonable size. - */ - size = min(size, 4096UL); - - return max(size, 4UL); -} -#else -/* - * A zone's size might be changed by hot-add, so it is not possible to determine - * a suitable size for its wait_table. So we use the maximum size now. - * - * The max wait table size = 4096 x sizeof(wait_queue_head_t). ie: - * - * i386 (preemption config) : 4096 x 16 = 64Kbyte. - * ia64, x86-64 (no preemption): 4096 x 20 = 80Kbyte. - * ia64, x86-64 (preemption) : 4096 x 24 = 96Kbyte. - * - * The maximum entries are prepared when a zone's memory is (512K + 256) pages - * or more by the traditional way. (See above). It equals: - * - * i386, x86-64, powerpc(4K page size) : = ( 2G + 1M)byte. - * ia64(16K page size) : = ( 8G + 4M)byte. - * powerpc (64K page size) : = (32G +16M)byte. - */ -static inline unsigned long wait_table_hash_nr_entries(unsigned long pages) -{ - return 4096UL; -} -#endif - -/* - * This is an integer logarithm so that shifts can be used later - * to extract the more random high bits from the multiplicative - * hash function before the remainder is taken. - */ -static inline unsigned long wait_table_bits(unsigned long size) -{ - return ffz(~size); -} - /* * Initially all pages are reserved - free ones are freed * up by free_all_bootmem() once the early boot process is @@ -5304,49 +5238,6 @@ void __init setup_per_cpu_pageset(void) alloc_percpu(struct per_cpu_nodestat); } -static noinline __ref -int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) -{ - int i; - size_t alloc_size; - - /* - * The per-page waitqueue mechanism uses hashed waitqueues - * per zone. - */ - zone->wait_table_hash_nr_entries = - wait_table_hash_nr_entries(zone_size_pages); - zone->wait_table_bits = - wait_table_bits(zone->wait_table_hash_nr_entries); - alloc_size = zone->wait_table_hash_nr_entries - * sizeof(wait_queue_head_t); - - if (!slab_is_available()) { - zone->wait_table = (wait_queue_head_t *) - memblock_virt_alloc_node_nopanic( - alloc_size, zone->zone_pgdat->node_id); - } else { - /* - * This case means that a zone whose size was 0 gets new memory - * via memory hot-add. - * But it may be the case that a new node was hot-added. In - * this case vmalloc() will not be able to use this new node's - * memory - this wait_table must be initialized to use this new - * node itself as well. - * To use this new node's memory, further consideration will be - * necessary. - */ - zone->wait_table = vmalloc(alloc_size); - } - if (!zone->wait_table) - return -ENOMEM; - - for (i = 0; i < zone->wait_table_hash_nr_entries; ++i) - init_waitqueue_head(zone->wait_table + i); - - return 0; -} - static __meminit void zone_pcp_init(struct zone *zone) { /* @@ -5367,10 +5258,7 @@ int __meminit init_currently_empty_zone(struct zone *zone, unsigned long size) { struct pglist_data *pgdat = zone->zone_pgdat; - int ret; - ret = zone_wait_table_init(zone, size); - if (ret) - return ret; + pgdat->nr_zones = zone_idx(zone) + 1; zone->zone_start_pfn = zone_start_pfn; @@ -5382,6 +5270,7 @@ int __meminit init_currently_empty_zone(struct zone *zone, zone_start_pfn, (zone_start_pfn + size)); zone_init_free_lists(zone); + zone->initialized = 1; return 0; } -- cgit v1.2.3 From 830218c1add1da16519b71909e5cf21522b7d062 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Mon, 24 Oct 2016 10:52:35 -0700 Subject: net: ipv6: Fix processing of RAs in presence of VRF rt6_add_route_info and rt6_add_dflt_router were updated to pull the FIB table from the device index, but the corresponding rt6_get_route_info and rt6_get_dflt_router functions were not leading to the failure to process RA's: ICMPv6: RA: ndisc_router_discovery failed to add default route Fix the 'get' functions by using the table id associated with the device when applicable. Also, now that default routes can be added to tables other than the default table, rt6_purge_dflt_routers needs to be updated as well to look at all tables. To handle that efficiently, add a flag to the table denoting if it is has a default route via RA. Fixes: ca254490c8dfd ("net: Add VRF support to IPv6 stack") Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 2 ++ net/ipv6/route.c | 68 ++++++++++++++++++++++++++++++++++++--------------- 2 files changed, 50 insertions(+), 20 deletions(-) (limited to 'include') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index fb961a576abe..a74e2aa40ef4 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -230,6 +230,8 @@ struct fib6_table { rwlock_t tb6_lock; struct fib6_node tb6_root; struct inet_peer_base tb6_peers; + unsigned int flags; +#define RT6_TABLE_HAS_DFLT_ROUTER BIT(0) }; #define RT6_TABLE_UNSPEC RT_TABLE_UNSPEC diff --git a/net/ipv6/route.c b/net/ipv6/route.c index bdbc38e8bf29..3ac19eb81a86 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -102,11 +102,13 @@ static int rt6_score_route(struct rt6_info *rt, int oif, int strict); #ifdef CONFIG_IPV6_ROUTE_INFO static struct rt6_info *rt6_add_route_info(struct net *net, const struct in6_addr *prefix, int prefixlen, - const struct in6_addr *gwaddr, int ifindex, + const struct in6_addr *gwaddr, + struct net_device *dev, unsigned int pref); static struct rt6_info *rt6_get_route_info(struct net *net, const struct in6_addr *prefix, int prefixlen, - const struct in6_addr *gwaddr, int ifindex); + const struct in6_addr *gwaddr, + struct net_device *dev); #endif struct uncached_list { @@ -803,7 +805,7 @@ int rt6_route_rcv(struct net_device *dev, u8 *opt, int len, rt = rt6_get_dflt_router(gwaddr, dev); else rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, - gwaddr, dev->ifindex); + gwaddr, dev); if (rt && !lifetime) { ip6_del_rt(rt); @@ -811,8 +813,8 @@ int rt6_route_rcv(struct net_device *dev, u8 *opt, int len, } if (!rt && lifetime) - rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex, - pref); + rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, + dev, pref); else if (rt) rt->rt6i_flags = RTF_ROUTEINFO | (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref); @@ -2325,13 +2327,16 @@ static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort) #ifdef CONFIG_IPV6_ROUTE_INFO static struct rt6_info *rt6_get_route_info(struct net *net, const struct in6_addr *prefix, int prefixlen, - const struct in6_addr *gwaddr, int ifindex) + const struct in6_addr *gwaddr, + struct net_device *dev) { + u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO; + int ifindex = dev->ifindex; struct fib6_node *fn; struct rt6_info *rt = NULL; struct fib6_table *table; - table = fib6_get_table(net, RT6_TABLE_INFO); + table = fib6_get_table(net, tb_id); if (!table) return NULL; @@ -2357,12 +2362,13 @@ out: static struct rt6_info *rt6_add_route_info(struct net *net, const struct in6_addr *prefix, int prefixlen, - const struct in6_addr *gwaddr, int ifindex, + const struct in6_addr *gwaddr, + struct net_device *dev, unsigned int pref) { struct fib6_config cfg = { .fc_metric = IP6_RT_PRIO_USER, - .fc_ifindex = ifindex, + .fc_ifindex = dev->ifindex, .fc_dst_len = prefixlen, .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO | RTF_UP | RTF_PREF(pref), @@ -2371,7 +2377,7 @@ static struct rt6_info *rt6_add_route_info(struct net *net, .fc_nlinfo.nl_net = net, }; - cfg.fc_table = l3mdev_fib_table_by_index(net, ifindex) ? : RT6_TABLE_INFO; + cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO, cfg.fc_dst = *prefix; cfg.fc_gateway = *gwaddr; @@ -2381,16 +2387,17 @@ static struct rt6_info *rt6_add_route_info(struct net *net, ip6_route_add(&cfg); - return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex); + return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev); } #endif struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev) { + u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT; struct rt6_info *rt; struct fib6_table *table; - table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT); + table = fib6_get_table(dev_net(dev), tb_id); if (!table) return NULL; @@ -2424,20 +2431,20 @@ struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr, cfg.fc_gateway = *gwaddr; - ip6_route_add(&cfg); + if (!ip6_route_add(&cfg)) { + struct fib6_table *table; + + table = fib6_get_table(dev_net(dev), cfg.fc_table); + if (table) + table->flags |= RT6_TABLE_HAS_DFLT_ROUTER; + } return rt6_get_dflt_router(gwaddr, dev); } -void rt6_purge_dflt_routers(struct net *net) +static void __rt6_purge_dflt_routers(struct fib6_table *table) { struct rt6_info *rt; - struct fib6_table *table; - - /* NOTE: Keep consistent with rt6_get_dflt_router */ - table = fib6_get_table(net, RT6_TABLE_DFLT); - if (!table) - return; restart: read_lock_bh(&table->tb6_lock); @@ -2451,6 +2458,27 @@ restart: } } read_unlock_bh(&table->tb6_lock); + + table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER; +} + +void rt6_purge_dflt_routers(struct net *net) +{ + struct fib6_table *table; + struct hlist_head *head; + unsigned int h; + + rcu_read_lock(); + + for (h = 0; h < FIB6_TABLE_HASHSZ; h++) { + head = &net->ipv6.fib_table_hash[h]; + hlist_for_each_entry_rcu(table, head, tb6_hlist) { + if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER) + __rt6_purge_dflt_routers(table); + } + } + + rcu_read_unlock(); } static void rtmsg_to_fib6_config(struct net *net, -- cgit v1.2.3 From d5d32e4b76687f4df9ad3ba8d3702b7347f51fa6 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Mon, 24 Oct 2016 12:27:23 -0700 Subject: net: ipv6: Do not consider link state for nexthop validation Similar to IPv4, do not consider link state when validating next hops. Currently, if the link is down default routes can fail to insert: $ ip -6 ro add vrf blue default via 2100:2::64 dev eth2 RTNETLINK answers: No route to host With this patch the command succeeds. Fixes: 8c14586fc320 ("net: ipv6: Use passed in table for nexthop lookups") Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_route.h | 1 + net/ipv6/route.c | 6 ++++-- 2 files changed, 5 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index e0cd318d5103..f83e78d071a3 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -32,6 +32,7 @@ struct route_info { #define RT6_LOOKUP_F_SRCPREF_TMP 0x00000008 #define RT6_LOOKUP_F_SRCPREF_PUBLIC 0x00000010 #define RT6_LOOKUP_F_SRCPREF_COA 0x00000020 +#define RT6_LOOKUP_F_IGNORE_LINKSTATE 0x00000040 /* We do not (yet ?) support IPv6 jumbograms (RFC 2675) * Unlike IPv4, hdr->seg_len doesn't include the IPv6 header diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 3ac19eb81a86..947ed1ded026 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -658,7 +658,8 @@ static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict, struct net_device *dev = rt->dst.dev; if (dev && !netif_carrier_ok(dev) && - idev->cnf.ignore_routes_with_linkdown) + idev->cnf.ignore_routes_with_linkdown && + !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE)) goto out; if (rt6_check_expired(rt)) @@ -1052,6 +1053,7 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int strict = 0; strict |= flags & RT6_LOOKUP_F_IFACE; + strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE; if (net->ipv6.devconf_all->forwarding == 0) strict |= RT6_LOOKUP_F_REACHABLE; @@ -1791,7 +1793,7 @@ static struct rt6_info *ip6_nh_lookup_table(struct net *net, }; struct fib6_table *table; struct rt6_info *rt; - int flags = RT6_LOOKUP_F_IFACE; + int flags = RT6_LOOKUP_F_IFACE | RT6_LOOKUP_F_IGNORE_LINKSTATE; table = fib6_get_table(net, cfg->fc_table); if (!table) -- cgit v1.2.3 From c0a0aba8e478229b2f0956918542152fbad3f794 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Thu, 27 Oct 2016 17:46:38 -0700 Subject: kconfig.h: remove config_enabled() macro The use of config_enabled() is ambiguous. For config options, IS_ENABLED(), IS_REACHABLE(), etc. will make intention clearer. Sometimes config_enabled() has been used for non-config options because it is useful to check whether the given symbol is defined or not. I have been tackling on deprecating config_enabled(), and now is the time to finish this work. Some new users have appeared for v4.9-rc1, but it is trivial to replace them: - arch/x86/mm/kaslr.c replace config_enabled() with IS_ENABLED() because CONFIG_X86_ESPFIX64 and CONFIG_EFI are boolean. - include/asm-generic/export.h replace config_enabled() with __is_defined(). Then, config_enabled() can be removed now. Going forward, please use IS_ENABLED(), IS_REACHABLE(), etc. for config options, and __is_defined() for non-config symbols. Link: http://lkml.kernel.org/r/1476616078-32252-1-git-send-email-yamada.masahiro@socionext.com Signed-off-by: Masahiro Yamada Acked-by: Ingo Molnar Acked-by: Nicolas Pitre Cc: Peter Oberparleiter Cc: Arnd Bergmann Cc: Kees Cook Cc: Michal Marek Cc: "H. Peter Anvin" Cc: Thomas Gleixner Cc: Thomas Garnier Cc: Paul Bolle Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/mm/kaslr.c | 6 +++--- include/asm-generic/export.h | 2 +- include/linux/kconfig.h | 5 ++--- 3 files changed, 6 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c index ddd2661c4502..887e57182716 100644 --- a/arch/x86/mm/kaslr.c +++ b/arch/x86/mm/kaslr.c @@ -104,10 +104,10 @@ void __init kernel_randomize_memory(void) * consistent with the vaddr_start/vaddr_end variables. */ BUILD_BUG_ON(vaddr_start >= vaddr_end); - BUILD_BUG_ON(config_enabled(CONFIG_X86_ESPFIX64) && + BUILD_BUG_ON(IS_ENABLED(CONFIG_X86_ESPFIX64) && vaddr_end >= EFI_VA_START); - BUILD_BUG_ON((config_enabled(CONFIG_X86_ESPFIX64) || - config_enabled(CONFIG_EFI)) && + BUILD_BUG_ON((IS_ENABLED(CONFIG_X86_ESPFIX64) || + IS_ENABLED(CONFIG_EFI)) && vaddr_end >= __START_KERNEL_map); BUILD_BUG_ON(vaddr_end > __START_KERNEL_map); diff --git a/include/asm-generic/export.h b/include/asm-generic/export.h index 43199a049da5..63554e9f6e0c 100644 --- a/include/asm-generic/export.h +++ b/include/asm-generic/export.h @@ -70,7 +70,7 @@ KSYM(__kcrctab_\name): #include #define __EXPORT_SYMBOL(sym, val, sec) \ - __cond_export_sym(sym, val, sec, config_enabled(__KSYM_##sym)) + __cond_export_sym(sym, val, sec, __is_defined(__KSYM_##sym)) #define __cond_export_sym(sym, val, sec, conf) \ ___cond_export_sym(sym, val, sec, conf) #define ___cond_export_sym(sym, val, sec, enabled) \ diff --git a/include/linux/kconfig.h b/include/linux/kconfig.h index 15ec117ec537..8f2e059e4d45 100644 --- a/include/linux/kconfig.h +++ b/include/linux/kconfig.h @@ -31,7 +31,6 @@ * When CONFIG_BOOGER is not defined, we generate a (... 1, 0) pair, and when * the last step cherry picks the 2nd arg, we get a zero. */ -#define config_enabled(cfg) ___is_defined(cfg) #define __is_defined(x) ___is_defined(x) #define ___is_defined(val) ____is_defined(__ARG_PLACEHOLDER_##val) #define ____is_defined(arg1_or_junk) __take_second_arg(arg1_or_junk 1, 0) @@ -41,13 +40,13 @@ * otherwise. For boolean options, this is equivalent to * IS_ENABLED(CONFIG_FOO). */ -#define IS_BUILTIN(option) config_enabled(option) +#define IS_BUILTIN(option) __is_defined(option) /* * IS_MODULE(CONFIG_FOO) evaluates to 1 if CONFIG_FOO is set to 'm', 0 * otherwise. */ -#define IS_MODULE(option) config_enabled(option##_MODULE) +#define IS_MODULE(option) __is_defined(option##_MODULE) /* * IS_REACHABLE(CONFIG_FOO) evaluates to 1 if the currently compiled -- cgit v1.2.3 From 5aab90ce1ec449912a2ebc4d45e0c85dac29e9dd Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 26 Oct 2016 11:48:24 +0200 Subject: perf/powerpc: Don't call perf_event_disable() from atomic context The trinity syscall fuzzer triggered following WARN() on powerpc: WARNING: CPU: 9 PID: 2998 at arch/powerpc/kernel/hw_breakpoint.c:278 ... NIP [c00000000093aedc] .hw_breakpoint_handler+0x28c/0x2b0 LR [c00000000093aed8] .hw_breakpoint_handler+0x288/0x2b0 Call Trace: [c0000002f7933580] [c00000000093aed8] .hw_breakpoint_handler+0x288/0x2b0 (unreliable) [c0000002f7933630] [c0000000000f671c] .notifier_call_chain+0x7c/0xf0 [c0000002f79336d0] [c0000000000f6abc] .__atomic_notifier_call_chain+0xbc/0x1c0 [c0000002f7933780] [c0000000000f6c40] .notify_die+0x70/0xd0 [c0000002f7933820] [c00000000001a74c] .do_break+0x4c/0x100 [c0000002f7933920] [c0000000000089fc] handle_dabr_fault+0x14/0x48 Followed by a lockdep warning: =============================== [ INFO: suspicious RCU usage. ] 4.8.0-rc5+ #7 Tainted: G W ------------------------------- ./include/linux/rcupdate.h:556 Illegal context switch in RCU read-side critical section! other info that might help us debug this: rcu_scheduler_active = 1, debug_locks = 0 2 locks held by ls/2998: #0: (rcu_read_lock){......}, at: [] .__atomic_notifier_call_chain+0x0/0x1c0 #1: (rcu_read_lock){......}, at: [] .hw_breakpoint_handler+0x0/0x2b0 stack backtrace: CPU: 9 PID: 2998 Comm: ls Tainted: G W 4.8.0-rc5+ #7 Call Trace: [c0000002f7933150] [c00000000094b1f8] .dump_stack+0xe0/0x14c (unreliable) [c0000002f79331e0] [c00000000013c468] .lockdep_rcu_suspicious+0x138/0x180 [c0000002f7933270] [c0000000001005d8] .___might_sleep+0x278/0x2e0 [c0000002f7933300] [c000000000935584] .mutex_lock_nested+0x64/0x5a0 [c0000002f7933410] [c00000000023084c] .perf_event_ctx_lock_nested+0x16c/0x380 [c0000002f7933500] [c000000000230a80] .perf_event_disable+0x20/0x60 [c0000002f7933580] [c00000000093aeec] .hw_breakpoint_handler+0x29c/0x2b0 [c0000002f7933630] [c0000000000f671c] .notifier_call_chain+0x7c/0xf0 [c0000002f79336d0] [c0000000000f6abc] .__atomic_notifier_call_chain+0xbc/0x1c0 [c0000002f7933780] [c0000000000f6c40] .notify_die+0x70/0xd0 [c0000002f7933820] [c00000000001a74c] .do_break+0x4c/0x100 [c0000002f7933920] [c0000000000089fc] handle_dabr_fault+0x14/0x48 While it looks like the first WARN() is probably valid, the other one is triggered by disabling event via perf_event_disable() from atomic context. The event is disabled here in case we were not able to emulate the instruction that hit the breakpoint. By disabling the event we unschedule the event and make sure it's not scheduled back. But we can't call perf_event_disable() from atomic context, instead we need to use the event's pending_disable irq_work method to disable it. Reported-by: Jan Stancek Signed-off-by: Jiri Olsa Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Huang Ying Cc: Jiri Olsa Cc: Linus Torvalds Cc: Michael Neuling Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20161026094824.GA21397@krava Signed-off-by: Ingo Molnar --- arch/powerpc/kernel/hw_breakpoint.c | 2 +- include/linux/perf_event.h | 1 + kernel/events/core.c | 10 ++++++++-- 3 files changed, 10 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/arch/powerpc/kernel/hw_breakpoint.c b/arch/powerpc/kernel/hw_breakpoint.c index 9781c69eae57..03d089b3ed72 100644 --- a/arch/powerpc/kernel/hw_breakpoint.c +++ b/arch/powerpc/kernel/hw_breakpoint.c @@ -275,7 +275,7 @@ int hw_breakpoint_handler(struct die_args *args) if (!stepped) { WARN(1, "Unable to handle hardware breakpoint. Breakpoint at " "0x%lx will be disabled.", info->address); - perf_event_disable(bp); + perf_event_disable_inatomic(bp); goto out; } /* diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 060d0ede88df..4741ecdb9817 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1257,6 +1257,7 @@ extern u64 perf_swevent_set_period(struct perf_event *event); extern void perf_event_enable(struct perf_event *event); extern void perf_event_disable(struct perf_event *event); extern void perf_event_disable_local(struct perf_event *event); +extern void perf_event_disable_inatomic(struct perf_event *event); extern void perf_event_task_tick(void); #else /* !CONFIG_PERF_EVENTS: */ static inline void * diff --git a/kernel/events/core.c b/kernel/events/core.c index a5d2e62faf7e..0e292132efac 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1960,6 +1960,12 @@ void perf_event_disable(struct perf_event *event) } EXPORT_SYMBOL_GPL(perf_event_disable); +void perf_event_disable_inatomic(struct perf_event *event) +{ + event->pending_disable = 1; + irq_work_queue(&event->pending); +} + static void perf_set_shadow_time(struct perf_event *event, struct perf_event_context *ctx, u64 tstamp) @@ -7075,8 +7081,8 @@ static int __perf_event_overflow(struct perf_event *event, if (events && atomic_dec_and_test(&event->event_limit)) { ret = 1; event->pending_kill = POLL_HUP; - event->pending_disable = 1; - irq_work_queue(&event->pending); + + perf_event_disable_inatomic(event); } READ_ONCE(event->overflow_handler)(event, data, regs); -- cgit v1.2.3 From b47bd6ea40636362a8b6605de51207cc387ba0b8 Mon Sep 17 00:00:00 2001 From: Daniel Jurgens Date: Tue, 25 Oct 2016 18:36:24 +0300 Subject: {net, ib}/mlx5: Make cache line size determination at runtime. ARM 64B cache line systems have L1_CACHE_BYTES set to 128. cache_line_size() will return the correct size. Fixes: cf50b5efa2fe('net/mlx5_core/ib: New device capabilities handling.') Signed-off-by: Daniel Jurgens Signed-off-by: Saeed Mahameed Signed-off-by: David S. Miller --- drivers/infiniband/hw/mlx5/main.c | 2 +- drivers/infiniband/hw/mlx5/qp.c | 1 - drivers/net/ethernet/mellanox/mlx5/core/alloc.c | 31 +++++++++++++++++++++---- include/linux/mlx5/driver.h | 11 --------- 4 files changed, 27 insertions(+), 18 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 22174774dbb8..63036c731626 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -1019,7 +1019,7 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev, resp.qp_tab_size = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp); if (mlx5_core_is_pf(dev->mdev) && MLX5_CAP_GEN(dev->mdev, bf)) resp.bf_reg_size = 1 << MLX5_CAP_GEN(dev->mdev, log_bf_reg_size); - resp.cache_line_size = L1_CACHE_BYTES; + resp.cache_line_size = cache_line_size(); resp.max_sq_desc_sz = MLX5_CAP_GEN(dev->mdev, max_wqe_sz_sq); resp.max_rq_desc_sz = MLX5_CAP_GEN(dev->mdev, max_wqe_sz_rq); resp.max_send_wqebb = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz); diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 41f4c2afbcdd..7ce97daf26c6 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -52,7 +52,6 @@ enum { enum { MLX5_IB_SQ_STRIDE = 6, - MLX5_IB_CACHE_LINE_SIZE = 64, }; static const u32 mlx5_ib_opcode[] = { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/alloc.c b/drivers/net/ethernet/mellanox/mlx5/core/alloc.c index 6cb38304669f..2c6e3c7b7417 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/alloc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/alloc.c @@ -41,6 +41,13 @@ #include "mlx5_core.h" +struct mlx5_db_pgdir { + struct list_head list; + unsigned long *bitmap; + __be32 *db_page; + dma_addr_t db_dma; +}; + /* Handling for queue buffers -- we allocate a bunch of memory and * register it in a memory region at HCA virtual address 0. */ @@ -102,17 +109,28 @@ EXPORT_SYMBOL_GPL(mlx5_buf_free); static struct mlx5_db_pgdir *mlx5_alloc_db_pgdir(struct mlx5_core_dev *dev, int node) { + u32 db_per_page = PAGE_SIZE / cache_line_size(); struct mlx5_db_pgdir *pgdir; pgdir = kzalloc(sizeof(*pgdir), GFP_KERNEL); if (!pgdir) return NULL; - bitmap_fill(pgdir->bitmap, MLX5_DB_PER_PAGE); + pgdir->bitmap = kcalloc(BITS_TO_LONGS(db_per_page), + sizeof(unsigned long), + GFP_KERNEL); + + if (!pgdir->bitmap) { + kfree(pgdir); + return NULL; + } + + bitmap_fill(pgdir->bitmap, db_per_page); pgdir->db_page = mlx5_dma_zalloc_coherent_node(dev, PAGE_SIZE, &pgdir->db_dma, node); if (!pgdir->db_page) { + kfree(pgdir->bitmap); kfree(pgdir); return NULL; } @@ -123,18 +141,19 @@ static struct mlx5_db_pgdir *mlx5_alloc_db_pgdir(struct mlx5_core_dev *dev, static int mlx5_alloc_db_from_pgdir(struct mlx5_db_pgdir *pgdir, struct mlx5_db *db) { + u32 db_per_page = PAGE_SIZE / cache_line_size(); int offset; int i; - i = find_first_bit(pgdir->bitmap, MLX5_DB_PER_PAGE); - if (i >= MLX5_DB_PER_PAGE) + i = find_first_bit(pgdir->bitmap, db_per_page); + if (i >= db_per_page) return -ENOMEM; __clear_bit(i, pgdir->bitmap); db->u.pgdir = pgdir; db->index = i; - offset = db->index * L1_CACHE_BYTES; + offset = db->index * cache_line_size(); db->db = pgdir->db_page + offset / sizeof(*pgdir->db_page); db->dma = pgdir->db_dma + offset; @@ -181,14 +200,16 @@ EXPORT_SYMBOL_GPL(mlx5_db_alloc); void mlx5_db_free(struct mlx5_core_dev *dev, struct mlx5_db *db) { + u32 db_per_page = PAGE_SIZE / cache_line_size(); mutex_lock(&dev->priv.pgdir_mutex); __set_bit(db->index, db->u.pgdir->bitmap); - if (bitmap_full(db->u.pgdir->bitmap, MLX5_DB_PER_PAGE)) { + if (bitmap_full(db->u.pgdir->bitmap, db_per_page)) { dma_free_coherent(&(dev->pdev->dev), PAGE_SIZE, db->u.pgdir->db_page, db->u.pgdir->db_dma); list_del(&db->u.pgdir->list); + kfree(db->u.pgdir->bitmap); kfree(db->u.pgdir); } diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 85c4786427e4..5dbda60a09f4 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -625,10 +625,6 @@ struct mlx5_db { int index; }; -enum { - MLX5_DB_PER_PAGE = PAGE_SIZE / L1_CACHE_BYTES, -}; - enum { MLX5_COMP_EQ_SIZE = 1024, }; @@ -638,13 +634,6 @@ enum { MLX5_PTYS_EN = 1 << 2, }; -struct mlx5_db_pgdir { - struct list_head list; - DECLARE_BITMAP(bitmap, MLX5_DB_PER_PAGE); - __be32 *db_page; - dma_addr_t db_dma; -}; - typedef void (*mlx5_cmd_cbk_t)(int status, void *context); struct mlx5_cmd_work_ent { -- cgit v1.2.3 From 05ac2c0b7438ea08c5d54b48797acf9b22cb2f6f Mon Sep 17 00:00:00 2001 From: Mohamad Haj Yahia Date: Tue, 25 Oct 2016 18:36:33 +0300 Subject: net/mlx5: Fix race between PCI error handlers and health work Currently there is a race between the health care work and the kernel pci error handlers because both of them detect the error, the first one to be called will do the error handling. There is a chance that health care will disable the pci after resuming pci slot. Also create a separate WQ because now we will have two types of health works, one for the error detection and one for the recovery. Fixes: 89d44f0a6c73 ('net/mlx5_core: Add pci error handlers to mlx5_core driver') Signed-off-by: Mohamad Haj Yahia Signed-off-by: Saeed Mahameed Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx5/core/health.c | 30 +++++++++++++++++++++--- drivers/net/ethernet/mellanox/mlx5/core/main.c | 9 +++++-- include/linux/mlx5/driver.h | 4 ++++ 3 files changed, 38 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/health.c b/drivers/net/ethernet/mellanox/mlx5/core/health.c index 2cb4094c9c49..2d00022c92d8 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/health.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/health.c @@ -64,6 +64,10 @@ enum { MLX5_NIC_IFC_NO_DRAM_NIC = 2 }; +enum { + MLX5_DROP_NEW_HEALTH_WORK, +}; + static u8 get_nic_interface(struct mlx5_core_dev *dev) { return (ioread32be(&dev->iseg->cmdq_addr_l_sz) >> 8) & 3; @@ -272,7 +276,13 @@ static void poll_health(unsigned long data) if (in_fatal(dev) && !health->sick) { health->sick = true; print_health_info(dev); - schedule_work(&health->work); + spin_lock(&health->wq_lock); + if (!test_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags)) + queue_work(health->wq, &health->work); + else + dev_err(&dev->pdev->dev, + "new health works are not permitted at this stage\n"); + spin_unlock(&health->wq_lock); } } @@ -282,6 +292,7 @@ void mlx5_start_health_poll(struct mlx5_core_dev *dev) init_timer(&health->timer); health->sick = 0; + clear_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags); health->health = &dev->iseg->health; health->health_counter = &dev->iseg->health_counter; @@ -298,11 +309,21 @@ void mlx5_stop_health_poll(struct mlx5_core_dev *dev) del_timer_sync(&health->timer); } +void mlx5_drain_health_wq(struct mlx5_core_dev *dev) +{ + struct mlx5_core_health *health = &dev->priv.health; + + spin_lock(&health->wq_lock); + set_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags); + spin_unlock(&health->wq_lock); + cancel_work_sync(&health->work); +} + void mlx5_health_cleanup(struct mlx5_core_dev *dev) { struct mlx5_core_health *health = &dev->priv.health; - flush_work(&health->work); + destroy_workqueue(health->wq); } int mlx5_health_init(struct mlx5_core_dev *dev) @@ -317,8 +338,11 @@ int mlx5_health_init(struct mlx5_core_dev *dev) strcpy(name, "mlx5_health"); strcat(name, dev_name(&dev->pdev->dev)); + health->wq = create_singlethread_workqueue(name); kfree(name); - + if (!health->wq) + return -ENOMEM; + spin_lock_init(&health->wq_lock); INIT_WORK(&health->work, health_care); return 0; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index 8a63910bfccf..9f90226bc120 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -1315,8 +1315,13 @@ static pci_ers_result_t mlx5_pci_err_detected(struct pci_dev *pdev, dev_info(&pdev->dev, "%s was called\n", __func__); mlx5_enter_error_state(dev); mlx5_unload_one(dev, priv, false); - pci_save_state(pdev); - mlx5_pci_disable_device(dev); + /* In case of kernel call save the pci state and drain health wq */ + if (state) { + pci_save_state(pdev); + mlx5_drain_health_wq(dev); + mlx5_pci_disable_device(dev); + } + return state == pci_channel_io_perm_failure ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_NEED_RESET; } diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 5dbda60a09f4..7d9a5d08eb59 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -418,7 +418,10 @@ struct mlx5_core_health { u32 prev; int miss_counter; bool sick; + /* wq spinlock to synchronize draining */ + spinlock_t wq_lock; struct workqueue_struct *wq; + unsigned long flags; struct work_struct work; }; @@ -778,6 +781,7 @@ void mlx5_health_cleanup(struct mlx5_core_dev *dev); int mlx5_health_init(struct mlx5_core_dev *dev); void mlx5_start_health_poll(struct mlx5_core_dev *dev); void mlx5_stop_health_poll(struct mlx5_core_dev *dev); +void mlx5_drain_health_wq(struct mlx5_core_dev *dev); int mlx5_buf_alloc_node(struct mlx5_core_dev *dev, int size, struct mlx5_buf *buf, int node); int mlx5_buf_alloc(struct mlx5_core_dev *dev, int size, struct mlx5_buf *buf); -- cgit v1.2.3 From 04c0c1ab38e95105d950db5b84e727637e149ce7 Mon Sep 17 00:00:00 2001 From: Mohamad Haj Yahia Date: Tue, 25 Oct 2016 18:36:34 +0300 Subject: net/mlx5: PCI error recovery health care simulation In case that the kernel PCI error handlers are not called, we will trigger our own recovery flow. The health work will give priority to the kernel pci error handlers to recover the PCI by waiting for a small period, if the pci error handlers are not triggered the manual recovery flow will be executed. We don't save pci state in case of manual recovery because it will ruin the pci configuration space and we will lose dma sync. Fixes: 89d44f0a6c73 ('net/mlx5_core: Add pci error handlers to mlx5_core driver') Signed-off-by: Mohamad Haj Yahia Signed-off-by: Saeed Mahameed Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx5/core/health.c | 45 ++++++++++++++++++++-- drivers/net/ethernet/mellanox/mlx5/core/main.c | 18 ++++++--- .../net/ethernet/mellanox/mlx5/core/mlx5_core.h | 1 + include/linux/mlx5/driver.h | 1 + 4 files changed, 56 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/health.c b/drivers/net/ethernet/mellanox/mlx5/core/health.c index 2d00022c92d8..5bcf93422ee0 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/health.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/health.c @@ -61,14 +61,15 @@ enum { enum { MLX5_NIC_IFC_FULL = 0, MLX5_NIC_IFC_DISABLED = 1, - MLX5_NIC_IFC_NO_DRAM_NIC = 2 + MLX5_NIC_IFC_NO_DRAM_NIC = 2, + MLX5_NIC_IFC_INVALID = 3 }; enum { MLX5_DROP_NEW_HEALTH_WORK, }; -static u8 get_nic_interface(struct mlx5_core_dev *dev) +static u8 get_nic_state(struct mlx5_core_dev *dev) { return (ioread32be(&dev->iseg->cmdq_addr_l_sz) >> 8) & 3; } @@ -101,7 +102,7 @@ static int in_fatal(struct mlx5_core_dev *dev) struct mlx5_core_health *health = &dev->priv.health; struct health_buffer __iomem *h = health->health; - if (get_nic_interface(dev) == MLX5_NIC_IFC_DISABLED) + if (get_nic_state(dev) == MLX5_NIC_IFC_DISABLED) return 1; if (ioread32be(&h->fw_ver) == 0xffffffff) @@ -131,7 +132,7 @@ unlock: static void mlx5_handle_bad_state(struct mlx5_core_dev *dev) { - u8 nic_interface = get_nic_interface(dev); + u8 nic_interface = get_nic_state(dev); switch (nic_interface) { case MLX5_NIC_IFC_FULL: @@ -153,8 +154,34 @@ static void mlx5_handle_bad_state(struct mlx5_core_dev *dev) mlx5_disable_device(dev); } +static void health_recover(struct work_struct *work) +{ + struct mlx5_core_health *health; + struct delayed_work *dwork; + struct mlx5_core_dev *dev; + struct mlx5_priv *priv; + u8 nic_state; + + dwork = container_of(work, struct delayed_work, work); + health = container_of(dwork, struct mlx5_core_health, recover_work); + priv = container_of(health, struct mlx5_priv, health); + dev = container_of(priv, struct mlx5_core_dev, priv); + + nic_state = get_nic_state(dev); + if (nic_state == MLX5_NIC_IFC_INVALID) { + dev_err(&dev->pdev->dev, "health recovery flow aborted since the nic state is invalid\n"); + return; + } + + dev_err(&dev->pdev->dev, "starting health recovery flow\n"); + mlx5_recover_device(dev); +} + +/* How much time to wait until health resetting the driver (in msecs) */ +#define MLX5_RECOVERY_DELAY_MSECS 60000 static void health_care(struct work_struct *work) { + unsigned long recover_delay = msecs_to_jiffies(MLX5_RECOVERY_DELAY_MSECS); struct mlx5_core_health *health; struct mlx5_core_dev *dev; struct mlx5_priv *priv; @@ -164,6 +191,14 @@ static void health_care(struct work_struct *work) dev = container_of(priv, struct mlx5_core_dev, priv); mlx5_core_warn(dev, "handling bad device here\n"); mlx5_handle_bad_state(dev); + + spin_lock(&health->wq_lock); + if (!test_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags)) + schedule_delayed_work(&health->recover_work, recover_delay); + else + dev_err(&dev->pdev->dev, + "new health works are not permitted at this stage\n"); + spin_unlock(&health->wq_lock); } static const char *hsynd_str(u8 synd) @@ -316,6 +351,7 @@ void mlx5_drain_health_wq(struct mlx5_core_dev *dev) spin_lock(&health->wq_lock); set_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags); spin_unlock(&health->wq_lock); + cancel_delayed_work_sync(&health->recover_work); cancel_work_sync(&health->work); } @@ -344,6 +380,7 @@ int mlx5_health_init(struct mlx5_core_dev *dev) return -ENOMEM; spin_lock_init(&health->wq_lock); INIT_WORK(&health->work, health_care); + INIT_DELAYED_WORK(&health->recover_work, health_recover); return 0; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index 9f90226bc120..d5433c49b2b0 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -1313,6 +1313,7 @@ static pci_ers_result_t mlx5_pci_err_detected(struct pci_dev *pdev, struct mlx5_priv *priv = &dev->priv; dev_info(&pdev->dev, "%s was called\n", __func__); + mlx5_enter_error_state(dev); mlx5_unload_one(dev, priv, false); /* In case of kernel call save the pci state and drain health wq */ @@ -1378,11 +1379,6 @@ static pci_ers_result_t mlx5_pci_slot_reset(struct pci_dev *pdev) return PCI_ERS_RESULT_RECOVERED; } -void mlx5_disable_device(struct mlx5_core_dev *dev) -{ - mlx5_pci_err_detected(dev->pdev, 0); -} - static void mlx5_pci_resume(struct pci_dev *pdev) { struct mlx5_core_dev *dev = pci_get_drvdata(pdev); @@ -1432,6 +1428,18 @@ static const struct pci_device_id mlx5_core_pci_table[] = { MODULE_DEVICE_TABLE(pci, mlx5_core_pci_table); +void mlx5_disable_device(struct mlx5_core_dev *dev) +{ + mlx5_pci_err_detected(dev->pdev, 0); +} + +void mlx5_recover_device(struct mlx5_core_dev *dev) +{ + mlx5_pci_disable_device(dev); + if (mlx5_pci_slot_reset(dev->pdev) == PCI_ERS_RESULT_RECOVERED) + mlx5_pci_resume(dev->pdev); +} + static struct pci_driver mlx5_core_driver = { .name = DRIVER_NAME, .id_table = mlx5_core_pci_table, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h index 3d0cfb9f18f9..187662c8ea96 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h @@ -83,6 +83,7 @@ void mlx5_core_event(struct mlx5_core_dev *dev, enum mlx5_dev_event event, unsigned long param); void mlx5_enter_error_state(struct mlx5_core_dev *dev); void mlx5_disable_device(struct mlx5_core_dev *dev); +void mlx5_recover_device(struct mlx5_core_dev *dev); int mlx5_sriov_init(struct mlx5_core_dev *dev); void mlx5_sriov_cleanup(struct mlx5_core_dev *dev); int mlx5_sriov_attach(struct mlx5_core_dev *dev); diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 7d9a5d08eb59..ecc451d89ccd 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -423,6 +423,7 @@ struct mlx5_core_health { struct workqueue_struct *wq; unsigned long flags; struct work_struct work; + struct delayed_work recover_work; }; struct mlx5_cq_table { -- cgit v1.2.3 From e934f684856393a0b2aecc7fdd8357a48b79c535 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Wed, 26 Oct 2016 08:30:29 -0700 Subject: Revert "hv_netvsc: report vmbus name in ethtool" This reverts commit e3f74b841d48 ("hv_netvsc: report vmbus name in ethtool")' because of problem introduced by commit f9a56e5d6a0ba ("Drivers: hv: make VMBus bus ids persistent"). This changed the format of the vmbus name and this new format is too long to fit in the bus_info field of ethtool. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- drivers/net/hyperv/netvsc_drv.c | 4 ---- include/linux/hyperv.h | 7 ------- 2 files changed, 11 deletions(-) (limited to 'include') diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c index c71d966d905f..f6382150b16a 100644 --- a/drivers/net/hyperv/netvsc_drv.c +++ b/drivers/net/hyperv/netvsc_drv.c @@ -699,12 +699,8 @@ int netvsc_recv_callback(struct hv_device *device_obj, static void netvsc_get_drvinfo(struct net_device *net, struct ethtool_drvinfo *info) { - struct net_device_context *net_device_ctx = netdev_priv(net); - struct hv_device *dev = net_device_ctx->device_ctx; - strlcpy(info->driver, KBUILD_MODNAME, sizeof(info->driver)); strlcpy(info->fw_version, "N/A", sizeof(info->fw_version)); - strlcpy(info->bus_info, vmbus_dev_name(dev), sizeof(info->bus_info)); } static void netvsc_get_channels(struct net_device *net, diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index 6824556d37ed..cd184bdca58f 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -1169,13 +1169,6 @@ int __must_check __vmbus_driver_register(struct hv_driver *hv_driver, const char *mod_name); void vmbus_driver_unregister(struct hv_driver *hv_driver); -static inline const char *vmbus_dev_name(const struct hv_device *device_obj) -{ - const struct kobject *kobj = &device_obj->device.kobj; - - return kobj->name; -} - void vmbus_hvsock_device_unregister(struct vmbus_channel *channel); int vmbus_allocate_mmio(struct resource **new, struct hv_device *device_obj, -- cgit v1.2.3 From 6f2e0d2c3bf0f8d322ab7516c57340c7189cca02 Mon Sep 17 00:00:00 2001 From: Eugenia Emantayev Date: Thu, 27 Oct 2016 16:27:20 +0300 Subject: net/mlx4: Fix firmware command timeout during interrupt test Currently interrupt test that is part of ethtool selftest runs the check over all interrupt vectors of the device. In mlx4_en package part of interrupt vectors are uninitialized since mlx4_ib doesn't exist. This causes NOP FW command to time out. Change logic to test current port interrupt vectors only. Signed-off-by: Eugenia Emantayev Signed-off-by: Tariq Toukan Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx4/en_selftest.c | 26 +++++++++- drivers/net/ethernet/mellanox/mlx4/eq.c | 62 +++++++++++------------- include/linux/mlx4/device.h | 3 +- 3 files changed, 55 insertions(+), 36 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/mellanox/mlx4/en_selftest.c b/drivers/net/ethernet/mellanox/mlx4/en_selftest.c index b66e03d9711f..c06346a82496 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_selftest.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_selftest.c @@ -118,6 +118,29 @@ mlx4_en_test_loopback_exit: return !loopback_ok; } +static int mlx4_en_test_interrupts(struct mlx4_en_priv *priv) +{ + struct mlx4_en_dev *mdev = priv->mdev; + int err = 0; + int i = 0; + + err = mlx4_test_async(mdev->dev); + /* When not in MSI_X or slave, test only async */ + if (!(mdev->dev->flags & MLX4_FLAG_MSI_X) || mlx4_is_slave(mdev->dev)) + return err; + + /* A loop over all completion vectors of current port, + * for each vector check whether it works by mapping command + * completions to that vector and performing a NOP command + */ + for (i = 0; i < priv->rx_ring_num; i++) { + err = mlx4_test_interrupt(mdev->dev, priv->rx_cq[i]->vector); + if (err) + break; + } + + return err; +} static int mlx4_en_test_link(struct mlx4_en_priv *priv) { @@ -151,7 +174,6 @@ static int mlx4_en_test_speed(struct mlx4_en_priv *priv) void mlx4_en_ex_selftest(struct net_device *dev, u32 *flags, u64 *buf) { struct mlx4_en_priv *priv = netdev_priv(dev); - struct mlx4_en_dev *mdev = priv->mdev; int i, carrier_ok; memset(buf, 0, sizeof(u64) * MLX4_EN_NUM_SELF_TEST); @@ -177,7 +199,7 @@ void mlx4_en_ex_selftest(struct net_device *dev, u32 *flags, u64 *buf) netif_carrier_on(dev); } - buf[0] = mlx4_test_interrupts(mdev->dev); + buf[0] = mlx4_en_test_interrupts(priv); buf[1] = mlx4_en_test_link(priv); buf[2] = mlx4_en_test_speed(priv); diff --git a/drivers/net/ethernet/mellanox/mlx4/eq.c b/drivers/net/ethernet/mellanox/mlx4/eq.c index cf8f8a72a801..cd3638e6fe25 100644 --- a/drivers/net/ethernet/mellanox/mlx4/eq.c +++ b/drivers/net/ethernet/mellanox/mlx4/eq.c @@ -1361,53 +1361,49 @@ void mlx4_cleanup_eq_table(struct mlx4_dev *dev) kfree(priv->eq_table.uar_map); } -/* A test that verifies that we can accept interrupts on all - * the irq vectors of the device. +/* A test that verifies that we can accept interrupts + * on the vector allocated for asynchronous events + */ +int mlx4_test_async(struct mlx4_dev *dev) +{ + return mlx4_NOP(dev); +} +EXPORT_SYMBOL(mlx4_test_async); + +/* A test that verifies that we can accept interrupts + * on the given irq vector of the tested port. * Interrupts are checked using the NOP command. */ -int mlx4_test_interrupts(struct mlx4_dev *dev) +int mlx4_test_interrupt(struct mlx4_dev *dev, int vector) { struct mlx4_priv *priv = mlx4_priv(dev); - int i; int err; - err = mlx4_NOP(dev); - /* When not in MSI_X, there is only one irq to check */ - if (!(dev->flags & MLX4_FLAG_MSI_X) || mlx4_is_slave(dev)) - return err; - - /* A loop over all completion vectors, for each vector we will check - * whether it works by mapping command completions to that vector - * and performing a NOP command - */ - for(i = 0; !err && (i < dev->caps.num_comp_vectors); ++i) { - /* Make sure request_irq was called */ - if (!priv->eq_table.eq[i].have_irq) - continue; - - /* Temporary use polling for command completions */ - mlx4_cmd_use_polling(dev); - - /* Map the new eq to handle all asynchronous events */ - err = mlx4_MAP_EQ(dev, get_async_ev_mask(dev), 0, - priv->eq_table.eq[i].eqn); - if (err) { - mlx4_warn(dev, "Failed mapping eq for interrupt test\n"); - mlx4_cmd_use_events(dev); - break; - } + /* Temporary use polling for command completions */ + mlx4_cmd_use_polling(dev); - /* Go back to using events */ - mlx4_cmd_use_events(dev); - err = mlx4_NOP(dev); + /* Map the new eq to handle all asynchronous events */ + err = mlx4_MAP_EQ(dev, get_async_ev_mask(dev), 0, + priv->eq_table.eq[MLX4_CQ_TO_EQ_VECTOR(vector)].eqn); + if (err) { + mlx4_warn(dev, "Failed mapping eq for interrupt test\n"); + goto out; } + /* Go back to using events */ + mlx4_cmd_use_events(dev); + err = mlx4_NOP(dev); + /* Return to default */ + mlx4_cmd_use_polling(dev); +out: mlx4_MAP_EQ(dev, get_async_ev_mask(dev), 0, priv->eq_table.eq[MLX4_EQ_ASYNC].eqn); + mlx4_cmd_use_events(dev); + return err; } -EXPORT_SYMBOL(mlx4_test_interrupts); +EXPORT_SYMBOL(mlx4_test_interrupt); bool mlx4_is_eq_vector_valid(struct mlx4_dev *dev, u8 port, int vector) { diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h index f6a164297358..3be7abd6e722 100644 --- a/include/linux/mlx4/device.h +++ b/include/linux/mlx4/device.h @@ -1399,7 +1399,8 @@ void mlx4_fmr_unmap(struct mlx4_dev *dev, struct mlx4_fmr *fmr, u32 *lkey, u32 *rkey); int mlx4_fmr_free(struct mlx4_dev *dev, struct mlx4_fmr *fmr); int mlx4_SYNC_TPT(struct mlx4_dev *dev); -int mlx4_test_interrupts(struct mlx4_dev *dev); +int mlx4_test_interrupt(struct mlx4_dev *dev, int vector); +int mlx4_test_async(struct mlx4_dev *dev); int mlx4_query_diag_counters(struct mlx4_dev *dev, u8 op_modifier, const u32 offset[], u32 value[], size_t array_len, u8 port); -- cgit v1.2.3 From c6fcc4fc5f8b592600c7409e769ab68da0fb1eca Mon Sep 17 00:00:00 2001 From: pravin shelar Date: Fri, 28 Oct 2016 09:59:15 -0700 Subject: vxlan: avoid using stale vxlan socket. When vxlan device is closed vxlan socket is freed. This operation can race with vxlan-xmit function which dereferences vxlan socket. Following patch uses RCU mechanism to avoid this situation. Signed-off-by: Pravin B Shelar Signed-off-by: David S. Miller --- drivers/net/vxlan.c | 80 +++++++++++++++++++++++++++++++++-------------------- include/net/vxlan.h | 4 +-- 2 files changed, 52 insertions(+), 32 deletions(-) (limited to 'include') diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index c1639a3e95a4..f3c2fa3ab0d5 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -943,17 +943,20 @@ static bool vxlan_snoop(struct net_device *dev, static bool vxlan_group_used(struct vxlan_net *vn, struct vxlan_dev *dev) { struct vxlan_dev *vxlan; + struct vxlan_sock *sock4; + struct vxlan_sock *sock6 = NULL; unsigned short family = dev->default_dst.remote_ip.sa.sa_family; + sock4 = rtnl_dereference(dev->vn4_sock); + /* The vxlan_sock is only used by dev, leaving group has * no effect on other vxlan devices. */ - if (family == AF_INET && dev->vn4_sock && - atomic_read(&dev->vn4_sock->refcnt) == 1) + if (family == AF_INET && sock4 && atomic_read(&sock4->refcnt) == 1) return false; #if IS_ENABLED(CONFIG_IPV6) - if (family == AF_INET6 && dev->vn6_sock && - atomic_read(&dev->vn6_sock->refcnt) == 1) + sock6 = rtnl_dereference(dev->vn6_sock); + if (family == AF_INET6 && sock6 && atomic_read(&sock6->refcnt) == 1) return false; #endif @@ -961,10 +964,12 @@ static bool vxlan_group_used(struct vxlan_net *vn, struct vxlan_dev *dev) if (!netif_running(vxlan->dev) || vxlan == dev) continue; - if (family == AF_INET && vxlan->vn4_sock != dev->vn4_sock) + if (family == AF_INET && + rtnl_dereference(vxlan->vn4_sock) != sock4) continue; #if IS_ENABLED(CONFIG_IPV6) - if (family == AF_INET6 && vxlan->vn6_sock != dev->vn6_sock) + if (family == AF_INET6 && + rtnl_dereference(vxlan->vn6_sock) != sock6) continue; #endif @@ -1005,22 +1010,25 @@ static bool __vxlan_sock_release_prep(struct vxlan_sock *vs) static void vxlan_sock_release(struct vxlan_dev *vxlan) { - bool ipv4 = __vxlan_sock_release_prep(vxlan->vn4_sock); + struct vxlan_sock *sock4 = rtnl_dereference(vxlan->vn4_sock); #if IS_ENABLED(CONFIG_IPV6) - bool ipv6 = __vxlan_sock_release_prep(vxlan->vn6_sock); + struct vxlan_sock *sock6 = rtnl_dereference(vxlan->vn6_sock); + + rcu_assign_pointer(vxlan->vn6_sock, NULL); #endif + rcu_assign_pointer(vxlan->vn4_sock, NULL); synchronize_net(); - if (ipv4) { - udp_tunnel_sock_release(vxlan->vn4_sock->sock); - kfree(vxlan->vn4_sock); + if (__vxlan_sock_release_prep(sock4)) { + udp_tunnel_sock_release(sock4->sock); + kfree(sock4); } #if IS_ENABLED(CONFIG_IPV6) - if (ipv6) { - udp_tunnel_sock_release(vxlan->vn6_sock->sock); - kfree(vxlan->vn6_sock); + if (__vxlan_sock_release_prep(sock6)) { + udp_tunnel_sock_release(sock6->sock); + kfree(sock6); } #endif } @@ -1036,18 +1044,21 @@ static int vxlan_igmp_join(struct vxlan_dev *vxlan) int ret = -EINVAL; if (ip->sa.sa_family == AF_INET) { + struct vxlan_sock *sock4 = rtnl_dereference(vxlan->vn4_sock); struct ip_mreqn mreq = { .imr_multiaddr.s_addr = ip->sin.sin_addr.s_addr, .imr_ifindex = ifindex, }; - sk = vxlan->vn4_sock->sock->sk; + sk = sock4->sock->sk; lock_sock(sk); ret = ip_mc_join_group(sk, &mreq); release_sock(sk); #if IS_ENABLED(CONFIG_IPV6) } else { - sk = vxlan->vn6_sock->sock->sk; + struct vxlan_sock *sock6 = rtnl_dereference(vxlan->vn6_sock); + + sk = sock6->sock->sk; lock_sock(sk); ret = ipv6_stub->ipv6_sock_mc_join(sk, ifindex, &ip->sin6.sin6_addr); @@ -1067,18 +1078,21 @@ static int vxlan_igmp_leave(struct vxlan_dev *vxlan) int ret = -EINVAL; if (ip->sa.sa_family == AF_INET) { + struct vxlan_sock *sock4 = rtnl_dereference(vxlan->vn4_sock); struct ip_mreqn mreq = { .imr_multiaddr.s_addr = ip->sin.sin_addr.s_addr, .imr_ifindex = ifindex, }; - sk = vxlan->vn4_sock->sock->sk; + sk = sock4->sock->sk; lock_sock(sk); ret = ip_mc_leave_group(sk, &mreq); release_sock(sk); #if IS_ENABLED(CONFIG_IPV6) } else { - sk = vxlan->vn6_sock->sock->sk; + struct vxlan_sock *sock6 = rtnl_dereference(vxlan->vn6_sock); + + sk = sock6->sock->sk; lock_sock(sk); ret = ipv6_stub->ipv6_sock_mc_drop(sk, ifindex, &ip->sin6.sin6_addr); @@ -1828,11 +1842,15 @@ static struct dst_entry *vxlan6_get_route(struct vxlan_dev *vxlan, struct dst_cache *dst_cache, const struct ip_tunnel_info *info) { + struct vxlan_sock *sock6 = rcu_dereference(vxlan->vn6_sock); bool use_cache = ip_tunnel_dst_cache_usable(skb, info); struct dst_entry *ndst; struct flowi6 fl6; int err; + if (!sock6) + return ERR_PTR(-EIO); + if (tos && !info) use_cache = false; if (use_cache) { @@ -1850,7 +1868,7 @@ static struct dst_entry *vxlan6_get_route(struct vxlan_dev *vxlan, fl6.flowi6_proto = IPPROTO_UDP; err = ipv6_stub->ipv6_dst_lookup(vxlan->net, - vxlan->vn6_sock->sock->sk, + sock6->sock->sk, &ndst, &fl6); if (err < 0) return ERR_PTR(err); @@ -1995,9 +2013,11 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, } if (dst->sa.sa_family == AF_INET) { - if (!vxlan->vn4_sock) + struct vxlan_sock *sock4 = rcu_dereference(vxlan->vn4_sock); + + if (!sock4) goto drop; - sk = vxlan->vn4_sock->sock->sk; + sk = sock4->sock->sk; rt = vxlan_get_route(vxlan, skb, rdst ? rdst->remote_ifindex : 0, tos, @@ -2050,12 +2070,13 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, src_port, dst_port, xnet, !udp_sum); #if IS_ENABLED(CONFIG_IPV6) } else { + struct vxlan_sock *sock6 = rcu_dereference(vxlan->vn6_sock); struct dst_entry *ndst; u32 rt6i_flags; - if (!vxlan->vn6_sock) + if (!sock6) goto drop; - sk = vxlan->vn6_sock->sock->sk; + sk = sock6->sock->sk; ndst = vxlan6_get_route(vxlan, skb, rdst ? rdst->remote_ifindex : 0, tos, @@ -2415,9 +2436,10 @@ static int vxlan_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb) dport = info->key.tp_dst ? : vxlan->cfg.dst_port; if (ip_tunnel_info_af(info) == AF_INET) { + struct vxlan_sock *sock4 = rcu_dereference(vxlan->vn4_sock); struct rtable *rt; - if (!vxlan->vn4_sock) + if (!sock4) return -EINVAL; rt = vxlan_get_route(vxlan, skb, 0, info->key.tos, info->key.u.ipv4.dst, @@ -2429,8 +2451,6 @@ static int vxlan_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb) #if IS_ENABLED(CONFIG_IPV6) struct dst_entry *ndst; - if (!vxlan->vn6_sock) - return -EINVAL; ndst = vxlan6_get_route(vxlan, skb, 0, info->key.tos, info->key.label, &info->key.u.ipv6.dst, &info->key.u.ipv6.src, NULL, info); @@ -2740,10 +2760,10 @@ static int __vxlan_sock_add(struct vxlan_dev *vxlan, bool ipv6) return PTR_ERR(vs); #if IS_ENABLED(CONFIG_IPV6) if (ipv6) - vxlan->vn6_sock = vs; + rcu_assign_pointer(vxlan->vn6_sock, vs); else #endif - vxlan->vn4_sock = vs; + rcu_assign_pointer(vxlan->vn4_sock, vs); vxlan_vs_add_dev(vs, vxlan); return 0; } @@ -2754,9 +2774,9 @@ static int vxlan_sock_add(struct vxlan_dev *vxlan) bool metadata = vxlan->flags & VXLAN_F_COLLECT_METADATA; int ret = 0; - vxlan->vn4_sock = NULL; + RCU_INIT_POINTER(vxlan->vn4_sock, NULL); #if IS_ENABLED(CONFIG_IPV6) - vxlan->vn6_sock = NULL; + RCU_INIT_POINTER(vxlan->vn6_sock, NULL); if (ipv6 || metadata) ret = __vxlan_sock_add(vxlan, true); #endif diff --git a/include/net/vxlan.h b/include/net/vxlan.h index 0255613a54a4..308adc4154f4 100644 --- a/include/net/vxlan.h +++ b/include/net/vxlan.h @@ -225,9 +225,9 @@ struct vxlan_config { struct vxlan_dev { struct hlist_node hlist; /* vni hash table */ struct list_head next; /* vxlan's per namespace list */ - struct vxlan_sock *vn4_sock; /* listening socket for IPv4 */ + struct vxlan_sock __rcu *vn4_sock; /* listening socket for IPv4 */ #if IS_ENABLED(CONFIG_IPV6) - struct vxlan_sock *vn6_sock; /* listening socket for IPv6 */ + struct vxlan_sock __rcu *vn6_sock; /* listening socket for IPv6 */ #endif struct net_device *dev; struct net *net; /* netns for packet i/o */ -- cgit v1.2.3