diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2008-10-17 00:39:20 +0200 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2008-10-17 00:39:20 +0200 |
commit | 52ad096465d60ee7fdc99583f969a99a1166c7c3 (patch) | |
tree | 0cf33926ce8b5eb4f6675d3b1332b35e914f4846 | |
parent | Merge branch 'sched-fixes-for-linus' of git://git.kernel.org/pub/scm/linux/ke... (diff) | |
parent | Merge branch 'next' (diff) | |
download | linux-52ad096465d60ee7fdc99583f969a99a1166c7c3.tar.xz linux-52ad096465d60ee7fdc99583f969a99a1166c7c3.zip |
Merge git://git.linux-nfs.org/projects/trondmy/nfs-2.6
* git://git.linux-nfs.org/projects/trondmy/nfs-2.6: (53 commits)
NFS: Fix a resolution problem with nfs_inode->cache_change_attribute
NFS: Fix the resolution problem with nfs_inode_attrs_need_update()
NFS: Changes to inode->i_nlinks must set the NFS_INO_INVALID_ATTR flag
RPC/RDMA: ensure connection attempt is complete before signalling.
RPC/RDMA: correct the reconnect timer backoff
RPC/RDMA: optionally emit useful transport info upon connect/disconnect.
RPC/RDMA: reformat a debug printk to keep lines together.
RPC/RDMA: harden connection logic against missing/late rdma_cm upcalls.
RPC/RDMA: fix connect/reconnect resource leak.
RPC/RDMA: return a consistent error, when connect fails.
RPC/RDMA: adhere to protocol for unpadded client trailing write chunks.
RPC/RDMA: avoid an oops due to disconnect racing with async upcalls.
RPC/RDMA: maintain the RPC task bytes-sent statistic.
RPC/RDMA: suppress retransmit on RPC/RDMA clients.
RPC/RDMA: fix connection IRD/ORD setting
RPC/RDMA: support FRMR client memory registration.
RPC/RDMA: check selected memory registration mode at runtime.
RPC/RDMA: add data types and new FRMR memory registration enum.
RPC/RDMA: refactor the inline memory registration code.
NFS: fix nfs_parse_ip_address() corner case
...
-rw-r--r-- | fs/nfs/client.c | 5 | ||||
-rw-r--r-- | fs/nfs/dir.c | 20 | ||||
-rw-r--r-- | fs/nfs/file.c | 18 | ||||
-rw-r--r-- | fs/nfs/inode.c | 183 | ||||
-rw-r--r-- | fs/nfs/internal.h | 25 | ||||
-rw-r--r-- | fs/nfs/mount_clnt.c | 3 | ||||
-rw-r--r-- | fs/nfs/namespace.c | 7 | ||||
-rw-r--r-- | fs/nfs/nfs3acl.c | 2 | ||||
-rw-r--r-- | fs/nfs/nfs3proc.c | 20 | ||||
-rw-r--r-- | fs/nfs/nfs4namespace.c | 105 | ||||
-rw-r--r-- | fs/nfs/proc.c | 10 | ||||
-rw-r--r-- | fs/nfs/super.c | 126 | ||||
-rw-r--r-- | fs/nfs/unlink.c | 5 | ||||
-rw-r--r-- | fs/nfs/write.c | 3 | ||||
-rw-r--r-- | include/linux/nfs_fs.h | 19 | ||||
-rw-r--r-- | include/linux/nfs_fs_sb.h | 1 | ||||
-rw-r--r-- | include/linux/nfs_mount.h | 4 | ||||
-rw-r--r-- | include/linux/nfs_xdr.h | 11 | ||||
-rw-r--r-- | include/linux/sunrpc/xprtrdma.h | 4 | ||||
-rw-r--r-- | net/sunrpc/clnt.c | 4 | ||||
-rw-r--r-- | net/sunrpc/rpcb_clnt.c | 40 | ||||
-rw-r--r-- | net/sunrpc/xprt.c | 12 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/rpc_rdma.c | 29 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/transport.c | 41 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/verbs.c | 741 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/xprt_rdma.h | 17 |
26 files changed, 955 insertions, 500 deletions
diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 5ee23e7058b3..7547600b6174 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -675,7 +675,7 @@ static int nfs_init_server(struct nfs_server *server, server->nfs_client = clp; /* Initialise the client representation from the mount data */ - server->flags = data->flags & NFS_MOUNT_FLAGMASK; + server->flags = data->flags; if (data->rsize) server->rsize = nfs_block_size(data->rsize, NULL); @@ -850,7 +850,6 @@ static struct nfs_server *nfs_alloc_server(void) INIT_LIST_HEAD(&server->client_link); INIT_LIST_HEAD(&server->master_link); - init_waitqueue_head(&server->active_wq); atomic_set(&server->active, 0); server->io_stats = nfs_alloc_iostats(); @@ -1073,7 +1072,7 @@ static int nfs4_init_server(struct nfs_server *server, goto error; /* Initialise the client representation from the mount data */ - server->flags = data->flags & NFS_MOUNT_FLAGMASK; + server->flags = data->flags; server->caps |= NFS_CAP_ATOMIC_OPEN; if (data->rsize) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 74f92b717f78..2ab70d46ecbc 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -156,6 +156,7 @@ typedef struct { decode_dirent_t decode; int plus; unsigned long timestamp; + unsigned long gencount; int timestamp_valid; } nfs_readdir_descriptor_t; @@ -177,7 +178,7 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page *page) struct file *file = desc->file; struct inode *inode = file->f_path.dentry->d_inode; struct rpc_cred *cred = nfs_file_cred(file); - unsigned long timestamp; + unsigned long timestamp, gencount; int error; dfprintk(DIRCACHE, "NFS: %s: reading cookie %Lu into page %lu\n", @@ -186,6 +187,7 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page *page) again: timestamp = jiffies; + gencount = nfs_inc_attr_generation_counter(); error = NFS_PROTO(inode)->readdir(file->f_path.dentry, cred, desc->entry->cookie, page, NFS_SERVER(inode)->dtsize, desc->plus); if (error < 0) { @@ -199,6 +201,7 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page *page) goto error; } desc->timestamp = timestamp; + desc->gencount = gencount; desc->timestamp_valid = 1; SetPageUptodate(page); /* Ensure consistent page alignment of the data. @@ -224,9 +227,10 @@ int dir_decode(nfs_readdir_descriptor_t *desc) if (IS_ERR(p)) return PTR_ERR(p); desc->ptr = p; - if (desc->timestamp_valid) + if (desc->timestamp_valid) { desc->entry->fattr->time_start = desc->timestamp; - else + desc->entry->fattr->gencount = desc->gencount; + } else desc->entry->fattr->valid &= ~NFS_ATTR_FATTR; return 0; } @@ -471,7 +475,7 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent, struct rpc_cred *cred = nfs_file_cred(file); struct page *page = NULL; int status; - unsigned long timestamp; + unsigned long timestamp, gencount; dfprintk(DIRCACHE, "NFS: uncached_readdir() searching for cookie %Lu\n", (unsigned long long)*desc->dir_cookie); @@ -482,6 +486,7 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent, goto out; } timestamp = jiffies; + gencount = nfs_inc_attr_generation_counter(); status = NFS_PROTO(inode)->readdir(file->f_path.dentry, cred, *desc->dir_cookie, page, NFS_SERVER(inode)->dtsize, @@ -490,6 +495,7 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent, desc->ptr = kmap(page); /* matching kunmap in nfs_do_filldir */ if (status >= 0) { desc->timestamp = timestamp; + desc->gencount = gencount; desc->timestamp_valid = 1; if ((status = dir_decode(desc)) == 0) desc->entry->prev_cookie = *desc->dir_cookie; @@ -655,7 +661,7 @@ static int nfs_fsync_dir(struct file *filp, struct dentry *dentry, int datasync) */ void nfs_force_lookup_revalidate(struct inode *dir) { - NFS_I(dir)->cache_change_attribute = jiffies; + NFS_I(dir)->cache_change_attribute++; } /* @@ -667,6 +673,8 @@ static int nfs_check_verifier(struct inode *dir, struct dentry *dentry) { if (IS_ROOT(dentry)) return 1; + if (NFS_SERVER(dir)->flags & NFS_MOUNT_LOOKUP_CACHE_NONE) + return 0; if (!nfs_verify_change_attribute(dir, dentry->d_time)) return 0; /* Revalidate nfsi->cache_change_attribute before we declare a match */ @@ -750,6 +758,8 @@ int nfs_neg_need_reval(struct inode *dir, struct dentry *dentry, /* Don't revalidate a negative dentry if we're creating a new file */ if (nd != NULL && nfs_lookup_check_intent(nd, LOOKUP_CREATE) != 0) return 0; + if (NFS_SERVER(dir)->flags & NFS_MOUNT_LOOKUP_CACHE_NONEG) + return 1; return !nfs_check_verifier(dir, dentry); } diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 78460657f5cb..d319b49f8f06 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -188,13 +188,16 @@ static loff_t nfs_file_llseek(struct file *filp, loff_t offset, int origin) /* origin == SEEK_END => we must revalidate the cached file length */ if (origin == SEEK_END) { struct inode *inode = filp->f_mapping->host; + int retval = nfs_revalidate_file_size(inode, filp); if (retval < 0) return (loff_t)retval; - } - lock_kernel(); /* BKL needed? */ - loff = generic_file_llseek_unlocked(filp, offset, origin); - unlock_kernel(); + + spin_lock(&inode->i_lock); + loff = generic_file_llseek_unlocked(filp, offset, origin); + spin_unlock(&inode->i_lock); + } else + loff = generic_file_llseek_unlocked(filp, offset, origin); return loff; } @@ -699,13 +702,6 @@ static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl) filp->f_path.dentry->d_name.name, fl->fl_type, fl->fl_flags); - /* - * No BSD flocks over NFS allowed. - * Note: we could try to fake a POSIX lock request here by - * using ((u32) filp | 0x80000000) or some such as the pid. - * Not sure whether that would be unique, though, or whether - * that would break in other places. - */ if (!(fl->fl_flags & FL_FLOCK)) return -ENOLCK; diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 52daefa2f521..b9195c02a863 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -305,8 +305,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) init_special_inode(inode, inode->i_mode, fattr->rdev); nfsi->read_cache_jiffies = fattr->time_start; - nfsi->last_updated = now; - nfsi->cache_change_attribute = now; + nfsi->attr_gencount = fattr->gencount; inode->i_atime = fattr->atime; inode->i_mtime = fattr->mtime; inode->i_ctime = fattr->ctime; @@ -453,6 +452,7 @@ out_big: void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr) { if ((attr->ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID)) != 0) { + spin_lock(&inode->i_lock); if ((attr->ia_valid & ATTR_MODE) != 0) { int mode = attr->ia_mode & S_IALLUGO; mode |= inode->i_mode & ~S_IALLUGO; @@ -462,7 +462,6 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr) inode->i_uid = attr->ia_uid; if ((attr->ia_valid & ATTR_GID) != 0) inode->i_gid = attr->ia_gid; - spin_lock(&inode->i_lock); NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; spin_unlock(&inode->i_lock); } @@ -472,37 +471,6 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr) } } -static int nfs_wait_schedule(void *word) -{ - if (signal_pending(current)) - return -ERESTARTSYS; - schedule(); - return 0; -} - -/* - * Wait for the inode to get unlocked. - */ -static int nfs_wait_on_inode(struct inode *inode) -{ - struct nfs_inode *nfsi = NFS_I(inode); - int error; - - error = wait_on_bit_lock(&nfsi->flags, NFS_INO_REVALIDATING, - nfs_wait_schedule, TASK_KILLABLE); - - return error; -} - -static void nfs_wake_up_inode(struct inode *inode) -{ - struct nfs_inode *nfsi = NFS_I(inode); - - clear_bit(NFS_INO_REVALIDATING, &nfsi->flags); - smp_mb__after_clear_bit(); - wake_up_bit(&nfsi->flags, NFS_INO_REVALIDATING); -} - int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) { struct inode *inode = dentry->d_inode; @@ -697,20 +665,15 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) dfprintk(PAGECACHE, "NFS: revalidating (%s/%Ld)\n", inode->i_sb->s_id, (long long)NFS_FILEID(inode)); - nfs_inc_stats(inode, NFSIOS_INODEREVALIDATE); if (is_bad_inode(inode)) - goto out_nowait; + goto out; if (NFS_STALE(inode)) - goto out_nowait; - - status = nfs_wait_on_inode(inode); - if (status < 0) goto out; - status = -ESTALE; if (NFS_STALE(inode)) goto out; + nfs_inc_stats(inode, NFSIOS_INODEREVALIDATE); status = NFS_PROTO(inode)->getattr(server, NFS_FH(inode), &fattr); if (status != 0) { dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Ld) getattr failed, error=%d\n", @@ -724,16 +687,13 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) goto out; } - spin_lock(&inode->i_lock); - status = nfs_update_inode(inode, &fattr); + status = nfs_refresh_inode(inode, &fattr); if (status) { - spin_unlock(&inode->i_lock); dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Ld) refresh failed, error=%d\n", inode->i_sb->s_id, (long long)NFS_FILEID(inode), status); goto out; } - spin_unlock(&inode->i_lock); if (nfsi->cache_validity & NFS_INO_INVALID_ACL) nfs_zap_acl_cache(inode); @@ -743,9 +703,6 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) (long long)NFS_FILEID(inode)); out: - nfs_wake_up_inode(inode); - - out_nowait: return status; } @@ -908,9 +865,6 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat return -EIO; } - /* Do atomic weak cache consistency updates */ - nfs_wcc_update_inode(inode, fattr); - if ((fattr->valid & NFS_ATTR_FATTR_V4) != 0 && nfsi->change_attr != fattr->change_attr) invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; @@ -939,15 +893,81 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat if (invalid != 0) nfsi->cache_validity |= invalid; - else - nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR - | NFS_INO_INVALID_ATIME - | NFS_INO_REVAL_PAGECACHE); nfsi->read_cache_jiffies = fattr->time_start; return 0; } +static int nfs_ctime_need_update(const struct inode *inode, const struct nfs_fattr *fattr) +{ + return timespec_compare(&fattr->ctime, &inode->i_ctime) > 0; +} + +static int nfs_size_need_update(const struct inode *inode, const struct nfs_fattr *fattr) +{ + return nfs_size_to_loff_t(fattr->size) > i_size_read(inode); +} + +static unsigned long nfs_attr_generation_counter; + +static unsigned long nfs_read_attr_generation_counter(void) +{ + smp_rmb(); + return nfs_attr_generation_counter; +} + +unsigned long nfs_inc_attr_generation_counter(void) +{ + unsigned long ret; + smp_rmb(); + ret = ++nfs_attr_generation_counter; + smp_wmb(); + return ret; +} + +void nfs_fattr_init(struct nfs_fattr *fattr) +{ + fattr->valid = 0; + fattr->time_start = jiffies; + fattr->gencount = nfs_inc_attr_generation_counter(); +} + +/** + * nfs_inode_attrs_need_update - check if the inode attributes need updating + * @inode - pointer to inode + * @fattr - attributes + * + * Attempt to divine whether or not an RPC call reply carrying stale + * attributes got scheduled after another call carrying updated ones. + * + * To do so, the function first assumes that a more recent ctime means + * that the attributes in fattr are newer, however it also attempt to + * catch the case where ctime either didn't change, or went backwards + * (if someone reset the clock on the server) by looking at whether + * or not this RPC call was started after the inode was last updated. + * Note also the check for wraparound of 'attr_gencount' + * + * The function returns 'true' if it thinks the attributes in 'fattr' are + * more recent than the ones cached in the inode. + * + */ +static int nfs_inode_attrs_need_update(const struct inode *inode, const struct nfs_fattr *fattr) +{ + const struct nfs_inode *nfsi = NFS_I(inode); + + return ((long)fattr->gencount - (long)nfsi->attr_gencount) > 0 || + nfs_ctime_need_update(inode, fattr) || + nfs_size_need_update(inode, fattr) || + ((long)nfsi->attr_gencount - (long)nfs_read_attr_generation_counter() > 0); +} + +static int nfs_refresh_inode_locked(struct inode *inode, struct nfs_fattr *fattr) +{ + if (nfs_inode_attrs_need_update(inode, fattr)) + return nfs_update_inode(inode, fattr); + return nfs_check_inode_attributes(inode, fattr); +} + /** * nfs_refresh_inode - try to update the inode attribute cache * @inode - pointer to inode @@ -960,21 +980,28 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat */ int nfs_refresh_inode(struct inode *inode, struct nfs_fattr *fattr) { - struct nfs_inode *nfsi = NFS_I(inode); int status; if ((fattr->valid & NFS_ATTR_FATTR) == 0) return 0; spin_lock(&inode->i_lock); - if (time_after(fattr->time_start, nfsi->last_updated)) - status = nfs_update_inode(inode, fattr); - else - status = nfs_check_inode_attributes(inode, fattr); - + status = nfs_refresh_inode_locked(inode, fattr); spin_unlock(&inode->i_lock); return status; } +static int nfs_post_op_update_inode_locked(struct inode *inode, struct nfs_fattr *fattr) +{ + struct nfs_inode *nfsi = NFS_I(inode); + + nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; + if (S_ISDIR(inode->i_mode)) + nfsi->cache_validity |= NFS_INO_INVALID_DATA; + if ((fattr->valid & NFS_ATTR_FATTR) == 0) + return 0; + return nfs_refresh_inode_locked(inode, fattr); +} + /** * nfs_post_op_update_inode - try to update the inode attribute cache * @inode - pointer to inode @@ -991,14 +1018,12 @@ int nfs_refresh_inode(struct inode *inode, struct nfs_fattr *fattr) */ int nfs_post_op_update_inode(struct inode *inode, struct nfs_fattr *fattr) { - struct nfs_inode *nfsi = NFS_I(inode); + int status; spin_lock(&inode->i_lock); - nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; - if (S_ISDIR(inode->i_mode)) - nfsi->cache_validity |= NFS_INO_INVALID_DATA; + status = nfs_post_op_update_inode_locked(inode, fattr); spin_unlock(&inode->i_lock); - return nfs_refresh_inode(inode, fattr); + return status; } /** @@ -1014,6 +1039,15 @@ int nfs_post_op_update_inode(struct inode *inode, struct nfs_fattr *fattr) */ int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fattr) { + int status; + + spin_lock(&inode->i_lock); + /* Don't do a WCC update if these attributes are already stale */ + if ((fattr->valid & NFS_ATTR_FATTR) == 0 || + !nfs_inode_attrs_need_update(inode, fattr)) { + fattr->valid &= ~(NFS_ATTR_WCC_V4|NFS_ATTR_WCC); + goto out_noforce; + } if ((fattr->valid & NFS_ATTR_FATTR_V4) != 0 && (fattr->valid & NFS_ATTR_WCC_V4) == 0) { fattr->pre_change_attr = NFS_I(inode)->change_attr; @@ -1026,7 +1060,10 @@ int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fa fattr->pre_size = i_size_read(inode); fattr->valid |= NFS_ATTR_WCC; } - return nfs_post_op_update_inode(inode, fattr); +out_noforce: + status = nfs_post_op_update_inode_locked(inode, fattr); + spin_unlock(&inode->i_lock); + return status; } /* @@ -1092,7 +1129,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) } /* If ctime has changed we should definitely clear access+acl caches */ if (!timespec_equal(&inode->i_ctime, &fattr->ctime)) - invalid |= NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; + invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; } else if (nfsi->change_attr != fattr->change_attr) { dprintk("NFS: change_attr change on server for file %s/%ld\n", inode->i_sb->s_id, inode->i_ino); @@ -1126,6 +1163,9 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) inode->i_gid != fattr->gid) invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; + if (inode->i_nlink != fattr->nlink) + invalid |= NFS_INO_INVALID_ATTR; + inode->i_mode = fattr->mode; inode->i_nlink = fattr->nlink; inode->i_uid = fattr->uid; @@ -1145,18 +1185,13 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) nfs_inc_stats(inode, NFSIOS_ATTRINVALIDATE); nfsi->attrtimeo = NFS_MINATTRTIMEO(inode); nfsi->attrtimeo_timestamp = now; - nfsi->last_updated = now; + nfsi->attr_gencount = nfs_inc_attr_generation_counter(); } else { if (!time_in_range(now, nfsi->attrtimeo_timestamp, nfsi->attrtimeo_timestamp + nfsi->attrtimeo)) { if ((nfsi->attrtimeo <<= 1) > NFS_MAXATTRTIMEO(inode)) nfsi->attrtimeo = NFS_MAXATTRTIMEO(inode); nfsi->attrtimeo_timestamp = now; } - /* - * Avoid jiffy wraparound issues with nfsi->last_updated - */ - if (!time_in_range(nfsi->last_updated, nfsi->read_cache_jiffies, now)) - nfsi->last_updated = nfsi->read_cache_jiffies; } invalid &= ~NFS_INO_INVALID_ATTR; /* Don't invalidate the data if we were to blame */ diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 24241fcbb98d..d212ee41caf2 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -153,6 +153,7 @@ extern void nfs4_clear_inode(struct inode *); void nfs_zap_acl_cache(struct inode *inode); /* super.c */ +void nfs_parse_ip_address(char *, size_t, struct sockaddr *, size_t *); extern struct file_system_type nfs_xdev_fs_type; #ifdef CONFIG_NFS_V4 extern struct file_system_type nfs4_xdev_fs_type; @@ -163,8 +164,8 @@ extern struct rpc_stat nfs_rpcstat; extern int __init register_nfs_fs(void); extern void __exit unregister_nfs_fs(void); -extern void nfs_sb_active(struct nfs_server *server); -extern void nfs_sb_deactive(struct nfs_server *server); +extern void nfs_sb_active(struct super_block *sb); +extern void nfs_sb_deactive(struct super_block *sb); /* namespace.c */ extern char *nfs_path(const char *base, @@ -276,3 +277,23 @@ unsigned int nfs_page_array_len(unsigned int base, size_t len) PAGE_SIZE - 1) >> PAGE_SHIFT; } +#define IPV6_SCOPE_DELIMITER '%' + +/* + * Set the port number in an address. Be agnostic about the address + * family. + */ +static inline void nfs_set_port(struct sockaddr *sap, unsigned short port) +{ + struct sockaddr_in *ap = (struct sockaddr_in *)sap; + struct sockaddr_in6 *ap6 = (struct sockaddr_in6 *)sap; + + switch (sap->sa_family) { + case AF_INET: + ap->sin_port = htons(port); + break; + case AF_INET6: + ap6->sin6_port = htons(port); + break; + } +} diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c index 779d2eb649c5..086a6830d785 100644 --- a/fs/nfs/mount_clnt.c +++ b/fs/nfs/mount_clnt.c @@ -14,6 +14,7 @@ #include <linux/sunrpc/clnt.h> #include <linux/sunrpc/sched.h> #include <linux/nfs_fs.h> +#include "internal.h" #ifdef RPC_DEBUG # define NFSDBG_FACILITY NFSDBG_MOUNT @@ -98,7 +99,7 @@ out_call_err: out_mnt_err: dprintk("NFS: MNT server returned result %d\n", result.status); - status = -EACCES; + status = nfs_stat_to_errno(result.status); goto out; } diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c index 66df08dd1caf..64a288ee046d 100644 --- a/fs/nfs/namespace.c +++ b/fs/nfs/namespace.c @@ -105,7 +105,10 @@ static void * nfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd) dprintk("--> nfs_follow_mountpoint()\n"); - BUG_ON(IS_ROOT(dentry)); + err = -ESTALE; + if (IS_ROOT(dentry)) + goto out_err; + dprintk("%s: enter\n", __func__); dput(nd->path.dentry); nd->path.dentry = dget(dentry); @@ -189,7 +192,7 @@ static struct vfsmount *nfs_do_clone_mount(struct nfs_server *server, struct nfs_clone_mount *mountdata) { #ifdef CONFIG_NFS_V4 - struct vfsmount *mnt = NULL; + struct vfsmount *mnt = ERR_PTR(-EINVAL); switch (server->nfs_client->rpc_ops->version) { case 2: case 3: diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c index 423842f51ac9..cef62557c87d 100644 --- a/fs/nfs/nfs3acl.c +++ b/fs/nfs/nfs3acl.c @@ -229,6 +229,7 @@ struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type) dprintk("NFS call getacl\n"); msg.rpc_proc = &server->client_acl->cl_procinfo[ACLPROC3_GETACL]; + nfs_fattr_init(&fattr); status = rpc_call_sync(server->client_acl, &msg, 0); dprintk("NFS reply getacl: %d\n", status); @@ -322,6 +323,7 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl, dprintk("NFS call setacl\n"); msg.rpc_proc = &server->client_acl->cl_procinfo[ACLPROC3_SETACL]; + nfs_fattr_init(&fattr); status = rpc_call_sync(server->client_acl, &msg, 0); nfs_access_zap_cache(inode); nfs_zap_acl_cache(inode); diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index 1e750e4574a9..c55be7a7679e 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -699,7 +699,7 @@ nfs3_proc_statfs(struct nfs_server *server, struct nfs_fh *fhandle, } static int -nfs3_proc_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle, +do_proc_fsinfo(struct rpc_clnt *client, struct nfs_fh *fhandle, struct nfs_fsinfo *info) { struct rpc_message msg = { @@ -711,11 +711,27 @@ nfs3_proc_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle, dprintk("NFS call fsinfo\n"); nfs_fattr_init(info->fattr); - status = rpc_call_sync(server->nfs_client->cl_rpcclient, &msg, 0); + status = rpc_call_sync(client, &msg, 0); dprintk("NFS reply fsinfo: %d\n", status); return status; } +/* + * Bare-bones access to fsinfo: this is for nfs_get_root/nfs_get_sb via + * nfs_create_server + */ +static int +nfs3_proc_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle, + struct nfs_fsinfo *info) +{ + int status; + + status = do_proc_fsinfo(server->client, fhandle, info); + if (status && server->nfs_client->cl_rpcclient != server->client) + status = do_proc_fsinfo(server->nfs_client->cl_rpcclient, fhandle, info); + return status; +} + static int nfs3_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_pathconf *info) diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c index b112857301f7..30befc39b3c6 100644 --- a/fs/nfs/nfs4namespace.c +++ b/fs/nfs/nfs4namespace.c @@ -93,21 +93,52 @@ static int nfs4_validate_fspath(const struct vfsmount *mnt_parent, return 0; } -/* - * Check if the string represents a "valid" IPv4 address - */ -static inline int valid_ipaddr4(const char *buf) +static struct vfsmount *try_location(struct nfs_clone_mount *mountdata, + char *page, char *page2, + const struct nfs4_fs_location *location) { - int rc, count, in[4]; - - rc = sscanf(buf, "%d.%d.%d.%d", &in[0], &in[1], &in[2], &in[3]); - if (rc != 4) - return -EINVAL; - for (count = 0; count < 4; count++) { - if (in[count] > 255) - return -EINVAL; + struct vfsmount *mnt = ERR_PTR(-ENOENT); + char *mnt_path; + int page2len; + unsigned int s; + + mnt_path = nfs4_pathname_string(&location->rootpath, page2, PAGE_SIZE); + if (IS_ERR(mnt_path)) + return mnt; + mountdata->mnt_path = mnt_path; + page2 += strlen(mnt_path) + 1; + page2len = PAGE_SIZE - strlen(mnt_path) - 1; + + for (s = 0; s < location->nservers; s++) { + const struct nfs4_string *buf = &location->servers[s]; + struct sockaddr_storage addr; + + if (buf->len <= 0 || buf->len >= PAGE_SIZE) + continue; + + mountdata->addr = (struct sockaddr *)&addr; + + if (memchr(buf->data, IPV6_SCOPE_DELIMITER, buf->len)) + continue; + nfs_parse_ip_address(buf->data, buf->len, + mountdata->addr, &mountdata->addrlen); + if (mountdata->addr->sa_family == AF_UNSPEC) + continue; + nfs_set_port(mountdata->addr, NFS_PORT); + + strncpy(page2, buf->data, page2len); + page2[page2len] = '\0'; + mountdata->hostname = page2; + + snprintf(page, PAGE_SIZE, "%s:%s", + mountdata->hostname, + mountdata->mnt_path); + + mnt = vfs_kern_mount(&nfs4_referral_fs_type, 0, page, mountdata); + if (!IS_ERR(mnt)) + break; } - return 0; + return mnt; } /** @@ -128,7 +159,6 @@ static struct vfsmount *nfs_follow_referral(const struct vfsmount *mnt_parent, .authflavor = NFS_SB(mnt_parent->mnt_sb)->client->cl_auth->au_flavor, }; char *page = NULL, *page2 = NULL; - unsigned int s; int loc, error; if (locations == NULL || locations->nlocations <= 0) @@ -152,53 +182,16 @@ static struct vfsmount *nfs_follow_referral(const struct vfsmount *mnt_parent, goto out; } - loc = 0; - while (loc < locations->nlocations && IS_ERR(mnt)) { + for (loc = 0; loc < locations->nlocations; loc++) { const struct nfs4_fs_location *location = &locations->locations[loc]; - char *mnt_path; if (location == NULL || location->nservers <= 0 || - location->rootpath.ncomponents == 0) { - loc++; + location->rootpath.ncomponents == 0) continue; - } - mnt_path = nfs4_pathname_string(&location->rootpath, page2, PAGE_SIZE); - if (IS_ERR(mnt_path)) { - loc++; - continue; - } - mountdata.mnt_path = mnt_path; - - s = 0; - while (s < location->nservers) { - struct sockaddr_in addr = { - .sin_family = AF_INET, - .sin_port = htons(NFS_PORT), - }; - - if (location->servers[s].len <= 0 || - valid_ipaddr4(location->servers[s].data) < 0) { - s++; - continue; - } - - mountdata.hostname = location->servers[s].data; - addr.sin_addr.s_addr = in_aton(mountdata.hostname), - mountdata.addr = (struct sockaddr *)&addr; - mountdata.addrlen = sizeof(addr); - - snprintf(page, PAGE_SIZE, "%s:%s", - mountdata.hostname, - mountdata.mnt_path); - - mnt = vfs_kern_mount(&nfs4_referral_fs_type, 0, page, &mountdata); - if (!IS_ERR(mnt)) { - break; - } - s++; - } - loc++; + mnt = try_location(&mountdata, page, page2, location); + if (!IS_ERR(mnt)) + break; } out: diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c index 4dbb84df1b68..193465210d7c 100644 --- a/fs/nfs/proc.c +++ b/fs/nfs/proc.c @@ -65,14 +65,20 @@ nfs_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle, dprintk("%s: call getattr\n", __func__); nfs_fattr_init(fattr); - status = rpc_call_sync(server->nfs_client->cl_rpcclient, &msg, 0); + status = rpc_call_sync(server->client, &msg, 0); + /* Retry with default authentication if different */ + if (status && server->nfs_client->cl_rpcclient != server->client) + status = rpc_call_sync(server->nfs_client->cl_rpcclient, &msg, 0); dprintk("%s: reply getattr: %d\n", __func__, status); if (status) return status; dprintk("%s: call statfs\n", __func__); msg.rpc_proc = &nfs_procedures[NFSPROC_STATFS]; msg.rpc_resp = &fsinfo; - status = rpc_call_sync(server->nfs_client->cl_rpcclient, &msg, 0); + status = rpc_call_sync(server->client, &msg, 0); + /* Retry with default authentication if different */ + if (status && server->nfs_client->cl_rpcclient != server->client) + status = rpc_call_sync(server->nfs_client->cl_rpcclient, &msg, 0); dprintk("%s: reply statfs: %d\n", __func__, status); if (status) return status; diff --git a/fs/nfs/super.c b/fs/nfs/super.c index ffb697416cb1..8b28b95c9e44 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -91,6 +91,7 @@ enum { /* Mount options that take string arguments */ Opt_sec, Opt_proto, Opt_mountproto, Opt_mounthost, Opt_addr, Opt_mountaddr, Opt_clientaddr, + Opt_lookupcache, /* Special mount options */ Opt_userspace, Opt_deprecated, Opt_sloppy, @@ -154,6 +155,8 @@ static const match_table_t nfs_mount_option_tokens = { { Opt_mounthost, "mounthost=%s" }, { Opt_mountaddr, "mountaddr=%s" }, + { Opt_lookupcache, "lookupcache=%s" }, + { Opt_err, NULL } }; @@ -200,6 +203,22 @@ static const match_table_t nfs_secflavor_tokens = { { Opt_sec_err, NULL } }; +enum { + Opt_lookupcache_all, Opt_lookupcache_positive, + Opt_lookupcache_none, + + Opt_lookupcache_err +}; + +static match_table_t nfs_lookupcache_tokens = { + { Opt_lookupcache_all, "all" }, + { Opt_lookupcache_positive, "pos" }, + { Opt_lookupcache_positive, "positive" }, + { Opt_lookupcache_none, "none" }, + + { Opt_lookupcache_err, NULL } +}; + static void nfs_umount_begin(struct super_block *); static int nfs_statfs(struct dentry *, struct kstatfs *); @@ -209,7 +228,6 @@ static int nfs_get_sb(struct file_system_type *, int, const char *, void *, stru static int nfs_xdev_get_sb(struct file_system_type *fs_type, int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt); static void nfs_kill_super(struct super_block *); -static void nfs_put_super(struct super_block *); static int nfs_remount(struct super_block *sb, int *flags, char *raw_data); static struct file_system_type nfs_fs_type = { @@ -232,7 +250,6 @@ static const struct super_operations nfs_sops = { .alloc_inode = nfs_alloc_inode, .destroy_inode = nfs_destroy_inode, .write_inode = nfs_write_inode, - .put_super = nfs_put_super, .statfs = nfs_statfs, .clear_inode = nfs_clear_inode, .umount_begin = nfs_umount_begin, @@ -337,26 +354,20 @@ void __exit unregister_nfs_fs(void) unregister_filesystem(&nfs_fs_type); } -void nfs_sb_active(struct nfs_server *server) +void nfs_sb_active(struct super_block *sb) { - atomic_inc(&server->active); -} + struct nfs_server *server = NFS_SB(sb); -void nfs_sb_deactive(struct nfs_server *server) -{ - if (atomic_dec_and_test(&server->active)) - wake_up(&server->active_wq); + if (atomic_inc_return(&server->active) == 1) + atomic_inc(&sb->s_active); } -static void nfs_put_super(struct super_block *sb) +void nfs_sb_deactive(struct super_block *sb) { struct nfs_server *server = NFS_SB(sb); - /* - * Make sure there are no outstanding ops to this server. - * If so, wait for them to finish before allowing the - * unmount to continue. - */ - wait_event(server->active_wq, atomic_read(&server->active) == 0); + + if (atomic_dec_and_test(&server->active)) + deactivate_super(sb); } /* @@ -664,25 +675,6 @@ static void nfs_umount_begin(struct super_block *sb) } /* - * Set the port number in an address. Be agnostic about the address family. - */ -static void nfs_set_port(struct sockaddr *sap, unsigned short port) -{ - switch (sap->sa_family) { - case AF_INET: { - struct sockaddr_in *ap = (struct sockaddr_in *)sap; - ap->sin_port = htons(port); - break; - } - case AF_INET6: { - struct sockaddr_in6 *ap = (struct sockaddr_in6 *)sap; - ap->sin6_port = htons(port); - break; - } - } -} - -/* * Sanity-check a server address provided by the mount command. * * Address family must be initialized, and address must not be @@ -724,20 +716,22 @@ static void nfs_parse_ipv4_address(char *string, size_t str_len, *addr_len = 0; } -#define IPV6_SCOPE_DELIMITER '%' - #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) -static void nfs_parse_ipv6_scope_id(const char *string, const size_t str_len, - const char *delim, - struct sockaddr_in6 *sin6) +static int nfs_parse_ipv6_scope_id(const char *string, const size_t str_len, + const char *delim, + struct sockaddr_in6 *sin6) { char *p; size_t len; - if (!(ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL)) - return ; + if ((string + str_len) == delim) + return 1; + if (*delim != IPV6_SCOPE_DELIMITER) - return; + return 0; + + if (!(ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL)) + return 0; len = (string + str_len) - delim - 1; p = kstrndup(delim + 1, len, GFP_KERNEL); @@ -750,14 +744,20 @@ static void nfs_parse_ipv6_scope_id(const char *string, const size_t str_len, scope_id = dev->ifindex; dev_put(dev); } else { - /* scope_id is set to zero on error */ - strict_strtoul(p, 10, &scope_id); + if (strict_strtoul(p, 10, &scope_id) == 0) { + kfree(p); + return 0; + } } kfree(p); + sin6->sin6_scope_id = scope_id; dfprintk(MOUNT, "NFS: IPv6 scope ID = %lu\n", scope_id); + return 1; } + + return 0; } static void nfs_parse_ipv6_address(char *string, size_t str_len, @@ -773,9 +773,11 @@ static void nfs_parse_ipv6_address(char *string, size_t str_len, sin6->sin6_family = AF_INET6; *addr_len = sizeof(*sin6); - if (in6_pton(string, str_len, addr, IPV6_SCOPE_DELIMITER, &delim)) { - nfs_parse_ipv6_scope_id(string, str_len, delim, sin6); - return; + if (in6_pton(string, str_len, addr, + IPV6_SCOPE_DELIMITER, &delim) != 0) { + if (nfs_parse_ipv6_scope_id(string, str_len, + delim, sin6) != 0) + return; } } @@ -798,7 +800,7 @@ static void nfs_parse_ipv6_address(char *string, size_t str_len, * If there is a problem constructing the new sockaddr, set the address * family to AF_UNSPEC. */ -static void nfs_parse_ip_address(char *string, size_t str_len, +void nfs_parse_ip_address(char *string, size_t str_len, struct sockaddr *sap, size_t *addr_len) { unsigned int i, colons; @@ -1258,6 +1260,30 @@ static int nfs_parse_mount_options(char *raw, &mnt->mount_server.addrlen); kfree(string); break; + case Opt_lookupcache: + string = match_strdup(args); + if (string == NULL) + goto out_nomem; + token = match_token(string, + nfs_lookupcache_tokens, args); + kfree(string); + switch (token) { + case Opt_lookupcache_all: + mnt->flags &= ~(NFS_MOUNT_LOOKUP_CACHE_NONEG|NFS_MOUNT_LOOKUP_CACHE_NONE); + break; + case Opt_lookupcache_positive: + mnt->flags &= ~NFS_MOUNT_LOOKUP_CACHE_NONE; + mnt->flags |= NFS_MOUNT_LOOKUP_CACHE_NONEG; + break; + case Opt_lookupcache_none: + mnt->flags |= NFS_MOUNT_LOOKUP_CACHE_NONEG|NFS_MOUNT_LOOKUP_CACHE_NONE; + break; + default: + errors++; + dfprintk(MOUNT, "NFS: invalid " + "lookupcache argument\n"); + }; + break; /* * Special options @@ -1558,7 +1584,7 @@ static int nfs_validate_mount_data(void *options, * Translate to nfs_parsed_mount_data, which nfs_fill_super * can deal with. */ - args->flags = data->flags; + args->flags = data->flags & NFS_MOUNT_FLAGMASK; args->rsize = data->rsize; args->wsize = data->wsize; args->timeo = data->timeo; diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c index f089e5839d7d..ecc295347775 100644 --- a/fs/nfs/unlink.c +++ b/fs/nfs/unlink.c @@ -99,7 +99,7 @@ static void nfs_async_unlink_release(void *calldata) nfs_dec_sillycount(data->dir); nfs_free_unlinkdata(data); - nfs_sb_deactive(NFS_SB(sb)); + nfs_sb_deactive(sb); } static const struct rpc_call_ops nfs_unlink_ops = { @@ -118,6 +118,7 @@ static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct n .rpc_message = &msg, .callback_ops = &nfs_unlink_ops, .callback_data = data, + .workqueue = nfsiod_workqueue, .flags = RPC_TASK_ASYNC, }; struct rpc_task *task; @@ -149,7 +150,7 @@ static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct n nfs_dec_sillycount(dir); return 0; } - nfs_sb_active(NFS_SERVER(dir)); + nfs_sb_active(dir->i_sb); data->args.fh = NFS_FH(dir); nfs_fattr_init(&data->res.dir_attr); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 3229e217c773..9f9845859fc1 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1427,8 +1427,9 @@ static int nfs_write_mapping(struct address_space *mapping, int how) .bdi = mapping->backing_dev_info, .sync_mode = WB_SYNC_NONE, .nr_to_write = LONG_MAX, + .range_start = 0, + .range_end = LLONG_MAX, .for_writepages = 1, - .range_cyclic = 1, }; int ret; diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 78a5922a2f11..ac8d0233b05c 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -137,7 +137,7 @@ struct nfs_inode { unsigned long attrtimeo_timestamp; __u64 change_attr; /* v4 only */ - unsigned long last_updated; + unsigned long attr_gencount; /* "Generation counter" for the attribute cache. This is * bumped whenever we update the metadata on the * server. @@ -200,11 +200,10 @@ struct nfs_inode { /* * Bit offsets in flags field */ -#define NFS_INO_REVALIDATING (0) /* revalidating attrs */ -#define NFS_INO_ADVISE_RDPLUS (1) /* advise readdirplus */ -#define NFS_INO_STALE (2) /* possible stale inode */ -#define NFS_INO_ACL_LRU_SET (3) /* Inode is on the LRU list */ -#define NFS_INO_MOUNTPOINT (4) /* inode is remote mountpoint */ +#define NFS_INO_ADVISE_RDPLUS (0) /* advise readdirplus */ +#define NFS_INO_STALE (1) /* possible stale inode */ +#define NFS_INO_ACL_LRU_SET (2) /* Inode is on the LRU list */ +#define NFS_INO_MOUNTPOINT (3) /* inode is remote mountpoint */ static inline struct nfs_inode *NFS_I(const struct inode *inode) { @@ -345,15 +344,11 @@ extern struct nfs_open_context *get_nfs_open_context(struct nfs_open_context *ct extern void put_nfs_open_context(struct nfs_open_context *ctx); extern struct nfs_open_context *nfs_find_open_context(struct inode *inode, struct rpc_cred *cred, int mode); extern u64 nfs_compat_user_ino64(u64 fileid); +extern void nfs_fattr_init(struct nfs_fattr *fattr); /* linux/net/ipv4/ipconfig.c: trims ip addr off front of name, too. */ extern __be32 root_nfs_parse_addr(char *name); /*__init*/ - -static inline void nfs_fattr_init(struct nfs_fattr *fattr) -{ - fattr->valid = 0; - fattr->time_start = jiffies; -} +extern unsigned long nfs_inc_attr_generation_counter(void); /* * linux/fs/nfs/file.c diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h index c9beacd16c00..4e477ae58699 100644 --- a/include/linux/nfs_fs_sb.h +++ b/include/linux/nfs_fs_sb.h @@ -119,7 +119,6 @@ struct nfs_server { void (*destroy)(struct nfs_server *); atomic_t active; /* Keep trace of any activity to this server */ - wait_queue_head_t active_wq; /* Wait for any activity to stop */ /* mountd-related mount options */ struct sockaddr_storage mountd_address; diff --git a/include/linux/nfs_mount.h b/include/linux/nfs_mount.h index df7c6b7a7ebb..6549a06ac16e 100644 --- a/include/linux/nfs_mount.h +++ b/include/linux/nfs_mount.h @@ -65,4 +65,8 @@ struct nfs_mount_data { #define NFS_MOUNT_UNSHARED 0x8000 /* 5 */ #define NFS_MOUNT_FLAGMASK 0xFFFF +/* The following are for internal use only */ +#define NFS_MOUNT_LOOKUP_CACHE_NONEG 0x10000 +#define NFS_MOUNT_LOOKUP_CACHE_NONE 0x20000 + #endif diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 8c77c11224d1..c1c31acb8a2b 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -36,6 +36,7 @@ struct nfs_fattr { __u32 nlink; __u32 uid; __u32 gid; + dev_t rdev; __u64 size; union { struct { @@ -46,7 +47,6 @@ struct nfs_fattr { __u64 used; } nfs3; } du; - dev_t rdev; struct nfs_fsid fsid; __u64 fileid; struct timespec atime; @@ -56,6 +56,7 @@ struct nfs_fattr { __u64 change_attr; /* NFSv4 change attribute */ __u64 pre_change_attr;/* pre-op NFSv4 change attribute */ unsigned long time_start; + unsigned long gencount; }; #define NFS_ATTR_WCC 0x0001 /* pre-op WCC data */ @@ -672,16 +673,16 @@ struct nfs4_rename_res { struct nfs_fattr * new_fattr; }; -#define NFS4_SETCLIENTID_NAMELEN (56) +#define NFS4_SETCLIENTID_NAMELEN (127) struct nfs4_setclientid { const nfs4_verifier * sc_verifier; unsigned int sc_name_len; - char sc_name[NFS4_SETCLIENTID_NAMELEN]; + char sc_name[NFS4_SETCLIENTID_NAMELEN + 1]; u32 sc_prog; unsigned int sc_netid_len; - char sc_netid[RPCBIND_MAXNETIDLEN]; + char sc_netid[RPCBIND_MAXNETIDLEN + 1]; unsigned int sc_uaddr_len; - char sc_uaddr[RPCBIND_MAXUADDRLEN]; + char sc_uaddr[RPCBIND_MAXUADDRLEN + 1]; u32 sc_cb_ident; }; diff --git a/include/linux/sunrpc/xprtrdma.h b/include/linux/sunrpc/xprtrdma.h index 4de56b1d372b..54a379c9e8eb 100644 --- a/include/linux/sunrpc/xprtrdma.h +++ b/include/linux/sunrpc/xprtrdma.h @@ -66,9 +66,6 @@ #define RPCRDMA_INLINE_PAD_THRESH (512)/* payload threshold to pad (bytes) */ -#define RDMA_RESOLVE_TIMEOUT (5*HZ) /* TBD 5 seconds */ -#define RDMA_CONNECT_RETRY_MAX (2) /* retries if no listener backlog */ - /* memory registration strategies */ #define RPCRDMA_PERSISTENT_REGISTRATION (1) @@ -78,6 +75,7 @@ enum rpcrdma_memreg { RPCRDMA_MEMWINDOWS, RPCRDMA_MEMWINDOWS_ASYNC, RPCRDMA_MTHCAFMR, + RPCRDMA_FRMR, RPCRDMA_ALLPHYSICAL, RPCRDMA_LAST }; diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index da0789fa1b88..4895c341e46d 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -213,10 +213,10 @@ static struct rpc_clnt * rpc_new_client(const struct rpc_create_args *args, stru } /* save the nodename */ - clnt->cl_nodelen = strlen(utsname()->nodename); + clnt->cl_nodelen = strlen(init_utsname()->nodename); if (clnt->cl_nodelen > UNX_MAXNODENAME) clnt->cl_nodelen = UNX_MAXNODENAME; - memcpy(clnt->cl_nodename, utsname()->nodename, clnt->cl_nodelen); + memcpy(clnt->cl_nodename, init_utsname()->nodename, clnt->cl_nodelen); rpc_register_client(clnt); return clnt; diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c index 34abc91058d8..41013dd66ac3 100644 --- a/net/sunrpc/rpcb_clnt.c +++ b/net/sunrpc/rpcb_clnt.c @@ -460,6 +460,28 @@ static struct rpc_task *rpcb_call_async(struct rpc_clnt *rpcb_clnt, struct rpcbi return rpc_run_task(&task_setup_data); } +/* + * In the case where rpc clients have been cloned, we want to make + * sure that we use the program number/version etc of the actual + * owner of the xprt. To do so, we walk back up the tree of parents + * to find whoever created the transport and/or whoever has the + * autobind flag set. + */ +static struct rpc_clnt *rpcb_find_transport_owner(struct rpc_clnt *clnt) +{ + struct rpc_clnt *parent = clnt->cl_parent; + + while (parent != clnt) { + if (parent->cl_xprt != clnt->cl_xprt) + break; + if (clnt->cl_autobind) + break; + clnt = parent; + parent = parent->cl_parent; + } + return clnt; +} + /** * rpcb_getport_async - obtain the port for a given RPC service on a given host * @task: task that is waiting for portmapper request @@ -469,10 +491,10 @@ static struct rpc_task *rpcb_call_async(struct rpc_clnt *rpcb_clnt, struct rpcbi */ void rpcb_getport_async(struct rpc_task *task) { - struct rpc_clnt *clnt = task->tk_client; + struct rpc_clnt *clnt; struct rpc_procinfo *proc; u32 bind_version; - struct rpc_xprt *xprt = task->tk_xprt; + struct rpc_xprt *xprt; struct rpc_clnt *rpcb_clnt; static struct rpcbind_args *map; struct rpc_task *child; @@ -481,13 +503,13 @@ void rpcb_getport_async(struct rpc_task *task) size_t salen; int status; + clnt = rpcb_find_transport_owner(task->tk_client); + xprt = clnt->cl_xprt; + dprintk("RPC: %5u %s(%s, %u, %u, %d)\n", task->tk_pid, __func__, clnt->cl_server, clnt->cl_prog, clnt->cl_vers, xprt->prot); - /* Autobind on cloned rpc clients is discouraged */ - BUG_ON(clnt->cl_parent != clnt); - /* Put self on the wait queue to ensure we get notified if * some other task is already attempting to bind the port */ rpc_sleep_on(&xprt->binding, task, NULL); @@ -549,7 +571,7 @@ void rpcb_getport_async(struct rpc_task *task) status = -ENOMEM; dprintk("RPC: %5u %s: no memory available\n", task->tk_pid, __func__); - goto bailout_nofree; + goto bailout_release_client; } map->r_prog = clnt->cl_prog; map->r_vers = clnt->cl_vers; @@ -569,11 +591,13 @@ void rpcb_getport_async(struct rpc_task *task) task->tk_pid, __func__); return; } - rpc_put_task(child); - task->tk_xprt->stat.bind_count++; + xprt->stat.bind_count++; + rpc_put_task(child); return; +bailout_release_client: + rpc_release_client(rpcb_clnt); bailout_nofree: rpcb_wake_rpcbind_waiters(xprt, status); task->tk_status = status; diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 99a52aabe332..29e401bb612e 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -108,13 +108,10 @@ int xprt_register_transport(struct xprt_class *transport) goto out; } - result = -EINVAL; - if (try_module_get(THIS_MODULE)) { - list_add_tail(&transport->list, &xprt_list); - printk(KERN_INFO "RPC: Registered %s transport module.\n", - transport->name); - result = 0; - } + list_add_tail(&transport->list, &xprt_list); + printk(KERN_INFO "RPC: Registered %s transport module.\n", + transport->name); + result = 0; out: spin_unlock(&xprt_list_lock); @@ -143,7 +140,6 @@ int xprt_unregister_transport(struct xprt_class *transport) "RPC: Unregistered %s transport module.\n", transport->name); list_del_init(&transport->list); - module_put(THIS_MODULE); goto out; } } diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 5c1954d28d09..14106d26bb95 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -118,6 +118,10 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos, } if (xdrbuf->tail[0].iov_len) { + /* the rpcrdma protocol allows us to omit any trailing + * xdr pad bytes, saving the server an RDMA operation. */ + if (xdrbuf->tail[0].iov_len < 4 && xprt_rdma_pad_optimize) + return n; if (n == nsegs) return 0; seg[n].mr_page = NULL; @@ -508,8 +512,8 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) if (hdrlen == 0) return -1; - dprintk("RPC: %s: %s: hdrlen %zd rpclen %zd padlen %zd\n" - " headerp 0x%p base 0x%p lkey 0x%x\n", + dprintk("RPC: %s: %s: hdrlen %zd rpclen %zd padlen %zd" + " headerp 0x%p base 0x%p lkey 0x%x\n", __func__, transfertypes[wtype], hdrlen, rpclen, padlen, headerp, base, req->rl_iov.lkey); @@ -594,7 +598,7 @@ rpcrdma_count_chunks(struct rpcrdma_rep *rep, unsigned int max, int wrchunk, __b * Scatter inline received data back into provided iov's. */ static void -rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len) +rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad) { int i, npages, curlen, olen; char *destp; @@ -660,6 +664,13 @@ rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len) } else rqst->rq_rcv_buf.tail[0].iov_len = 0; + if (pad) { + /* implicit padding on terminal chunk */ + unsigned char *p = rqst->rq_rcv_buf.tail[0].iov_base; + while (pad--) + p[rqst->rq_rcv_buf.tail[0].iov_len++] = 0; + } + if (copy_len) dprintk("RPC: %s: %d bytes in" " %d extra segments (%d lost)\n", @@ -681,12 +692,14 @@ rpcrdma_conn_func(struct rpcrdma_ep *ep) struct rpc_xprt *xprt = ep->rep_xprt; spin_lock_bh(&xprt->transport_lock); + if (++xprt->connect_cookie == 0) /* maintain a reserved value */ + ++xprt->connect_cookie; if (ep->rep_connected > 0) { if (!xprt_test_and_set_connected(xprt)) xprt_wake_pending_tasks(xprt, 0); } else { if (xprt_test_and_clear_connected(xprt)) - xprt_wake_pending_tasks(xprt, ep->rep_connected); + xprt_wake_pending_tasks(xprt, -ENOTCONN); } spin_unlock_bh(&xprt->transport_lock); } @@ -792,14 +805,20 @@ repost: ((unsigned char *)iptr - (unsigned char *)headerp); status = rep->rr_len + rdmalen; r_xprt->rx_stats.total_rdma_reply += rdmalen; + /* special case - last chunk may omit padding */ + if (rdmalen &= 3) { + rdmalen = 4 - rdmalen; + status += rdmalen; + } } else { /* else ordinary inline */ + rdmalen = 0; iptr = (__be32 *)((unsigned char *)headerp + 28); rep->rr_len -= 28; /*sizeof *headerp;*/ status = rep->rr_len; } /* Fix up the rpc results for upper layer */ - rpcrdma_inline_fixup(rqst, (char *)iptr, rep->rr_len); + rpcrdma_inline_fixup(rqst, (char *)iptr, rep->rr_len, rdmalen); break; case htonl(RDMA_NOMSG): diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index a564c1a39ec5..9839c3d94145 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -70,11 +70,8 @@ static unsigned int xprt_rdma_slot_table_entries = RPCRDMA_DEF_SLOT_TABLE; static unsigned int xprt_rdma_max_inline_read = RPCRDMA_DEF_INLINE; static unsigned int xprt_rdma_max_inline_write = RPCRDMA_DEF_INLINE; static unsigned int xprt_rdma_inline_write_padding; -#if !RPCRDMA_PERSISTENT_REGISTRATION -static unsigned int xprt_rdma_memreg_strategy = RPCRDMA_REGISTER; /* FMR? */ -#else -static unsigned int xprt_rdma_memreg_strategy = RPCRDMA_ALLPHYSICAL; -#endif +static unsigned int xprt_rdma_memreg_strategy = RPCRDMA_FRMR; + int xprt_rdma_pad_optimize = 0; #ifdef RPC_DEBUG @@ -140,6 +137,14 @@ static ctl_table xr_tunables_table[] = { .extra2 = &max_memreg, }, { + .ctl_name = CTL_UNNUMBERED, + .procname = "rdma_pad_optimize", + .data = &xprt_rdma_pad_optimize, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { .ctl_name = 0, }, }; @@ -458,6 +463,8 @@ xprt_rdma_close(struct rpc_xprt *xprt) struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); dprintk("RPC: %s: closing\n", __func__); + if (r_xprt->rx_ep.rep_connected > 0) + xprt->reestablish_timeout = 0; xprt_disconnect_done(xprt); (void) rpcrdma_ep_disconnect(&r_xprt->rx_ep, &r_xprt->rx_ia); } @@ -485,6 +492,11 @@ xprt_rdma_connect(struct rpc_task *task) /* Reconnect */ schedule_delayed_work(&r_xprt->rdma_connect, xprt->reestablish_timeout); + xprt->reestablish_timeout <<= 1; + if (xprt->reestablish_timeout > (30 * HZ)) + xprt->reestablish_timeout = (30 * HZ); + else if (xprt->reestablish_timeout < (5 * HZ)) + xprt->reestablish_timeout = (5 * HZ); } else { schedule_delayed_work(&r_xprt->rdma_connect, 0); if (!RPC_IS_ASYNC(task)) @@ -591,6 +603,7 @@ xprt_rdma_allocate(struct rpc_task *task, size_t size) } dprintk("RPC: %s: size %zd, request 0x%p\n", __func__, size, req); out: + req->rl_connect_cookie = 0; /* our reserved value */ return req->rl_xdr_buf; outfail: @@ -694,13 +707,21 @@ xprt_rdma_send_request(struct rpc_task *task) req->rl_reply->rr_xprt = xprt; } - if (rpcrdma_ep_post(&r_xprt->rx_ia, &r_xprt->rx_ep, req)) { - xprt_disconnect_done(xprt); - return -ENOTCONN; /* implies disconnect */ - } + /* Must suppress retransmit to maintain credits */ + if (req->rl_connect_cookie == xprt->connect_cookie) + goto drop_connection; + req->rl_connect_cookie = xprt->connect_cookie; + + if (rpcrdma_ep_post(&r_xprt->rx_ia, &r_xprt->rx_ep, req)) + goto drop_connection; + task->tk_bytes_sent += rqst->rq_snd_buf.len; rqst->rq_bytes_sent = 0; return 0; + +drop_connection: + xprt_disconnect_done(xprt); + return -ENOTCONN; /* implies disconnect */ } static void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq) @@ -770,7 +791,7 @@ static void __exit xprt_rdma_cleanup(void) { int rc; - dprintk("RPCRDMA Module Removed, deregister RPC RDMA transport\n"); + dprintk(KERN_INFO "RPCRDMA Module Removed, deregister RPC RDMA transport\n"); #ifdef RPC_DEBUG if (sunrpc_table_header) { unregister_sysctl_table(sunrpc_table_header); diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 8ea283ecc522..a5fef5e6c323 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -284,6 +284,7 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event) switch (event->event) { case RDMA_CM_EVENT_ADDR_RESOLVED: case RDMA_CM_EVENT_ROUTE_RESOLVED: + ia->ri_async_rc = 0; complete(&ia->ri_done); break; case RDMA_CM_EVENT_ADDR_ERROR: @@ -338,13 +339,32 @@ connected: wake_up_all(&ep->rep_connect_wait); break; default: - ia->ri_async_rc = -EINVAL; - dprintk("RPC: %s: unexpected CM event %X\n", + dprintk("RPC: %s: unexpected CM event %d\n", __func__, event->event); - complete(&ia->ri_done); break; } +#ifdef RPC_DEBUG + if (connstate == 1) { + int ird = attr.max_dest_rd_atomic; + int tird = ep->rep_remote_cma.responder_resources; + printk(KERN_INFO "rpcrdma: connection to %u.%u.%u.%u:%u " + "on %s, memreg %d slots %d ird %d%s\n", + NIPQUAD(addr->sin_addr.s_addr), + ntohs(addr->sin_port), + ia->ri_id->device->name, + ia->ri_memreg_strategy, + xprt->rx_buf.rb_max_requests, + ird, ird < 4 && ird < tird / 2 ? " (low!)" : ""); + } else if (connstate < 0) { + printk(KERN_INFO "rpcrdma: connection to %u.%u.%u.%u:%u " + "closed (%d)\n", + NIPQUAD(addr->sin_addr.s_addr), + ntohs(addr->sin_port), + connstate); + } +#endif + return 0; } @@ -355,6 +375,8 @@ rpcrdma_create_id(struct rpcrdma_xprt *xprt, struct rdma_cm_id *id; int rc; + init_completion(&ia->ri_done); + id = rdma_create_id(rpcrdma_conn_upcall, xprt, RDMA_PS_TCP); if (IS_ERR(id)) { rc = PTR_ERR(id); @@ -363,26 +385,28 @@ rpcrdma_create_id(struct rpcrdma_xprt *xprt, return id; } - ia->ri_async_rc = 0; + ia->ri_async_rc = -ETIMEDOUT; rc = rdma_resolve_addr(id, NULL, addr, RDMA_RESOLVE_TIMEOUT); if (rc) { dprintk("RPC: %s: rdma_resolve_addr() failed %i\n", __func__, rc); goto out; } - wait_for_completion(&ia->ri_done); + wait_for_completion_interruptible_timeout(&ia->ri_done, + msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1); rc = ia->ri_async_rc; if (rc) goto out; - ia->ri_async_rc = 0; + ia->ri_async_rc = -ETIMEDOUT; rc = rdma_resolve_route(id, RDMA_RESOLVE_TIMEOUT); if (rc) { dprintk("RPC: %s: rdma_resolve_route() failed %i\n", __func__, rc); goto out; } - wait_for_completion(&ia->ri_done); + wait_for_completion_interruptible_timeout(&ia->ri_done, + msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1); rc = ia->ri_async_rc; if (rc) goto out; @@ -423,11 +447,10 @@ rpcrdma_clean_cq(struct ib_cq *cq) int rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) { - int rc; + int rc, mem_priv; + struct ib_device_attr devattr; struct rpcrdma_ia *ia = &xprt->rx_ia; - init_completion(&ia->ri_done); - ia->ri_id = rpcrdma_create_id(xprt, ia, addr); if (IS_ERR(ia->ri_id)) { rc = PTR_ERR(ia->ri_id); @@ -443,6 +466,73 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) } /* + * Query the device to determine if the requested memory + * registration strategy is supported. If it isn't, set the + * strategy to a globally supported model. + */ + rc = ib_query_device(ia->ri_id->device, &devattr); + if (rc) { + dprintk("RPC: %s: ib_query_device failed %d\n", + __func__, rc); + goto out2; + } + + if (devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY) { + ia->ri_have_dma_lkey = 1; + ia->ri_dma_lkey = ia->ri_id->device->local_dma_lkey; + } + + switch (memreg) { + case RPCRDMA_MEMWINDOWS: + case RPCRDMA_MEMWINDOWS_ASYNC: + if (!(devattr.device_cap_flags & IB_DEVICE_MEM_WINDOW)) { + dprintk("RPC: %s: MEMWINDOWS registration " + "specified but not supported by adapter, " + "using slower RPCRDMA_REGISTER\n", + __func__); + memreg = RPCRDMA_REGISTER; + } + break; + case RPCRDMA_MTHCAFMR: + if (!ia->ri_id->device->alloc_fmr) { +#if RPCRDMA_PERSISTENT_REGISTRATION + dprintk("RPC: %s: MTHCAFMR registration " + "specified but not supported by adapter, " + "using riskier RPCRDMA_ALLPHYSICAL\n", + __func__); + memreg = RPCRDMA_ALLPHYSICAL; +#else + dprintk("RPC: %s: MTHCAFMR registration " + "specified but not supported by adapter, " + "using slower RPCRDMA_REGISTER\n", + __func__); + memreg = RPCRDMA_REGISTER; +#endif + } + break; + case RPCRDMA_FRMR: + /* Requires both frmr reg and local dma lkey */ + if ((devattr.device_cap_flags & + (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) != + (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) { +#if RPCRDMA_PERSISTENT_REGISTRATION + dprintk("RPC: %s: FRMR registration " + "specified but not supported by adapter, " + "using riskier RPCRDMA_ALLPHYSICAL\n", + __func__); + memreg = RPCRDMA_ALLPHYSICAL; +#else + dprintk("RPC: %s: FRMR registration " + "specified but not supported by adapter, " + "using slower RPCRDMA_REGISTER\n", + __func__); + memreg = RPCRDMA_REGISTER; +#endif + } + break; + } + + /* * Optionally obtain an underlying physical identity mapping in * order to do a memory window-based bind. This base registration * is protected from remote access - that is enabled only by binding @@ -450,22 +540,28 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) * revoked after the corresponding completion similar to a storage * adapter. */ - if (memreg > RPCRDMA_REGISTER) { - int mem_priv = IB_ACCESS_LOCAL_WRITE; - switch (memreg) { + switch (memreg) { + case RPCRDMA_BOUNCEBUFFERS: + case RPCRDMA_REGISTER: + case RPCRDMA_FRMR: + break; #if RPCRDMA_PERSISTENT_REGISTRATION - case RPCRDMA_ALLPHYSICAL: - mem_priv |= IB_ACCESS_REMOTE_WRITE; - mem_priv |= IB_ACCESS_REMOTE_READ; - break; + case RPCRDMA_ALLPHYSICAL: + mem_priv = IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_REMOTE_WRITE | + IB_ACCESS_REMOTE_READ; + goto register_setup; #endif - case RPCRDMA_MEMWINDOWS_ASYNC: - case RPCRDMA_MEMWINDOWS: - mem_priv |= IB_ACCESS_MW_BIND; - break; - default: + case RPCRDMA_MEMWINDOWS_ASYNC: + case RPCRDMA_MEMWINDOWS: + mem_priv = IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_MW_BIND; + goto register_setup; + case RPCRDMA_MTHCAFMR: + if (ia->ri_have_dma_lkey) break; - } + mem_priv = IB_ACCESS_LOCAL_WRITE; + register_setup: ia->ri_bind_mem = ib_get_dma_mr(ia->ri_pd, mem_priv); if (IS_ERR(ia->ri_bind_mem)) { printk(KERN_ALERT "%s: ib_get_dma_mr for " @@ -475,7 +571,15 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) memreg = RPCRDMA_REGISTER; ia->ri_bind_mem = NULL; } + break; + default: + printk(KERN_ERR "%s: invalid memory registration mode %d\n", + __func__, memreg); + rc = -EINVAL; + goto out2; } + dprintk("RPC: %s: memory registration strategy is %d\n", + __func__, memreg); /* Else will do memory reg/dereg for each chunk */ ia->ri_memreg_strategy = memreg; @@ -483,6 +587,7 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) return 0; out2: rdma_destroy_id(ia->ri_id); + ia->ri_id = NULL; out1: return rc; } @@ -503,15 +608,17 @@ rpcrdma_ia_close(struct rpcrdma_ia *ia) dprintk("RPC: %s: ib_dereg_mr returned %i\n", __func__, rc); } - if (ia->ri_id != NULL && !IS_ERR(ia->ri_id) && ia->ri_id->qp) - rdma_destroy_qp(ia->ri_id); + if (ia->ri_id != NULL && !IS_ERR(ia->ri_id)) { + if (ia->ri_id->qp) + rdma_destroy_qp(ia->ri_id); + rdma_destroy_id(ia->ri_id); + ia->ri_id = NULL; + } if (ia->ri_pd != NULL && !IS_ERR(ia->ri_pd)) { rc = ib_dealloc_pd(ia->ri_pd); dprintk("RPC: %s: ib_dealloc_pd returned %i\n", __func__, rc); } - if (ia->ri_id != NULL && !IS_ERR(ia->ri_id)) - rdma_destroy_id(ia->ri_id); } /* @@ -541,6 +648,12 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, ep->rep_attr.srq = NULL; ep->rep_attr.cap.max_send_wr = cdata->max_requests; switch (ia->ri_memreg_strategy) { + case RPCRDMA_FRMR: + /* Add room for frmr register and invalidate WRs */ + ep->rep_attr.cap.max_send_wr *= 3; + if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr) + return -EINVAL; + break; case RPCRDMA_MEMWINDOWS_ASYNC: case RPCRDMA_MEMWINDOWS: /* Add room for mw_binds+unbinds - overkill! */ @@ -617,29 +730,13 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, ep->rep_remote_cma.private_data_len = 0; /* Client offers RDMA Read but does not initiate */ - switch (ia->ri_memreg_strategy) { - case RPCRDMA_BOUNCEBUFFERS: + ep->rep_remote_cma.initiator_depth = 0; + if (ia->ri_memreg_strategy == RPCRDMA_BOUNCEBUFFERS) ep->rep_remote_cma.responder_resources = 0; - break; - case RPCRDMA_MTHCAFMR: - case RPCRDMA_REGISTER: - ep->rep_remote_cma.responder_resources = cdata->max_requests * - (RPCRDMA_MAX_DATA_SEGS / 8); - break; - case RPCRDMA_MEMWINDOWS: - case RPCRDMA_MEMWINDOWS_ASYNC: -#if RPCRDMA_PERSISTENT_REGISTRATION - case RPCRDMA_ALLPHYSICAL: -#endif - ep->rep_remote_cma.responder_resources = cdata->max_requests * - (RPCRDMA_MAX_DATA_SEGS / 2); - break; - default: - break; - } - if (ep->rep_remote_cma.responder_resources > devattr.max_qp_rd_atom) + else if (devattr.max_qp_rd_atom > 32) /* arbitrary but <= 255 */ + ep->rep_remote_cma.responder_resources = 32; + else ep->rep_remote_cma.responder_resources = devattr.max_qp_rd_atom; - ep->rep_remote_cma.initiator_depth = 0; ep->rep_remote_cma.retry_count = 7; ep->rep_remote_cma.flow_control = 0; @@ -679,21 +776,16 @@ rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) if (rc) dprintk("RPC: %s: rpcrdma_ep_disconnect" " returned %i\n", __func__, rc); + rdma_destroy_qp(ia->ri_id); + ia->ri_id->qp = NULL; } - ep->rep_func = NULL; - /* padding - could be done in rpcrdma_buffer_destroy... */ if (ep->rep_pad_mr) { rpcrdma_deregister_internal(ia, ep->rep_pad_mr, &ep->rep_pad); ep->rep_pad_mr = NULL; } - if (ia->ri_id->qp) { - rdma_destroy_qp(ia->ri_id); - ia->ri_id->qp = NULL; - } - rpcrdma_clean_cq(ep->rep_cq); rc = ib_destroy_cq(ep->rep_cq); if (rc) @@ -712,9 +804,8 @@ rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) struct rdma_cm_id *id; int rc = 0; int retry_count = 0; - int reconnect = (ep->rep_connected != 0); - if (reconnect) { + if (ep->rep_connected != 0) { struct rpcrdma_xprt *xprt; retry: rc = rpcrdma_ep_disconnect(ep, ia); @@ -745,6 +836,7 @@ retry: goto out; } /* END TEMP */ + rdma_destroy_qp(ia->ri_id); rdma_destroy_id(ia->ri_id); ia->ri_id = id; } @@ -769,14 +861,6 @@ if (strnicmp(ia->ri_id->device->dma_device->bus->name, "pci", 3) == 0) { } } - /* Theoretically a client initiator_depth > 0 is not needed, - * but many peers fail to complete the connection unless they - * == responder_resources! */ - if (ep->rep_remote_cma.initiator_depth != - ep->rep_remote_cma.responder_resources) - ep->rep_remote_cma.initiator_depth = - ep->rep_remote_cma.responder_resources; - ep->rep_connected = 0; rc = rdma_connect(ia->ri_id, &ep->rep_remote_cma); @@ -786,9 +870,6 @@ if (strnicmp(ia->ri_id->device->dma_device->bus->name, "pci", 3) == 0) { goto out; } - if (reconnect) - return 0; - wait_event_interruptible(ep->rep_connect_wait, ep->rep_connected != 0); /* @@ -805,14 +886,16 @@ if (strnicmp(ia->ri_id->device->dma_device->bus->name, "pci", 3) == 0) { if (ep->rep_connected <= 0) { /* Sometimes, the only way to reliably connect to remote * CMs is to use same nonzero values for ORD and IRD. */ - ep->rep_remote_cma.initiator_depth = - ep->rep_remote_cma.responder_resources; - if (ep->rep_remote_cma.initiator_depth == 0) - ++ep->rep_remote_cma.initiator_depth; - if (ep->rep_remote_cma.responder_resources == 0) - ++ep->rep_remote_cma.responder_resources; - if (retry_count++ == 0) + if (retry_count++ <= RDMA_CONNECT_RETRY_MAX + 1 && + (ep->rep_remote_cma.responder_resources == 0 || + ep->rep_remote_cma.initiator_depth != + ep->rep_remote_cma.responder_resources)) { + if (ep->rep_remote_cma.responder_resources == 0) + ep->rep_remote_cma.responder_resources = 1; + ep->rep_remote_cma.initiator_depth = + ep->rep_remote_cma.responder_resources; goto retry; + } rc = ep->rep_connected; } else { dprintk("RPC: %s: connected\n", __func__); @@ -863,6 +946,7 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep, char *p; size_t len; int i, rc; + struct rpcrdma_mw *r; buf->rb_max_requests = cdata->max_requests; spin_lock_init(&buf->rb_lock); @@ -873,7 +957,7 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep, * 2. arrays of struct rpcrdma_req to fill in pointers * 3. array of struct rpcrdma_rep for replies * 4. padding, if any - * 5. mw's, if any + * 5. mw's, fmr's or frmr's, if any * Send/recv buffers in req/rep need to be registered */ @@ -881,6 +965,10 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep, (sizeof(struct rpcrdma_req *) + sizeof(struct rpcrdma_rep *)); len += cdata->padding; switch (ia->ri_memreg_strategy) { + case RPCRDMA_FRMR: + len += buf->rb_max_requests * RPCRDMA_MAX_SEGS * + sizeof(struct rpcrdma_mw); + break; case RPCRDMA_MTHCAFMR: /* TBD we are perhaps overallocating here */ len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS * @@ -927,15 +1015,37 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep, * and also reduce unbind-to-bind collision. */ INIT_LIST_HEAD(&buf->rb_mws); + r = (struct rpcrdma_mw *)p; switch (ia->ri_memreg_strategy) { + case RPCRDMA_FRMR: + for (i = buf->rb_max_requests * RPCRDMA_MAX_SEGS; i; i--) { + r->r.frmr.fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd, + RPCRDMA_MAX_SEGS); + if (IS_ERR(r->r.frmr.fr_mr)) { + rc = PTR_ERR(r->r.frmr.fr_mr); + dprintk("RPC: %s: ib_alloc_fast_reg_mr" + " failed %i\n", __func__, rc); + goto out; + } + r->r.frmr.fr_pgl = + ib_alloc_fast_reg_page_list(ia->ri_id->device, + RPCRDMA_MAX_SEGS); + if (IS_ERR(r->r.frmr.fr_pgl)) { + rc = PTR_ERR(r->r.frmr.fr_pgl); + dprintk("RPC: %s: " + "ib_alloc_fast_reg_page_list " + "failed %i\n", __func__, rc); + goto out; + } + list_add(&r->mw_list, &buf->rb_mws); + ++r; + } + break; case RPCRDMA_MTHCAFMR: - { - struct rpcrdma_mw *r = (struct rpcrdma_mw *)p; - struct ib_fmr_attr fa = { - RPCRDMA_MAX_DATA_SEGS, 1, PAGE_SHIFT - }; /* TBD we are perhaps overallocating here */ for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) { + static struct ib_fmr_attr fa = + { RPCRDMA_MAX_DATA_SEGS, 1, PAGE_SHIFT }; r->r.fmr = ib_alloc_fmr(ia->ri_pd, IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ, &fa); @@ -948,12 +1058,9 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep, list_add(&r->mw_list, &buf->rb_mws); ++r; } - } break; case RPCRDMA_MEMWINDOWS_ASYNC: case RPCRDMA_MEMWINDOWS: - { - struct rpcrdma_mw *r = (struct rpcrdma_mw *)p; /* Allocate one extra request's worth, for full cycling */ for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) { r->r.mw = ib_alloc_mw(ia->ri_pd); @@ -966,7 +1073,6 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep, list_add(&r->mw_list, &buf->rb_mws); ++r; } - } break; default: break; @@ -1046,6 +1152,7 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) { int rc, i; struct rpcrdma_ia *ia = rdmab_to_ia(buf); + struct rpcrdma_mw *r; /* clean up in reverse order from create * 1. recv mr memory (mr free, then kfree) @@ -1065,11 +1172,19 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) } if (buf->rb_send_bufs && buf->rb_send_bufs[i]) { while (!list_empty(&buf->rb_mws)) { - struct rpcrdma_mw *r; r = list_entry(buf->rb_mws.next, struct rpcrdma_mw, mw_list); list_del(&r->mw_list); switch (ia->ri_memreg_strategy) { + case RPCRDMA_FRMR: + rc = ib_dereg_mr(r->r.frmr.fr_mr); + if (rc) + dprintk("RPC: %s:" + " ib_dereg_mr" + " failed %i\n", + __func__, rc); + ib_free_fast_reg_page_list(r->r.frmr.fr_pgl); + break; case RPCRDMA_MTHCAFMR: rc = ib_dealloc_fmr(r->r.fmr); if (rc) @@ -1115,6 +1230,8 @@ rpcrdma_buffer_get(struct rpcrdma_buffer *buffers) { struct rpcrdma_req *req; unsigned long flags; + int i; + struct rpcrdma_mw *r; spin_lock_irqsave(&buffers->rb_lock, flags); if (buffers->rb_send_index == buffers->rb_max_requests) { @@ -1135,9 +1252,8 @@ rpcrdma_buffer_get(struct rpcrdma_buffer *buffers) } buffers->rb_send_bufs[buffers->rb_send_index++] = NULL; if (!list_empty(&buffers->rb_mws)) { - int i = RPCRDMA_MAX_SEGS - 1; + i = RPCRDMA_MAX_SEGS - 1; do { - struct rpcrdma_mw *r; r = list_entry(buffers->rb_mws.next, struct rpcrdma_mw, mw_list); list_del(&r->mw_list); @@ -1171,6 +1287,7 @@ rpcrdma_buffer_put(struct rpcrdma_req *req) req->rl_reply = NULL; } switch (ia->ri_memreg_strategy) { + case RPCRDMA_FRMR: case RPCRDMA_MTHCAFMR: case RPCRDMA_MEMWINDOWS_ASYNC: case RPCRDMA_MEMWINDOWS: @@ -1252,7 +1369,11 @@ rpcrdma_register_internal(struct rpcrdma_ia *ia, void *va, int len, va, len, DMA_BIDIRECTIONAL); iov->length = len; - if (ia->ri_bind_mem != NULL) { + if (ia->ri_have_dma_lkey) { + *mrp = NULL; + iov->lkey = ia->ri_dma_lkey; + return 0; + } else if (ia->ri_bind_mem != NULL) { *mrp = NULL; iov->lkey = ia->ri_bind_mem->lkey; return 0; @@ -1329,15 +1450,292 @@ rpcrdma_unmap_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg) seg->mr_dma, seg->mr_dmalen, seg->mr_dir); } +static int +rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg, + int *nsegs, int writing, struct rpcrdma_ia *ia, + struct rpcrdma_xprt *r_xprt) +{ + struct rpcrdma_mr_seg *seg1 = seg; + struct ib_send_wr frmr_wr, *bad_wr; + u8 key; + int len, pageoff; + int i, rc; + + pageoff = offset_in_page(seg1->mr_offset); + seg1->mr_offset -= pageoff; /* start of page */ + seg1->mr_len += pageoff; + len = -pageoff; + if (*nsegs > RPCRDMA_MAX_DATA_SEGS) + *nsegs = RPCRDMA_MAX_DATA_SEGS; + for (i = 0; i < *nsegs;) { + rpcrdma_map_one(ia, seg, writing); + seg1->mr_chunk.rl_mw->r.frmr.fr_pgl->page_list[i] = seg->mr_dma; + len += seg->mr_len; + ++seg; + ++i; + /* Check for holes */ + if ((i < *nsegs && offset_in_page(seg->mr_offset)) || + offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len)) + break; + } + dprintk("RPC: %s: Using frmr %p to map %d segments\n", + __func__, seg1->mr_chunk.rl_mw, i); + + /* Bump the key */ + key = (u8)(seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey & 0x000000FF); + ib_update_fast_reg_key(seg1->mr_chunk.rl_mw->r.frmr.fr_mr, ++key); + + /* Prepare FRMR WR */ + memset(&frmr_wr, 0, sizeof frmr_wr); + frmr_wr.opcode = IB_WR_FAST_REG_MR; + frmr_wr.send_flags = 0; /* unsignaled */ + frmr_wr.wr.fast_reg.iova_start = (unsigned long)seg1->mr_dma; + frmr_wr.wr.fast_reg.page_list = seg1->mr_chunk.rl_mw->r.frmr.fr_pgl; + frmr_wr.wr.fast_reg.page_list_len = i; + frmr_wr.wr.fast_reg.page_shift = PAGE_SHIFT; + frmr_wr.wr.fast_reg.length = i << PAGE_SHIFT; + frmr_wr.wr.fast_reg.access_flags = (writing ? + IB_ACCESS_REMOTE_WRITE : IB_ACCESS_REMOTE_READ); + frmr_wr.wr.fast_reg.rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey; + DECR_CQCOUNT(&r_xprt->rx_ep); + + rc = ib_post_send(ia->ri_id->qp, &frmr_wr, &bad_wr); + + if (rc) { + dprintk("RPC: %s: failed ib_post_send for register," + " status %i\n", __func__, rc); + while (i--) + rpcrdma_unmap_one(ia, --seg); + } else { + seg1->mr_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey; + seg1->mr_base = seg1->mr_dma + pageoff; + seg1->mr_nsegs = i; + seg1->mr_len = len; + } + *nsegs = i; + return rc; +} + +static int +rpcrdma_deregister_frmr_external(struct rpcrdma_mr_seg *seg, + struct rpcrdma_ia *ia, struct rpcrdma_xprt *r_xprt) +{ + struct rpcrdma_mr_seg *seg1 = seg; + struct ib_send_wr invalidate_wr, *bad_wr; + int rc; + + while (seg1->mr_nsegs--) + rpcrdma_unmap_one(ia, seg++); + + memset(&invalidate_wr, 0, sizeof invalidate_wr); + invalidate_wr.opcode = IB_WR_LOCAL_INV; + invalidate_wr.send_flags = 0; /* unsignaled */ + invalidate_wr.ex.invalidate_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey; + DECR_CQCOUNT(&r_xprt->rx_ep); + + rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr); + if (rc) + dprintk("RPC: %s: failed ib_post_send for invalidate," + " status %i\n", __func__, rc); + return rc; +} + +static int +rpcrdma_register_fmr_external(struct rpcrdma_mr_seg *seg, + int *nsegs, int writing, struct rpcrdma_ia *ia) +{ + struct rpcrdma_mr_seg *seg1 = seg; + u64 physaddrs[RPCRDMA_MAX_DATA_SEGS]; + int len, pageoff, i, rc; + + pageoff = offset_in_page(seg1->mr_offset); + seg1->mr_offset -= pageoff; /* start of page */ + seg1->mr_len += pageoff; + len = -pageoff; + if (*nsegs > RPCRDMA_MAX_DATA_SEGS) + *nsegs = RPCRDMA_MAX_DATA_SEGS; + for (i = 0; i < *nsegs;) { + rpcrdma_map_one(ia, seg, writing); + physaddrs[i] = seg->mr_dma; + len += seg->mr_len; + ++seg; + ++i; + /* Check for holes */ + if ((i < *nsegs && offset_in_page(seg->mr_offset)) || + offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len)) + break; + } + rc = ib_map_phys_fmr(seg1->mr_chunk.rl_mw->r.fmr, + physaddrs, i, seg1->mr_dma); + if (rc) { + dprintk("RPC: %s: failed ib_map_phys_fmr " + "%u@0x%llx+%i (%d)... status %i\n", __func__, + len, (unsigned long long)seg1->mr_dma, + pageoff, i, rc); + while (i--) + rpcrdma_unmap_one(ia, --seg); + } else { + seg1->mr_rkey = seg1->mr_chunk.rl_mw->r.fmr->rkey; + seg1->mr_base = seg1->mr_dma + pageoff; + seg1->mr_nsegs = i; + seg1->mr_len = len; + } + *nsegs = i; + return rc; +} + +static int +rpcrdma_deregister_fmr_external(struct rpcrdma_mr_seg *seg, + struct rpcrdma_ia *ia) +{ + struct rpcrdma_mr_seg *seg1 = seg; + LIST_HEAD(l); + int rc; + + list_add(&seg1->mr_chunk.rl_mw->r.fmr->list, &l); + rc = ib_unmap_fmr(&l); + while (seg1->mr_nsegs--) + rpcrdma_unmap_one(ia, seg++); + if (rc) + dprintk("RPC: %s: failed ib_unmap_fmr," + " status %i\n", __func__, rc); + return rc; +} + +static int +rpcrdma_register_memwin_external(struct rpcrdma_mr_seg *seg, + int *nsegs, int writing, struct rpcrdma_ia *ia, + struct rpcrdma_xprt *r_xprt) +{ + int mem_priv = (writing ? IB_ACCESS_REMOTE_WRITE : + IB_ACCESS_REMOTE_READ); + struct ib_mw_bind param; + int rc; + + *nsegs = 1; + rpcrdma_map_one(ia, seg, writing); + param.mr = ia->ri_bind_mem; + param.wr_id = 0ULL; /* no send cookie */ + param.addr = seg->mr_dma; + param.length = seg->mr_len; + param.send_flags = 0; + param.mw_access_flags = mem_priv; + + DECR_CQCOUNT(&r_xprt->rx_ep); + rc = ib_bind_mw(ia->ri_id->qp, seg->mr_chunk.rl_mw->r.mw, ¶m); + if (rc) { + dprintk("RPC: %s: failed ib_bind_mw " + "%u@0x%llx status %i\n", + __func__, seg->mr_len, + (unsigned long long)seg->mr_dma, rc); + rpcrdma_unmap_one(ia, seg); + } else { + seg->mr_rkey = seg->mr_chunk.rl_mw->r.mw->rkey; + seg->mr_base = param.addr; + seg->mr_nsegs = 1; + } + return rc; +} + +static int +rpcrdma_deregister_memwin_external(struct rpcrdma_mr_seg *seg, + struct rpcrdma_ia *ia, + struct rpcrdma_xprt *r_xprt, void **r) +{ + struct ib_mw_bind param; + LIST_HEAD(l); + int rc; + + BUG_ON(seg->mr_nsegs != 1); + param.mr = ia->ri_bind_mem; + param.addr = 0ULL; /* unbind */ + param.length = 0; + param.mw_access_flags = 0; + if (*r) { + param.wr_id = (u64) (unsigned long) *r; + param.send_flags = IB_SEND_SIGNALED; + INIT_CQCOUNT(&r_xprt->rx_ep); + } else { + param.wr_id = 0ULL; + param.send_flags = 0; + DECR_CQCOUNT(&r_xprt->rx_ep); + } + rc = ib_bind_mw(ia->ri_id->qp, seg->mr_chunk.rl_mw->r.mw, ¶m); + rpcrdma_unmap_one(ia, seg); + if (rc) + dprintk("RPC: %s: failed ib_(un)bind_mw," + " status %i\n", __func__, rc); + else + *r = NULL; /* will upcall on completion */ + return rc; +} + +static int +rpcrdma_register_default_external(struct rpcrdma_mr_seg *seg, + int *nsegs, int writing, struct rpcrdma_ia *ia) +{ + int mem_priv = (writing ? IB_ACCESS_REMOTE_WRITE : + IB_ACCESS_REMOTE_READ); + struct rpcrdma_mr_seg *seg1 = seg; + struct ib_phys_buf ipb[RPCRDMA_MAX_DATA_SEGS]; + int len, i, rc = 0; + + if (*nsegs > RPCRDMA_MAX_DATA_SEGS) + *nsegs = RPCRDMA_MAX_DATA_SEGS; + for (len = 0, i = 0; i < *nsegs;) { + rpcrdma_map_one(ia, seg, writing); + ipb[i].addr = seg->mr_dma; + ipb[i].size = seg->mr_len; + len += seg->mr_len; + ++seg; + ++i; + /* Check for holes */ + if ((i < *nsegs && offset_in_page(seg->mr_offset)) || + offset_in_page((seg-1)->mr_offset+(seg-1)->mr_len)) + break; + } + seg1->mr_base = seg1->mr_dma; + seg1->mr_chunk.rl_mr = ib_reg_phys_mr(ia->ri_pd, + ipb, i, mem_priv, &seg1->mr_base); + if (IS_ERR(seg1->mr_chunk.rl_mr)) { + rc = PTR_ERR(seg1->mr_chunk.rl_mr); + dprintk("RPC: %s: failed ib_reg_phys_mr " + "%u@0x%llx (%d)... status %i\n", + __func__, len, + (unsigned long long)seg1->mr_dma, i, rc); + while (i--) + rpcrdma_unmap_one(ia, --seg); + } else { + seg1->mr_rkey = seg1->mr_chunk.rl_mr->rkey; + seg1->mr_nsegs = i; + seg1->mr_len = len; + } + *nsegs = i; + return rc; +} + +static int +rpcrdma_deregister_default_external(struct rpcrdma_mr_seg *seg, + struct rpcrdma_ia *ia) +{ + struct rpcrdma_mr_seg *seg1 = seg; + int rc; + + rc = ib_dereg_mr(seg1->mr_chunk.rl_mr); + seg1->mr_chunk.rl_mr = NULL; + while (seg1->mr_nsegs--) + rpcrdma_unmap_one(ia, seg++); + if (rc) + dprintk("RPC: %s: failed ib_dereg_mr," + " status %i\n", __func__, rc); + return rc; +} + int rpcrdma_register_external(struct rpcrdma_mr_seg *seg, int nsegs, int writing, struct rpcrdma_xprt *r_xprt) { struct rpcrdma_ia *ia = &r_xprt->rx_ia; - int mem_priv = (writing ? IB_ACCESS_REMOTE_WRITE : - IB_ACCESS_REMOTE_READ); - struct rpcrdma_mr_seg *seg1 = seg; - int i; int rc = 0; switch (ia->ri_memreg_strategy) { @@ -1352,114 +1750,25 @@ rpcrdma_register_external(struct rpcrdma_mr_seg *seg, break; #endif - /* Registration using fast memory registration */ + /* Registration using frmr registration */ + case RPCRDMA_FRMR: + rc = rpcrdma_register_frmr_external(seg, &nsegs, writing, ia, r_xprt); + break; + + /* Registration using fmr memory registration */ case RPCRDMA_MTHCAFMR: - { - u64 physaddrs[RPCRDMA_MAX_DATA_SEGS]; - int len, pageoff = offset_in_page(seg->mr_offset); - seg1->mr_offset -= pageoff; /* start of page */ - seg1->mr_len += pageoff; - len = -pageoff; - if (nsegs > RPCRDMA_MAX_DATA_SEGS) - nsegs = RPCRDMA_MAX_DATA_SEGS; - for (i = 0; i < nsegs;) { - rpcrdma_map_one(ia, seg, writing); - physaddrs[i] = seg->mr_dma; - len += seg->mr_len; - ++seg; - ++i; - /* Check for holes */ - if ((i < nsegs && offset_in_page(seg->mr_offset)) || - offset_in_page((seg-1)->mr_offset+(seg-1)->mr_len)) - break; - } - nsegs = i; - rc = ib_map_phys_fmr(seg1->mr_chunk.rl_mw->r.fmr, - physaddrs, nsegs, seg1->mr_dma); - if (rc) { - dprintk("RPC: %s: failed ib_map_phys_fmr " - "%u@0x%llx+%i (%d)... status %i\n", __func__, - len, (unsigned long long)seg1->mr_dma, - pageoff, nsegs, rc); - while (nsegs--) - rpcrdma_unmap_one(ia, --seg); - } else { - seg1->mr_rkey = seg1->mr_chunk.rl_mw->r.fmr->rkey; - seg1->mr_base = seg1->mr_dma + pageoff; - seg1->mr_nsegs = nsegs; - seg1->mr_len = len; - } - } + rc = rpcrdma_register_fmr_external(seg, &nsegs, writing, ia); break; /* Registration using memory windows */ case RPCRDMA_MEMWINDOWS_ASYNC: case RPCRDMA_MEMWINDOWS: - { - struct ib_mw_bind param; - rpcrdma_map_one(ia, seg, writing); - param.mr = ia->ri_bind_mem; - param.wr_id = 0ULL; /* no send cookie */ - param.addr = seg->mr_dma; - param.length = seg->mr_len; - param.send_flags = 0; - param.mw_access_flags = mem_priv; - - DECR_CQCOUNT(&r_xprt->rx_ep); - rc = ib_bind_mw(ia->ri_id->qp, - seg->mr_chunk.rl_mw->r.mw, ¶m); - if (rc) { - dprintk("RPC: %s: failed ib_bind_mw " - "%u@0x%llx status %i\n", - __func__, seg->mr_len, - (unsigned long long)seg->mr_dma, rc); - rpcrdma_unmap_one(ia, seg); - } else { - seg->mr_rkey = seg->mr_chunk.rl_mw->r.mw->rkey; - seg->mr_base = param.addr; - seg->mr_nsegs = 1; - nsegs = 1; - } - } + rc = rpcrdma_register_memwin_external(seg, &nsegs, writing, ia, r_xprt); break; /* Default registration each time */ default: - { - struct ib_phys_buf ipb[RPCRDMA_MAX_DATA_SEGS]; - int len = 0; - if (nsegs > RPCRDMA_MAX_DATA_SEGS) - nsegs = RPCRDMA_MAX_DATA_SEGS; - for (i = 0; i < nsegs;) { - rpcrdma_map_one(ia, seg, writing); - ipb[i].addr = seg->mr_dma; - ipb[i].size = seg->mr_len; - len += seg->mr_len; - ++seg; - ++i; - /* Check for holes */ - if ((i < nsegs && offset_in_page(seg->mr_offset)) || - offset_in_page((seg-1)->mr_offset+(seg-1)->mr_len)) - break; - } - nsegs = i; - seg1->mr_base = seg1->mr_dma; - seg1->mr_chunk.rl_mr = ib_reg_phys_mr(ia->ri_pd, - ipb, nsegs, mem_priv, &seg1->mr_base); - if (IS_ERR(seg1->mr_chunk.rl_mr)) { - rc = PTR_ERR(seg1->mr_chunk.rl_mr); - dprintk("RPC: %s: failed ib_reg_phys_mr " - "%u@0x%llx (%d)... status %i\n", - __func__, len, - (unsigned long long)seg1->mr_dma, nsegs, rc); - while (nsegs--) - rpcrdma_unmap_one(ia, --seg); - } else { - seg1->mr_rkey = seg1->mr_chunk.rl_mr->rkey; - seg1->mr_nsegs = nsegs; - seg1->mr_len = len; - } - } + rc = rpcrdma_register_default_external(seg, &nsegs, writing, ia); break; } if (rc) @@ -1473,7 +1782,6 @@ rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg, struct rpcrdma_xprt *r_xprt, void *r) { struct rpcrdma_ia *ia = &r_xprt->rx_ia; - struct rpcrdma_mr_seg *seg1 = seg; int nsegs = seg->mr_nsegs, rc; switch (ia->ri_memreg_strategy) { @@ -1486,56 +1794,21 @@ rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg, break; #endif + case RPCRDMA_FRMR: + rc = rpcrdma_deregister_frmr_external(seg, ia, r_xprt); + break; + case RPCRDMA_MTHCAFMR: - { - LIST_HEAD(l); - list_add(&seg->mr_chunk.rl_mw->r.fmr->list, &l); - rc = ib_unmap_fmr(&l); - while (seg1->mr_nsegs--) - rpcrdma_unmap_one(ia, seg++); - } - if (rc) - dprintk("RPC: %s: failed ib_unmap_fmr," - " status %i\n", __func__, rc); + rc = rpcrdma_deregister_fmr_external(seg, ia); break; case RPCRDMA_MEMWINDOWS_ASYNC: case RPCRDMA_MEMWINDOWS: - { - struct ib_mw_bind param; - BUG_ON(nsegs != 1); - param.mr = ia->ri_bind_mem; - param.addr = 0ULL; /* unbind */ - param.length = 0; - param.mw_access_flags = 0; - if (r) { - param.wr_id = (u64) (unsigned long) r; - param.send_flags = IB_SEND_SIGNALED; - INIT_CQCOUNT(&r_xprt->rx_ep); - } else { - param.wr_id = 0ULL; - param.send_flags = 0; - DECR_CQCOUNT(&r_xprt->rx_ep); - } - rc = ib_bind_mw(ia->ri_id->qp, - seg->mr_chunk.rl_mw->r.mw, ¶m); - rpcrdma_unmap_one(ia, seg); - } - if (rc) - dprintk("RPC: %s: failed ib_(un)bind_mw," - " status %i\n", __func__, rc); - else - r = NULL; /* will upcall on completion */ + rc = rpcrdma_deregister_memwin_external(seg, ia, r_xprt, &r); break; default: - rc = ib_dereg_mr(seg1->mr_chunk.rl_mr); - seg1->mr_chunk.rl_mr = NULL; - while (seg1->mr_nsegs--) - rpcrdma_unmap_one(ia, seg++); - if (rc) - dprintk("RPC: %s: failed ib_dereg_mr," - " status %i\n", __func__, rc); + rc = rpcrdma_deregister_default_external(seg, ia); break; } if (r) { diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 2427822f8bd4..c7a7eba991bc 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -51,6 +51,9 @@ #include <linux/sunrpc/rpc_rdma.h> /* RPC/RDMA protocol */ #include <linux/sunrpc/xprtrdma.h> /* xprt parameters */ +#define RDMA_RESOLVE_TIMEOUT (5000) /* 5 seconds */ +#define RDMA_CONNECT_RETRY_MAX (2) /* retries if no listener backlog */ + /* * Interface Adapter -- one per transport instance */ @@ -58,6 +61,8 @@ struct rpcrdma_ia { struct rdma_cm_id *ri_id; struct ib_pd *ri_pd; struct ib_mr *ri_bind_mem; + u32 ri_dma_lkey; + int ri_have_dma_lkey; struct completion ri_done; int ri_async_rc; enum rpcrdma_memreg ri_memreg_strategy; @@ -156,6 +161,10 @@ struct rpcrdma_mr_seg { /* chunk descriptors */ union { struct ib_mw *mw; struct ib_fmr *fmr; + struct { + struct ib_fast_reg_page_list *fr_pgl; + struct ib_mr *fr_mr; + } frmr; } r; struct list_head mw_list; } *rl_mw; @@ -175,6 +184,7 @@ struct rpcrdma_req { size_t rl_size; /* actual length of buffer */ unsigned int rl_niovs; /* 0, 2 or 4 */ unsigned int rl_nchunks; /* non-zero if chunks */ + unsigned int rl_connect_cookie; /* retry detection */ struct rpcrdma_buffer *rl_buffer; /* home base for this structure */ struct rpcrdma_rep *rl_reply;/* holder for reply buffer */ struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS];/* chunk segments */ @@ -198,7 +208,7 @@ struct rpcrdma_buffer { atomic_t rb_credits; /* most recent server credits */ unsigned long rb_cwndscale; /* cached framework rpc_cwndscale */ int rb_max_requests;/* client max requests */ - struct list_head rb_mws; /* optional memory windows/fmrs */ + struct list_head rb_mws; /* optional memory windows/fmrs/frmrs */ int rb_send_index; struct rpcrdma_req **rb_send_bufs; int rb_recv_index; @@ -273,6 +283,11 @@ struct rpcrdma_xprt { #define rpcx_to_rdmax(x) container_of(x, struct rpcrdma_xprt, xprt) #define rpcx_to_rdmad(x) (rpcx_to_rdmax(x)->rx_data) +/* Setting this to 0 ensures interoperability with early servers. + * Setting this to 1 enhances certain unaligned read/write performance. + * Default is 0, see sysctl entry and rpc_rdma.c rpcrdma_convert_iovs() */ +extern int xprt_rdma_pad_optimize; + /* * Interface Adapter calls - xprtrdma/verbs.c */ |